├── JData_第一炉香_12_代码运行说明 解题思路.pdf
├── start.sh
├── merge_result.py
├── README.md
├── preprocessing.py
├── Umodel_1.py
└── Umodel_2.py


/JData_第一炉香_12_代码运行说明 解题思路.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hecongqing/2017-jdata-competition/HEAD/JData_第一炉香_12_代码运行说明 解题思路.pdf


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python preprocessing.py
3 | python USModel.py
4 | python Umodel_0.py
5 | python Umodel_1.py
6 | python Umodel_2.py
7 | python merge_result.py
8 | 
9 | 


--------------------------------------------------------------------------------
/merge_result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | import pandas as pd
 4 | #3个model的结果取权值，取top700
 5 | def u_id():
 6 |     df1=pd.read_csv('./sub/Umodel_0.csv')
 7 |     df1.columns=['user_id','label1']
 8 | 
 9 |     df2=pd.read_csv('./sub/Umodel_1.csv')
10 |     df2.columns=['user_id','label2']
11 | 
12 |     df3=pd.read_csv('./sub/Umodel_2.csv')
13 |     df3.columns=['user_id','label3']
14 | 
15 |     df=pd.merge(df1,df2,on='user_id',how='outer')
16 |     df=pd.merge(df,df3,on='user_id',how='outer')
17 |     df['label']=0.3*df['label1']+0.3*df['label2']+0.4*df['label3']
18 |     df.sort_values(by=['label'],ascending=[0],inplace=True)
19 |     df=df[['user_id','label']].reset_index(drop=True)
20 |     df=df[['user_id']]
21 |     return  df[:700]
22 | #usmodel的结果取top325
23 | def us_id():
24 |     df=pd.read_csv('./sub/USModel.csv')
25 |     df=df[['user_id']]
26 |     return df[:325]
27 | #合并user top700 ,us中的user top 325，结果为802
28 | def merge_u_us():
29 |     u = u_id()
30 |     us = us_id()
31 |     df=pd.merge(u,us,on='user_id',how='outer')
32 |     df=df.drop_duplicates('user_id')
33 |     return df
34 | 
35 | #合并user802与us model['user_id','sku_id'],得到结果
36 | def result():
37 |     u = merge_u_us()
38 |     us=pd.read_csv( './sub/USModel.csv')
39 |     us=us[['user_id','sku_id']]
40 |     us=us.astype('int')
41 |     result=pd.merge(u,us,how='left',on='user_id')
42 |     print ('===========>>>打印输出结果：')
43 |     result=result.fillna(0)
44 |     result=result.astype('int')
45 |     
46 |     result.to_csv('./sub/best_result.csv',index=False)
47 |     return  result
48 | 
49 | print (result())
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2017 JData Competition
 2 | 高潜用户购买意向预测-rank12
 3 |  
 4 | # 赛题网址
 5 | 
 6 | https://www.datafountain.cn/competitions/247/details/data-evaluation
 7 | 
 8 | # 赛题任务
 9 | 本次大赛以京东商城真实的用户、商品和行为数据（脱敏后）为基础，参赛队伍需要通过数据挖掘的技术和机器学习的算法，构建用户购买商品的预测模型，
10 | 输出高潜用户和目标商品的匹配结果，为精准营销提供高质量的目标群体。同时，希望参赛队伍能通过本次比赛，挖掘数据背后潜在的意义，为电商用户提供
11 | 更简单、快捷、省心的购物体验。
12 | 参赛者需要使用京东多个品类下商品的历史销售数据，构建算法模型，预测用户在未来5天内，对某个目标品类下商品的购买意向。对于训练集中出现的每一
13 | 个用户，参赛者的模型需要预测该用户在未来5天内是否购买目标品类下的商品以及所购买商品的SKU_ID。评测算法将针对参赛者提交的预测结果，计算加权得分。
14 | 
15 | # 评分公式
16 | 参赛者提交的结果文件中包含对所有用户购买意向的预测结果。对每一个用户的预测结果包括两方面：
17 | 
18 | 1、该用户2016-04-16到2016-04-20是否下单P中的商品，提交的结果文件中仅包含预测为下单的用户，预测为未下单的用户，无须在结果中出现。若预测正确，则评测算法中置label=1，不正确label=0；
19 | 
20 | 2、如果下单，下单的sku_id （只需提交一个sku_id），若sku_id预测正确，则评测算法中置pred=1，不正确pred=0。
21 | 
22 | 对于参赛者提交的结果文件，按如下公式计算得分：
23 | 
24 | <a href="https://www.codecogs.com/eqnedit.php?latex=$Score=0.4F_{11}&space;&plus;&space;0.6F_{12}$" target="_blank"><img src="https://latex.codecogs.com/gif.latex?$Score=0.4F_{11}&space;&plus;&space;0.6F_{12}$" title="$Score=0.4F_{11} + 0.6F_{12}$" /></a>
25 | 
26 | 此处的F1值定义为：
27 | 
28 | <a href="https://www.codecogs.com/eqnedit.php?latex=F_{11}=&space;\frac{6Recall&space;\cdot&space;Precise}{5Recall&plus;Precise}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?F_{11}=&space;\frac{6Recall&space;\cdot&space;Precise}{5Recall&plus;Precise}" title="F_{11}= \frac{6Recall \cdot Precise}{5Recall+Precise}" /></a>
29 | 
30 | <a href="https://www.codecogs.com/eqnedit.php?latex=F_{12}=&space;\frac{5Recall&space;\cdot&space;Precise}{2Recall&plus;3&space;\cdot&space;Precise}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?F_{12}=&space;\frac{5Recall&space;\cdot&space;Precise}{2Recall&plus;3&space;\cdot&space;Precise}" title="F_{12}= \frac{5Recall \cdot Precise}{2Recall+3 \cdot Precise}" /></a>
31 | 
32 | 其中，Precise为准确率，Recall为召回率.<a href="https://www.codecogs.com/eqnedit.php?latex=F_{11}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?F_{11}" title="F_{11}" /></a>是label=1或0的F1值，<a href="https://www.codecogs.com/eqnedit.php?latex=F_{12}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?F_{12}" title="F_{12}" /></a>是pred=1或0的F1值.
33 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | 
 9 | path = './'
10 | 
11 | def concat_action():
12 |     action1 = pd.read_csv(path+'/data/JData_Action_201602.csv')
13 |     action2 = pd.read_csv(path+'/data/JData_Action_201603.csv')
14 |     action3 = pd.read_csv(path+'/data/JData_Action_201604.csv')
15 |     action = pd.concat([action1,action2,action3]).sort_values(by='time')
16 |     action.to_csv('./data/JData_Action.csv', index=False)
17 | 
18 | def map_user_reg(x):
19 |     if x<pd.to_datetime('2016/04/16'):
20 |         d = pd.to_datetime('2016/04/16') - x
21 |         d = d.days // 30
22 |     else:
23 |         d = -1
24 |     return d
25 |         
26 | def cate_user_reg(d):
27 |     if d <0:
28 |         d = -1
29 |     elif d>=0 and d<=3:
30 |         d = 1
31 |     elif d>3 and d<=6:
32 |         d = 2
33 |     elif d>6 and d<=12:
34 |         d = 3
35 |     elif d>12 and d<=24:
36 |         d = 4
37 |     elif d>24 and d<=48:
38 |         d = 5
39 |     else:
40 |         d = 6
41 |     return d
42 |     
43 | def user_process():
44 |     user = pd.read_csv(path + '/data/JData_User.csv', encoding='gbk', parse_dates=[4])
45 |     user = user.drop_duplicates('user_id')
46 |     #user = user[user['user_reg_tm']<pd.to_datetime('2016/04/16')]
47 | 
48 |     user['reg_duration'] = user['user_reg_tm'].apply(map_user_reg)
49 |     user['reg_duration_cate'] = user['reg_duration'].apply(cate_user_reg)
50 |     user['age'] = np.where(user['age']==u'15岁以下', 0,
51 |                            np.where(user['age']==u'16-25岁', 1,
52 |                                     np.where(user['age']==u'26-35岁', 2,
53 |                                              np.where(user['age']==u'36-45岁', 3,
54 |                                                       np.where(user['age']==u'46-55岁', 4,
55 |                                                                np.where(user['age']==u'56岁以上', 5, -1))))))
56 |     user = user.sort_values('user_id')
57 |     user.to_csv( './data1/JData_modified_user.csv', index=False)
58 | user_process()    
59 | def product_process():
60 |     product = pd.read_csv(path + '/data/JData_Product.csv')
61 |     product = product.drop_duplicates('sku_id')
62 |     product.to_csv( './data/JData_Product.csv', index=False)
63 | # product_process() 
64 | def action_process():
65 |     product = pd.read_csv( './data/JData_Product.csv')
66 |     user = pd.read_csv('./data/JData_modified_user.csv')
67 |     action = pd.read_csv( './data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
68 |     action['date'] = action['time'].map(lambda x: x.date())
69 |     action = action[action['sku_id'].isin(product['sku_id'])]
70 |     action = action[action['user_id'].isin(user['user_id'])]
71 |     action.to_csv( './data/JData_subset_action.csv', index=False)
72 | 
73 | 
74 | concat_action()
75 | user_process()  
76 | product_process()   
77 | action_process()   
78 | print('可以运行U模型')
79 | 
80 | 
81 | 
82 | 
83 | 
84 | # In[ ]:
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/Umodel_1.py:
--------------------------------------------------------------------------------
   1 | 
   2 | # coding: utf-8
   3 | 
   4 | # In[1]:
   5 | 
   6 | #!/usr/bin/env python
   7 | 
   8 | import time
   9 | 
  10 | from datetime import datetime
  11 | from datetime import timedelta
  12 | import pandas as pd
  13 | import pickle
  14 | import os
  15 | import math
  16 | import numpy as np
  17 | from sklearn import preprocessing
  18 | import matplotlib.pyplot as plt
  19 | 
  20 | action_1_path = "./data/JData_Action_201602.csv"
  21 | action_2_path = "./data/JData_Action_201603.csv"
  22 | action_3_path = "./data/JData_Action_201604.csv"
  23 | user_path = "./data/JData_User.csv"
  24 | product_path = "./data/JData_Product.csv"
  25 | 
  26 | 
  27 | def convert_age(age_str):
  28 |     if age_str == u'-1':
  29 |         return 0
  30 |     elif age_str == u'15岁以下':
  31 |         return 1
  32 |     elif age_str == u'16-25岁':
  33 |         return 2
  34 |     elif age_str == u'26-35岁':
  35 |         return 3
  36 |     elif age_str == u'36-45岁':
  37 |         return 4
  38 |     elif age_str == u'46-55岁':
  39 |         return 5
  40 |     elif age_str == u'56岁以上':
  41 |         return 6
  42 |     else:
  43 |         return -1
  44 | 
  45 | 
  46 | # 用户的基本信息
  47 | def get_basic_user_feat():
  48 |     dump_path = './cache/basic_user.csv'
  49 |     if os.path.exists(dump_path):
  50 |         user = pd.read_csv(dump_path)
  51 |     else:
  52 |         user = pd.read_csv(user_path, encoding='gbk')
  53 |         user['age'] = user['age'].map(convert_age)
  54 |         age_df = pd.get_dummies(user["age"], prefix="age")
  55 |         sex_df = pd.get_dummies(user["sex"], prefix="sex")
  56 |         user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
  57 |         user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
  58 |         user.to_csv(dump_path, index=False)
  59 |     return user
  60 | 
  61 | # 商品的基本信息
  62 | def get_basic_product_feat():
  63 |     dump_path = './cache/basic_product.csv'
  64 |     if os.path.exists(dump_path):
  65 |         product = pd.read_csv(dump_path)
  66 |     else:
  67 |         product = pd.read_csv(product_path)
  68 |         attr1_df = pd.get_dummies(product["a1"], prefix="a1")
  69 |         attr2_df = pd.get_dummies(product["a2"], prefix="a2")
  70 |         attr3_df = pd.get_dummies(product["a3"], prefix="a3")
  71 |         product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)
  72 |         product.to_csv(dump_path, index=False)
  73 |     return product
  74 | 
  75 | def get_actions_1():
  76 |     action = pd.read_csv(action_1_path)
  77 |     return action
  78 | 
  79 | 
  80 | def get_actions_2():
  81 |     action2 = pd.read_csv(action_2_path)
  82 |     return action2
  83 | 
  84 | 
  85 | def get_actions_3():
  86 |     action3 = pd.read_csv(action_3_path)
  87 |     return action3
  88 | 
  89 | def sub_get_actions(start_date,end_date):
  90 |     dump_path = './cache/sub_action_%s_%s.csv' % (start_date, end_date)
  91 |     if os.path.exists(dump_path):
  92 |         actions = pd.read_csv(dump_path)
  93 |     else:
  94 |         actions=get_actions(start_date,end_date)
  95 |         actions=actions[actions['cate']==8]
  96 |         actions.to_csv(dump_path,index=False)
  97 |     return actions
  98 | 
  99 | # 行为数据
 100 | def get_actions(start_date, end_date):
 101 |     """
 102 | 
 103 |     :param start_date:
 104 |     :param end_date:
 105 |     :return: actions: pd.Dataframe
 106 |     """
 107 |     dump_path = './cache/all_action_%s_%s.csv' % (start_date, end_date)
 108 |     if os.path.exists(dump_path):
 109 |         actions = pd.read_csv(dump_path)
 110 |     else:
 111 |         action_1 = get_actions_1()
 112 |         action_1 = action_1[(action_1.time >= start_date) & (action_1.time < end_date)]
 113 |         action_2 = get_actions_2()
 114 |         action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)]
 115 |         actions = pd.concat([action_1, action_2])
 116 |         action_3 = get_actions_3()
 117 |         action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)]
 118 |         actions = pd.concat([actions, action_3])  # type: pd.DataFrame
 119 |         actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
 120 |         actions.to_csv(dump_path, index=False)
 121 |     # actions['user_id']=actions['user_id'].astype('int')
 122 |     return actions
 123 | 
 124 | # 获取两个时间相差几天
 125 | def get_day_chaju(x, end_date):
 126 |     #     x=x.split(' ')[0]
 127 |     x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
 128 |     end_date = datetime.strptime(end_date, '%Y-%m-%d')
 129 |     return (end_date - x).days
 130 | 
 131 | 
 132 | def get_action_feat(start_date, end_date,k):
 133 |     dump_path = './cache/u_action_%s_%s_%s.csv' % (start_date, end_date,k)
 134 |     if os.path.exists(dump_path):
 135 |         actions = pd.read_csv(dump_path)
 136 |     else:
 137 |         start_days=pd.to_datetime(end_date)-timedelta(days=k)
 138 |         start_days=str(start_days).split(' ')[0]
 139 |         actions = get_actions(start_days, end_date)
 140 |         actions = actions[['user_id', 'type']]
 141 |         df = pd.get_dummies(actions['type'], prefix='type')
 142 |         actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
 143 |         actions = actions.groupby('user_id', as_index=False).sum()
 144 |         min_max_scaler = preprocessing.MinMaxScaler()
 145 |         df = min_max_scaler.fit_transform(actions.drop(['user_id','type'],axis=1).values)
 146 |         df = pd.DataFrame(df)
 147 |         df.columns=['u_action_'+str(k)+'_'+str(i) for i in range(1,df.shape[1]+1)]
 148 |         actions = pd.concat([actions[['user_id']], df], axis=1)
 149 |         actions.to_csv(dump_path, index=False)
 150 |     return actions
 151 | 
 152 | 
 153 | 
 154 | 
 155 | 
 156 | 
 157 | # 用户的行为转化率
 158 | def get_action_user_feat1(start_date, end_date):
 159 |     feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio',
 160 |                'user_action_5_ratio', 'user_action_6_ratio']
 161 |     dump_path = './cache/user_feat_accumulate_xiugai_%s_%s.csv' % (start_date, end_date)
 162 |     if os.path.exists(dump_path):
 163 |         actions = pd.read_csv(dump_path)
 164 |     else:
 165 |         actions = get_actions(start_date, end_date)
 166 |         df = pd.get_dummies(actions['type'], prefix='action')
 167 |         actions = pd.concat([actions['user_id'], df], axis=1)
 168 |         actions = actions.groupby(['user_id'], as_index=False).sum()
 169 |         actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1']
 170 |         actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2']
 171 |         #         actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3']
 172 |         actions['user_action_3_ratio'] = actions['action_3'] / actions['action_2']
 173 |         actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5']
 174 |         actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6']
 175 |         #         3.购物车删除
 176 |         actions = actions[feature]
 177 |         actions.to_csv(dump_path, index=False)
 178 |     return actions
 179 | 
 180 | 
 181 | # print get_accumulate_user_feat('2016-03-10','2016-04-11')
 182 | # 用户购买前访问天数
 183 | # 用户购买/加入购物车/关注前访问天数
 184 | def get_action_user_feat2(start_date, end_date):
 185 |     dump_path = './cache/user_feat2_after_%s_%s.csv' % (start_date, end_date)
 186 |     if os.path.exists(dump_path):
 187 |         actions = pd.read_csv(dump_path)
 188 | 
 189 |     else:
 190 |         # 用户购买前访问天数
 191 |         def user_feat_2_1(start_date, end_date):
 192 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 193 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 194 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 195 |             visit = actions[actions['type'] == 1]
 196 |             visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
 197 |             del visit['time']
 198 |             del actions['time']
 199 |             visit = visit.groupby('user_id', as_index=False).count()
 200 |             visit.columns = ['user_id', 'visit']
 201 |             buy = actions[actions['type'] == 4]
 202 |             buy = buy.groupby('user_id', as_index=False).count()
 203 |             buy.columns = ['user_id', 'buy']
 204 |             actions = pd.merge(visit, buy, on='user_id', how='left')
 205 |             actions['visit_day_before_buy'] = actions['visit'] / actions['buy']
 206 |             del actions['buy']
 207 |             del actions['visit']
 208 |             return actions
 209 | 
 210 |         # 用户加入购物车前访问天数
 211 |         def user_feat_2_2(start_date, end_date):
 212 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 213 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 214 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 215 |             visit = actions[actions['type'] == 1]
 216 |             visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
 217 |             del visit['time']
 218 |             del actions['time']
 219 |             visit = visit.groupby('user_id', as_index=False).count()
 220 |             visit.columns = ['user_id', 'visit']
 221 |             addtoshopping = actions[actions['type'] == 2]
 222 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 223 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 224 |             actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
 225 |             actions['visit_day_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
 226 |             del actions['addtoshopping']
 227 |             del actions['visit']
 228 |             return actions
 229 | 
 230 |         # 用户关注前访问天数
 231 |         def user_feat_2_3(start_date, end_date):
 232 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 233 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 234 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 235 |             visit = actions[actions['type'] == 1]
 236 |             visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
 237 |             del visit['time']
 238 |             del actions['time']
 239 |             visit = visit.groupby('user_id', as_index=False).count()
 240 |             visit.columns = ['user_id', 'visit']
 241 |             guanzhu = actions[actions['type'] == 5]
 242 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 243 |             guanzhu.columns = ['user_id', 'guanzhu']
 244 |             actions = pd.merge(visit, guanzhu, on='user_id', how='left')
 245 |             actions['visit_day_before_guanzhu'] = actions['visit'] / actions['guanzhu']
 246 |             del actions['guanzhu']
 247 |             del actions['visit']
 248 |             return actions
 249 | 
 250 |         # 用户购买前加入购物车天数
 251 |         def user_feat_2_4(start_date, end_date):
 252 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 253 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 254 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 255 |             addtoshopping = actions[actions['type'] == 2]
 256 |             addtoshopping = addtoshopping.drop_duplicates(['user_id', 'time'], keep='first')
 257 |             del addtoshopping['time']
 258 |             del actions['time']
 259 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 260 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 261 |             buy = actions[actions['type'] == 4]
 262 |             buy = buy.groupby('user_id', as_index=False).count()
 263 |             buy.columns = ['user_id', 'buy']
 264 |             actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
 265 |             actions['addtoshopping_day_before_buy'] = actions['addtoshopping'] / actions['buy']
 266 |             del actions['buy']
 267 |             del actions['addtoshopping']
 268 |             return actions
 269 | 
 270 |         # 用户购买前关注天数
 271 |         def user_feat_2_5(start_date, end_date):
 272 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 273 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 274 |             guanzhu = actions[actions['type'] == 5]
 275 |             guanzhu = guanzhu.drop_duplicates(['user_id', 'time'], keep='first')
 276 |             del guanzhu['time']
 277 |             del actions['time']
 278 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 279 |             guanzhu.columns = ['user_id', 'guanzhu']
 280 |             buy = actions[actions['type'] == 4]
 281 |             buy = buy.groupby('user_id', as_index=False).count()
 282 |             buy.columns = ['user_id', 'buy']
 283 |             actions = pd.merge(guanzhu, buy, on='user_id', how='left')
 284 |             actions['guanzhu_day_before_buy'] = actions['guanzhu'] / actions['buy']
 285 |             del actions['buy']
 286 |             del actions['guanzhu']
 287 |             return actions
 288 | 
 289 |         actions = pd.merge(user_feat_2_1(start_date, end_date), user_feat_2_2(start_date, end_date), on='user_id',
 290 |                            how='outer')
 291 |         actions = pd.merge(actions, user_feat_2_3(start_date, end_date), on='user_id', how='outer')
 292 |         actions = pd.merge(actions, user_feat_2_4(start_date, end_date), on='user_id', how='outer')
 293 |         actions = pd.merge(actions, user_feat_2_5(start_date, end_date), on='user_id', how='outer')
 294 |         user_id = actions['user_id']
 295 |         del actions['user_id']
 296 |         actions = actions.fillna(0)
 297 |         min_max_scale = preprocessing.MinMaxScaler()
 298 |         actions = min_max_scale.fit_transform(actions.values)
 299 |         actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
 300 |         actions.to_csv(dump_path, index=False)
 301 |     actions.columns = ['user_id'] + ['u_feat2_' + str(i) for i in range(1, actions.shape[1])]
 302 |     return actions
 303 | 
 304 | 
 305 | 
 306 | 
 307 | # # 用户总购买品牌数
 308 | # def get_action_user_feat5(start_date, end_date):
 309 | #     dump_path = './cache/user_feat5_%s_%s.csv' % (start_date, end_date)
 310 | #     if os.path.exists(dump_path):
 311 | #         actions = pd.read_csv(dump_path)
 312 | #     else:
 313 | #         actions = get_actions(start_date, end_date)[['user_id', 'sku_id']]
 314 | #         actions = actions.drop_duplicates(['user_id', 'sku_id'], keep='first')
 315 | #         actions = actions.groupby('user_id', as_index=False).count()
 316 | #         actions.columns = ['user_id', 'sku_num']
 317 | #         actions['sku_num'] = actions['sku_num'].astype('float')
 318 | #         actions['sku_num'] = actions['sku_num'].map(
 319 | #             lambda x: (x - actions['sku_num'].min()) / (actions['sku_num'].max() - actions['sku_num'].min()))
 320 | #         actions.to_csv(dump_path, index=False)
 321 | #     actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
 322 | #     return actions
 323 | 
 324 | 
 325 | # 用户平均访问间隔
 326 | def get_action_user_feat6(start_date, end_date):
 327 |     dump_path = './cache/user_feat6_%s_%s.csv' % (start_date, end_date)
 328 |     if os.path.exists(dump_path):
 329 |         actions = pd.read_csv(dump_path)
 330 |     else:
 331 | 
 332 |         df = get_actions(start_date, end_date)[['user_id', 'time']]
 333 |         # df['user_id']=df['user_id'].astype('int')
 334 |         df['time'] = df['time'].map(lambda x: x.split(' ')[0])
 335 |         df = df.drop_duplicates(['user_id', 'time'], keep='first')
 336 |         df['time'] = df['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
 337 |         actions = df.groupby('user_id', as_index=False).agg(lambda x: x['time'].diff().mean())
 338 |         actions['avg_visit'] = actions['time'].dt.days
 339 |         del actions['time']
 340 |         actions.to_csv(dump_path, index=False)
 341 |     actions.columns = ['user_id'] + ['u_feat6_' + str(i) for i in range(1, actions.shape[1])]
 342 |     return actions
 343 | 
 344 | 
 345 | # 用户平均六种行为的访问间隔
 346 | def get_action_user_feat6_six(start_date, end_date):
 347 |     dump_path = './cache/user_feat6_six_%s_%s.csv' % (start_date, end_date)
 348 |     if os.path.exists(dump_path):
 349 |         actions = pd.read_csv(dump_path)
 350 |     else:
 351 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 352 |         df['time'] = df['time'].map(lambda x: (-1) * get_day_chaju(x, start_date))
 353 |         df = df.drop_duplicates(['user_id', 'time', 'type'], keep='first')
 354 |         actions = df.groupby(['user_id', 'type']).agg(lambda x: np.diff(x).mean())
 355 |         actions = actions.unstack()
 356 |         actions.columns = list(range(actions.shape[1]))
 357 |         actions = actions.reset_index()
 358 |         actions.to_csv(dump_path, index=False)
 359 |     actions.columns = ['user_id'] + ['u_feat6_six_' + str(i) for i in range(1, actions.shape[1])]
 360 |     return actions
 361 | 
 362 | 
 363 | # 用户购买频率
 364 | def get_action_user_feat7(start_date, end_date):
 365 |     dump_path = './cache/user_feat7_six_%s_%s.csv' % (start_date, end_date)
 366 |     if os.path.exists(dump_path):
 367 |         actions = pd.read_csv(dump_path)
 368 |     else:
 369 |         df = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 370 |         actions = df.groupby(['user_id', 'type'], as_index=False).count()
 371 | 
 372 |         time_min = df.groupby(['user_id', 'type'], as_index=False).min()
 373 |         time_max = df.groupby(['user_id', 'type'], as_index=False).max()
 374 | 
 375 |         time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
 376 |         time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 377 |         time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 378 | 
 379 |         time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
 380 |                                                                                              time_cha[
 381 |                                                                                                  'time_y']).dt.seconds // 3600
 382 |         del time_cha['time_x']
 383 |         del time_cha['time_y']
 384 |         # time_cha=time_cha.fillna(1)
 385 | 
 386 |         actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
 387 |         actions = actions.groupby(['user_id', 'type']).sum()
 388 |         actions['cnt/time'] = actions['time'] / actions["cha_hour"]
 389 |         actions = actions.unstack()
 390 |         actions.columns = list(range(actions.shape[1]))
 391 |         actions = actions.reset_index()
 392 |         actions = actions.fillna(0)
 393 |         actions.to_csv(dump_path, index=False)
 394 |     actions.columns = ['user_id'] + ['u_feat7_' + str(i) for i in range(1, actions.shape[1])]
 395 |     return actions
 396 | 
 397 | 
 398 | def user_top_k_0_1(start_date, end_date):
 399 |     actions = get_actions(start_date, end_date)
 400 |     actions = actions[['user_id', 'sku_id', 'type']]
 401 |     df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
 402 |     actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
 403 |     actions = actions.groupby('user_id', as_index=False).sum()
 404 |     del actions['type']
 405 |     del actions['sku_id']
 406 |     user_id = actions['user_id']
 407 |     del actions['user_id']
 408 |     actions = actions.applymap(lambda x: 1 if x > 0 else 0)
 409 |     actions = pd.concat([user_id, actions], axis=1)
 410 |     return actions
 411 | 
 412 | 
 413 | # 用户最近K天行为0/1提取
 414 | def get_action_user_feat8(start_date, end_date):
 415 |     dump_path = './cache/user_feat8_%s_%s.csv' % (start_date, end_date)
 416 |     if os.path.exists(dump_path):
 417 |         actions = pd.read_csv(dump_path)
 418 |     else:
 419 |         actions = None
 420 |         for i in (1, 2, 3, 4, 5, 6, 7, 15, 30):
 421 |             print(i)
 422 |             start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
 423 |             start_days = start_days.strftime('%Y-%m-%d')
 424 |             if actions is None:
 425 |                 actions = user_top_k_0_1(start_days, end_date)
 426 |             else:
 427 |                 actions = pd.merge(actions, user_top_k_0_1(start_days, end_date), how='outer', on='user_id')
 428 |         actions.to_csv(dump_path, index=False)
 429 |     actions.columns = ['user_id'] + ['u_feat8_' + str(i) for i in range(1, actions.shape[1])]
 430 |     return actions
 431 | 
 432 | 
 433 | # 获取用户的重复购买率
 434 | def get_action_user_feat8_2(start_date, end_date):
 435 |     dump_path = './cache/product_feat8_2_%s_%s.csv' % (start_date, end_date)
 436 |     if os.path.exists(dump_path):
 437 |         actions = pd.read_csv(dump_path)
 438 |     else:
 439 |         df = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']]
 440 |         df = df[df['type'] == 4]  # 购买的行为
 441 |         df = df.groupby(['user_id', 'sku_id'], as_index=False).count()
 442 |         df.columns = ['user_id', 'sku_id', 'count1']
 443 |         df['count1'] = df['count1'].map(lambda x: 1 if x > 1 else 0)
 444 |         grouped = df.groupby(['user_id'], as_index=False)
 445 |         actions = grouped.count()[['user_id', 'count1']]
 446 |         actions.columns = ['user_id', 'count']
 447 |         re_count = grouped.sum()[['user_id', 'count1']]
 448 |         re_count.columns = ['user_id', 're_count']
 449 |         actions = pd.merge(actions, re_count, on='user_id', how='left')
 450 |         re_buy_rate = actions['re_count'] / actions['count']
 451 |         actions = pd.concat([actions['user_id'], re_buy_rate], axis=1)
 452 |         actions.columns = ['user_id', 're_buy_rate']
 453 |         actions.to_csv(dump_path, index=False)
 454 |     actions.columns = ['user_id'] + ['u_feat8_2_' + str(i) for i in range(1, actions.shape[1])]
 455 |     return actions
 456 | 
 457 | 
 458 | # 获取最近一次行为的时间距离当前时间的差距
 459 | def get_action_user_feat9(start_date, end_date):
 460 |     dump_path = './cache/user_feat9_%s_%s.csv' % (start_date, end_date)
 461 |     if os.path.exists(dump_path):
 462 |         actions = pd.read_csv(dump_path)
 463 |     else:
 464 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 465 |         # df['time'] = df['time'].map(lambda x: (-1)*get_day_chaju(x,start_date))
 466 |         df = df.drop_duplicates(['user_id', 'type'], keep='last')
 467 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
 468 |         actions = df.groupby(['user_id', 'type']).sum()
 469 |         actions = actions.unstack()
 470 |         actions.columns = list(range(actions.shape[1]))
 471 |         actions = actions.reset_index()
 472 |         actions = actions.fillna(30)
 473 |         actions.to_csv(dump_path, index=False)
 474 |     actions.columns = ['user_id'] + ['u_feat9_' + str(i) for i in range(1, actions.shape[1])]
 475 |     return actions
 476 | 
 477 | 
 478 | # 获取最后一次行为的次数并且进行归一化
 479 | def get_action_user_feat10(start_date, end_date):
 480 |     dump_path = './cache/user_feat10_%s_%s.csv' % (start_date, end_date)
 481 |     if os.path.exists(dump_path):
 482 |         actions = pd.read_csv(dump_path)
 483 |     else:
 484 | 
 485 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 486 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
 487 | 
 488 |         idx = df.groupby(['user_id', 'type'])['time'].transform(min)
 489 |         idx1 = idx == df['time']
 490 |         actions = df[idx1].groupby(["user_id", "type"]).count()
 491 |         actions = actions.unstack()
 492 |         actions.columns = list(range(actions.shape[1]))
 493 |         actions = actions.fillna(0)
 494 |         actions = actions.reset_index()
 495 | 
 496 |         user_sku = actions[['user_id']]
 497 |         del actions['user_id']
 498 |         min_max_scaler = preprocessing.MinMaxScaler()
 499 |         actions = min_max_scaler.fit_transform(actions.values)
 500 |         actions = pd.DataFrame(actions)
 501 |         actions = pd.concat([user_sku, actions], axis=1)
 502 | 
 503 |         actions.to_csv(dump_path, index=False)
 504 |     actions.columns = ['user_id'] + ['u_feat10_' + str(i) for i in range(1, actions.shape[1])]
 505 |     return actions
 506 | 
 507 | 
 508 | # 获取人物该层级最后一层的各种行为的统计数量
 509 | def get_action_user_feat11(start_date, end_date, n):
 510 |     dump_path = './cache/user_feat11_%s_%s_%s.csv' % (start_date, end_date, n)
 511 |     if os.path.exists(dump_path):
 512 |         actions = pd.read_csv(dump_path)
 513 |     else:
 514 | 
 515 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 516 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
 517 |         df = df[df['time'] == 0]
 518 |         del df['time']
 519 |         temp = pd.get_dummies(df['type'], prefix='type')
 520 |         del df['type']
 521 |         actions = pd.concat([df, temp], axis=1)
 522 |         actions = actions.groupby(['user_id'], as_index=False).sum()
 523 |         user_sku = actions[['user_id']]
 524 |         del actions['user_id']
 525 |         min_max_scaler = preprocessing.MinMaxScaler()
 526 |         actions = min_max_scaler.fit_transform(actions.values)
 527 |         actions = pd.DataFrame(actions)
 528 |         actions = pd.concat([user_sku, actions], axis=1)
 529 |         actions.to_csv(dump_path, index=False)
 530 |     actions.columns = ['user_id'] + ['u_feat11_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
 531 |     return actions
 532 | 
 533 | 
 534 | def get_action_user_feat12(start_date, end_date):
 535 |     dump_path = './cache/user_feat12_%s_%s.csv' % (start_date, end_date)
 536 |     if os.path.exists(dump_path):
 537 |         actions = pd.read_csv(dump_path)
 538 |     else:
 539 |         actions = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 540 |         actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 541 |         actions = actions.drop_duplicates(['user_id', 'time', 'type'], keep='first')
 542 |         actions['day'] = actions['time'].map(
 543 |             lambda x: (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d')).days)
 544 |         result = None
 545 |         for i in (2, 3, 7, 14, 28):  # 层级个数
 546 |             print ('i%s' % i)
 547 |             actions['level%s' % i] = actions['day'].map(lambda x: x // i)
 548 |             a=set(actions['level%s' % i].tolist())
 549 |             for j in (1, 2,3,4, 5, 6):  # type
 550 |                 print ('j%s' % j)
 551 |                 df = actions[actions['type'] == j][['user_id', 'level%s' % i, 'time']]
 552 |                 df = df.groupby(['user_id', 'level%s' % i]).count()
 553 |                 df = df.unstack()
 554 |                 b=df.columns.levels[1].tolist()
 555 |                 df.columns = ['u_feat12_' + str('level%s_' % i) + str(j) + '_' + str(k) for k in df.columns.levels[1].tolist()]
 556 |                 if  len(list(a-set(b)))!=0:
 557 |                     c=list(a-set(b))
 558 |                     for k in c:
 559 |                         df['u_feat12_'+str('level%s_' % i)+str(j)+'_'+ str(k)]=0
 560 |                 columns=df.columns
 561 |                 dict={}
 562 |                 for column in columns:
 563 |                     k=int(column.split('_')[-1])
 564 |                     dict[column]=k
 565 |                 columns=sorted(dict.items(),key=lambda x: x[1])
 566 |                 columns=[(columns[t])[0] for t in range(len(columns))]
 567 |                 df=df[columns]
 568 |                 df = df.reset_index()
 569 |                 if result is None:
 570 |                     result = df
 571 |                 else:
 572 |                     result = pd.merge(result, df, on='user_id', how='left')
 573 |         columns = result.columns
 574 |         user_id = result['user_id']
 575 |         del result['user_id']
 576 |         actions = result.fillna(0)
 577 | 
 578 |         min_max_scaler = preprocessing.MinMaxScaler()
 579 |         actions = min_max_scaler.fit_transform(actions.values)
 580 |         actions = pd.DataFrame(actions)
 581 |         actions = pd.concat([user_id, actions], axis=1)
 582 |         actions.columns=columns
 583 |         actions.to_csv(dump_path, index=False)
 584 |     return actions
 585 | 
 586 | 
 587 | 
 588 | # 层级的天数
 589 | def get_action_user_feat13(start_date, end_date, n):
 590 |     dump_path = './cache/user_feat13_%s_%s_%s.csv' % (start_date, end_date, n)
 591 |     if os.path.exists(dump_path):
 592 |         actions = pd.read_csv(dump_path)
 593 |     else:
 594 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 595 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
 596 |         df = df.drop_duplicates(['user_id', 'type', 'time'], keep='first')
 597 |         actions = df.groupby(['user_id', 'type']).count()
 598 |         actions = actions.unstack()
 599 |         actions.columns = list(range(actions.shape[1]))
 600 |         actions = actions.fillna(0)
 601 |         actions = actions.reset_index()
 602 |         user_sku = actions[['user_id']]
 603 |         del actions['user_id']
 604 |         min_max_scaler = preprocessing.MinMaxScaler()
 605 |         actions = min_max_scaler.fit_transform(actions.values)
 606 |         actions = pd.DataFrame(actions)
 607 |         actions = pd.concat([user_sku, actions], axis=1)
 608 |         actions.to_csv(dump_path, index=False)
 609 |     actions.columns = ['user_id'] + ['u_feat13_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
 610 |     return actions
 611 | 
 612 | 
 613 | def get_action_user_feat14(start_date, end_date):
 614 |     dump_path = './cache/user_feat14_%s_%s.csv' % (start_date, end_date)
 615 |     if os.path.exists(dump_path):
 616 |         actions = pd.read_csv(dump_path)
 617 |     else:
 618 |         n = 5
 619 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 620 |         df = df[df['type'] == 4][['user_id', 'time']]
 621 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
 622 |         days = np.max(df['time'])
 623 | 
 624 |         df['cnt'] = 0
 625 |         actions = df.groupby(['user_id', 'time']).count()
 626 | 
 627 |         actions = actions.unstack()
 628 | 
 629 |         actions.columns = list(range(actions.shape[1]))
 630 |         actions = actions.reset_index()
 631 | 
 632 |         actions = actions.fillna(0)
 633 |         user_sku = actions[['user_id']]
 634 |         del actions['user_id']
 635 |         min_max_scaler = preprocessing.MinMaxScaler()
 636 |         actions = min_max_scaler.fit_transform(actions.values)
 637 |         actions = pd.DataFrame(actions)
 638 |         actions = pd.concat([user_sku, actions], axis=1)
 639 |         actions.to_csv(dump_path, index=False)
 640 |     actions.columns = ['user_id'] + ['u_feat14_' + str(i) for i in range(1, actions.shape[1])]
 641 |     return actions
 642 | 
 643 | 
 644 | # 用户购买/加入购物车/关注前访问次数
 645 | def get_action_user_feat15(start_date, end_date):
 646 |     dump_path = './cache/user_feat15_%s_%s.csv' % (start_date, end_date)
 647 |     if os.path.exists(dump_path):
 648 |         actions = pd.read_csv(dump_path)
 649 |     else:
 650 |         # 用户购买前访问次数
 651 |         def user_feat_15_1(start_date, end_date):
 652 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 653 |             visit = actions[actions['type'] == 1]
 654 |             visit = visit.groupby('user_id', as_index=False).count()
 655 |             visit.columns = ['user_id', 'visit']
 656 |             buy = actions[actions['type'] == 4]
 657 |             buy = buy.groupby('user_id', as_index=False).count()
 658 |             buy.columns = ['user_id', 'buy']
 659 |             actions = pd.merge(visit, buy, on='user_id', how='left')
 660 |             actions['visit_num_before_buy'] = actions['visit'] / actions['buy']
 661 |             del actions['buy']
 662 |             del actions['visit']
 663 |             return actions
 664 | 
 665 |         # 用户加入购物车前访问次数
 666 |         def user_feat_15_2(start_date, end_date):
 667 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 668 |             visit = actions[actions['type'] == 1]
 669 |             visit = visit.groupby('user_id', as_index=False).count()
 670 |             visit.columns = ['user_id', 'visit']
 671 |             addtoshopping = actions[actions['type'] == 2]
 672 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 673 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 674 |             actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
 675 |             actions['visit_num_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
 676 |             del actions['addtoshopping']
 677 |             del actions['visit']
 678 |             return actions
 679 | 
 680 |         # 用户关注前访问次数
 681 |         def user_feat_15_3(start_date, end_date):
 682 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 683 |             visit = actions[actions['type'] == 1]
 684 |             visit = visit.groupby('user_id', as_index=False).count()
 685 |             visit.columns = ['user_id', 'visit']
 686 |             guanzhu = actions[actions['type'] == 5]
 687 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 688 |             guanzhu.columns = ['user_id', 'guanzhu']
 689 |             actions = pd.merge(visit, guanzhu, on='user_id', how='left')
 690 |             actions['visit_num_before_guanzhu'] = actions['visit'] / actions['guanzhu']
 691 |             del actions['guanzhu']
 692 |             del actions['visit']
 693 |             return actions
 694 | 
 695 |         # 用户购买前加入购物车次数
 696 |         def user_feat_15_4(start_date, end_date):
 697 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 698 |             addtoshopping = actions[actions['type'] == 2]
 699 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 700 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 701 |             buy = actions[actions['type'] == 4]
 702 |             buy = buy.groupby('user_id', as_index=False).count()
 703 |             buy.columns = ['user_id', 'buy']
 704 |             actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
 705 |             actions['addtoshopping_num_before_buy'] = actions['addtoshopping'] / actions['buy']
 706 |             del actions['buy']
 707 |             del actions['addtoshopping']
 708 |             return actions
 709 | 
 710 |         # 用户购买前关注次数
 711 |         def user_feat_15_5(start_date, end_date):
 712 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 713 |             guanzhu = actions[actions['type'] == 5]
 714 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 715 |             guanzhu.columns = ['user_id', 'guanzhu']
 716 |             buy = actions[actions['type'] == 4]
 717 |             buy = buy.groupby('user_id', as_index=False).count()
 718 |             buy.columns = ['user_id', 'buy']
 719 |             actions = pd.merge(guanzhu, buy, on='user_id', how='left')
 720 |             actions['guanzhu_num_before_buy'] = actions['guanzhu'] / actions['buy']
 721 |             del actions['buy']
 722 |             del actions['guanzhu']
 723 |             return actions
 724 | 
 725 |         actions = pd.merge(user_feat_15_1(start_date, end_date), user_feat_15_2(start_date, end_date), on='user_id',
 726 |                            how='outer')
 727 |         actions = pd.merge(actions, user_feat_15_3(start_date, end_date), on='user_id', how='outer')
 728 |         actions = pd.merge(actions, user_feat_15_4(start_date, end_date), on='user_id', how='outer')
 729 |         actions = pd.merge(actions, user_feat_15_5(start_date, end_date), on='user_id', how='outer')
 730 |         user_id = actions['user_id']
 731 |         del actions['user_id']
 732 |         actions = actions.fillna(0)
 733 |         min_max_scale = preprocessing.MinMaxScaler()
 734 |         actions = min_max_scale.fit_transform(actions.values)
 735 |         actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
 736 | 
 737 |         actions.to_csv(dump_path, index=False)
 738 |     actions.columns = ['user_id'] + ['u_feat15_' + str(i) for i in range(1, actions.shape[1])]
 739 |     return actions
 740 | 
 741 | 
 742 | # 用户行为的交叉
 743 | def get_action_user_feat16(start_date, end_date):
 744 |     dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
 745 |     if os.path.exists(dump_path):
 746 |         actions = pd.read_csv(dump_path)
 747 |     else:
 748 |         actions = get_actions(start_date, end_date)[['user_id', 'type']]
 749 |         actions['cnt'] = 0
 750 |         action1 = actions.groupby(['user_id', 'type']).count()
 751 |         action1 = action1.unstack()
 752 |         index_col = list(range(action1.shape[1]))
 753 |         action1.columns = index_col
 754 |         action1 = action1.reset_index()
 755 |         action2 = actions.groupby('user_id', as_index=False).count()
 756 |         del action2['type']
 757 |         action2.columns = ['user_id', 'cnt']
 758 |         actions = pd.merge(action1, action2, how='left', on='user_id')
 759 |         for i in index_col:
 760 |             actions[i] = actions[i] / actions['cnt']
 761 |         del actions['cnt']
 762 |         actions.to_csv(dump_path, index=False)
 763 |     actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
 764 |     return actions
 765 | 
 766 | 
 767 | # 最近k天用户访问P集合的商品数/用户访问总体的商品数（k小于7天，不除总体的商品数，反之，除）
 768 | def get_action_user_feat0509_1_30(start_date, end_date, n):
 769 |     dump_path = './cache/user_feat0509_1_30_%s_%s_%s.csv' % (start_date, end_date, n)
 770 |     if os.path.exists(dump_path):
 771 |         actions = pd.read_csv(dump_path)
 772 |     else:
 773 | 
 774 |         start_days = datetime.strptime(end_dfte, '%Y-%m-%d') - timedelta(days=n)
 775 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
 776 | 
 777 |         actions = get_actions(start_days, end_date)[['user_id', 'sku_id', 'type']]
 778 |         actions_dummy = pd.get_dummies(actions['type'], prefix='actions')
 779 |         actions = pd.concat([actions, actions_dummy], axis=1)
 780 |         del actions['type']
 781 | 
 782 |         P = get_basic_product_feat()[['sku_id']]
 783 |         P['label'] = 1
 784 |         actions_sub = pd.merge(actions, P, on='sku_id', how='left')
 785 |         actions_sub = actions_sub[actions_sub['label'] == 1]
 786 |         del actions_sub['label']
 787 | 
 788 |         actions_sub = actions_sub.groupby(['user_id'], as_index=False).sum()
 789 |         del actions_sub['sku_id']
 790 |         actions_all = actions.groupby(['user_id'], as_index=False).sum()
 791 |         del actions_all['sku_id']
 792 | 
 793 |         if n > 7:
 794 |             actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
 795 |             # print actions.head()
 796 |             for i in range(1, 7):
 797 |                 actions['actions_%s' % i] = actions['actions_%s_y' % i] / actions['actions_%s_x' % i]
 798 |                 # actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
 799 | 
 800 |         else:
 801 |             actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
 802 |         actions.to_csv(dump_path, index=False)
 803 |     actions.columns = ['user_id'] + ['u_feat30_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
 804 |     
 805 |     return actions
 806 | 
 807 | 
 808 | 
 809 | #用户行为的交叉
 810 | def get_action_user_feat16(start_date,end_date):
 811 |     dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
 812 |     if os.path.exists(dump_path):
 813 |         actions = pd.read_csv(dump_path)
 814 |     else:
 815 |         actions=get_actions(start_date, end_date)[['user_id', 'type']]
 816 |         actions['cnt']=0
 817 |         action1 = actions.groupby(['user_id', 'type']).count()
 818 |         action1=action1.unstack()
 819 |         index_col=list(range(action1.shape[1]))
 820 |         action1.columns=index_col
 821 |         action1=action1.reset_index()
 822 |         action2 = actions.groupby('user_id', as_index=False).count()
 823 |         del action2['type']
 824 |         action2.columns = ['user_id', 'cnt']
 825 |         actions = pd.merge(action1, action2, how='left', on='user_id')
 826 |         for i in index_col:
 827 |             actions[i] = actions[i] / actions['cnt']
 828 |         del actions['cnt']
 829 |         actions.to_csv(dump_path,index=False)
 830 |     actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
 831 |     return actions
 832 | 
 833 | #最近k天用户访问P集合的商品数/用户访问总体的商品数（k小于7天，不除总体的商品数，反之，除）
 834 | def get_action_user_feat0509_1_30(start_date,end_date,n):
 835 |     dump_path='./cache/user_feat0509_1_30_%s_%s_%s.csv'%(start_date,end_date,n)
 836 |     if os.path.exists(dump_path):
 837 |         actions = pd.read_csv(dump_path)
 838 |     else:
 839 | 
 840 |         start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
 841 |         start_days=datetime.strftime(start_days,'%Y-%m-%d')
 842 | 
 843 |         actions=get_actions(start_days,end_date)[['user_id','sku_id','type']]
 844 |         actions_dummy=pd.get_dummies(actions['type'],prefix='actions')
 845 |         actions=pd.concat([actions,actions_dummy],axis=1)
 846 |         del actions['type']
 847 | 
 848 |         P = get_basic_product_feat()[['sku_id']]
 849 |         P['label']=1
 850 |         actions_sub=pd.merge(actions,P,on='sku_id',how='left')
 851 |         actions_sub=actions_sub[actions_sub['label']==1]
 852 |         del actions_sub['label']
 853 | 
 854 |         actions_sub=actions_sub.groupby(['user_id'],as_index=False).sum()
 855 |         del actions_sub['sku_id']
 856 |         actions_all=actions.groupby(['user_id'],as_index=False).sum()
 857 |         del actions_all['sku_id']
 858 | 
 859 |         if n>7:
 860 |             actions=pd.merge(actions_all,actions_sub,on=['user_id'],how='left')
 861 |             #print actions.head()
 862 |             for i in range(1,7):
 863 |                 actions['actions_%s'%i]=actions['actions_%s_y'%i]/actions['actions_%s_x'%i]
 864 |             #actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
 865 | 
 866 |         else:
 867 |             actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
 868 |         actions.to_csv(dump_path,index=False)
 869 |     actions.columns = ['user_id'] + ['u_feat30_' +str(n)+'_'+ str(i) for i in range(1, actions.shape[1])]  
 870 | #     user_id = actions[['user_id']]
 871 | #     del actions['user_id']
 872 | #     actions = actions.fillna(0)
 873 | #     actions=actions.replace(np.inf,0)
 874 | # #         print(actions.head())
 875 | #     columns = actions.columns
 876 | 
 877 | #     min_max_scale = preprocessing.MinMaxScaler()
 878 | #     actions=actions.replace(np.inf,0)
 879 | #     actions = min_max_scale.fit_transform(actions.values)
 880 | #     actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 881 |     return actions
 882 | 
 883 | #用户点击到购买的时间间隔
 884 | def get_action_user_feat0515_2_1(start_date,end_date):
 885 |     dump_path='./cache/get_action_user_feat0515_2_1_%s_%s.csv'%(start_date,end_date)
 886 |     if os.path.exists(dump_path):
 887 |         actions = pd.read_csv(dump_path)
 888 |     else:
 889 |         actions = get_actions(start_date,end_date)
 890 |         actions_dianji=actions[actions['type']==6][['user_id','sku_id','time']]
 891 |         actions_dianji['time_dianji'] = actions_dianji['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 892 |         actions_dianji = actions_dianji[['user_id', 'sku_id','time_dianji']]
 893 |         actions_dianji= actions_dianji.drop_duplicates(['user_id', 'sku_id'], keep='first')
 894 | 
 895 | 
 896 |         actions_goumai=actions[actions['type']==4][['user_id','sku_id','time']]
 897 |         actions_goumai['time_goumai'] = actions_goumai['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 898 |         actions_goumai = actions_goumai[['user_id', 'sku_id','time_goumai']]
 899 |         actions_goumai= actions_goumai.drop_duplicates(['user_id', 'sku_id'], keep='last')
 900 | 
 901 |         actions = pd.merge(actions_dianji,actions_goumai,on=['user_id','sku_id'],how='inner')
 902 |         actions['time_jiange']=actions['time_goumai']-actions['time_dianji']
 903 |         actions=actions.drop(['sku_id','time_goumai','time_dianji'],axis=1)
 904 |         actions['time_jiange']=actions['time_jiange'].map(lambda x:x.days*24+x.seconds//3600+1)
 905 | 
 906 |         actions_min = actions.groupby('user_id').min().reset_index()
 907 |         actions_min.columns = ['user_id','time_min']
 908 |         # actions_mean = actions.groupby('user_id').mean().reset_index()
 909 |         # actions_mean.columns = ['user_id','time_mean']
 910 |         actions_max = actions.groupby('user_id').max().reset_index()
 911 |         actions_max.columns = ['user_id','time_max']
 912 |         actions=pd.merge(actions_min,actions_max,on='user_id',how='left')
 913 |         
 914 |         user_id = actions[['user_id']]
 915 |         del actions['user_id']
 916 |         actions = actions.fillna(0)
 917 |         columns = actions.columns
 918 |         min_max_scale = preprocessing.MinMaxScaler()
 919 |         actions = min_max_scale.fit_transform(actions.values)
 920 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 921 |         actions.to_csv(dump_path,index=False)
 922 |     return actions
 923 | 
 924 | 
 925 | #用户购买每种cate的数量
 926 | def get_action_user_feat0515_2_2(start_date,end_date):
 927 |     dump_path='./cache/get_action_user_feat0515_2_2_%s_%s.csv'%(start_date,end_date)
 928 |     if os.path.exists(dump_path):
 929 |         actions = pd.read_csv(dump_path)
 930 |     else:
 931 |         actions = get_actions(start_date,end_date)
 932 |         actions = get_actions(start_date,end_date)[['user_id','cate']]
 933 |         cate_col = pd.get_dummies(actions['cate'],prefix='cate')
 934 |         actions=pd.concat([actions[['user_id']],cate_col],axis=1)
 935 |         actions= actions.groupby('user_id').sum().reset_index()
 936 |         
 937 |         user_id = actions[['user_id']]
 938 |         del actions['user_id']
 939 |         actions = actions.fillna(0)
 940 |         columns = actions.columns
 941 |         min_max_scale = preprocessing.MinMaxScaler()
 942 |         actions = min_max_scale.fit_transform(actions.values)
 943 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 944 |         actions.to_csv(dump_path,index=False)
 945 |     return actions
 946 | 
 947 | 
 948 | #获取某人某段时间内加入购物车的数量以及关注的数量
 949 | def get_action_user_feat0515_2_3(start_date, end_date, n):
 950 |     dump_path = './cache/get_action_user_feat0515_2_3_%s_%s_%s_1.csv' % (start_date, end_date, n)
 951 |     if os.path.exists(dump_path):
 952 |         actions = pd.read_csv(dump_path)
 953 |     else:
 954 | 
 955 |         start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
 956 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
 957 | 
 958 |         actions = get_actions(start_days,end_date)[['user_id','type','cate']]
 959 |         actions_gouwuche=actions[actions['type']==2]
 960 |         actions_gouwuche_1= actions_gouwuche[['user_id','type']]
 961 |         actions_gouwuche_1= actions_gouwuche_1.groupby('user_id').count().reset_index()
 962 |         actions_gouwuche_1.columns = ['user_id',str(n)+'gouwuche_add']
 963 | 
 964 |         actions_gouwuche_2= actions_gouwuche[actions_gouwuche['cate']==8][['user_id','type']]
 965 |         actions_gouwuche_2= actions_gouwuche_2.groupby('user_id').count().reset_index()
 966 |         actions_gouwuche_2.columns = ['user_id',str(n)+'gouwuche_add_cate_8']
 967 | 
 968 |         actions_guanzhu=actions[actions['type']==5]
 969 |         actions_guanzhu_1= actions_guanzhu[['user_id','type']]
 970 |         actions_guanzhu_1= actions_guanzhu_1.groupby('user_id').count().reset_index()
 971 |         actions_guanzhu_1.columns = ['user_id',str(n)+'guanzhu_add']
 972 | 
 973 |         actions_guanzhu_2= actions_guanzhu[actions_guanzhu['cate']==8][['user_id','type']]
 974 |         actions_guanzhu_2= actions_guanzhu_2.groupby('user_id').count().reset_index()
 975 |         actions_guanzhu_2.columns = ['user_id',str(n)+'guanzhu_add_cate_8']
 976 | 
 977 |         actions = pd.merge(actions_gouwuche_1,actions_gouwuche_2,on='user_id',how ='outer')
 978 |         actions = pd.merge(actions,actions_guanzhu_1,on='user_id',how ='outer')
 979 |         actions = pd.merge(actions,actions_guanzhu_2,on='user_id',how ='outer')
 980 |         actions=actions.fillna(0)
 981 |         
 982 |         user_id = actions[['user_id']]
 983 |         del actions['user_id']
 984 |         actions = actions.fillna(0)
 985 |         columns = actions.columns
 986 |         min_max_scale = preprocessing.MinMaxScaler()
 987 |         actions = min_max_scale.fit_transform(actions.values)
 988 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 989 |         actions.to_csv(dump_path, index=False)
 990 |     
 991 |     
 992 |     return actions
 993 | 
 994 | #top n 中 某人使用了多少天产生了该行为
 995 | def get_action_user_feat0515_2_4(start_date, end_date, n):
 996 |     dump_path = './cache/get_action_user_feat0515_2_4_%s_%s_%s.csv' % (start_date, end_date, n)
 997 |     if os.path.exists(dump_path):
 998 |         actions = pd.read_csv(dump_path)
 999 |     else:
1000 | 
1001 |         start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1002 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
1003 | 
1004 |         actions = get_actions(start_days,end_date)[['user_id','type','time']]
1005 |         actions['time'] = actions['time'].map(lambda x: (datetime.strptime(end_date,'%Y-%m-%d')-datetime.strptime(x, '%Y-%m-%d %H:%M:%S')).days)
1006 |         actions=actions.drop_duplicates(['user_id','type','time'])
1007 |         actions = actions.groupby(['user_id','type']).count()
1008 |         actions.columns = [str(n)+'day_nums']
1009 |         actions=actions.unstack()
1010 |         actions=actions.reset_index()
1011 |         actions.columns = ['user_id'] + ['get_action_user_feat0515_2_4_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1012 |         actions=actions.fillna(0)
1013 |         
1014 |         user_id = actions[['user_id']]
1015 |         del actions['user_id']
1016 |         actions = actions.fillna(0)
1017 |         columns = actions.columns
1018 |         min_max_scale = preprocessing.MinMaxScaler()
1019 |         actions = min_max_scale.fit_transform(actions.values)
1020 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
1021 |         actions.to_csv(dump_path, index=False)  
1022 |     return actions
1023 | 
1024 | 
1025 | # 用户总购买/加购/关注/点击/浏览品牌数
1026 | def get_action_user_feat5(start_date, end_date):
1027 |     dump_path = './cache/user_feat5_a_%s_%s.csv' % (start_date, end_date)
1028 |     if os.path.exists(dump_path):
1029 |         actions = pd.read_csv(dump_path)
1030 |     else:
1031 |         actions = get_actions(start_date, end_date)
1032 |         action=None
1033 |         for i in (1,2,4,5,6):
1034 |             df=actions[actions['type']==i][['user_id', 'sku_id']]
1035 |             df = df.drop_duplicates(['user_id', 'sku_id'], keep='first')
1036 |             df = df.groupby('user_id', as_index=False).count()
1037 |             df.columns = ['user_id', 'num_%s'%i]
1038 |             if i==1:
1039 |                 action=df
1040 |             else:
1041 |                 action=pd.merge(action,df,on='user_id',how='outer')
1042 |         actions=action.fillna(0)
1043 |         actions = actions.astype('float')
1044 |         user=actions[['user_id']]
1045 |         min_max_scaler = preprocessing.MinMaxScaler()
1046 |         actions = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1047 |         actions = pd.DataFrame(actions)
1048 |         actions = pd.concat([user, actions], axis=1)
1049 |         actions.to_csv(dump_path, index=False)
1050 |     actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
1051 |     return actions
1052 | 
1053 | #top  k 用户总购买/加购/关注/点击/浏览品牌数
1054 | def get_action_u0515_feat5(start_date,end_date,k):
1055 |     dump_path = './cache/u0515_feat5_%s_%s_%s.csv' % (start_date, end_date,k)
1056 |     if os.path.exists(dump_path):
1057 |         actions = pd.read_csv(dump_path)
1058 |     else:
1059 |         start_days=pd.to_datetime(end_date)-timedelta(days=k)
1060 |         start_days=str(start_days).split(' ')[0]
1061 |         actions=get_action_user_feat5(start_days, end_date)
1062 |         actions.to_csv(dump_path,index=False)
1063 |     actions.columns=['user_id']+['u0515_feat5_'+str(k)+'_'+str(i) for i in range(1,actions.shape[1])]
1064 |     return actions
1065 | 
1066 | 
1067 | #最早交互时间
1068 | def get_action_u0524_feat1(start_date,end_date):
1069 |     dump_path = './cache/u0524_feat1_%s_%s.csv' % (start_date, end_date,)
1070 |     if os.path.exists(dump_path):
1071 |         actions = pd.read_csv(dump_path)
1072 |     else:
1073 |         #全集
1074 |         actions=get_actions(start_date,end_date)[['user_id','time']]
1075 |         actions=actions.groupby('user_id',as_index=False).first()
1076 |         actions['time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(actions['time'])
1077 |         actions['time_diff_early']=actions['time_diff_early'].dt.days*24+actions['time_diff_early'].dt.seconds//3600
1078 |         actions=actions[['user_id','time_diff_early']]
1079 |         #子集
1080 |         sub_actions=sub_get_actions(start_date,end_date)[['user_id','time']]
1081 |         sub_actions=sub_actions.groupby('user_id',as_index=False).first()
1082 |         sub_actions['sub_time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(sub_actions['time'])
1083 |         sub_actions['sub_time_diff_early']=sub_actions['sub_time_diff_early'].dt.days*24+sub_actions['sub_time_diff_early'].dt.seconds//3600
1084 |         sub_actions = sub_actions[['user_id', 'sub_time_diff_early']]
1085 | 
1086 |         actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1087 |         actions=actions.fillna(0)
1088 |         min_max_scale = preprocessing.MinMaxScaler()
1089 |         action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1090 |         actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1091 |         actions.to_csv(dump_path,index=False)
1092 |     actions.columns=['user_id']+['u0524_feat1_'+str(i)for i in range(1,actions.shape[1])]
1093 |     return actions
1094 | 
1095 | #最晚交互时间
1096 | def get_action_u0524_feat2(start_date,end_date):
1097 |     dump_path = './cache/u0524_feat2_%s_%s.csv' % (start_date, end_date,)
1098 |     if os.path.exists(dump_path):
1099 |         actions = pd.read_csv(dump_path)
1100 |     else:
1101 |         # 全集
1102 |         actions = get_actions(start_date, end_date)[['user_id', 'time']]
1103 |         actions = actions.groupby('user_id', as_index=False).last()
1104 |         actions['time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(actions['time'])
1105 |         actions['time_diff_recent'] = actions['time_diff_recent'].dt.days * 24 + actions['time_diff_recent'].dt.seconds // 3600
1106 |         actions = actions[['user_id', 'time_diff_recent']]
1107 |         # 子集
1108 |         sub_actions = sub_get_actions(start_date, end_date)[['user_id', 'time']]
1109 |         sub_actions = sub_actions.groupby('user_id', as_index=False).last()
1110 |         sub_actions['sub_time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(sub_actions['time'])
1111 |         sub_actions['sub_time_diff_recent'] = sub_actions['sub_time_diff_recent'].dt.days * 24 + sub_actions['sub_time_diff_recent'].dt.seconds // 3600
1112 |         sub_actions = sub_actions[['user_id', 'sub_time_diff_recent']]
1113 | 
1114 |         actions = pd.merge(actions, sub_actions, on='user_id', how='left')
1115 |         actions=actions.fillna(0)
1116 |         min_max_scale = preprocessing.MinMaxScaler()
1117 |         action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1118 |         actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1119 |         actions.to_csv(dump_path,index=False)
1120 |     actions.columns = ['user_id'] + ['u0524_feat2_' + str(i) for i in range(1, actions.shape[1])]
1121 |     return actions
1122 | 
1123 | 
1124 | #活跃天数
1125 | def get_action_u0524_feat3(start_date,end_date):
1126 |     dump_path = './cache/u0524_feat3_%s_%s.csv' % (start_date, end_date,)
1127 |     if os.path.exists(dump_path):
1128 |         actions = pd.read_csv(dump_path)
1129 |     else:
1130 |         #全集
1131 |         actions=get_actions(start_date,end_date)
1132 |         actions['time']=pd.to_datetime(actions['time']).dt.date
1133 |         actions=actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1134 |         actions=actions.groupby('user_id',as_index=False).count()
1135 |         #子集
1136 |         sub_actions=sub_get_actions(start_date,end_date)
1137 |         sub_actions['time']=pd.to_datetime(sub_actions['time']).dt.date
1138 |         sub_actions=sub_actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1139 |         sub_actions=sub_actions.groupby('user_id',as_index=False).count()
1140 |         actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1141 |         actions=actions.fillna(0)
1142 |         min_max_scale = preprocessing.MinMaxScaler()
1143 |         action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1144 |         actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1145 |         actions.to_csv(dump_path,index=False)
1146 |     actions.columns=['user_id']+['u0524_feat3_'+str(i) for i in range(1,actions.shape[1])]
1147 |     return actions
1148 | 
1149 | 
1150 | #点击模块
1151 | def get_action_user_feat0509_1_31(start_date,end_date,n):
1152 |     dump_path='./cache/user_feat0509_1_31_%s_%s_%s.csv'%(start_date,end_date,n)
1153 |     if os.path.exists(dump_path):
1154 |         actions = pd.read_csv(dump_path)
1155 |     else:
1156 |         start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
1157 |         start_days=datetime.strftime(start_days,'%Y-%m-%d')
1158 |         actions=get_actions(start_days,end_date)
1159 |         actions=actions[actions['type']==6][['user_id','model_id']]
1160 |         
1161 | #         actions = actions.drop('type',axis=1)
1162 |         
1163 |         actions_click_sum=actions[['user_id','model_id']].groupby('user_id').count().reset_index()
1164 |         actions_click_sum.columns = ['user_id',str(n)+'click_sum_all']
1165 |         actions[str(n)+'u_click14_history'] = actions['model_id'].map(lambda x: int(x == 14))
1166 |         actions[str(n)+'u_click21_history'] = actions['model_id'].map(lambda x: int(x == 21))
1167 |         actions[str(n)+'u_click28_history'] = actions['model_id'].map(lambda x: int(x == 28))
1168 |         actions[str(n)+'u_click110_history'] = actions['model_id'].map(lambda x: int(x == 110))
1169 |         actions[str(n)+'u_click210_history'] = actions['model_id'].map(lambda x: int(x == 210))
1170 |         actions = actions.groupby('user_id').sum().reset_index().drop('model_id', axis=1)
1171 | #         actions.to_csv(dump_path,index=False)
1172 |         actions = pd.merge(actions,actions_click_sum,how='left',on='user_id')
1173 |         
1174 |         actions[str(n)+'u_click14/click_sum_history'] = actions[str(n)+'u_click14_history']/actions[str(n)+'click_sum_all']
1175 |         actions[str(n)+'u_click21/click_sum_history'] = actions[str(n)+'u_click21_history']/actions[str(n)+'click_sum_all']
1176 |         actions[str(n)+'u_click28/click_sum_history'] = actions[str(n)+'u_click28_history']/actions[str(n)+'click_sum_all']
1177 |         actions[str(n)+'u_click110/click_sum_history'] = actions[str(n)+'u_click110_history']/actions[str(n)+'click_sum_all']
1178 |         actions[str(n)+'u_click210/click_sum_history'] = actions[str(n)+'u_click210_history']/actions[str(n)+'click_sum_all']
1179 |         
1180 |         user_id = actions[['user_id']]
1181 |         del actions['user_id']
1182 |         actions = actions.fillna(0)
1183 |         columns = actions.columns
1184 |         min_max_scale = preprocessing.MinMaxScaler()
1185 |         actions = min_max_scale.fit_transform(actions.values)
1186 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1187 |         actions.to_csv(dump_path,index=False)
1188 |     return actions
1189 | #u模型cate=8的购买者和不是cate=8的购买者
1190 | def get_action_u0513_feat16(start_date,end_date):
1191 |     dump_path = './cache/u0513_feat16_%s_%s.csv' % (start_date, end_date)
1192 |     if os.path.exists(dump_path):
1193 |         actions = pd.read_csv(dump_path)
1194 |     else:
1195 |         df = get_actions(start_date, end_date)[['user_id', 'type', 'cate']]
1196 |         df = df[df['type'] == 4]
1197 |         df = df.groupby(['user_id', 'cate']).count()
1198 |         df = df.unstack().reset_index()
1199 |         df.columns = ['user_id'] + ['cate' + str(i) for i in range(4, 12)]
1200 |         df = df.fillna(0)
1201 |         sum1 = df.drop(['user_id', 'cate8'], axis=1).apply(sum, axis=1)
1202 |         sum2 = df.drop(['user_id'], axis=1).apply(sum, axis=1)
1203 |         actions = pd.concat([df[['user_id', 'cate8']], sum1, sum2], axis=1)
1204 |         actions.columns = ['user_id', 'cate8', 'sum_other_cate', 'sum']
1205 |         actions['cate8_rate'] = actions['cate8'] / actions['sum']
1206 |         actions['sum_other_cate_rate'] = actions['sum_other_cate'] / actions['sum']
1207 |         del actions['sum']
1208 |         actions.to_csv(dump_path,index=False)
1209 |     return actions
1210 | 
1211 | #get_action_u0513_feat16('2016-02-01','2016-04-16')
1212 | # 用户层级特征
1213 | def get_action_user_feat_six_xingwei(start_date, end_date, n):
1214 |     dump_path = './cache/user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1215 |     if os.path.exists(dump_path):
1216 |         actions = pd.read_csv(dump_path)
1217 |         print("user_zlzl" + str(n))
1218 |         
1219 |     else:
1220 |         actions = get_actions(start_date, end_date)
1221 |         actions['time'] = actions['time'].map(lambda x: get_day_chaju(x, end_date) // n)
1222 |         num_day = np.max(actions['time'])
1223 |         df = None
1224 |         print(num_day)
1225 |         for i in range(min(num_day + 1, 6)):
1226 |             in_temp = pd.get_dummies(actions['type'], prefix="user_action_time_" + str(i))
1227 |             temp = actions[actions['time'] == i]
1228 |             temp = pd.concat([temp['user_id'], in_temp], axis=1)
1229 | 
1230 |             feature = ['user_id']
1231 |             for j in range(1, 7, 1):
1232 |                 feature.append('user_action_time_' + str(i) + '_' + str(j))
1233 | 
1234 |             temp = temp.groupby(['user_id'], as_index=False).sum()
1235 |             temp.columns = feature
1236 |             if df is None:
1237 |                 df = temp
1238 |             else:
1239 |                 df = pd.merge(df, temp, how='outer', on='user_id')
1240 |         df.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, df.shape[1])]
1241 |         df.to_csv(dump_path, index=False)
1242 |         actions=df
1243 |         
1244 | #     user_id = actions[['user_id']]
1245 | #     del actions['user_id']
1246 | #     actions = actions.fillna(0)
1247 | #     actions=actions.replace(np.inf,0)
1248 | # #         print(actions.head())
1249 | #     columns = actions.columns
1250 | 
1251 | #     min_max_scale = preprocessing.MinMaxScaler()
1252 | #     actions=actions.replace(np.inf,0)
1253 | #     actions = min_max_scale.fit_transform(actions.values)
1254 | #     actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1255 |     actions.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1256 |     return actions
1257 | 
1258 | 
1259 | def deal_user_six_deal(start_date, end_date, n):
1260 |     dump_path = './cache/deal_user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1261 |     if os.path.exists(dump_path):
1262 |         actions = pd.read_csv(dump_path)
1263 |         actions.columns = ['user_id'] + ['u_featsix_deal_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1264 |         return actions
1265 |     else:
1266 |         temp = get_action_user_feat_six_xingwei(start_date, end_date, n)  # 修改
1267 |         time1 = datetime.now()
1268 |         columns = ["user_id"]
1269 |         all_col = temp.shape[1] - 1
1270 |         temp.columns = columns + list(range(all_col))
1271 |         temp = temp.fillna(0)
1272 |         columns = ['user_id']
1273 |         for j in range(0, 6, 1):
1274 |             temp["zl_" + str(j)] = 0
1275 |             columns.append("zl_" + str(j))
1276 |             for k in range(j, all_col, 6):
1277 |                 temp["zl_" + str(j)] = temp["zl_" + str(j)] + temp[k].map(lambda x: x * ((k // 6 + 1) ** (-0.67)))
1278 |             temp["zl_" + str(j)] = temp["zl_" + str(j)].map(lambda x: (x - np.min(temp["zl_" + str(j)])) / (
1279 |                 np.max(temp["zl_" + str(j)]) - np.min(temp["zl_" + str(j)])))
1280 |         temp = temp[columns]
1281 |         temp.to_csv(dump_path, index=False)
1282 |         return temp
1283 | 
1284 | # # get  user sku
1285 | # def get_user(start_date, end_date):
1286 | #     dump_path = './cache/user_sku_%s_%s.csv' % (start_date, end_date)
1287 | #     if os.path.exists(dump_path):
1288 | #         actions = pd.read_csv(dump_path)
1289 | #     else:
1290 | #         actions = get_actions(start_date, end_date)
1291 | #         actions = actions[(actions['type'] == 2) | (actions['type'] == 5) | (actions['type'] == 4)]
1292 | #         actions=actions[actions['cate']==8]
1293 | #         actions = actions[['user_id']]
1294 | #         actions = actions.drop_duplicates(['user_id'], keep='first')
1295 | #         actions.to_csv(dump_path, index=False)
1296 | #     return actions
1297 | 
1298 | 
1299 | #用户购买前的行为
1300 | def get_action_u0509_feat_28(start_date, end_date,k):
1301 |     dump_path = './cache/u0509_feat_28_%s_%s_%s.csv' % (start_date, end_date,k)
1302 |     if os.path.exists(dump_path):
1303 |         actions = pd.read_csv(dump_path)
1304 |     else:
1305 |         actions = get_actions(start_date, end_date)
1306 |         actions = actions[actions['type'] == 4]
1307 |         actions['time_buy'] = actions['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1308 |         actions = actions[['user_id', 'sku_id', 'time_buy']].reset_index(drop=True)
1309 |         actions['before_time_buy'] = actions['time_buy'] - timedelta(days=k)
1310 | 
1311 |         df = get_actions('2016-02-01','2016-04-16')[['user_id', 'sku_id', 'time', 'type']]
1312 |         df['time'] = df['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1313 |         df = pd.merge(df, actions, on=['user_id', 'sku_id'], how='left')
1314 |         df = df.dropna(axis=0, how='any')
1315 |         df['before_days'] = (df['time'] - df['before_time_buy']).dt.days
1316 |         df['days'] = (df['time'] - df['time_buy']).dt.days
1317 |         df = df[(df['before_days'] >= 0) & (df['days'] < 0)]
1318 |         df_dummy = pd.get_dummies(df['type'], prefix='type')
1319 | 
1320 |         df = pd.concat([df, df_dummy], axis=1)[
1321 |             ['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']]
1322 | 
1323 |         df = df.groupby(['user_id', 'sku_id'], as_index=False).sum()
1324 |         del df['sku_id']
1325 |         df = df.groupby('user_id', as_index=False).agg(['min', 'max', 'mean'])
1326 |         df = df.reset_index()
1327 |         df.columns = ['user_id'] + ['u0509_feat28_' + str(k) + '_' + i for i in (
1328 |         'type_1_min', 'type_1_max', 'type_1_mean', 'type_2_min', 'type_2_max', 'type_2_mean',
1329 |         'type_3_min', 'type_3_max', 'type_3_mean', 'type_4_min', 'type_4_max', 'type_4_mean',
1330 |         'type_5_min', 'type_5_max', 'type_5_mean', 'type_6_min', 'type_6_max', 'type_6_mean')]
1331 |         min_max_scaler = preprocessing.MinMaxScaler()
1332 |         actions = min_max_scaler.fit_transform(df.drop('user_id', axis=1).values)
1333 |         actions = pd.DataFrame(actions)
1334 |         actions = pd.concat([df[['user_id']], actions], axis=1)
1335 |         actions.columns = ['user_id']+['u0509_feat_28_'+str(i) for i in range(1,actions.shape[1])]
1336 |         actions.to_csv(dump_path,index=False)
1337 |     actions.columns = ['user_id']+['u0509_feat_28_'+str(k)+"_"+str(i) for i in range(1,actions.shape[1])]
1338 |     return actions
1339 | 
1340 | #用户看了几个cate=8中的brand、用户看的cate=8的brand/用户看的brand
1341 | def get_action_u0509_feat_29(start_date,end_date):
1342 |     dump_path = './cache/u0509_feat_29_%s_%s.csv' % (start_date, end_date)
1343 |     if os.path.exists(dump_path):
1344 |         actions = pd.read_csv(dump_path)
1345 |     else:
1346 |         actions=get_actions(start_date,end_date)
1347 |         df1=actions[actions['cate']==8].drop_duplicates(['user_id','brand'])[['user_id','brand']]
1348 |         df1=df1.groupby(['user_id'],as_index=False).count()
1349 |         df1.columns=['user_id','brand_cate=8']
1350 |         df2=actions.drop_duplicates(['user_id','brand'])[['user_id','brand']]
1351 |         df2 = df2.groupby(['user_id'], as_index=False).count()
1352 |         df2.columns=['user_id','brand_cate_all']
1353 |         df=pd.merge(df1,df2,on='user_id',how='right')
1354 |         df['rate']=df['brand_cate=8']/df['brand_cate_all']
1355 | #         print df
1356 |         actions=df.fillna(0)
1357 |         actions.to_csv(dump_path,index=False)
1358 |     actions.columns=['user_id']+['u0509_feat_29'+str(i) for i in range(1,actions.shape[1])]
1359 |     return actions
1360 | 
1361 | def get_action_u0521_feat_31(start_date,end_date,k):
1362 |     dump_path = './cache/u0509_feat_31_%s_%s_%s.csv' % (start_date, end_date,k)
1363 |     if os.path.exists(dump_path):
1364 |         actions = pd.read_csv(dump_path)
1365 |     else:
1366 |         start_days=pd.to_datetime(end_date)-timedelta(days=k)
1367 |         start_days=datetime.strftime(start_days,'%H-%m-%d')
1368 |         actions=get_actions(start_days,end_date)
1369 |         df1=actions[actions['cate']==8].drop_duplicates(['user_id','cate'])[['user_id','cate']]
1370 |         df1=df1.groupby('user_id',as_index=False).count()
1371 |         df1.columns=['user_id','cate8']
1372 |         df2=actions.drop_duplicates(['user_id','cate'])[['user_id','cate']]
1373 |         df2=df2.groupby('user_id',as_index=False).count()
1374 |         actions=pd.merge(df1,df2,on='user_id',how='right')
1375 |         actions['cate8/cate']=actions['cate8']/actions['cate']
1376 |         actions=actions.fillna(0)
1377 |         min_max_scaler = preprocessing.MinMaxScaler()
1378 |         df = min_max_scaler.fit_transform(actions[['cate8','cate']].values)
1379 |         df = pd.DataFrame(df)
1380 |         actions = pd.concat([actions[['user_id','cate8/cate']], df], axis=1)
1381 |         actions.to_csv(dump_path,index=False)
1382 |     actions.columns=['user_id']+['u0509_feat_31_'+str(k)+'_'+str(i)for i in range(1,actions.shape[1])]
1383 |     return actions
1384 | 
1385 | 
1386 | def get_action_u0521_feat_32(start_date,end_date):
1387 |     dump_path = './cache/u0509_feat_32_%s_%s.csv' % (start_date, end_date)
1388 |     if os.path.exists(dump_path):
1389 |         actions = pd.read_csv(dump_path)
1390 |     else:
1391 |         actions=get_actions(start_date,end_date)
1392 |         actions=actions[actions['cate']==8][['user_id','brand']]
1393 |         df1=actions.drop_duplicates(['user_id','brand']).groupby('user_id',as_index=False).count()
1394 |         df1.columns=['user_id','brand_num']
1395 |         df2=actions.groupby('user_id',as_index=False).count()
1396 |         actions=pd.merge(df1,df2,on='user_id',how='left')
1397 |         actions['brand_num/brand']=actions['brand']/actions['brand_num']
1398 |         actions=actions.fillna(0)
1399 |         min_max_scaler = preprocessing.MinMaxScaler()
1400 |         df = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1401 |         df = pd.DataFrame(df)
1402 |         actions = pd.concat([actions[['user_id']], df], axis=1)
1403 |         actions.to_csv(dump_path, index=False)
1404 |     actions.columns = ['user_id'] + ['u0509_feat_32_' + str(i) for i in range(1, actions.shape[1])]
1405 |     return actions
1406 | 
1407 | def get_action_user_feat7_0522_huachuang(start_date, end_date,n):
1408 |     dump_path = './cache/user_feat7_six_%s_%s_%s_0522.csv' % (start_date, end_date,n)
1409 |     if os.path.exists(dump_path):
1410 |         actions = pd.read_csv(dump_path)
1411 |     else:
1412 |         start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1413 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
1414 |         
1415 |         df = get_actions(start_days, end_date)[['user_id', 'type', 'time']]
1416 |         actions = df.groupby(['user_id', 'type'], as_index=False).count()
1417 | 
1418 |         time_min = df.groupby(['user_id', 'type'], as_index=False).min()
1419 |         time_max = df.groupby(['user_id', 'type'], as_index=False).max()
1420 | 
1421 |         time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
1422 |         time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1423 |         time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1424 | 
1425 |         time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
1426 |                                                                                              time_cha[
1427 |                                                                                                  'time_y']).dt.seconds // 3600
1428 |         del time_cha['time_x']
1429 |         del time_cha['time_y']
1430 |         # time_cha=time_cha.fillna(1)
1431 | 
1432 |         actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
1433 |         actions = actions.groupby(['user_id', 'type']).sum()
1434 |         actions['cnt/time'] = actions['time'] / actions["cha_hour"]
1435 |         actions = actions.unstack()
1436 |         actions.columns = list(range(actions.shape[1]))
1437 |         actions = actions.reset_index()
1438 |         actions = actions.fillna(0)
1439 |         actions.to_csv(dump_path, index=False)
1440 |     actions.columns = ['user_id'] + ['u_feat7_' +str(n)+"_"+ str(i) for i in range(1, actions.shape[1])]
1441 |     return actions
1442 | 
1443 | def get_user_labels(test_start_date,test_end_date):
1444 |     dump_path = './cache/user_labels_%s_%s_11.csv' % (test_start_date, test_end_date)
1445 |     if os.path.exists(dump_path):
1446 |         actions = pd.read_csv(dump_path)
1447 |     else:
1448 |         actions = get_actions(test_start_date, test_end_date)
1449 |         actions = actions[actions['cate']==8]
1450 |         actions = actions[actions['type'] == 4].drop_duplicates(['user_id'])[['user_id']]
1451 |         actions['label'] = 1
1452 | 
1453 |     return actions
1454 | 
1455 | print("U model 1 finish  part_0")
1456 | 
1457 | #########################################################################################################
1458 | 
1459 | 
1460 | # In[ ]:
1461 | 
1462 | 
1463 | 
1464 | 
1465 | # In[ ]:
1466 | 
1467 | 
1468 | 
1469 | 
1470 | # In[ ]:
1471 | 
1472 | 
1473 | 
1474 | 
1475 | # In[ ]:
1476 | 
1477 | 
1478 | 
1479 | 
1480 | # In[ ]:
1481 | 
1482 | 
1483 | 
1484 | 
1485 | # In[ ]:
1486 | 
1487 | 
1488 | 
1489 | 
1490 | # In[ ]:
1491 | 
1492 | 
1493 | 
1494 | 
1495 | # In[ ]:
1496 | 
1497 | 
1498 | 
1499 | 
1500 | # In[ ]:
1501 | 
1502 | 
1503 | 
1504 | 
1505 | # In[ ]:
1506 | 
1507 | 
1508 | 
1509 | 
1510 | # In[2]:
1511 | 
1512 | import os
1513 | from datetime import datetime
1514 | from datetime import timedelta
1515 | 
1516 | # -*- coding: utf-8 -*-
1517 | """
1518 | Created on Sun May 14 10:27:41 2017
1519 | @author: 老虎趴趴走
1520 | """
1521 | import pandas as pd
1522 | import numpy as np
1523 | # import datetime
1524 | import math
1525 | 
1526 | def user_features(user, ful_action, sub_action, end_date):
1527 |     dump_path='./cache/user_features_%s_0514_2.csv'%(end_date)
1528 |     if os.path.exists(dump_path):
1529 |         actions = pd.read_csv(dump_path)
1530 |     
1531 |     else:
1532 |         end_date=pd.to_datetime(end_date)
1533 |         day = timedelta(1, 0)
1534 |         print('=====> 提取特征...')
1535 |         sub_1 = sub_action[(sub_action['time']>=end_date-1*day) & (sub_action['time']<end_date)]
1536 |         sub_3 = sub_action[(sub_action['time']>=end_date-3*day) & (sub_action['time']<end_date)]
1537 |         sub_5 = sub_action[(sub_action['time']>=end_date-5*day) & (sub_action['time']<end_date)]
1538 |         sub_30 = sub_action[(sub_action['time']>=end_date-30*day) & (sub_action['time']<end_date)]
1539 |         sub_all = sub_action[sub_action['time']<end_date]
1540 | 
1541 |         ful_5 = ful_action[(ful_action['time']>=end_date-5*day) & (ful_action['time']<end_date)]
1542 |         ful_30 = ful_action[(ful_action['time']>=end_date-30*day) & (ful_action['time']<end_date)]
1543 |         ful_all = ful_action[ful_action['time']<end_date]
1544 |         # ========================================
1545 |         #    用户历史行为  
1546 |         # ========================================
1547 |         # 6种行为特征
1548 |         df = pd.get_dummies(sub_all['type'], prefix='type')
1549 |         df['type_0'] = df.sum(axis=1)
1550 |         df = pd.concat([sub_all[['user_id', 'sku_id']], df], axis=1)
1551 | #         u_feature_history = action_dummy[['user_id']]
1552 |         u_feature_all = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1553 |         col = ['user_id','browse_all','add_all','del_all','buy_all','follow_all','click_all','action_all']
1554 |         u_feature_all.columns = col
1555 |         # 比值
1556 |         u_feature_all['buy/browse_all'] = u_feature_all['buy_all']/(u_feature_all['browse_all']+0.001)*100
1557 |         u_feature_all['buy/add_all'] = u_feature_all['buy_all']/(u_feature_all['add_all']+0.001)*100
1558 |         u_feature_all['buy/click_all'] = u_feature_all['buy_all']/(u_feature_all['click_all']+0.001)*100
1559 |         u_feature_all['buy/follow_all'] = u_feature_all['buy_all']/(u_feature_all['follow_all']+0.001)*100
1560 |         u_feature_all['del/add_all'] = u_feature_all['del_all']/(u_feature_all['add_all']+0.001)*100
1561 | 
1562 |         # 用户对商品行为特征
1563 |         us = df.groupby(['user_id', 'sku_id']).sum().reset_index()
1564 |         us = us.drop('sku_id', axis=1)
1565 |         us_avg = us.groupby('user_id').mean().reset_index()
1566 |         col = ['user_id','us_browse_all_avg','us_add_all_avg','us_del_all_avg','us_buy_all_avg','us_follow_all_avg','us_click_all_avg','us_action_all_avg']
1567 |         us_avg.columns = col
1568 |         us_max = us.groupby('user_id').max().reset_index()
1569 |         col = ['user_id','us_browse_all_max','us_add_all_max','us_del_all_max','us_buy_all_max','us_follow_all_max','us_click_all_max','us_action_all_max']
1570 |         us_max.columns = col
1571 |         us_max = us_max.drop(['us_buy_all_max', 'us_del_all_max'], axis=1)
1572 |         u_feature_all = pd.merge(u_feature_all, us_avg, on='user_id', how='left')
1573 |         u_feature_all = pd.merge(u_feature_all, us_max, on='user_id', how='left').fillna(0)
1574 | 
1575 |         # 活跃天数
1576 |         u_days = sub_all[['user_id', 'date']]
1577 |         u_days = u_days.drop_duplicates()
1578 |         u_days = u_days.groupby('user_id').count().reset_index()
1579 |         u_days.rename(columns={'date': 'u_days_all'}, inplace=True)
1580 |         u_feature_all = pd.merge(u_feature_all, u_days, on='user_id', how='left').fillna(0)
1581 | 
1582 |         # 时间特征
1583 |         u_days = sub_all[['user_id', 'time']]
1584 |         u_start = u_days.groupby('user_id').min().reset_index()
1585 |         u_start.rename(columns={'time': 'start'}, inplace=True)
1586 |         u_end = u_days.groupby('user_id').max().reset_index()
1587 |         u_end.rename(columns={'time': 'end'}, inplace=True)
1588 |         u_duration = pd.merge(u_start, u_end, on='user_id')
1589 |         u_duration['u_duration_all'] = u_duration['end'] - u_duration['start']
1590 |         u_duration['u_duration_all'] = u_duration['u_duration_all'].map(lambda x: x.days*24+x.seconds/3600)
1591 |         u_duration = u_duration[['user_id', 'u_duration_all']]
1592 |         u_feature_all = pd.merge(u_feature_all, u_duration, on='user_id', how='left').fillna(0)
1593 | 
1594 |         # 行为/时间
1595 |         u_feature_all['action_avg_all'] = u_feature_all['action_all']/(u_feature_all['u_duration_all']+0.001)
1596 |         u_feature_all['browse_avg_all'] = u_feature_all['browse_all']/(u_feature_all['u_duration_all']+0.001)
1597 |         u_feature_all['add_avg_all'] = u_feature_all['add_all']/(u_feature_all['u_duration_all']+0.001)
1598 |         u_feature_all['del_avg_all'] = u_feature_all['del_all']/(u_feature_all['u_duration_all']+0.001)
1599 |         u_feature_all['buy_avg_all'] = u_feature_all['buy_all']/(u_feature_all['u_duration_all']+0.001)
1600 |         u_feature_all['follow_avg_all'] = u_feature_all['follow_all']/(u_feature_all['u_duration_all']+0.001)
1601 |         u_feature_all['click_avg_all'] = u_feature_all['click_all']/(u_feature_all['u_duration_all']+0.001)
1602 | 
1603 |         # 商品特征
1604 |         sku = df.drop('user_id', axis=1).groupby('sku_id').sum().reset_index()
1605 |         sku = pd.merge(df[['user_id', 'sku_id']].drop_duplicates(), sku, on='sku_id', how='left')
1606 |         sku = sku.drop('sku_id', axis=1)
1607 |         sku_avg = sku.groupby('user_id').mean().reset_index()
1608 |         col = ['user_id','sku_browse_all_avg','sku_add_all_max','sku_del_all_max','sku_buy_all_max','sku_follow_all_max','sku_click_all_max','sku_action_all_max']
1609 |         sku_avg.columns = col        
1610 | 
1611 |         sku_min = sku.groupby('user_id').min().reset_index()
1612 |         col = ['user_id','sku_browse_all_min','sku_add_all_min','sku_del_all_min','sku_buy_all_min','sku_follow_all_min','sku_click_all_min','sku_action_all_min']
1613 |         sku_min.columns = col  
1614 |         
1615 |         u_feature_all = pd.merge(u_feature_all, sku_avg, on='user_id', how='left')
1616 |         u_feature_all = pd.merge(u_feature_all, sku_min, on='user_id', how='left').fillna(0)
1617 | 
1618 |         # 全集行为特征
1619 |         df = pd.get_dummies(ful_all['type'], prefix='type')
1620 |         df['type_0'] = df.sum(axis=1)
1621 |         df = pd.concat([ful_all[['user_id', 'sku_id']], df], axis=1)
1622 |         u_feature_ful_all = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1623 |         col = ['user_id','browse_ful_all','add_ful_all','del_ful_all','buy_ful_all','follow_ful_all','click_ful_all','action_ful_all']
1624 |         u_feature_ful_all.columns = col     
1625 | 
1626 |         u_feature_all = pd.merge(u_feature_all, u_feature_ful_all, on='user_id', how='left')
1627 |  
1628 |         # 子集/全集
1629 |         u_feature_all['action/ful_all'] = u_feature_all['action_all']/(u_feature_all['action_ful_all']+0.001)*100
1630 |         u_feature_all['browse/ful_all'] = u_feature_all['browse_all']/(u_feature_all['browse_ful_all']+0.001)*100
1631 |         u_feature_all['add/ful_all'] = u_feature_all['add_all']/(u_feature_all['add_ful_all']+0.001)*100
1632 |         u_feature_all['del/ful_all'] = u_feature_all['del_all']/(u_feature_all['del_ful_all']+0.001)*100
1633 |         u_feature_all['buy/ful_all'] = u_feature_all['buy_all']/(u_feature_all['buy_ful_all']+0.001)*100
1634 |         u_feature_all['follow/ful_all'] = u_feature_all['follow_all']/(u_feature_all['follow_ful_all']+0.001)*100
1635 |         u_feature_all['click/ful_all'] = u_feature_all['click_all']/(u_feature_all['click_ful_all']+0.001)*100
1636 |         u_feature_all = u_feature_all.drop(['browse_ful_all', 'action_ful_all', 'add_ful_all', 'del_ful_all',
1637 |                                                     'buy_ful_all', 'follow_ful_all', 'click_ful_all'], axis=1)
1638 | 
1639 | 
1640 |         # =======================================
1641 |         #    用户30天行为特征
1642 |         # =======================================
1643 |         df = pd.get_dummies(sub_30['type'], prefix='type')
1644 |         df['type_0'] = df.sum(axis=1)
1645 |         df = pd.concat([sub_30[['user_id', 'sku_id']], df], axis=1)
1646 |         # 子集行为特征
1647 |         u_feature_30 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1648 |         col = ['user_id','browse_30','add_30','del_30','buy_30','follow_30','click_30','action_30']
1649 |         u_feature_30.columns = col
1650 | 
1651 |         # 全集行为特征
1652 |         df = pd.get_dummies(ful_30['type'], prefix='type')
1653 |         df['type_0'] = df.sum(axis=1)
1654 |         df = pd.concat([ful_30[['user_id', 'sku_id']], df], axis=1)
1655 |         u_feature_ful_30 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1656 |         col = ['user_id','browse_ful_30','add_ful_30','del_ful_30','buy_ful_30','follow_ful_30','click_ful_30','action_ful_30']
1657 |         u_feature_ful_30.columns = col        
1658 | 
1659 |         u_feature_30 = pd.merge(u_feature_30, u_feature_ful_30, on='user_id', how='left')
1660 | 
1661 |         # 子集/全集
1662 |         u_feature_30['action/ful_30'] = u_feature_30['action_30']/(u_feature_30['action_ful_30']+0.001)*100
1663 |         u_feature_30['browse/ful_30'] = u_feature_30['browse_30']/(u_feature_30['browse_ful_30']+0.001)*100
1664 |         u_feature_30['add/ful_30'] = u_feature_30['add_30']/(u_feature_30['add_ful_30']+0.001)*100
1665 |         u_feature_30['del/ful_30'] = u_feature_30['del_30']/(u_feature_30['del_ful_30']+0.001)*100
1666 |         u_feature_30['buy/ful_30'] = u_feature_30['buy_30']/(u_feature_30['buy_ful_30']+0.001)*100
1667 |         u_feature_30['follow/ful_30'] = u_feature_30['follow_30']/(u_feature_30['follow_ful_30']+0.001)*100
1668 |         u_feature_30['click/ful_30'] = u_feature_30['click_30']/(u_feature_30['click_ful_30']+0.001)*100
1669 | 
1670 |         # ========================================
1671 |         #     用户5天行为特征
1672 |         # ========================================
1673 |         df = pd.get_dummies(sub_5['type'], prefix='type')
1674 |         df['type_0'] = df.sum(axis=1)
1675 |         df = pd.concat([sub_5[['user_id', 'sku_id']], df], axis=1)
1676 |         # 子集行为特征
1677 |         u_feature_5 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1678 |         col = ['user_id','browse_5','add_5','del_5','buy_5','follow_5','click_5','action_5']
1679 |         u_feature_5.columns = col
1680 | 
1681 |         # 用户对商品行为特征
1682 |         us = df.groupby(['user_id', 'sku_id']).sum().reset_index()
1683 |         us = us.drop('sku_id', axis=1)
1684 |         us_avg = us.groupby('user_id').mean().reset_index()
1685 |         col = ['user_id','us_browse_5_avg','us_add_5_avg','us_del_5_avg','us_buy_5_avg','us_follow_5_avg','us_click_5_avg','us_action_5_avg']
1686 |         us_avg.columns = col
1687 |         us_max = us.groupby('user_id').max().reset_index()
1688 |         col = ['user_id','us_browse_5_max','us_add_5_max','us_del_5_max','us_buy_5_max','us_follow_5_max','us_click_5_max','us_action_5_max']
1689 |         us_max.columns = col
1690 |         u_feature_5 = pd.merge(u_feature_5, us_avg, on='user_id', how='left')
1691 |         u_feature_5 = pd.merge(u_feature_5, us_max, on='user_id', how='left').fillna(0)
1692 | 
1693 |         # 时间特征
1694 |         u_days = sub_5[['user_id', 'time']]
1695 |         u_start = u_days.groupby('user_id').min().reset_index()
1696 |         u_start.rename(columns={'time': 'start'}, inplace=True)
1697 |         u_end = u_days.groupby('user_id').max().reset_index()
1698 |         u_end.rename(columns={'time': 'end'}, inplace=True)
1699 |         u_duration = pd.merge(u_start, u_end, on='user_id')
1700 |         u_duration['u_duration_5'] = u_duration['end'] - u_duration['start']
1701 |         u_duration['u_duration_5'] = u_duration['u_duration_5'].map(lambda x: x.days*24+x.seconds/3600)
1702 |         u_duration['u_stop_5'] = end_date - u_duration['end']
1703 |         u_duration['u_stop_5']= u_duration['u_stop_5'].map(lambda x: x.days*24+x.seconds/3600)
1704 |         u_duration = u_duration[['user_id', 'u_duration_5', 'u_stop_5']]
1705 |         u_feature_5 = pd.merge(u_feature_5, u_duration, on='user_id', how='left').fillna(0)
1706 | 
1707 |         # 全集行为特征
1708 |         df = pd.get_dummies(ful_5['type'], prefix='type')
1709 |         df['type_0'] = df.sum(axis=1)
1710 |         df = pd.concat([ful_5[['user_id', 'sku_id']], df], axis=1)
1711 |         u_feature_ful_5 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1712 |         col = ['user_id','browse_ful_5','add_ful_5','del_ful_5','buy_ful_5','follow_ful_5','click_ful_5','action_ful_5']
1713 |         u_feature_ful_5.columns = col
1714 | 
1715 |         # 子集/全集
1716 |         u_feature_5 = pd.merge(u_feature_5, u_feature_ful_5, on='user_id', how='left')
1717 |         u_feature_5['browse/ful_5'] = u_feature_5['browse_5'] / (u_feature_5['browse_ful_5']+0.001)*100
1718 |         u_feature_5['add/ful_5'] = u_feature_5['add_5'] / (u_feature_5['add_ful_5']+0.001)*100
1719 |         u_feature_5['del/ful_5'] = u_feature_5['del_5'] / (u_feature_5['del_ful_5']+0.001)*100
1720 |         u_feature_5['click/ful_5'] = u_feature_5['click_5'] / (u_feature_5['click_ful_5']+0.001)*100
1721 |         #u_feature_5D = u_feature_5D.drop(['u_browse_num_ful_5D','u_add_num_ful_5D','u_del_num_ful_5D','u_buy_num_ful_5D','u_follow_num_ful_5D','u_click_num_ful_5D'], axis=1)
1722 | 
1723 |         # ========================================
1724 |         #     用户3天行为特征  
1725 |         # ========================================
1726 |         df = pd.get_dummies(sub_3['type'], prefix='type')
1727 |         df['type_0'] = df.sum(axis=1)
1728 |         df = pd.concat([sub_3[['user_id', 'sku_id']], df], axis=1)
1729 |         u_feature_3 = df.groupby('user_id')['type_0'].sum().reset_index()
1730 |         u_feature_3.rename(columns={'type_0': 'action_3'}, inplace=True)
1731 | 
1732 |         # ========================================
1733 |         #     用户1天行为特征  
1734 |         # ========================================
1735 |         df = pd.get_dummies(sub_1['type'], prefix='type')
1736 |         df['type_0'] = df.sum(axis=1)
1737 |         df = pd.concat([sub_1[['user_id', 'sku_id']], df], axis=1)
1738 | 
1739 |         u_feature_1 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1740 |         col = ['user_id','browse_1','add_1','del_1','buy_1','follow_1','click_1','action_1']
1741 |         u_feature_1.columns = col
1742 |         # ========================================
1743 |         #          特征融合
1744 |         # ========================================
1745 |         actions = pd.merge(user[['user_id', 'user_lv_cd', 'reg_duration', 'reg_duration_cate']], u_feature_all, on='user_id', how='left')
1746 |         actions['lv/reg_day'] = actions['user_lv_cd']/(actions['reg_duration']+0.001)*100
1747 |         actions['lv/reg_day_cate'] = actions['user_lv_cd']/(actions['reg_duration_cate']+0.001)
1748 |         actions = pd.merge(actions, u_feature_30, on='user_id', how='left')
1749 |         actions = pd.merge(actions, u_feature_5, on='user_id', how='left')
1750 |         actions['action_5D/all'] = actions['action_5']/(actions['action_all']+0.001)
1751 |         actions = pd.merge(actions, u_feature_3, on='user_id', how='left')
1752 |         actions = pd.merge(actions, u_feature_1, on='user_id', how='left').fillna(0)
1753 | 
1754 |         actions['action_diff1'] = actions['action_1']-actions['action_avg_all']
1755 |         actions['browse_diff1'] = actions['browse_1'] - actions['browse_avg_all']
1756 |         actions['add_diff1'] = actions['add_1'] - actions['add_avg_all']
1757 |         actions['del_diff1'] = actions['del_1'] - actions['del_avg_all']
1758 |         actions['buy_diff1'] = actions['buy_1'] - actions['buy_avg_all']
1759 |         actions['follow_diff1'] = actions['follow_1'] - actions['follow_avg_all']
1760 |         actions['click_diff1'] = actions['click_1'] - actions['click_avg_all']
1761 | 
1762 |         print('=====> 完成!')
1763 |         actions.to_csv(dump_path,index=False)
1764 |         
1765 | #     user_id = actions[['user_id']]
1766 | #     del actions['user_id']
1767 | #     actions = actions.fillna(0)
1768 | #     actions=actions.replace(np.inf,0)
1769 | #         print(actions.head())
1770 | #     columns = actions.columns
1771 | 
1772 | #     min_max_scale = preprocessing.MinMaxScaler()
1773 | #     actions=actions.replace(np.inf,0)
1774 | #     actions = min_max_scale.fit_transform(actions.values)
1775 | #     actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1776 |     return actions
1777 |     
1778 | import pandas as pd
1779 | ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1780 | sub_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True) 
1781 | user = pd.read_csv('./data/JData_modified_user.csv', parse_dates=[4])
1782 | # user_features(user,ful_action,sel_action,'2016-04-11')
1783 | print("U model 1 finish  part_1")
1784 | ######################################################################################
1785 | 
1786 | 
1787 | # In[ ]:
1788 | 
1789 | 
1790 | 
1791 | 
1792 | # In[6]:
1793 | 
1794 | # 测试集
1795 | # ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1796 | # sel_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True)
1797 | def make_test_set(train_start_date, train_end_date,user,ful_action,sub_action):
1798 |     dump_path = './cache/bu0525model_1_u_test_set_%s_%s.csv' % (train_start_date, train_end_date)
1799 |     if os.path.exists(dump_path):
1800 |         actions = pd.read_csv(dump_path)
1801 |     else:
1802 |         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1803 |         actions_1 = get_actions(start_days, train_end_date)
1804 |         actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1805 | 
1806 | 
1807 | 
1808 |         print (actions.shape)
1809 | 
1810 |         start_days = "2016-02-01"
1811 | #         actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1812 | #         print(actions.shape)
1813 | #         
1814 |      
1815 | #         actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1816 | #         print(actions.shape)
1817 |         actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1818 |         print(actions.shape)
1819 |         actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1820 |         print(actions.shape)
1821 |         actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1822 |         print(actions.shape)
1823 |         actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1824 |         print(actions.shape)
1825 |         actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1826 |         print(actions.shape)
1827 |         actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1828 |         print (actions.shape)
1829 |         actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1830 |         print (actions.shape)
1831 |         actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1832 |         print (actions.shape)
1833 |         actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1834 |         print (actions.shape)
1835 |         actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1836 |         print (actions.shape)
1837 |         actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1838 |         print (actions.shape)
1839 |         actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1840 |         print (actions.shape)
1841 |         actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1842 |         print (actions.shape)
1843 |         actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1844 |         print (actions.shape)
1845 |         actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1846 |         print (actions.shape)
1847 |         actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1848 |         print (actions.shape)
1849 |         actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1850 |         print (actions.shape)
1851 |         
1852 |         #模型1   和 模型二
1853 |         actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1854 |         print (actions.shape)
1855 |         #模型 二
1856 | #         actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1857 |         
1858 |         
1859 | #         actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1860 | #         print (actions.shape)
1861 |         
1862 | #         actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1863 | #         print (actions.shape)
1864 | #         actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1865 | #         print (actions.shape)
1866 |         
1867 |         for i in (1, 2, 3, 7, 14, 28):
1868 |             actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1869 |             actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1870 |             actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1871 |             actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1872 |             actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1873 |             actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1874 |             actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1875 |             actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1876 |             actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1877 |             #模型1   和 模型二
1878 |             actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1879 |             if(i<=10):
1880 |                 actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1881 |             #模型 二
1882 | #             actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1883 | #             actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
1884 |         print(actions.shape)
1885 |         print(actions.shape)
1886 | 
1887 |         actions = actions.fillna(0)
1888 | #         user_id = actions[['user_id']]
1889 | #         del actions['user_id']
1890 | #         actions = actions.fillna(0)
1891 | #         actions=actions.replace(np.inf,0)
1892 | # #         print(actions.head())
1893 | #         columns = actions.columns
1894 | 
1895 | #         min_max_scale = preprocessing.MinMaxScaler()
1896 | #         actions=actions.replace(np.inf,0)
1897 | #         actions = min_max_scale.fit_transform(actions.values)
1898 | #         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1899 | #         actions.to_csv(dump_path,index=False)
1900 |     return actions
1901 | 
1902 | 
1903 | # 训练集
1904 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action):
1905 |     dump_path = './cache/bu0525model_1_u_train_set_%s_%s_%s_%s.csv' % (train_start_date, train_end_date, test_start_date, test_end_date)
1906 |     if os.path.exists(dump_path):
1907 |         actions = pd.read_csv(dump_path)
1908 |     else:
1909 | 
1910 |         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1911 |         actions_1 = get_actions(start_days, train_end_date)
1912 |         actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1913 | #         buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates()
1914 | #         actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False]
1915 |         
1916 |         
1917 |         
1918 | #         print (actions.shape)
1919 |         
1920 | #         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1921 | #         actions_1 = get_actions(start_days, train_end_date)
1922 | #         actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)]
1923 | #         actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1924 | #         actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id'])
1925 |         print (actions.shape)
1926 | #         start_days = train_start_date
1927 |         start_days = "2016-02-01"
1928 | #         actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1929 |         print(actions.shape)
1930 |         
1931 | #         actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1932 | #         print(actions.shape)
1933 |         actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1934 |         print(actions.shape)
1935 |         actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1936 |         print(actions.shape)
1937 |         actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1938 |         print(actions.shape)
1939 |         actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1940 |         print(actions.shape)
1941 |         actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1942 |         print(actions.shape)
1943 |         actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1944 |         print (actions.shape)
1945 |         actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1946 |         print (actions.shape)
1947 |         actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1948 |         print (actions.shape)
1949 |         actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1950 |         print (actions.shape)
1951 |         actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1952 |         print (actions.shape)
1953 |         actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1954 |         print (actions.shape)
1955 |         actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1956 |         print (actions.shape)
1957 |         actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1958 |         print (actions.shape)
1959 |         actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1960 |         print (actions.shape)
1961 |         actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1962 |         print (actions.shape)
1963 |         
1964 |         actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1965 |         print (actions.shape)
1966 |         actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1967 |         print (actions.shape)
1968 |        
1969 |         actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1970 | #         actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1971 | 
1972 | #         actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1973 | #         print (actions.shape)
1974 |         
1975 | #         actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1976 | #         print (actions.shape)
1977 | #         actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1978 | #         print (actions.shape)
1979 |         print (actions.shape)
1980 |         for i in (1, 2, 3,7, 14, 28):
1981 |             actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1982 |             actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1983 |             actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1984 |             actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1985 |             actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1986 |             actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1987 |             actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1988 |             actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1989 |             actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1990 |             actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1991 |             if(i<=10):
1992 |                 actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1993 | #             actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1994 |         
1995 | #             actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
1996 |         print(actions.shape)
1997 |         actions = pd.merge(actions, get_user_labels(test_start_date, test_end_date), how='left', on='user_id')
1998 |         
1999 |         actions = actions.fillna(0)
2000 |         print(actions.shape)
2001 | #         user_id = actions[['user_id']]
2002 | #         del actions['user_id']
2003 | #         actions = actions.fillna(0)
2004 | #         actions=actions.replace(np.inf,0)
2005 | # #         print(actions.head())
2006 | #         columns = actions.columns
2007 | 
2008 | #         min_max_scale = preprocessing.MinMaxScaler()
2009 | #         actions=actions.replace(np.inf,0)
2010 | #         actions = min_max_scale.fit_transform(actions.values)
2011 | #         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
2012 | #         actions.to_csv(dump_path,index=False)
2013 |     return  actions
2014 | 
2015 | print("U model 1 finish  part_3")
2016 | 
2017 | 
2018 | 
2019 | 
2020 | 
2021 | 
2022 | ###########################################################################################
2023 | 
2024 | 
2025 | # In[ ]:
2026 | 
2027 | 
2028 | 
2029 | 
2030 | # In[ ]:
2031 | 
2032 | 
2033 | 
2034 | 
2035 | # In[ ]:
2036 | 
2037 | 
2038 | 
2039 | 
2040 | # In[7]:
2041 | 
2042 | #!/usr/bin/python
2043 | 
2044 | import numpy as np
2045 | import xgboost as xgb
2046 | # from user_feat import *
2047 | from sklearn.model_selection import train_test_split
2048 | 
2049 | 
2050 | train_start_date = '2016-03-10'
2051 | train_end_date = '2016-04-11'
2052 | test_start_date = '2016-04-11'
2053 | test_end_date = '2016-04-16'
2054 | 
2055 | # train_start_date='2016-03-05'
2056 | # train_end_date='2016-04-06'
2057 | # test_start_date='2016-04-06'
2058 | # test_end_date='2016-04-11'
2059 | 
2060 | sub_start_date = '2016-03-15'
2061 | sub_end_date = '2016-04-16'
2062 | 
2063 | #训练数据集
2064 | actions = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action)
2065 | # print(np.isinf(actions))
2066 | # print(np.isnan(actions))
2067 | 
2068 | actions
2069 | 
2070 | 
2071 | feature_name = actions.columns.values
2072 | 
2073 | # for index in feature_name[1:-1]:
2074 | #     actions["r"+index]=actions[index].rank(method='max')/actions.shape[0]
2075 | 
2076 | print(actions.shape)
2077 | actions_pos = actions[actions['label']==1]
2078 | actions_neg =  actions[actions['label']==0]
2079 | 
2080 | 
2081 | print("+++++++++++++++++++++++")
2082 | 
2083 | 
2084 | 
2085 | train,test=train_test_split(actions.values,test_size=0.2,random_state=0)
2086 | train=pd.DataFrame(train,columns=actions.columns)
2087 | test=pd.DataFrame(test,columns=actions.columns)
2088 | 
2089 | X_train=train.drop(['user_id','label'],axis=1)
2090 | X_test=test.drop(['user_id','label'],axis=1)
2091 | y_train=train[['label']]
2092 | y_test=test[['label']]
2093 | train_index=train[['user_id']].copy()
2094 | test_index=test[['user_id']].copy()
2095 | 
2096 | 
2097 | 
2098 | 
2099 | 
2100 | #测试数据集
2101 | sub_test_data = make_test_set(sub_start_date, sub_end_date,user,ful_action,sub_action)
2102 | 
2103 | feature_name = sub_test_data.columns.values
2104 | # for index in feature_name[1:]:
2105 | #     sub_test_data["r"+index]=sub_test_data[index].rank(method='max')/sub_test_data.shape[0]
2106 | 
2107 | 
2108 | sub_trainning_data=sub_test_data.drop(['user_id'],axis=1)
2109 | sub_user_index=sub_test_data[['user_id']].copy()    
2110 | 
2111 | print("U model 1 finish  part_4")
2112 | 
2113 | ########################################################################
2114 | 
2115 | 
2116 | # In[ ]:
2117 | 
2118 | 
2119 | 
2120 | 
2121 | # In[9]:
2122 | 
2123 | print ('==========>>>train xgboost model ....')
2124 | 
2125 | dtrain = xgb.DMatrix(X_train,label=y_train)
2126 | dtest = xgb.DMatrix(X_test,label=y_test)
2127 | param = {'learning_rate' : 0.1,
2128 |         'n_estimators': 1000,
2129 |         'max_depth': 3,
2130 |         'min_child_weight': 5,
2131 |         'gamma': 0,
2132 |         'subsample': 1.0,
2133 |         'colsample_bytree': 0.8,
2134 |         'eta': 0.05,
2135 |         'silent': 1,
2136 |         'objective':
2137 |         'binary:logistic',
2138 |         'scale_pos_weight':1}
2139 | 
2140 | 
2141 | 
2142 | num_round =120
2143 | plst = list(param.items())
2144 | plst += [('eval_metric', 'logloss')]
2145 | 
2146 | evallist = [(dtest, 'eval'), (dtrain, 'train')]
2147 | bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10)
2148 | 
2149 | 
2150 | 
2151 | 
2152 | # ============================================>>>>
2153 | print ('==========>>>predict test data label')
2154 | 
2155 | 
2156 | sub_trainning_data_1 = xgb.DMatrix(sub_trainning_data)
2157 | y = bst.predict(sub_trainning_data_1)
2158 | pred = sub_user_index
2159 | sub_user_index['label'] = y
2160 | 
2161 | # print(sub_user_index.head())
2162 | 
2163 | pred=sub_user_index
2164 | #pred.sort_values(by=['user_id','label'],ascending=[0,0],inplace=True)
2165 | pred=pred.sort_values(by=['user_id','label'],ascending=[0,0])
2166 | pred = pred.groupby('user_id').first().reset_index()
2167 | result=pred.sort_values(by=['label'],ascending=[0])
2168 | result['user_id']=result['user_id'].astype('int')
2169 | 
2170 | 
2171 | result.to_csv('./sub/Umodel_1.csv',index=False,index_label=False )
2172 | 
2173 | print("U model 1 finish  part_5")
2174 | 
2175 | 
2176 | # In[ ]:
2177 | 
2178 | 
2179 | 
2180 | 


--------------------------------------------------------------------------------
/Umodel_2.py:
--------------------------------------------------------------------------------
   1 | 
   2 | # coding: utf-8
   3 | 
   4 | # In[1]:
   5 | 
   6 | #!/usr/bin/env python
   7 | 
   8 | import time
   9 | 
  10 | from datetime import datetime
  11 | from datetime import timedelta
  12 | import pandas as pd
  13 | import pickle
  14 | import os
  15 | import math
  16 | import numpy as np
  17 | from sklearn import preprocessing
  18 | import matplotlib.pyplot as plt
  19 | 
  20 | action_1_path = "./data/JData_Action_201602.csv"
  21 | action_2_path = "./data/JData_Action_201603.csv"
  22 | action_3_path = "./data/JData_Action_201604.csv"
  23 | user_path = "./data/JData_User.csv"
  24 | product_path = "./data/JData_Product.csv"
  25 | 
  26 | 
  27 | def convert_age(age_str):
  28 |     if age_str == u'-1':
  29 |         return 0
  30 |     elif age_str == u'15岁以下':
  31 |         return 1
  32 |     elif age_str == u'16-25岁':
  33 |         return 2
  34 |     elif age_str == u'26-35岁':
  35 |         return 3
  36 |     elif age_str == u'36-45岁':
  37 |         return 4
  38 |     elif age_str == u'46-55岁':
  39 |         return 5
  40 |     elif age_str == u'56岁以上':
  41 |         return 6
  42 |     else:
  43 |         return -1
  44 | 
  45 | 
  46 | # 用户的基本信息
  47 | def get_basic_user_feat():
  48 |     dump_path = './cache/basic_user.csv'
  49 |     if os.path.exists(dump_path):
  50 |         user = pd.read_csv(dump_path)
  51 |     else:
  52 |         user = pd.read_csv(user_path, encoding='gbk')
  53 |         user['age'] = user['age'].map(convert_age)
  54 |         age_df = pd.get_dummies(user["age"], prefix="age")
  55 |         sex_df = pd.get_dummies(user["sex"], prefix="sex")
  56 |         user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
  57 |         user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
  58 |         user.to_csv(dump_path, index=False)
  59 |     return user
  60 | 
  61 | # 商品的基本信息
  62 | def get_basic_product_feat():
  63 |     dump_path = './cache/basic_product.csv'
  64 |     if os.path.exists(dump_path):
  65 |         product = pd.read_csv(dump_path)
  66 |     else:
  67 |         product = pd.read_csv(product_path)
  68 |         attr1_df = pd.get_dummies(product["a1"], prefix="a1")
  69 |         attr2_df = pd.get_dummies(product["a2"], prefix="a2")
  70 |         attr3_df = pd.get_dummies(product["a3"], prefix="a3")
  71 |         product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)
  72 |         product.to_csv(dump_path, index=False)
  73 |     return product
  74 | 
  75 | def get_actions_1():
  76 |     action = pd.read_csv(action_1_path)
  77 |     return action
  78 | 
  79 | 
  80 | def get_actions_2():
  81 |     action2 = pd.read_csv(action_2_path)
  82 |     return action2
  83 | 
  84 | 
  85 | def get_actions_3():
  86 |     action3 = pd.read_csv(action_3_path)
  87 |     return action3
  88 | 
  89 | def sub_get_actions(start_date,end_date):
  90 |     dump_path = './cache/sub_action_%s_%s.csv' % (start_date, end_date)
  91 |     if os.path.exists(dump_path):
  92 |         actions = pd.read_csv(dump_path)
  93 |     else:
  94 |         actions=get_actions(start_date,end_date)
  95 |         actions=actions[actions['cate']==8]
  96 |         actions.to_csv(dump_path,index=False)
  97 |     return actions
  98 | 
  99 | # 行为数据
 100 | def get_actions(start_date, end_date):
 101 |     """
 102 | 
 103 |     :param start_date:
 104 |     :param end_date:
 105 |     :return: actions: pd.Dataframe
 106 |     """
 107 |     dump_path = './cache/all_action_%s_%s.csv' % (start_date, end_date)
 108 |     if os.path.exists(dump_path):
 109 |         actions = pd.read_csv(dump_path)
 110 |     else:
 111 |         action_1 = get_actions_1()
 112 |         action_1 = action_1[(action_1.time >= start_date) & (action_1.time < end_date)]
 113 |         action_2 = get_actions_2()
 114 |         action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)]
 115 |         actions = pd.concat([action_1, action_2])
 116 |         action_3 = get_actions_3()
 117 |         action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)]
 118 |         actions = pd.concat([actions, action_3])  # type: pd.DataFrame
 119 |         actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
 120 |         actions.to_csv(dump_path, index=False)
 121 |     # actions['user_id']=actions['user_id'].astype('int')
 122 |     return actions
 123 | 
 124 | # 获取两个时间相差几天
 125 | def get_day_chaju(x, end_date):
 126 |     #     x=x.split(' ')[0]
 127 |     x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
 128 |     end_date = datetime.strptime(end_date, '%Y-%m-%d')
 129 |     return (end_date - x).days
 130 | 
 131 | 
 132 | 
 133 | 
 134 | # # 所有行为的总和
 135 | # def get_action_feat(start_date, end_date):
 136 | #     dump_path = './cache/action_%s_%s.csv' % (start_date, end_date)
 137 | #     if os.path.exists(dump_path):
 138 | #         actions = pd.read_csv(dump_path)
 139 | #     else:
 140 | #         actions = get_actions(start_date, end_date)
 141 | #         actions = actions[['user_id', 'sku_id', 'type']]
 142 | #         df = pd.get_dummies(actions['type'], prefix='action')
 143 | #         actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
 144 | #         actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
 145 | #         del actions['type']
 146 | #         actions.to_csv(dump_path, index=False)
 147 | #     return actions
 148 | # top k 天的行为次数总和(滑窗处理)
 149 | 
 150 | #user_id,u_action_1_1,u_action_1_2,u_action_1_3,u_action_1_4,u_action_1_5,u_action_1_6
 151 | def get_action_feat(start_date, end_date,k):
 152 |     dump_path = './cache/u_action_%s_%s_%s.csv' % (start_date, end_date,k)
 153 |     if os.path.exists(dump_path):
 154 |         actions = pd.read_csv(dump_path)
 155 |     else:
 156 |         start_days=pd.to_datetime(end_date)-timedelta(days=k)
 157 |         start_days=str(start_days).split(' ')[0]
 158 |         actions = get_actions(start_days, end_date)
 159 |         actions = actions[['user_id', 'type']]
 160 |         df = pd.get_dummies(actions['type'], prefix='type')
 161 |         actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
 162 |         actions = actions.groupby('user_id', as_index=False).sum()
 163 |         min_max_scaler = preprocessing.MinMaxScaler()
 164 |         df = min_max_scaler.fit_transform(actions.drop(['user_id','type'],axis=1).values)
 165 |         df = pd.DataFrame(df)
 166 |         df.columns=['u_action_'+str(k)+'_'+str(i) for i in range(1,df.shape[1]+1)]
 167 |         actions = pd.concat([actions[['user_id']], df], axis=1)
 168 |         actions.to_csv(dump_path, index=False)
 169 |     return actions
 170 | 
 171 | 
 172 | 
 173 | 
 174 | 
 175 | 
 176 | # 用户的行为转化率
 177 | def get_action_user_feat1(start_date, end_date):
 178 |     feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio',
 179 |                'user_action_5_ratio', 'user_action_6_ratio']
 180 |     dump_path = './cache/user_feat_accumulate_xiugai_%s_%s.csv' % (start_date, end_date)
 181 |     if os.path.exists(dump_path):
 182 |         actions = pd.read_csv(dump_path)
 183 |     else:
 184 |         actions = get_actions(start_date, end_date)
 185 |         df = pd.get_dummies(actions['type'], prefix='action')
 186 |         actions = pd.concat([actions['user_id'], df], axis=1)
 187 |         actions = actions.groupby(['user_id'], as_index=False).sum()
 188 |         actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1']
 189 |         actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2']
 190 |         #         actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3']
 191 |         actions['user_action_3_ratio'] = actions['action_3'] / actions['action_2']
 192 |         actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5']
 193 |         actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6']
 194 |         #         3.购物车删除
 195 |         actions = actions[feature]
 196 |         actions.to_csv(dump_path, index=False)
 197 |     return actions
 198 | 
 199 | 
 200 | # print get_accumulate_user_feat('2016-03-10','2016-04-11')
 201 | # 用户购买前访问天数
 202 | # 用户购买/加入购物车/关注前访问天数
 203 | def get_action_user_feat2(start_date, end_date):
 204 |     dump_path = './cache/user_feat2_after_%s_%s.csv' % (start_date, end_date)
 205 |     if os.path.exists(dump_path):
 206 |         actions = pd.read_csv(dump_path)
 207 | 
 208 |     else:
 209 |         # 用户购买前访问天数
 210 |         def user_feat_2_1(start_date, end_date):
 211 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 212 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 213 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 214 |             visit = actions[actions['type'] == 1]
 215 |             visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
 216 |             del visit['time']
 217 |             del actions['time']
 218 |             visit = visit.groupby('user_id', as_index=False).count()
 219 |             visit.columns = ['user_id', 'visit']
 220 |             buy = actions[actions['type'] == 4]
 221 |             buy = buy.groupby('user_id', as_index=False).count()
 222 |             buy.columns = ['user_id', 'buy']
 223 |             actions = pd.merge(visit, buy, on='user_id', how='left')
 224 |             actions['visit_day_before_buy'] = actions['visit'] / actions['buy']
 225 |             del actions['buy']
 226 |             del actions['visit']
 227 |             return actions
 228 | 
 229 |         # 用户加入购物车前访问天数
 230 |         def user_feat_2_2(start_date, end_date):
 231 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 232 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 233 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 234 |             visit = actions[actions['type'] == 1]
 235 |             visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
 236 |             del visit['time']
 237 |             del actions['time']
 238 |             visit = visit.groupby('user_id', as_index=False).count()
 239 |             visit.columns = ['user_id', 'visit']
 240 |             addtoshopping = actions[actions['type'] == 2]
 241 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 242 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 243 |             actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
 244 |             actions['visit_day_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
 245 |             del actions['addtoshopping']
 246 |             del actions['visit']
 247 |             return actions
 248 | 
 249 |         # 用户关注前访问天数
 250 |         def user_feat_2_3(start_date, end_date):
 251 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 252 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 253 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 254 |             visit = actions[actions['type'] == 1]
 255 |             visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
 256 |             del visit['time']
 257 |             del actions['time']
 258 |             visit = visit.groupby('user_id', as_index=False).count()
 259 |             visit.columns = ['user_id', 'visit']
 260 |             guanzhu = actions[actions['type'] == 5]
 261 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 262 |             guanzhu.columns = ['user_id', 'guanzhu']
 263 |             actions = pd.merge(visit, guanzhu, on='user_id', how='left')
 264 |             actions['visit_day_before_guanzhu'] = actions['visit'] / actions['guanzhu']
 265 |             del actions['guanzhu']
 266 |             del actions['visit']
 267 |             return actions
 268 | 
 269 |         # 用户购买前加入购物车天数
 270 |         def user_feat_2_4(start_date, end_date):
 271 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 272 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 273 |             # actions=actions.drop_duplicates(['user_id','time'],keep='first')
 274 |             addtoshopping = actions[actions['type'] == 2]
 275 |             addtoshopping = addtoshopping.drop_duplicates(['user_id', 'time'], keep='first')
 276 |             del addtoshopping['time']
 277 |             del actions['time']
 278 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 279 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 280 |             buy = actions[actions['type'] == 4]
 281 |             buy = buy.groupby('user_id', as_index=False).count()
 282 |             buy.columns = ['user_id', 'buy']
 283 |             actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
 284 |             actions['addtoshopping_day_before_buy'] = actions['addtoshopping'] / actions['buy']
 285 |             del actions['buy']
 286 |             del actions['addtoshopping']
 287 |             return actions
 288 | 
 289 |         # 用户购买前关注天数
 290 |         def user_feat_2_5(start_date, end_date):
 291 |             actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 292 |             actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 293 |             guanzhu = actions[actions['type'] == 5]
 294 |             guanzhu = guanzhu.drop_duplicates(['user_id', 'time'], keep='first')
 295 |             del guanzhu['time']
 296 |             del actions['time']
 297 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 298 |             guanzhu.columns = ['user_id', 'guanzhu']
 299 |             buy = actions[actions['type'] == 4]
 300 |             buy = buy.groupby('user_id', as_index=False).count()
 301 |             buy.columns = ['user_id', 'buy']
 302 |             actions = pd.merge(guanzhu, buy, on='user_id', how='left')
 303 |             actions['guanzhu_day_before_buy'] = actions['guanzhu'] / actions['buy']
 304 |             del actions['buy']
 305 |             del actions['guanzhu']
 306 |             return actions
 307 | 
 308 |         actions = pd.merge(user_feat_2_1(start_date, end_date), user_feat_2_2(start_date, end_date), on='user_id',
 309 |                            how='outer')
 310 |         actions = pd.merge(actions, user_feat_2_3(start_date, end_date), on='user_id', how='outer')
 311 |         actions = pd.merge(actions, user_feat_2_4(start_date, end_date), on='user_id', how='outer')
 312 |         actions = pd.merge(actions, user_feat_2_5(start_date, end_date), on='user_id', how='outer')
 313 |         user_id = actions['user_id']
 314 |         del actions['user_id']
 315 |         actions = actions.fillna(0)
 316 |         min_max_scale = preprocessing.MinMaxScaler()
 317 |         actions = min_max_scale.fit_transform(actions.values)
 318 |         actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
 319 |         actions.to_csv(dump_path, index=False)
 320 |     actions.columns = ['user_id'] + ['u_feat2_' + str(i) for i in range(1, actions.shape[1])]
 321 |     return actions
 322 | 
 323 | 
 324 | 
 325 | 
 326 | # # 用户总购买品牌数
 327 | # def get_action_user_feat5(start_date, end_date):
 328 | #     dump_path = './cache/user_feat5_%s_%s.csv' % (start_date, end_date)
 329 | #     if os.path.exists(dump_path):
 330 | #         actions = pd.read_csv(dump_path)
 331 | #     else:
 332 | #         actions = get_actions(start_date, end_date)[['user_id', 'sku_id']]
 333 | #         actions = actions.drop_duplicates(['user_id', 'sku_id'], keep='first')
 334 | #         actions = actions.groupby('user_id', as_index=False).count()
 335 | #         actions.columns = ['user_id', 'sku_num']
 336 | #         actions['sku_num'] = actions['sku_num'].astype('float')
 337 | #         actions['sku_num'] = actions['sku_num'].map(
 338 | #             lambda x: (x - actions['sku_num'].min()) / (actions['sku_num'].max() - actions['sku_num'].min()))
 339 | #         actions.to_csv(dump_path, index=False)
 340 | #     actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
 341 | #     return actions
 342 | 
 343 | 
 344 | # 用户平均访问间隔
 345 | def get_action_user_feat6(start_date, end_date):
 346 |     dump_path = './cache/user_feat6_%s_%s.csv' % (start_date, end_date)
 347 |     if os.path.exists(dump_path):
 348 |         actions = pd.read_csv(dump_path)
 349 |     else:
 350 | 
 351 |         df = get_actions(start_date, end_date)[['user_id', 'time']]
 352 |         # df['user_id']=df['user_id'].astype('int')
 353 |         df['time'] = df['time'].map(lambda x: x.split(' ')[0])
 354 |         df = df.drop_duplicates(['user_id', 'time'], keep='first')
 355 |         df['time'] = df['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
 356 |         actions = df.groupby('user_id', as_index=False).agg(lambda x: x['time'].diff().mean())
 357 |         actions['avg_visit'] = actions['time'].dt.days
 358 |         del actions['time']
 359 |         actions.to_csv(dump_path, index=False)
 360 |     actions.columns = ['user_id'] + ['u_feat6_' + str(i) for i in range(1, actions.shape[1])]
 361 |     return actions
 362 | 
 363 | 
 364 | # 用户平均六种行为的访问间隔
 365 | def get_action_user_feat6_six(start_date, end_date):
 366 |     dump_path = './cache/user_feat6_six_%s_%s.csv' % (start_date, end_date)
 367 |     if os.path.exists(dump_path):
 368 |         actions = pd.read_csv(dump_path)
 369 |     else:
 370 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 371 |         df['time'] = df['time'].map(lambda x: (-1) * get_day_chaju(x, start_date))
 372 |         df = df.drop_duplicates(['user_id', 'time', 'type'], keep='first')
 373 |         actions = df.groupby(['user_id', 'type']).agg(lambda x: np.diff(x).mean())
 374 |         actions = actions.unstack()
 375 |         actions.columns = list(range(actions.shape[1]))
 376 |         actions = actions.reset_index()
 377 |         actions.to_csv(dump_path, index=False)
 378 |     actions.columns = ['user_id'] + ['u_feat6_six_' + str(i) for i in range(1, actions.shape[1])]
 379 |     return actions
 380 | 
 381 | 
 382 | # 用户购买频率
 383 | def get_action_user_feat7(start_date, end_date):
 384 |     dump_path = './cache/user_feat7_six_%s_%s.csv' % (start_date, end_date)
 385 |     if os.path.exists(dump_path):
 386 |         actions = pd.read_csv(dump_path)
 387 |     else:
 388 |         df = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
 389 |         actions = df.groupby(['user_id', 'type'], as_index=False).count()
 390 | 
 391 |         time_min = df.groupby(['user_id', 'type'], as_index=False).min()
 392 |         time_max = df.groupby(['user_id', 'type'], as_index=False).max()
 393 | 
 394 |         time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
 395 |         time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 396 |         time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 397 | 
 398 |         time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
 399 |                                                                                              time_cha[
 400 |                                                                                                  'time_y']).dt.seconds // 3600
 401 |         del time_cha['time_x']
 402 |         del time_cha['time_y']
 403 |         # time_cha=time_cha.fillna(1)
 404 | 
 405 |         actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
 406 |         actions = actions.groupby(['user_id', 'type']).sum()
 407 |         actions['cnt/time'] = actions['time'] / actions["cha_hour"]
 408 |         actions = actions.unstack()
 409 |         actions.columns = list(range(actions.shape[1]))
 410 |         actions = actions.reset_index()
 411 |         actions = actions.fillna(0)
 412 |         actions.to_csv(dump_path, index=False)
 413 |     actions.columns = ['user_id'] + ['u_feat7_' + str(i) for i in range(1, actions.shape[1])]
 414 |     return actions
 415 | 
 416 | 
 417 | def user_top_k_0_1(start_date, end_date):
 418 |     actions = get_actions(start_date, end_date)
 419 |     actions = actions[['user_id', 'sku_id', 'type']]
 420 |     df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
 421 |     actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
 422 |     actions = actions.groupby('user_id', as_index=False).sum()
 423 |     del actions['type']
 424 |     del actions['sku_id']
 425 |     user_id = actions['user_id']
 426 |     del actions['user_id']
 427 |     actions = actions.applymap(lambda x: 1 if x > 0 else 0)
 428 |     actions = pd.concat([user_id, actions], axis=1)
 429 |     return actions
 430 | 
 431 | 
 432 | # 用户最近K天行为0/1提取
 433 | def get_action_user_feat8(start_date, end_date):
 434 |     dump_path = './cache/user_feat8_%s_%s.csv' % (start_date, end_date)
 435 |     if os.path.exists(dump_path):
 436 |         actions = pd.read_csv(dump_path)
 437 |     else:
 438 |         actions = None
 439 |         for i in (1, 2, 3, 4, 5, 6, 7, 15, 30):
 440 |             print(i)
 441 |             start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
 442 |             start_days = start_days.strftime('%Y-%m-%d')
 443 |             if actions is None:
 444 |                 actions = user_top_k_0_1(start_days, end_date)
 445 |             else:
 446 |                 actions = pd.merge(actions, user_top_k_0_1(start_days, end_date), how='outer', on='user_id')
 447 |         actions.to_csv(dump_path, index=False)
 448 |     actions.columns = ['user_id'] + ['u_feat8_' + str(i) for i in range(1, actions.shape[1])]
 449 |     return actions
 450 | 
 451 | 
 452 | # 获取用户的重复购买率
 453 | def get_action_user_feat8_2(start_date, end_date):
 454 |     dump_path = './cache/product_feat8_2_%s_%s.csv' % (start_date, end_date)
 455 |     if os.path.exists(dump_path):
 456 |         actions = pd.read_csv(dump_path)
 457 |     else:
 458 |         df = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']]
 459 |         df = df[df['type'] == 4]  # 购买的行为
 460 |         df = df.groupby(['user_id', 'sku_id'], as_index=False).count()
 461 |         df.columns = ['user_id', 'sku_id', 'count1']
 462 |         df['count1'] = df['count1'].map(lambda x: 1 if x > 1 else 0)
 463 |         grouped = df.groupby(['user_id'], as_index=False)
 464 |         actions = grouped.count()[['user_id', 'count1']]
 465 |         actions.columns = ['user_id', 'count']
 466 |         re_count = grouped.sum()[['user_id', 'count1']]
 467 |         re_count.columns = ['user_id', 're_count']
 468 |         actions = pd.merge(actions, re_count, on='user_id', how='left')
 469 |         re_buy_rate = actions['re_count'] / actions['count']
 470 |         actions = pd.concat([actions['user_id'], re_buy_rate], axis=1)
 471 |         actions.columns = ['user_id', 're_buy_rate']
 472 |         actions.to_csv(dump_path, index=False)
 473 |     actions.columns = ['user_id'] + ['u_feat8_2_' + str(i) for i in range(1, actions.shape[1])]
 474 |     return actions
 475 | 
 476 | 
 477 | # 获取最近一次行为的时间距离当前时间的差距
 478 | def get_action_user_feat9(start_date, end_date):
 479 |     dump_path = './cache/user_feat9_%s_%s.csv' % (start_date, end_date)
 480 |     if os.path.exists(dump_path):
 481 |         actions = pd.read_csv(dump_path)
 482 |     else:
 483 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 484 |         # df['time'] = df['time'].map(lambda x: (-1)*get_day_chaju(x,start_date))
 485 |         df = df.drop_duplicates(['user_id', 'type'], keep='last')
 486 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
 487 |         actions = df.groupby(['user_id', 'type']).sum()
 488 |         actions = actions.unstack()
 489 |         actions.columns = list(range(actions.shape[1]))
 490 |         actions = actions.reset_index()
 491 |         actions = actions.fillna(30)
 492 |         actions.to_csv(dump_path, index=False)
 493 |     actions.columns = ['user_id'] + ['u_feat9_' + str(i) for i in range(1, actions.shape[1])]
 494 |     return actions
 495 | 
 496 | 
 497 | # 获取最后一次行为的次数并且进行归一化
 498 | def get_action_user_feat10(start_date, end_date):
 499 |     dump_path = './cache/user_feat10_%s_%s.csv' % (start_date, end_date)
 500 |     if os.path.exists(dump_path):
 501 |         actions = pd.read_csv(dump_path)
 502 |     else:
 503 | 
 504 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 505 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
 506 | 
 507 |         idx = df.groupby(['user_id', 'type'])['time'].transform(min)
 508 |         idx1 = idx == df['time']
 509 |         actions = df[idx1].groupby(["user_id", "type"]).count()
 510 |         actions = actions.unstack()
 511 |         actions.columns = list(range(actions.shape[1]))
 512 |         actions = actions.fillna(0)
 513 |         actions = actions.reset_index()
 514 | 
 515 |         user_sku = actions[['user_id']]
 516 |         del actions['user_id']
 517 |         min_max_scaler = preprocessing.MinMaxScaler()
 518 |         actions = min_max_scaler.fit_transform(actions.values)
 519 |         actions = pd.DataFrame(actions)
 520 |         actions = pd.concat([user_sku, actions], axis=1)
 521 | 
 522 |         actions.to_csv(dump_path, index=False)
 523 |     actions.columns = ['user_id'] + ['u_feat10_' + str(i) for i in range(1, actions.shape[1])]
 524 |     return actions
 525 | 
 526 | 
 527 | # 获取人物该层级最后一层的各种行为的统计数量
 528 | def get_action_user_feat11(start_date, end_date, n):
 529 |     dump_path = './cache/user_feat11_%s_%s_%s.csv' % (start_date, end_date, n)
 530 |     if os.path.exists(dump_path):
 531 |         actions = pd.read_csv(dump_path)
 532 |     else:
 533 | 
 534 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 535 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
 536 |         df = df[df['time'] == 0]
 537 |         del df['time']
 538 |         temp = pd.get_dummies(df['type'], prefix='type')
 539 |         del df['type']
 540 |         actions = pd.concat([df, temp], axis=1)
 541 |         actions = actions.groupby(['user_id'], as_index=False).sum()
 542 |         user_sku = actions[['user_id']]
 543 |         del actions['user_id']
 544 |         min_max_scaler = preprocessing.MinMaxScaler()
 545 |         actions = min_max_scaler.fit_transform(actions.values)
 546 |         actions = pd.DataFrame(actions)
 547 |         actions = pd.concat([user_sku, actions], axis=1)
 548 |         actions.to_csv(dump_path, index=False)
 549 |     actions.columns = ['user_id'] + ['u_feat11_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
 550 |     return actions
 551 | 
 552 | 
 553 | def get_action_user_feat12(start_date, end_date):
 554 |     dump_path = './cache/user_feat12_%s_%s.csv' % (start_date, end_date)
 555 |     if os.path.exists(dump_path):
 556 |         actions = pd.read_csv(dump_path)
 557 |     else:
 558 |         actions = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 559 |         actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
 560 |         actions = actions.drop_duplicates(['user_id', 'time', 'type'], keep='first')
 561 |         actions['day'] = actions['time'].map(
 562 |             lambda x: (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d')).days)
 563 |         result = None
 564 |         for i in (2, 3, 7, 14, 28):  # 层级个数
 565 |             print ('i%s' % i)
 566 |             actions['level%s' % i] = actions['day'].map(lambda x: x // i)
 567 |             a=set(actions['level%s' % i].tolist())
 568 |             for j in (1, 2,3,4, 5, 6):  # type
 569 |                 print ('j%s' % j)
 570 |                 df = actions[actions['type'] == j][['user_id', 'level%s' % i, 'time']]
 571 |                 df = df.groupby(['user_id', 'level%s' % i]).count()
 572 |                 df = df.unstack()
 573 |                 b=df.columns.levels[1].tolist()
 574 |                 df.columns = ['u_feat12_' + str('level%s_' % i) + str(j) + '_' + str(k) for k in df.columns.levels[1].tolist()]
 575 |                 if  len(list(a-set(b)))!=0:
 576 |                     c=list(a-set(b))
 577 |                     for k in c:
 578 |                         df['u_feat12_'+str('level%s_' % i)+str(j)+'_'+ str(k)]=0
 579 |                 columns=df.columns
 580 |                 dict={}
 581 |                 for column in columns:
 582 |                     k=int(column.split('_')[-1])
 583 |                     dict[column]=k
 584 |                 columns=sorted(dict.items(),key=lambda x: x[1])
 585 |                 columns=[(columns[t])[0] for t in range(len(columns))]
 586 |                 df=df[columns]
 587 |                 df = df.reset_index()
 588 |                 if result is None:
 589 |                     result = df
 590 |                 else:
 591 |                     result = pd.merge(result, df, on='user_id', how='left')
 592 |         columns = result.columns
 593 |         user_id = result['user_id']
 594 |         del result['user_id']
 595 |         actions = result.fillna(0)
 596 | 
 597 |         min_max_scaler = preprocessing.MinMaxScaler()
 598 |         actions = min_max_scaler.fit_transform(actions.values)
 599 |         actions = pd.DataFrame(actions)
 600 |         actions = pd.concat([user_id, actions], axis=1)
 601 |         actions.columns=columns
 602 |         actions.to_csv(dump_path, index=False)
 603 |     return actions
 604 | 
 605 | 
 606 | 
 607 | # 层级的天数
 608 | def get_action_user_feat13(start_date, end_date, n):
 609 |     dump_path = './cache/user_feat13_%s_%s_%s.csv' % (start_date, end_date, n)
 610 |     if os.path.exists(dump_path):
 611 |         actions = pd.read_csv(dump_path)
 612 |     else:
 613 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 614 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
 615 |         df = df.drop_duplicates(['user_id', 'type', 'time'], keep='first')
 616 |         actions = df.groupby(['user_id', 'type']).count()
 617 |         actions = actions.unstack()
 618 |         actions.columns = list(range(actions.shape[1]))
 619 |         actions = actions.fillna(0)
 620 |         actions = actions.reset_index()
 621 |         user_sku = actions[['user_id']]
 622 |         del actions['user_id']
 623 |         min_max_scaler = preprocessing.MinMaxScaler()
 624 |         actions = min_max_scaler.fit_transform(actions.values)
 625 |         actions = pd.DataFrame(actions)
 626 |         actions = pd.concat([user_sku, actions], axis=1)
 627 |         actions.to_csv(dump_path, index=False)
 628 |     actions.columns = ['user_id'] + ['u_feat13_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
 629 |     return actions
 630 | 
 631 | 
 632 | def get_action_user_feat14(start_date, end_date):
 633 |     dump_path = './cache/user_feat14_%s_%s.csv' % (start_date, end_date)
 634 |     if os.path.exists(dump_path):
 635 |         actions = pd.read_csv(dump_path)
 636 |     else:
 637 |         n = 5
 638 |         df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
 639 |         df = df[df['type'] == 4][['user_id', 'time']]
 640 |         df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
 641 |         days = np.max(df['time'])
 642 | 
 643 |         df['cnt'] = 0
 644 |         actions = df.groupby(['user_id', 'time']).count()
 645 | 
 646 |         actions = actions.unstack()
 647 | 
 648 |         actions.columns = list(range(actions.shape[1]))
 649 |         actions = actions.reset_index()
 650 | 
 651 |         actions = actions.fillna(0)
 652 |         user_sku = actions[['user_id']]
 653 |         del actions['user_id']
 654 |         min_max_scaler = preprocessing.MinMaxScaler()
 655 |         actions = min_max_scaler.fit_transform(actions.values)
 656 |         actions = pd.DataFrame(actions)
 657 |         actions = pd.concat([user_sku, actions], axis=1)
 658 |         actions.to_csv(dump_path, index=False)
 659 |     actions.columns = ['user_id'] + ['u_feat14_' + str(i) for i in range(1, actions.shape[1])]
 660 |     return actions
 661 | 
 662 | 
 663 | # 用户购买/加入购物车/关注前访问次数
 664 | def get_action_user_feat15(start_date, end_date):
 665 |     dump_path = './cache/user_feat15_%s_%s.csv' % (start_date, end_date)
 666 |     if os.path.exists(dump_path):
 667 |         actions = pd.read_csv(dump_path)
 668 |     else:
 669 |         # 用户购买前访问次数
 670 |         def user_feat_15_1(start_date, end_date):
 671 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 672 |             visit = actions[actions['type'] == 1]
 673 |             visit = visit.groupby('user_id', as_index=False).count()
 674 |             visit.columns = ['user_id', 'visit']
 675 |             buy = actions[actions['type'] == 4]
 676 |             buy = buy.groupby('user_id', as_index=False).count()
 677 |             buy.columns = ['user_id', 'buy']
 678 |             actions = pd.merge(visit, buy, on='user_id', how='left')
 679 |             actions['visit_num_before_buy'] = actions['visit'] / actions['buy']
 680 |             del actions['buy']
 681 |             del actions['visit']
 682 |             return actions
 683 | 
 684 |         # 用户加入购物车前访问次数
 685 |         def user_feat_15_2(start_date, end_date):
 686 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 687 |             visit = actions[actions['type'] == 1]
 688 |             visit = visit.groupby('user_id', as_index=False).count()
 689 |             visit.columns = ['user_id', 'visit']
 690 |             addtoshopping = actions[actions['type'] == 2]
 691 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 692 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 693 |             actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
 694 |             actions['visit_num_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
 695 |             del actions['addtoshopping']
 696 |             del actions['visit']
 697 |             return actions
 698 | 
 699 |         # 用户关注前访问次数
 700 |         def user_feat_15_3(start_date, end_date):
 701 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 702 |             visit = actions[actions['type'] == 1]
 703 |             visit = visit.groupby('user_id', as_index=False).count()
 704 |             visit.columns = ['user_id', 'visit']
 705 |             guanzhu = actions[actions['type'] == 5]
 706 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 707 |             guanzhu.columns = ['user_id', 'guanzhu']
 708 |             actions = pd.merge(visit, guanzhu, on='user_id', how='left')
 709 |             actions['visit_num_before_guanzhu'] = actions['visit'] / actions['guanzhu']
 710 |             del actions['guanzhu']
 711 |             del actions['visit']
 712 |             return actions
 713 | 
 714 |         # 用户购买前加入购物车次数
 715 |         def user_feat_15_4(start_date, end_date):
 716 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 717 |             addtoshopping = actions[actions['type'] == 2]
 718 |             addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
 719 |             addtoshopping.columns = ['user_id', 'addtoshopping']
 720 |             buy = actions[actions['type'] == 4]
 721 |             buy = buy.groupby('user_id', as_index=False).count()
 722 |             buy.columns = ['user_id', 'buy']
 723 |             actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
 724 |             actions['addtoshopping_num_before_buy'] = actions['addtoshopping'] / actions['buy']
 725 |             del actions['buy']
 726 |             del actions['addtoshopping']
 727 |             return actions
 728 | 
 729 |         # 用户购买前关注次数
 730 |         def user_feat_15_5(start_date, end_date):
 731 |             actions = get_actions(start_date, end_date)[['user_id', 'type']]
 732 |             guanzhu = actions[actions['type'] == 5]
 733 |             guanzhu = guanzhu.groupby('user_id', as_index=False).count()
 734 |             guanzhu.columns = ['user_id', 'guanzhu']
 735 |             buy = actions[actions['type'] == 4]
 736 |             buy = buy.groupby('user_id', as_index=False).count()
 737 |             buy.columns = ['user_id', 'buy']
 738 |             actions = pd.merge(guanzhu, buy, on='user_id', how='left')
 739 |             actions['guanzhu_num_before_buy'] = actions['guanzhu'] / actions['buy']
 740 |             del actions['buy']
 741 |             del actions['guanzhu']
 742 |             return actions
 743 | 
 744 |         actions = pd.merge(user_feat_15_1(start_date, end_date), user_feat_15_2(start_date, end_date), on='user_id',
 745 |                            how='outer')
 746 |         actions = pd.merge(actions, user_feat_15_3(start_date, end_date), on='user_id', how='outer')
 747 |         actions = pd.merge(actions, user_feat_15_4(start_date, end_date), on='user_id', how='outer')
 748 |         actions = pd.merge(actions, user_feat_15_5(start_date, end_date), on='user_id', how='outer')
 749 |         user_id = actions['user_id']
 750 |         del actions['user_id']
 751 |         actions = actions.fillna(0)
 752 |         min_max_scale = preprocessing.MinMaxScaler()
 753 |         actions = min_max_scale.fit_transform(actions.values)
 754 |         actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
 755 | 
 756 |         actions.to_csv(dump_path, index=False)
 757 |     actions.columns = ['user_id'] + ['u_feat15_' + str(i) for i in range(1, actions.shape[1])]
 758 |     return actions
 759 | 
 760 | 
 761 | # 用户行为的交叉
 762 | def get_action_user_feat16(start_date, end_date):
 763 |     dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
 764 |     if os.path.exists(dump_path):
 765 |         actions = pd.read_csv(dump_path)
 766 |     else:
 767 |         actions = get_actions(start_date, end_date)[['user_id', 'type']]
 768 |         actions['cnt'] = 0
 769 |         action1 = actions.groupby(['user_id', 'type']).count()
 770 |         action1 = action1.unstack()
 771 |         index_col = list(range(action1.shape[1]))
 772 |         action1.columns = index_col
 773 |         action1 = action1.reset_index()
 774 |         action2 = actions.groupby('user_id', as_index=False).count()
 775 |         del action2['type']
 776 |         action2.columns = ['user_id', 'cnt']
 777 |         actions = pd.merge(action1, action2, how='left', on='user_id')
 778 |         for i in index_col:
 779 |             actions[i] = actions[i] / actions['cnt']
 780 |         del actions['cnt']
 781 |         actions.to_csv(dump_path, index=False)
 782 |     actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
 783 |     return actions
 784 | 
 785 | 
 786 | # 最近k天用户访问P集合的商品数/用户访问总体的商品数（k小于7天，不除总体的商品数，反之，除）
 787 | def get_action_user_feat0509_1_30(start_date, end_date, n):
 788 |     dump_path = './cache/user_feat0509_1_30_%s_%s_%s.csv' % (start_date, end_date, n)
 789 |     if os.path.exists(dump_path):
 790 |         actions = pd.read_csv(dump_path)
 791 |     else:
 792 | 
 793 |         start_days = datetime.strptime(end_dfte, '%Y-%m-%d') - timedelta(days=n)
 794 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
 795 | 
 796 |         actions = get_actions(start_days, end_date)[['user_id', 'sku_id', 'type']]
 797 |         actions_dummy = pd.get_dummies(actions['type'], prefix='actions')
 798 |         actions = pd.concat([actions, actions_dummy], axis=1)
 799 |         del actions['type']
 800 | 
 801 |         P = get_basic_product_feat()[['sku_id']]
 802 |         P['label'] = 1
 803 |         actions_sub = pd.merge(actions, P, on='sku_id', how='left')
 804 |         actions_sub = actions_sub[actions_sub['label'] == 1]
 805 |         del actions_sub['label']
 806 | 
 807 |         actions_sub = actions_sub.groupby(['user_id'], as_index=False).sum()
 808 |         del actions_sub['sku_id']
 809 |         actions_all = actions.groupby(['user_id'], as_index=False).sum()
 810 |         del actions_all['sku_id']
 811 | 
 812 |         if n > 7:
 813 |             actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
 814 |             # print actions.head()
 815 |             for i in range(1, 7):
 816 |                 actions['actions_%s' % i] = actions['actions_%s_y' % i] / actions['actions_%s_x' % i]
 817 |                 # actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
 818 | 
 819 |         else:
 820 |             actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
 821 |         actions.to_csv(dump_path, index=False)
 822 |     actions.columns = ['user_id'] + ['u_feat30_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
 823 |     
 824 |     return actions
 825 | 
 826 | 
 827 | 
 828 | #用户行为的交叉
 829 | def get_action_user_feat16(start_date,end_date):
 830 |     dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
 831 |     if os.path.exists(dump_path):
 832 |         actions = pd.read_csv(dump_path)
 833 |     else:
 834 |         actions=get_actions(start_date, end_date)[['user_id', 'type']]
 835 |         actions['cnt']=0
 836 |         action1 = actions.groupby(['user_id', 'type']).count()
 837 |         action1=action1.unstack()
 838 |         index_col=list(range(action1.shape[1]))
 839 |         action1.columns=index_col
 840 |         action1=action1.reset_index()
 841 |         action2 = actions.groupby('user_id', as_index=False).count()
 842 |         del action2['type']
 843 |         action2.columns = ['user_id', 'cnt']
 844 |         actions = pd.merge(action1, action2, how='left', on='user_id')
 845 |         for i in index_col:
 846 |             actions[i] = actions[i] / actions['cnt']
 847 |         del actions['cnt']
 848 |         actions.to_csv(dump_path,index=False)
 849 |     actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
 850 |     return actions
 851 | 
 852 | #最近k天用户访问P集合的商品数/用户访问总体的商品数（k小于7天，不除总体的商品数，反之，除）
 853 | def get_action_user_feat0509_1_30(start_date,end_date,n):
 854 |     dump_path='./cache/user_feat0509_1_30_%s_%s_%s.csv'%(start_date,end_date,n)
 855 |     if os.path.exists(dump_path):
 856 |         actions = pd.read_csv(dump_path)
 857 |     else:
 858 | 
 859 |         start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
 860 |         start_days=datetime.strftime(start_days,'%Y-%m-%d')
 861 | 
 862 |         actions=get_actions(start_days,end_date)[['user_id','sku_id','type']]
 863 |         actions_dummy=pd.get_dummies(actions['type'],prefix='actions')
 864 |         actions=pd.concat([actions,actions_dummy],axis=1)
 865 |         del actions['type']
 866 | 
 867 |         P = get_basic_product_feat()[['sku_id']]
 868 |         P['label']=1
 869 |         actions_sub=pd.merge(actions,P,on='sku_id',how='left')
 870 |         actions_sub=actions_sub[actions_sub['label']==1]
 871 |         del actions_sub['label']
 872 | 
 873 |         actions_sub=actions_sub.groupby(['user_id'],as_index=False).sum()
 874 |         del actions_sub['sku_id']
 875 |         actions_all=actions.groupby(['user_id'],as_index=False).sum()
 876 |         del actions_all['sku_id']
 877 | 
 878 |         if n>7:
 879 |             actions=pd.merge(actions_all,actions_sub,on=['user_id'],how='left')
 880 |             #print actions.head()
 881 |             for i in range(1,7):
 882 |                 actions['actions_%s'%i]=actions['actions_%s_y'%i]/actions['actions_%s_x'%i]
 883 |             #actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
 884 | 
 885 |         else:
 886 |             actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
 887 |         actions.to_csv(dump_path,index=False)
 888 |     actions.columns = ['user_id'] + ['u_feat30_' +str(n)+'_'+ str(i) for i in range(1, actions.shape[1])]  
 889 | #     user_id = actions[['user_id']]
 890 | #     del actions['user_id']
 891 | #     actions = actions.fillna(0)
 892 | #     actions=actions.replace(np.inf,0)
 893 | # #         print(actions.head())
 894 | #     columns = actions.columns
 895 | 
 896 | #     min_max_scale = preprocessing.MinMaxScaler()
 897 | #     actions=actions.replace(np.inf,0)
 898 | #     actions = min_max_scale.fit_transform(actions.values)
 899 | #     actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 900 |     return actions
 901 | 
 902 | #用户点击到购买的时间间隔
 903 | def get_action_user_feat0515_2_1(start_date,end_date):
 904 |     dump_path='./cache/get_action_user_feat0515_2_1_%s_%s.csv'%(start_date,end_date)
 905 |     if os.path.exists(dump_path):
 906 |         actions = pd.read_csv(dump_path)
 907 |     else:
 908 |         actions = get_actions(start_date,end_date)
 909 |         actions_dianji=actions[actions['type']==6][['user_id','sku_id','time']]
 910 |         actions_dianji['time_dianji'] = actions_dianji['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 911 |         actions_dianji = actions_dianji[['user_id', 'sku_id','time_dianji']]
 912 |         actions_dianji= actions_dianji.drop_duplicates(['user_id', 'sku_id'], keep='first')
 913 | 
 914 | 
 915 |         actions_goumai=actions[actions['type']==4][['user_id','sku_id','time']]
 916 |         actions_goumai['time_goumai'] = actions_goumai['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 917 |         actions_goumai = actions_goumai[['user_id', 'sku_id','time_goumai']]
 918 |         actions_goumai= actions_goumai.drop_duplicates(['user_id', 'sku_id'], keep='last')
 919 | 
 920 |         actions = pd.merge(actions_dianji,actions_goumai,on=['user_id','sku_id'],how='inner')
 921 |         actions['time_jiange']=actions['time_goumai']-actions['time_dianji']
 922 |         actions=actions.drop(['sku_id','time_goumai','time_dianji'],axis=1)
 923 |         actions['time_jiange']=actions['time_jiange'].map(lambda x:x.days*24+x.seconds//3600+1)
 924 | 
 925 |         actions_min = actions.groupby('user_id').min().reset_index()
 926 |         actions_min.columns = ['user_id','time_min']
 927 |         # actions_mean = actions.groupby('user_id').mean().reset_index()
 928 |         # actions_mean.columns = ['user_id','time_mean']
 929 |         actions_max = actions.groupby('user_id').max().reset_index()
 930 |         actions_max.columns = ['user_id','time_max']
 931 |         actions=pd.merge(actions_min,actions_max,on='user_id',how='left')
 932 |         
 933 |         user_id = actions[['user_id']]
 934 |         del actions['user_id']
 935 |         actions = actions.fillna(0)
 936 |         columns = actions.columns
 937 |         min_max_scale = preprocessing.MinMaxScaler()
 938 |         actions = min_max_scale.fit_transform(actions.values)
 939 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 940 |         actions.to_csv(dump_path,index=False)
 941 |     return actions
 942 | 
 943 | 
 944 | #用户购买每种cate的数量
 945 | def get_action_user_feat0515_2_2(start_date,end_date):
 946 |     dump_path='./cache/get_action_user_feat0515_2_2_%s_%s.csv'%(start_date,end_date)
 947 |     if os.path.exists(dump_path):
 948 |         actions = pd.read_csv(dump_path)
 949 |     else:
 950 |         actions = get_actions(start_date,end_date)
 951 |         actions = get_actions(start_date,end_date)[['user_id','cate']]
 952 |         cate_col = pd.get_dummies(actions['cate'],prefix='cate')
 953 |         actions=pd.concat([actions[['user_id']],cate_col],axis=1)
 954 |         actions= actions.groupby('user_id').sum().reset_index()
 955 |         
 956 |         user_id = actions[['user_id']]
 957 |         del actions['user_id']
 958 |         actions = actions.fillna(0)
 959 |         columns = actions.columns
 960 |         min_max_scale = preprocessing.MinMaxScaler()
 961 |         actions = min_max_scale.fit_transform(actions.values)
 962 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
 963 |         actions.to_csv(dump_path,index=False)
 964 |     return actions
 965 | 
 966 | 
 967 | #获取某人某段时间内加入购物车的数量以及关注的数量
 968 | def get_action_user_feat0515_2_3(start_date, end_date, n):
 969 |     dump_path = './cache/get_action_user_feat0515_2_3_%s_%s_%s_1.csv' % (start_date, end_date, n)
 970 |     if os.path.exists(dump_path):
 971 |         actions = pd.read_csv(dump_path)
 972 |     else:
 973 | 
 974 |         start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
 975 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
 976 | 
 977 |         actions = get_actions(start_days,end_date)[['user_id','type','cate']]
 978 |         actions_gouwuche=actions[actions['type']==2]
 979 |         actions_gouwuche_1= actions_gouwuche[['user_id','type']]
 980 |         actions_gouwuche_1= actions_gouwuche_1.groupby('user_id').count().reset_index()
 981 |         actions_gouwuche_1.columns = ['user_id',str(n)+'gouwuche_add']
 982 | 
 983 |         actions_gouwuche_2= actions_gouwuche[actions_gouwuche['cate']==8][['user_id','type']]
 984 |         actions_gouwuche_2= actions_gouwuche_2.groupby('user_id').count().reset_index()
 985 |         actions_gouwuche_2.columns = ['user_id',str(n)+'gouwuche_add_cate_8']
 986 | 
 987 |         actions_guanzhu=actions[actions['type']==5]
 988 |         actions_guanzhu_1= actions_guanzhu[['user_id','type']]
 989 |         actions_guanzhu_1= actions_guanzhu_1.groupby('user_id').count().reset_index()
 990 |         actions_guanzhu_1.columns = ['user_id',str(n)+'guanzhu_add']
 991 | 
 992 |         actions_guanzhu_2= actions_guanzhu[actions_guanzhu['cate']==8][['user_id','type']]
 993 |         actions_guanzhu_2= actions_guanzhu_2.groupby('user_id').count().reset_index()
 994 |         actions_guanzhu_2.columns = ['user_id',str(n)+'guanzhu_add_cate_8']
 995 | 
 996 |         actions = pd.merge(actions_gouwuche_1,actions_gouwuche_2,on='user_id',how ='outer')
 997 |         actions = pd.merge(actions,actions_guanzhu_1,on='user_id',how ='outer')
 998 |         actions = pd.merge(actions,actions_guanzhu_2,on='user_id',how ='outer')
 999 |         actions=actions.fillna(0)
1000 |         
1001 |         user_id = actions[['user_id']]
1002 |         del actions['user_id']
1003 |         actions = actions.fillna(0)
1004 |         columns = actions.columns
1005 |         min_max_scale = preprocessing.MinMaxScaler()
1006 |         actions = min_max_scale.fit_transform(actions.values)
1007 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
1008 |         actions.to_csv(dump_path, index=False)
1009 |     
1010 |     
1011 |     return actions
1012 | 
1013 | #top n 中 某人使用了多少天产生了该行为
1014 | def get_action_user_feat0515_2_4(start_date, end_date, n):
1015 |     dump_path = './cache/get_action_user_feat0515_2_4_%s_%s_%s.csv' % (start_date, end_date, n)
1016 |     if os.path.exists(dump_path):
1017 |         actions = pd.read_csv(dump_path)
1018 |     else:
1019 | 
1020 |         start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1021 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
1022 | 
1023 |         actions = get_actions(start_days,end_date)[['user_id','type','time']]
1024 |         actions['time'] = actions['time'].map(lambda x: (datetime.strptime(end_date,'%Y-%m-%d')-datetime.strptime(x, '%Y-%m-%d %H:%M:%S')).days)
1025 |         actions=actions.drop_duplicates(['user_id','type','time'])
1026 |         actions = actions.groupby(['user_id','type']).count()
1027 |         actions.columns = [str(n)+'day_nums']
1028 |         actions=actions.unstack()
1029 |         actions=actions.reset_index()
1030 |         actions.columns = ['user_id'] + ['get_action_user_feat0515_2_4_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1031 |         actions=actions.fillna(0)
1032 |         
1033 |         user_id = actions[['user_id']]
1034 |         del actions['user_id']
1035 |         actions = actions.fillna(0)
1036 |         columns = actions.columns
1037 |         min_max_scale = preprocessing.MinMaxScaler()
1038 |         actions = min_max_scale.fit_transform(actions.values)
1039 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)  
1040 |         actions.to_csv(dump_path, index=False)  
1041 |     return actions
1042 | 
1043 | 
1044 | # 用户总购买/加购/关注/点击/浏览品牌数
1045 | def get_action_user_feat5(start_date, end_date):
1046 |     dump_path = './cache/user_feat5_a_%s_%s.csv' % (start_date, end_date)
1047 |     if os.path.exists(dump_path):
1048 |         actions = pd.read_csv(dump_path)
1049 |     else:
1050 |         actions = get_actions(start_date, end_date)
1051 |         action=None
1052 |         for i in (1,2,4,5,6):
1053 |             df=actions[actions['type']==i][['user_id', 'sku_id']]
1054 |             df = df.drop_duplicates(['user_id', 'sku_id'], keep='first')
1055 |             df = df.groupby('user_id', as_index=False).count()
1056 |             df.columns = ['user_id', 'num_%s'%i]
1057 |             if i==1:
1058 |                 action=df
1059 |             else:
1060 |                 action=pd.merge(action,df,on='user_id',how='outer')
1061 |         actions=action.fillna(0)
1062 |         actions = actions.astype('float')
1063 |         user=actions[['user_id']]
1064 |         min_max_scaler = preprocessing.MinMaxScaler()
1065 |         actions = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1066 |         actions = pd.DataFrame(actions)
1067 |         actions = pd.concat([user, actions], axis=1)
1068 |         actions.to_csv(dump_path, index=False)
1069 |     actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
1070 |     return actions
1071 | 
1072 | #top  k 用户总购买/加购/关注/点击/浏览品牌数
1073 | def get_action_u0515_feat5(start_date,end_date,k):
1074 |     dump_path = './cache/u0515_feat5_%s_%s_%s.csv' % (start_date, end_date,k)
1075 |     if os.path.exists(dump_path):
1076 |         actions = pd.read_csv(dump_path)
1077 |     else:
1078 |         start_days=pd.to_datetime(end_date)-timedelta(days=k)
1079 |         start_days=str(start_days).split(' ')[0]
1080 |         actions=get_action_user_feat5(start_days, end_date)
1081 |         actions.to_csv(dump_path,index=False)
1082 |     actions.columns=['user_id']+['u0515_feat5_'+str(k)+'_'+str(i) for i in range(1,actions.shape[1])]
1083 |     return actions
1084 | 
1085 | 
1086 | #最早交互时间
1087 | def get_action_u0524_feat1(start_date,end_date):
1088 |     dump_path = './cache/u0524_feat1_%s_%s.csv' % (start_date, end_date,)
1089 |     if os.path.exists(dump_path):
1090 |         actions = pd.read_csv(dump_path)
1091 |     else:
1092 |         #全集
1093 |         actions=get_actions(start_date,end_date)[['user_id','time']]
1094 |         actions=actions.groupby('user_id',as_index=False).first()
1095 |         actions['time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(actions['time'])
1096 |         actions['time_diff_early']=actions['time_diff_early'].dt.days*24+actions['time_diff_early'].dt.seconds//3600
1097 |         actions=actions[['user_id','time_diff_early']]
1098 |         #子集
1099 |         sub_actions=sub_get_actions(start_date,end_date)[['user_id','time']]
1100 |         sub_actions=sub_actions.groupby('user_id',as_index=False).first()
1101 |         sub_actions['sub_time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(sub_actions['time'])
1102 |         sub_actions['sub_time_diff_early']=sub_actions['sub_time_diff_early'].dt.days*24+sub_actions['sub_time_diff_early'].dt.seconds//3600
1103 |         sub_actions = sub_actions[['user_id', 'sub_time_diff_early']]
1104 | 
1105 |         actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1106 |         actions=actions.fillna(0)
1107 |         min_max_scale = preprocessing.MinMaxScaler()
1108 |         action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1109 |         actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1110 |         actions.to_csv(dump_path,index=False)
1111 |     actions.columns=['user_id']+['u0524_feat1_'+str(i)for i in range(1,actions.shape[1])]
1112 |     return actions
1113 | 
1114 | #最晚交互时间
1115 | def get_action_u0524_feat2(start_date,end_date):
1116 |     dump_path = './cache/u0524_feat2_%s_%s.csv' % (start_date, end_date,)
1117 |     if os.path.exists(dump_path):
1118 |         actions = pd.read_csv(dump_path)
1119 |     else:
1120 |         # 全集
1121 |         actions = get_actions(start_date, end_date)[['user_id', 'time']]
1122 |         actions = actions.groupby('user_id', as_index=False).last()
1123 |         actions['time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(actions['time'])
1124 |         actions['time_diff_recent'] = actions['time_diff_recent'].dt.days * 24 + actions['time_diff_recent'].dt.seconds // 3600
1125 |         actions = actions[['user_id', 'time_diff_recent']]
1126 |         # 子集
1127 |         sub_actions = sub_get_actions(start_date, end_date)[['user_id', 'time']]
1128 |         sub_actions = sub_actions.groupby('user_id', as_index=False).last()
1129 |         sub_actions['sub_time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(sub_actions['time'])
1130 |         sub_actions['sub_time_diff_recent'] = sub_actions['sub_time_diff_recent'].dt.days * 24 + sub_actions['sub_time_diff_recent'].dt.seconds // 3600
1131 |         sub_actions = sub_actions[['user_id', 'sub_time_diff_recent']]
1132 | 
1133 |         actions = pd.merge(actions, sub_actions, on='user_id', how='left')
1134 |         actions=actions.fillna(0)
1135 |         min_max_scale = preprocessing.MinMaxScaler()
1136 |         action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1137 |         actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1138 |         actions.to_csv(dump_path,index=False)
1139 |     actions.columns = ['user_id'] + ['u0524_feat2_' + str(i) for i in range(1, actions.shape[1])]
1140 |     return actions
1141 | 
1142 | 
1143 | #活跃天数
1144 | def get_action_u0524_feat3(start_date,end_date):
1145 |     dump_path = './cache/u0524_feat3_%s_%s.csv' % (start_date, end_date,)
1146 |     if os.path.exists(dump_path):
1147 |         actions = pd.read_csv(dump_path)
1148 |     else:
1149 |         #全集
1150 |         actions=get_actions(start_date,end_date)
1151 |         actions['time']=pd.to_datetime(actions['time']).dt.date
1152 |         actions=actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1153 |         actions=actions.groupby('user_id',as_index=False).count()
1154 |         #子集
1155 |         sub_actions=sub_get_actions(start_date,end_date)
1156 |         sub_actions['time']=pd.to_datetime(sub_actions['time']).dt.date
1157 |         sub_actions=sub_actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1158 |         sub_actions=sub_actions.groupby('user_id',as_index=False).count()
1159 |         actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1160 |         actions=actions.fillna(0)
1161 |         min_max_scale = preprocessing.MinMaxScaler()
1162 |         action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1163 |         actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1164 |         actions.to_csv(dump_path,index=False)
1165 |     actions.columns=['user_id']+['u0524_feat3_'+str(i) for i in range(1,actions.shape[1])]
1166 |     return actions
1167 | 
1168 | 
1169 | #点击模块
1170 | def get_action_user_feat0509_1_31(start_date,end_date,n):
1171 |     dump_path='./cache/user_feat0509_1_31_%s_%s_%s.csv'%(start_date,end_date,n)
1172 |     if os.path.exists(dump_path):
1173 |         actions = pd.read_csv(dump_path)
1174 |     else:
1175 |         start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
1176 |         start_days=datetime.strftime(start_days,'%Y-%m-%d')
1177 |         actions=get_actions(start_days,end_date)
1178 |         actions=actions[actions['type']==6][['user_id','model_id']]
1179 |         
1180 | #         actions = actions.drop('type',axis=1)
1181 |         
1182 |         actions_click_sum=actions[['user_id','model_id']].groupby('user_id').count().reset_index()
1183 |         actions_click_sum.columns = ['user_id',str(n)+'click_sum_all']
1184 |         actions[str(n)+'u_click14_history'] = actions['model_id'].map(lambda x: int(x == 14))
1185 |         actions[str(n)+'u_click21_history'] = actions['model_id'].map(lambda x: int(x == 21))
1186 |         actions[str(n)+'u_click28_history'] = actions['model_id'].map(lambda x: int(x == 28))
1187 |         actions[str(n)+'u_click110_history'] = actions['model_id'].map(lambda x: int(x == 110))
1188 |         actions[str(n)+'u_click210_history'] = actions['model_id'].map(lambda x: int(x == 210))
1189 |         actions = actions.groupby('user_id').sum().reset_index().drop('model_id', axis=1)
1190 | #         actions.to_csv(dump_path,index=False)
1191 |         actions = pd.merge(actions,actions_click_sum,how='left',on='user_id')
1192 |         
1193 |         actions[str(n)+'u_click14/click_sum_history'] = actions[str(n)+'u_click14_history']/actions[str(n)+'click_sum_all']
1194 |         actions[str(n)+'u_click21/click_sum_history'] = actions[str(n)+'u_click21_history']/actions[str(n)+'click_sum_all']
1195 |         actions[str(n)+'u_click28/click_sum_history'] = actions[str(n)+'u_click28_history']/actions[str(n)+'click_sum_all']
1196 |         actions[str(n)+'u_click110/click_sum_history'] = actions[str(n)+'u_click110_history']/actions[str(n)+'click_sum_all']
1197 |         actions[str(n)+'u_click210/click_sum_history'] = actions[str(n)+'u_click210_history']/actions[str(n)+'click_sum_all']
1198 |         
1199 |         user_id = actions[['user_id']]
1200 |         del actions['user_id']
1201 |         actions = actions.fillna(0)
1202 |         columns = actions.columns
1203 |         min_max_scale = preprocessing.MinMaxScaler()
1204 |         actions = min_max_scale.fit_transform(actions.values)
1205 |         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1206 |         actions.to_csv(dump_path,index=False)
1207 |     return actions
1208 | #u模型cate=8的购买者和不是cate=8的购买者
1209 | def get_action_u0513_feat16(start_date,end_date):
1210 |     dump_path = './cache/u0513_feat16_%s_%s.csv' % (start_date, end_date)
1211 |     if os.path.exists(dump_path):
1212 |         actions = pd.read_csv(dump_path)
1213 |     else:
1214 |         df = get_actions(start_date, end_date)[['user_id', 'type', 'cate']]
1215 |         df = df[df['type'] == 4]
1216 |         df = df.groupby(['user_id', 'cate']).count()
1217 |         df = df.unstack().reset_index()
1218 |         df.columns = ['user_id'] + ['cate' + str(i) for i in range(4, 12)]
1219 |         df = df.fillna(0)
1220 |         sum1 = df.drop(['user_id', 'cate8'], axis=1).apply(sum, axis=1)
1221 |         sum2 = df.drop(['user_id'], axis=1).apply(sum, axis=1)
1222 |         actions = pd.concat([df[['user_id', 'cate8']], sum1, sum2], axis=1)
1223 |         actions.columns = ['user_id', 'cate8', 'sum_other_cate', 'sum']
1224 |         actions['cate8_rate'] = actions['cate8'] / actions['sum']
1225 |         actions['sum_other_cate_rate'] = actions['sum_other_cate'] / actions['sum']
1226 |         del actions['sum']
1227 |         actions.to_csv(dump_path,index=False)
1228 |     return actions
1229 | 
1230 | #get_action_u0513_feat16('2016-02-01','2016-04-16')
1231 | # 用户层级特征
1232 | def get_action_user_feat_six_xingwei(start_date, end_date, n):
1233 |     dump_path = './cache/user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1234 |     if os.path.exists(dump_path):
1235 |         actions = pd.read_csv(dump_path)
1236 |         print("user_zlzl" + str(n))
1237 |         
1238 |     else:
1239 |         actions = get_actions(start_date, end_date)
1240 |         actions['time'] = actions['time'].map(lambda x: get_day_chaju(x, end_date) // n)
1241 |         num_day = np.max(actions['time'])
1242 |         df = None
1243 |         print(num_day)
1244 |         for i in range(min(num_day + 1, 6)):
1245 |             in_temp = pd.get_dummies(actions['type'], prefix="user_action_time_" + str(i))
1246 |             temp = actions[actions['time'] == i]
1247 |             temp = pd.concat([temp['user_id'], in_temp], axis=1)
1248 | 
1249 |             feature = ['user_id']
1250 |             for j in range(1, 7, 1):
1251 |                 feature.append('user_action_time_' + str(i) + '_' + str(j))
1252 | 
1253 |             temp = temp.groupby(['user_id'], as_index=False).sum()
1254 |             temp.columns = feature
1255 |             if df is None:
1256 |                 df = temp
1257 |             else:
1258 |                 df = pd.merge(df, temp, how='outer', on='user_id')
1259 |         df.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, df.shape[1])]
1260 |         df.to_csv(dump_path, index=False)
1261 |         actions=df
1262 |         
1263 | #     user_id = actions[['user_id']]
1264 | #     del actions['user_id']
1265 | #     actions = actions.fillna(0)
1266 | #     actions=actions.replace(np.inf,0)
1267 | # #         print(actions.head())
1268 | #     columns = actions.columns
1269 | 
1270 | #     min_max_scale = preprocessing.MinMaxScaler()
1271 | #     actions=actions.replace(np.inf,0)
1272 | #     actions = min_max_scale.fit_transform(actions.values)
1273 | #     actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1274 |     actions.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1275 |     return actions
1276 | 
1277 | 
1278 | def deal_user_six_deal(start_date, end_date, n):
1279 |     dump_path = './cache/deal_user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1280 |     if os.path.exists(dump_path):
1281 |         actions = pd.read_csv(dump_path)
1282 |         actions.columns = ['user_id'] + ['u_featsix_deal_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1283 |         return actions
1284 |     else:
1285 |         temp = get_action_user_feat_six_xingwei(start_date, end_date, n)  # 修改
1286 |         time1 = datetime.now()
1287 |         columns = ["user_id"]
1288 |         all_col = temp.shape[1] - 1
1289 |         temp.columns = columns + list(range(all_col))
1290 |         temp = temp.fillna(0)
1291 |         columns = ['user_id']
1292 |         for j in range(0, 6, 1):
1293 |             temp["zl_" + str(j)] = 0
1294 |             columns.append("zl_" + str(j))
1295 |             for k in range(j, all_col, 6):
1296 |                 temp["zl_" + str(j)] = temp["zl_" + str(j)] + temp[k].map(lambda x: x * ((k // 6 + 1) ** (-0.67)))
1297 |             temp["zl_" + str(j)] = temp["zl_" + str(j)].map(lambda x: (x - np.min(temp["zl_" + str(j)])) / (
1298 |                 np.max(temp["zl_" + str(j)]) - np.min(temp["zl_" + str(j)])))
1299 |         temp = temp[columns]
1300 |         temp.to_csv(dump_path, index=False)
1301 |         return temp
1302 | 
1303 | # # get  user sku
1304 | # def get_user(start_date, end_date):
1305 | #     dump_path = './cache/user_sku_%s_%s.csv' % (start_date, end_date)
1306 | #     if os.path.exists(dump_path):
1307 | #         actions = pd.read_csv(dump_path)
1308 | #     else:
1309 | #         actions = get_actions(start_date, end_date)
1310 | #         actions = actions[(actions['type'] == 2) | (actions['type'] == 5) | (actions['type'] == 4)]
1311 | #         actions=actions[actions['cate']==8]
1312 | #         actions = actions[['user_id']]
1313 | #         actions = actions.drop_duplicates(['user_id'], keep='first')
1314 | #         actions.to_csv(dump_path, index=False)
1315 | #     return actions
1316 | 
1317 | 
1318 | #用户购买前的行为
1319 | def get_action_u0509_feat_28(start_date, end_date,k):
1320 |     dump_path = './cache/u0509_feat_28_%s_%s_%s.csv' % (start_date, end_date,k)
1321 |     if os.path.exists(dump_path):
1322 |         actions = pd.read_csv(dump_path)
1323 |     else:
1324 |         actions = get_actions(start_date, end_date)
1325 |         actions = actions[actions['type'] == 4]
1326 |         actions['time_buy'] = actions['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1327 |         actions = actions[['user_id', 'sku_id', 'time_buy']].reset_index(drop=True)
1328 |         actions['before_time_buy'] = actions['time_buy'] - timedelta(days=k)
1329 | 
1330 |         df = get_actions('2016-02-01','2016-04-16')[['user_id', 'sku_id', 'time', 'type']]
1331 |         df['time'] = df['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1332 |         df = pd.merge(df, actions, on=['user_id', 'sku_id'], how='left')
1333 |         df = df.dropna(axis=0, how='any')
1334 |         df['before_days'] = (df['time'] - df['before_time_buy']).dt.days
1335 |         df['days'] = (df['time'] - df['time_buy']).dt.days
1336 |         df = df[(df['before_days'] >= 0) & (df['days'] < 0)]
1337 |         df_dummy = pd.get_dummies(df['type'], prefix='type')
1338 | 
1339 |         df = pd.concat([df, df_dummy], axis=1)[
1340 |             ['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']]
1341 | 
1342 |         df = df.groupby(['user_id', 'sku_id'], as_index=False).sum()
1343 |         del df['sku_id']
1344 |         df = df.groupby('user_id', as_index=False).agg(['min', 'max', 'mean'])
1345 |         df = df.reset_index()
1346 |         df.columns = ['user_id'] + ['u0509_feat28_' + str(k) + '_' + i for i in (
1347 |         'type_1_min', 'type_1_max', 'type_1_mean', 'type_2_min', 'type_2_max', 'type_2_mean',
1348 |         'type_3_min', 'type_3_max', 'type_3_mean', 'type_4_min', 'type_4_max', 'type_4_mean',
1349 |         'type_5_min', 'type_5_max', 'type_5_mean', 'type_6_min', 'type_6_max', 'type_6_mean')]
1350 |         min_max_scaler = preprocessing.MinMaxScaler()
1351 |         actions = min_max_scaler.fit_transform(df.drop('user_id', axis=1).values)
1352 |         actions = pd.DataFrame(actions)
1353 |         actions = pd.concat([df[['user_id']], actions], axis=1)
1354 |         actions.columns = ['user_id']+['u0509_feat_28_'+str(i) for i in range(1,actions.shape[1])]
1355 |         actions.to_csv(dump_path,index=False)
1356 |     actions.columns = ['user_id']+['u0509_feat_28_'+str(k)+"_"+str(i) for i in range(1,actions.shape[1])]
1357 |     return actions
1358 | 
1359 | #用户看了几个cate=8中的brand、用户看的cate=8的brand/用户看的brand
1360 | def get_action_u0509_feat_29(start_date,end_date):
1361 |     dump_path = './cache/u0509_feat_29_%s_%s.csv' % (start_date, end_date)
1362 |     if os.path.exists(dump_path):
1363 |         actions = pd.read_csv(dump_path)
1364 |     else:
1365 |         actions=get_actions(start_date,end_date)
1366 |         df1=actions[actions['cate']==8].drop_duplicates(['user_id','brand'])[['user_id','brand']]
1367 |         df1=df1.groupby(['user_id'],as_index=False).count()
1368 |         df1.columns=['user_id','brand_cate=8']
1369 |         df2=actions.drop_duplicates(['user_id','brand'])[['user_id','brand']]
1370 |         df2 = df2.groupby(['user_id'], as_index=False).count()
1371 |         df2.columns=['user_id','brand_cate_all']
1372 |         df=pd.merge(df1,df2,on='user_id',how='right')
1373 |         df['rate']=df['brand_cate=8']/df['brand_cate_all']
1374 | #         print df
1375 |         actions=df.fillna(0)
1376 |         actions.to_csv(dump_path,index=False)
1377 |     actions.columns=['user_id']+['u0509_feat_29'+str(i) for i in range(1,actions.shape[1])]
1378 |     return actions
1379 | 
1380 | def get_action_u0521_feat_31(start_date,end_date,k):
1381 |     dump_path = './cache/u0509_feat_31_%s_%s_%s.csv' % (start_date, end_date,k)
1382 |     if os.path.exists(dump_path):
1383 |         actions = pd.read_csv(dump_path)
1384 |     else:
1385 |         start_days=pd.to_datetime(end_date)-timedelta(days=k)
1386 |         start_days=datetime.strftime(start_days,'%H-%m-%d')
1387 |         actions=get_actions(start_days,end_date)
1388 |         df1=actions[actions['cate']==8].drop_duplicates(['user_id','cate'])[['user_id','cate']]
1389 |         df1=df1.groupby('user_id',as_index=False).count()
1390 |         df1.columns=['user_id','cate8']
1391 |         df2=actions.drop_duplicates(['user_id','cate'])[['user_id','cate']]
1392 |         df2=df2.groupby('user_id',as_index=False).count()
1393 |         actions=pd.merge(df1,df2,on='user_id',how='right')
1394 |         actions['cate8/cate']=actions['cate8']/actions['cate']
1395 |         actions=actions.fillna(0)
1396 |         min_max_scaler = preprocessing.MinMaxScaler()
1397 |         df = min_max_scaler.fit_transform(actions[['cate8','cate']].values)
1398 |         df = pd.DataFrame(df)
1399 |         actions = pd.concat([actions[['user_id','cate8/cate']], df], axis=1)
1400 |         actions.to_csv(dump_path,index=False)
1401 |     actions.columns=['user_id']+['u0509_feat_31_'+str(k)+'_'+str(i)for i in range(1,actions.shape[1])]
1402 |     return actions
1403 | 
1404 | 
1405 | def get_action_u0521_feat_32(start_date,end_date):
1406 |     dump_path = './cache/u0509_feat_32_%s_%s.csv' % (start_date, end_date)
1407 |     if os.path.exists(dump_path):
1408 |         actions = pd.read_csv(dump_path)
1409 |     else:
1410 |         actions=get_actions(start_date,end_date)
1411 |         actions=actions[actions['cate']==8][['user_id','brand']]
1412 |         df1=actions.drop_duplicates(['user_id','brand']).groupby('user_id',as_index=False).count()
1413 |         df1.columns=['user_id','brand_num']
1414 |         df2=actions.groupby('user_id',as_index=False).count()
1415 |         actions=pd.merge(df1,df2,on='user_id',how='left')
1416 |         actions['brand_num/brand']=actions['brand']/actions['brand_num']
1417 |         actions=actions.fillna(0)
1418 |         min_max_scaler = preprocessing.MinMaxScaler()
1419 |         df = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1420 |         df = pd.DataFrame(df)
1421 |         actions = pd.concat([actions[['user_id']], df], axis=1)
1422 |         actions.to_csv(dump_path, index=False)
1423 |     actions.columns = ['user_id'] + ['u0509_feat_32_' + str(i) for i in range(1, actions.shape[1])]
1424 |     return actions
1425 | 
1426 | def get_action_user_feat7_0522_huachuang(start_date, end_date,n):
1427 |     dump_path = './cache/user_feat7_six_%s_%s_%s_0522.csv' % (start_date, end_date,n)
1428 |     if os.path.exists(dump_path):
1429 |         actions = pd.read_csv(dump_path)
1430 |     else:
1431 |         start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1432 |         start_days = datetime.strftime(start_days, '%Y-%m-%d')
1433 |         
1434 |         df = get_actions(start_days, end_date)[['user_id', 'type', 'time']]
1435 |         actions = df.groupby(['user_id', 'type'], as_index=False).count()
1436 | 
1437 |         time_min = df.groupby(['user_id', 'type'], as_index=False).min()
1438 |         time_max = df.groupby(['user_id', 'type'], as_index=False).max()
1439 | 
1440 |         time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
1441 |         time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1442 |         time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1443 | 
1444 |         time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
1445 |                                                                                              time_cha[
1446 |                                                                                                  'time_y']).dt.seconds // 3600
1447 |         del time_cha['time_x']
1448 |         del time_cha['time_y']
1449 |         # time_cha=time_cha.fillna(1)
1450 | 
1451 |         actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
1452 |         actions = actions.groupby(['user_id', 'type']).sum()
1453 |         actions['cnt/time'] = actions['time'] / actions["cha_hour"]
1454 |         actions = actions.unstack()
1455 |         actions.columns = list(range(actions.shape[1]))
1456 |         actions = actions.reset_index()
1457 |         actions = actions.fillna(0)
1458 |         actions.to_csv(dump_path, index=False)
1459 |     actions.columns = ['user_id'] + ['u_feat7_' +str(n)+"_"+ str(i) for i in range(1, actions.shape[1])]
1460 |     return actions
1461 | 
1462 | def get_user_labels(test_start_date,test_end_date):
1463 |     dump_path = './cache/user_labels_%s_%s_11.csv' % (test_start_date, test_end_date)
1464 |     if os.path.exists(dump_path):
1465 |         actions = pd.read_csv(dump_path)
1466 |     else:
1467 |         actions = get_actions(test_start_date, test_end_date)
1468 |         actions = actions[actions['cate']==8]
1469 |         actions = actions[actions['type'] == 4].drop_duplicates(['user_id'])[['user_id']]
1470 |         actions['label'] = 1
1471 | 
1472 |     return actions
1473 | 
1474 | 
1475 | print("U model 2 finish  part_0")
1476 | #########################################################################################################
1477 | 
1478 | 
1479 | # In[ ]:
1480 | 
1481 | 
1482 | 
1483 | 
1484 | # In[ ]:
1485 | 
1486 | 
1487 | 
1488 | 
1489 | # In[3]:
1490 | 
1491 | import os
1492 | from datetime import datetime
1493 | from datetime import timedelta
1494 | 
1495 | # -*- coding: utf-8 -*-
1496 | """
1497 | Created on Sun May 14 10:27:41 2017
1498 | @author: 老虎趴趴走
1499 | """
1500 | import pandas as pd
1501 | import numpy as np
1502 | # import datetime
1503 | import math
1504 | 
1505 | def user_features(user, ful_action, sub_action, end_date):
1506 |     dump_path='./cache/user_features_%s_0514_2.csv'%(end_date)
1507 |     if os.path.exists(dump_path):
1508 |         actions = pd.read_csv(dump_path)
1509 |     
1510 |     else:
1511 |         end_date=pd.to_datetime(end_date)
1512 |         day = timedelta(1, 0)
1513 |         print('=====> 提取特征...')
1514 |         sub_1 = sub_action[(sub_action['time']>=end_date-1*day) & (sub_action['time']<end_date)]
1515 |         sub_3 = sub_action[(sub_action['time']>=end_date-3*day) & (sub_action['time']<end_date)]
1516 |         sub_5 = sub_action[(sub_action['time']>=end_date-5*day) & (sub_action['time']<end_date)]
1517 |         sub_30 = sub_action[(sub_action['time']>=end_date-30*day) & (sub_action['time']<end_date)]
1518 |         sub_all = sub_action[sub_action['time']<end_date]
1519 | 
1520 |         ful_5 = ful_action[(ful_action['time']>=end_date-5*day) & (ful_action['time']<end_date)]
1521 |         ful_30 = ful_action[(ful_action['time']>=end_date-30*day) & (ful_action['time']<end_date)]
1522 |         ful_all = ful_action[ful_action['time']<end_date]
1523 |         # ========================================
1524 |         #    用户历史行为  
1525 |         # ========================================
1526 |         # 6种行为特征
1527 |         df = pd.get_dummies(sub_all['type'], prefix='type')
1528 |         df['type_0'] = df.sum(axis=1)
1529 |         df = pd.concat([sub_all[['user_id', 'sku_id']], df], axis=1)
1530 | #         u_feature_history = action_dummy[['user_id']]
1531 |         u_feature_all = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1532 |         col = ['user_id','browse_all','add_all','del_all','buy_all','follow_all','click_all','action_all']
1533 |         u_feature_all.columns = col
1534 |         # 比值
1535 |         u_feature_all['buy/browse_all'] = u_feature_all['buy_all']/(u_feature_all['browse_all']+0.001)*100
1536 |         u_feature_all['buy/add_all'] = u_feature_all['buy_all']/(u_feature_all['add_all']+0.001)*100
1537 |         u_feature_all['buy/click_all'] = u_feature_all['buy_all']/(u_feature_all['click_all']+0.001)*100
1538 |         u_feature_all['buy/follow_all'] = u_feature_all['buy_all']/(u_feature_all['follow_all']+0.001)*100
1539 |         u_feature_all['del/add_all'] = u_feature_all['del_all']/(u_feature_all['add_all']+0.001)*100
1540 | 
1541 |         # 用户对商品行为特征
1542 |         us = df.groupby(['user_id', 'sku_id']).sum().reset_index()
1543 |         us = us.drop('sku_id', axis=1)
1544 |         us_avg = us.groupby('user_id').mean().reset_index()
1545 |         col = ['user_id','us_browse_all_avg','us_add_all_avg','us_del_all_avg','us_buy_all_avg','us_follow_all_avg','us_click_all_avg','us_action_all_avg']
1546 |         us_avg.columns = col
1547 |         us_max = us.groupby('user_id').max().reset_index()
1548 |         col = ['user_id','us_browse_all_max','us_add_all_max','us_del_all_max','us_buy_all_max','us_follow_all_max','us_click_all_max','us_action_all_max']
1549 |         us_max.columns = col
1550 |         us_max = us_max.drop(['us_buy_all_max', 'us_del_all_max'], axis=1)
1551 |         u_feature_all = pd.merge(u_feature_all, us_avg, on='user_id', how='left')
1552 |         u_feature_all = pd.merge(u_feature_all, us_max, on='user_id', how='left').fillna(0)
1553 | 
1554 |         # 活跃天数
1555 |         u_days = sub_all[['user_id', 'date']]
1556 |         u_days = u_days.drop_duplicates()
1557 |         u_days = u_days.groupby('user_id').count().reset_index()
1558 |         u_days.rename(columns={'date': 'u_days_all'}, inplace=True)
1559 |         u_feature_all = pd.merge(u_feature_all, u_days, on='user_id', how='left').fillna(0)
1560 | 
1561 |         # 时间特征
1562 |         u_days = sub_all[['user_id', 'time']]
1563 |         u_start = u_days.groupby('user_id').min().reset_index()
1564 |         u_start.rename(columns={'time': 'start'}, inplace=True)
1565 |         u_end = u_days.groupby('user_id').max().reset_index()
1566 |         u_end.rename(columns={'time': 'end'}, inplace=True)
1567 |         u_duration = pd.merge(u_start, u_end, on='user_id')
1568 |         u_duration['u_duration_all'] = u_duration['end'] - u_duration['start']
1569 |         u_duration['u_duration_all'] = u_duration['u_duration_all'].map(lambda x: x.days*24+x.seconds/3600)
1570 |         u_duration = u_duration[['user_id', 'u_duration_all']]
1571 |         u_feature_all = pd.merge(u_feature_all, u_duration, on='user_id', how='left').fillna(0)
1572 | 
1573 |         # 行为/时间
1574 |         u_feature_all['action_avg_all'] = u_feature_all['action_all']/(u_feature_all['u_duration_all']+0.001)
1575 |         u_feature_all['browse_avg_all'] = u_feature_all['browse_all']/(u_feature_all['u_duration_all']+0.001)
1576 |         u_feature_all['add_avg_all'] = u_feature_all['add_all']/(u_feature_all['u_duration_all']+0.001)
1577 |         u_feature_all['del_avg_all'] = u_feature_all['del_all']/(u_feature_all['u_duration_all']+0.001)
1578 |         u_feature_all['buy_avg_all'] = u_feature_all['buy_all']/(u_feature_all['u_duration_all']+0.001)
1579 |         u_feature_all['follow_avg_all'] = u_feature_all['follow_all']/(u_feature_all['u_duration_all']+0.001)
1580 |         u_feature_all['click_avg_all'] = u_feature_all['click_all']/(u_feature_all['u_duration_all']+0.001)
1581 | 
1582 |         # 商品特征
1583 |         sku = df.drop('user_id', axis=1).groupby('sku_id').sum().reset_index()
1584 |         sku = pd.merge(df[['user_id', 'sku_id']].drop_duplicates(), sku, on='sku_id', how='left')
1585 |         sku = sku.drop('sku_id', axis=1)
1586 |         sku_avg = sku.groupby('user_id').mean().reset_index()
1587 |         col = ['user_id','sku_browse_all_avg','sku_add_all_max','sku_del_all_max','sku_buy_all_max','sku_follow_all_max','sku_click_all_max','sku_action_all_max']
1588 |         sku_avg.columns = col        
1589 | 
1590 |         sku_min = sku.groupby('user_id').min().reset_index()
1591 |         col = ['user_id','sku_browse_all_min','sku_add_all_min','sku_del_all_min','sku_buy_all_min','sku_follow_all_min','sku_click_all_min','sku_action_all_min']
1592 |         sku_min.columns = col  
1593 |         
1594 |         u_feature_all = pd.merge(u_feature_all, sku_avg, on='user_id', how='left')
1595 |         u_feature_all = pd.merge(u_feature_all, sku_min, on='user_id', how='left').fillna(0)
1596 | 
1597 |         # 全集行为特征
1598 |         df = pd.get_dummies(ful_all['type'], prefix='type')
1599 |         df['type_0'] = df.sum(axis=1)
1600 |         df = pd.concat([ful_all[['user_id', 'sku_id']], df], axis=1)
1601 |         u_feature_ful_all = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1602 |         col = ['user_id','browse_ful_all','add_ful_all','del_ful_all','buy_ful_all','follow_ful_all','click_ful_all','action_ful_all']
1603 |         u_feature_ful_all.columns = col     
1604 | 
1605 |         u_feature_all = pd.merge(u_feature_all, u_feature_ful_all, on='user_id', how='left')
1606 |  
1607 |         # 子集/全集
1608 |         u_feature_all['action/ful_all'] = u_feature_all['action_all']/(u_feature_all['action_ful_all']+0.001)*100
1609 |         u_feature_all['browse/ful_all'] = u_feature_all['browse_all']/(u_feature_all['browse_ful_all']+0.001)*100
1610 |         u_feature_all['add/ful_all'] = u_feature_all['add_all']/(u_feature_all['add_ful_all']+0.001)*100
1611 |         u_feature_all['del/ful_all'] = u_feature_all['del_all']/(u_feature_all['del_ful_all']+0.001)*100
1612 |         u_feature_all['buy/ful_all'] = u_feature_all['buy_all']/(u_feature_all['buy_ful_all']+0.001)*100
1613 |         u_feature_all['follow/ful_all'] = u_feature_all['follow_all']/(u_feature_all['follow_ful_all']+0.001)*100
1614 |         u_feature_all['click/ful_all'] = u_feature_all['click_all']/(u_feature_all['click_ful_all']+0.001)*100
1615 |         u_feature_all = u_feature_all.drop(['browse_ful_all', 'action_ful_all', 'add_ful_all', 'del_ful_all',
1616 |                                                     'buy_ful_all', 'follow_ful_all', 'click_ful_all'], axis=1)
1617 | 
1618 | 
1619 |         # =======================================
1620 |         #    用户30天行为特征
1621 |         # =======================================
1622 |         df = pd.get_dummies(sub_30['type'], prefix='type')
1623 |         df['type_0'] = df.sum(axis=1)
1624 |         df = pd.concat([sub_30[['user_id', 'sku_id']], df], axis=1)
1625 |         # 子集行为特征
1626 |         u_feature_30 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1627 |         col = ['user_id','browse_30','add_30','del_30','buy_30','follow_30','click_30','action_30']
1628 |         u_feature_30.columns = col
1629 | 
1630 |         # 全集行为特征
1631 |         df = pd.get_dummies(ful_30['type'], prefix='type')
1632 |         df['type_0'] = df.sum(axis=1)
1633 |         df = pd.concat([ful_30[['user_id', 'sku_id']], df], axis=1)
1634 |         u_feature_ful_30 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1635 |         col = ['user_id','browse_ful_30','add_ful_30','del_ful_30','buy_ful_30','follow_ful_30','click_ful_30','action_ful_30']
1636 |         u_feature_ful_30.columns = col        
1637 | 
1638 |         u_feature_30 = pd.merge(u_feature_30, u_feature_ful_30, on='user_id', how='left')
1639 | 
1640 |         # 子集/全集
1641 |         u_feature_30['action/ful_30'] = u_feature_30['action_30']/(u_feature_30['action_ful_30']+0.001)*100
1642 |         u_feature_30['browse/ful_30'] = u_feature_30['browse_30']/(u_feature_30['browse_ful_30']+0.001)*100
1643 |         u_feature_30['add/ful_30'] = u_feature_30['add_30']/(u_feature_30['add_ful_30']+0.001)*100
1644 |         u_feature_30['del/ful_30'] = u_feature_30['del_30']/(u_feature_30['del_ful_30']+0.001)*100
1645 |         u_feature_30['buy/ful_30'] = u_feature_30['buy_30']/(u_feature_30['buy_ful_30']+0.001)*100
1646 |         u_feature_30['follow/ful_30'] = u_feature_30['follow_30']/(u_feature_30['follow_ful_30']+0.001)*100
1647 |         u_feature_30['click/ful_30'] = u_feature_30['click_30']/(u_feature_30['click_ful_30']+0.001)*100
1648 | 
1649 |         # ========================================
1650 |         #     用户5天行为特征
1651 |         # ========================================
1652 |         df = pd.get_dummies(sub_5['type'], prefix='type')
1653 |         df['type_0'] = df.sum(axis=1)
1654 |         df = pd.concat([sub_5[['user_id', 'sku_id']], df], axis=1)
1655 |         # 子集行为特征
1656 |         u_feature_5 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1657 |         col = ['user_id','browse_5','add_5','del_5','buy_5','follow_5','click_5','action_5']
1658 |         u_feature_5.columns = col
1659 | 
1660 |         # 用户对商品行为特征
1661 |         us = df.groupby(['user_id', 'sku_id']).sum().reset_index()
1662 |         us = us.drop('sku_id', axis=1)
1663 |         us_avg = us.groupby('user_id').mean().reset_index()
1664 |         col = ['user_id','us_browse_5_avg','us_add_5_avg','us_del_5_avg','us_buy_5_avg','us_follow_5_avg','us_click_5_avg','us_action_5_avg']
1665 |         us_avg.columns = col
1666 |         us_max = us.groupby('user_id').max().reset_index()
1667 |         col = ['user_id','us_browse_5_max','us_add_5_max','us_del_5_max','us_buy_5_max','us_follow_5_max','us_click_5_max','us_action_5_max']
1668 |         us_max.columns = col
1669 |         u_feature_5 = pd.merge(u_feature_5, us_avg, on='user_id', how='left')
1670 |         u_feature_5 = pd.merge(u_feature_5, us_max, on='user_id', how='left').fillna(0)
1671 | 
1672 |         # 时间特征
1673 |         u_days = sub_5[['user_id', 'time']]
1674 |         u_start = u_days.groupby('user_id').min().reset_index()
1675 |         u_start.rename(columns={'time': 'start'}, inplace=True)
1676 |         u_end = u_days.groupby('user_id').max().reset_index()
1677 |         u_end.rename(columns={'time': 'end'}, inplace=True)
1678 |         u_duration = pd.merge(u_start, u_end, on='user_id')
1679 |         u_duration['u_duration_5'] = u_duration['end'] - u_duration['start']
1680 |         u_duration['u_duration_5'] = u_duration['u_duration_5'].map(lambda x: x.days*24+x.seconds/3600)
1681 |         u_duration['u_stop_5'] = end_date - u_duration['end']
1682 |         u_duration['u_stop_5']= u_duration['u_stop_5'].map(lambda x: x.days*24+x.seconds/3600)
1683 |         u_duration = u_duration[['user_id', 'u_duration_5', 'u_stop_5']]
1684 |         u_feature_5 = pd.merge(u_feature_5, u_duration, on='user_id', how='left').fillna(0)
1685 | 
1686 |         # 全集行为特征
1687 |         df = pd.get_dummies(ful_5['type'], prefix='type')
1688 |         df['type_0'] = df.sum(axis=1)
1689 |         df = pd.concat([ful_5[['user_id', 'sku_id']], df], axis=1)
1690 |         u_feature_ful_5 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1691 |         col = ['user_id','browse_ful_5','add_ful_5','del_ful_5','buy_ful_5','follow_ful_5','click_ful_5','action_ful_5']
1692 |         u_feature_ful_5.columns = col
1693 | 
1694 |         # 子集/全集
1695 |         u_feature_5 = pd.merge(u_feature_5, u_feature_ful_5, on='user_id', how='left')
1696 |         u_feature_5['browse/ful_5'] = u_feature_5['browse_5'] / (u_feature_5['browse_ful_5']+0.001)*100
1697 |         u_feature_5['add/ful_5'] = u_feature_5['add_5'] / (u_feature_5['add_ful_5']+0.001)*100
1698 |         u_feature_5['del/ful_5'] = u_feature_5['del_5'] / (u_feature_5['del_ful_5']+0.001)*100
1699 |         u_feature_5['click/ful_5'] = u_feature_5['click_5'] / (u_feature_5['click_ful_5']+0.001)*100
1700 |         #u_feature_5D = u_feature_5D.drop(['u_browse_num_ful_5D','u_add_num_ful_5D','u_del_num_ful_5D','u_buy_num_ful_5D','u_follow_num_ful_5D','u_click_num_ful_5D'], axis=1)
1701 | 
1702 |         # ========================================
1703 |         #     用户3天行为特征  
1704 |         # ========================================
1705 |         df = pd.get_dummies(sub_3['type'], prefix='type')
1706 |         df['type_0'] = df.sum(axis=1)
1707 |         df = pd.concat([sub_3[['user_id', 'sku_id']], df], axis=1)
1708 |         u_feature_3 = df.groupby('user_id')['type_0'].sum().reset_index()
1709 |         u_feature_3.rename(columns={'type_0': 'action_3'}, inplace=True)
1710 | 
1711 |         # ========================================
1712 |         #     用户1天行为特征  
1713 |         # ========================================
1714 |         df = pd.get_dummies(sub_1['type'], prefix='type')
1715 |         df['type_0'] = df.sum(axis=1)
1716 |         df = pd.concat([sub_1[['user_id', 'sku_id']], df], axis=1)
1717 | 
1718 |         u_feature_1 = df.drop('sku_id', axis=1).groupby('user_id').sum().reset_index()
1719 |         col = ['user_id','browse_1','add_1','del_1','buy_1','follow_1','click_1','action_1']
1720 |         u_feature_1.columns = col
1721 |         # ========================================
1722 |         #          特征融合
1723 |         # ========================================
1724 |         actions = pd.merge(user[['user_id', 'user_lv_cd', 'reg_duration', 'reg_duration_cate']], u_feature_all, on='user_id', how='left')
1725 |         actions['lv/reg_day'] = actions['user_lv_cd']/(actions['reg_duration']+0.001)*100
1726 |         actions['lv/reg_day_cate'] = actions['user_lv_cd']/(actions['reg_duration_cate']+0.001)
1727 |         actions = pd.merge(actions, u_feature_30, on='user_id', how='left')
1728 |         actions = pd.merge(actions, u_feature_5, on='user_id', how='left')
1729 |         actions['action_5D/all'] = actions['action_5']/(actions['action_all']+0.001)
1730 |         actions = pd.merge(actions, u_feature_3, on='user_id', how='left')
1731 |         actions = pd.merge(actions, u_feature_1, on='user_id', how='left').fillna(0)
1732 | 
1733 |         actions['action_diff1'] = actions['action_1']-actions['action_avg_all']
1734 |         actions['browse_diff1'] = actions['browse_1'] - actions['browse_avg_all']
1735 |         actions['add_diff1'] = actions['add_1'] - actions['add_avg_all']
1736 |         actions['del_diff1'] = actions['del_1'] - actions['del_avg_all']
1737 |         actions['buy_diff1'] = actions['buy_1'] - actions['buy_avg_all']
1738 |         actions['follow_diff1'] = actions['follow_1'] - actions['follow_avg_all']
1739 |         actions['click_diff1'] = actions['click_1'] - actions['click_avg_all']
1740 | 
1741 |         print('=====> 完成!')
1742 |         actions.to_csv(dump_path,index=False)
1743 |         
1744 | #     user_id = actions[['user_id']]
1745 | #     del actions['user_id']
1746 | #     actions = actions.fillna(0)
1747 | #     actions=actions.replace(np.inf,0)
1748 | #         print(actions.head())
1749 | #     columns = actions.columns
1750 | 
1751 | #     min_max_scale = preprocessing.MinMaxScaler()
1752 | #     actions=actions.replace(np.inf,0)
1753 | #     actions = min_max_scale.fit_transform(actions.values)
1754 | #     actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1755 |     return actions
1756 |     
1757 | import pandas as pd
1758 | ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1759 | sub_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True) 
1760 | user = pd.read_csv('./data/JData_modified_user.csv', parse_dates=[4])
1761 | # user_features(user,ful_action,sel_action,'2016-04-11')
1762 | 
1763 | print("U model 2 finish  part_1")
1764 | ######################################################################################
1765 | 
1766 | 
1767 | # In[ ]:
1768 | 
1769 | 
1770 | 
1771 | 
1772 | # In[ ]:
1773 | 
1774 | 
1775 | 
1776 | 
1777 | # In[ ]:
1778 | 
1779 | 
1780 | 
1781 | 
1782 | # In[ ]:
1783 | 
1784 | 
1785 | 
1786 | 
1787 | # In[8]:
1788 | 
1789 | # 测试集
1790 | # ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1791 | # sel_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True)
1792 | def make_test_set(train_start_date, train_end_date,user,ful_action,sub_action):
1793 |     dump_path = './cache/bu10525model_2_u_test_set_%s_%s.csv' % (train_start_date, train_end_date)
1794 |     if os.path.exists(dump_path):
1795 |         actions = pd.read_csv(dump_path)
1796 |     else:
1797 |         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1798 |         actions_1 = get_actions(start_days, train_end_date)
1799 |         actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1800 | #         buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates()
1801 | #         actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False]
1802 |         
1803 | #         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1804 | #         actions_1 = get_actions(start_days, train_end_date)
1805 | #         actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)]
1806 | #         actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1807 |         
1808 |         
1809 | #         actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id'])
1810 | 
1811 | 
1812 |         print (actions.shape)
1813 | #         start_days = train_start_date
1814 |         start_days = "2016-02-01"
1815 | #         actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1816 | #         print(actions.shape)
1817 | #         
1818 |      
1819 | #         actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1820 | #         print(actions.shape)
1821 |         actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1822 |         print(actions.shape)
1823 |         actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1824 |         print(actions.shape)
1825 |         actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1826 |         print(actions.shape)
1827 |         actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1828 |         print(actions.shape)
1829 |         actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1830 |         print(actions.shape)
1831 |         actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1832 |         print (actions.shape)
1833 |         actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1834 |         print (actions.shape)
1835 |         actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1836 |         print (actions.shape)
1837 |         actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1838 |         print (actions.shape)
1839 |         actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1840 |         print (actions.shape)
1841 |         actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1842 |         print (actions.shape)
1843 |         actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1844 |         print (actions.shape)
1845 |         actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1846 |         print (actions.shape)
1847 |         actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1848 |         print (actions.shape)
1849 |         actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1850 |         print (actions.shape)
1851 |         actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1852 |         print (actions.shape)
1853 |         actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1854 |         print (actions.shape)
1855 |         
1856 |         #模型1   和 模型二
1857 |         actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1858 |         print (actions.shape)
1859 |         #模型 二
1860 |         actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1861 |         
1862 |         
1863 | #         actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1864 | #         print (actions.shape)
1865 |         
1866 | #         actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1867 | #         print (actions.shape)
1868 | #         actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1869 | #         print (actions.shape)
1870 |         
1871 |         for i in (1, 2, 3, 7, 14, 28):
1872 |             actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1873 |             actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1874 |             actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1875 |             actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1876 |             actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1877 |             actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1878 |             actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1879 |             actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1880 |             actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1881 |             #模型1   和 模型二
1882 |             actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1883 |             if(i<=10):
1884 |                 actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1885 |             #模型 二
1886 |             actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1887 |             actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
1888 |         print(actions.shape)
1889 |         print(actions.shape)
1890 | 
1891 |         actions = actions.fillna(0)
1892 | #         user_id = actions[['user_id']]
1893 | #         del actions['user_id']
1894 | #         actions = actions.fillna(0)
1895 | #         actions=actions.replace(np.inf,0)
1896 | # #         print(actions.head())
1897 | #         columns = actions.columns
1898 | 
1899 | #         min_max_scale = preprocessing.MinMaxScaler()
1900 | #         actions=actions.replace(np.inf,0)
1901 | #         actions = min_max_scale.fit_transform(actions.values)
1902 | #         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1903 | #         actions.to_csv(dump_path,index=False)
1904 |     return actions
1905 | 
1906 | 
1907 | # 训练集
1908 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action):
1909 |     dump_path = './cache/bu10525model_2_u_train_set_%s_%s_%s_%s.csv' % (train_start_date, train_end_date, test_start_date, test_end_date)
1910 |     if os.path.exists(dump_path):
1911 |         actions = pd.read_csv(dump_path)
1912 |     else:
1913 | 
1914 |         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1915 |         actions_1 = get_actions(start_days, train_end_date)
1916 |         actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1917 | #         buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates()
1918 | #         actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False]
1919 |         
1920 |         
1921 |         
1922 | #         print (actions.shape)
1923 |         
1924 | #         start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1925 | #         actions_1 = get_actions(start_days, train_end_date)
1926 | #         actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)]
1927 | #         actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1928 | #         actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id'])
1929 |         print (actions.shape)
1930 | #         start_days = train_start_date
1931 |         start_days = "2016-02-01"
1932 | #         actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1933 |         print(actions.shape)
1934 |         
1935 | #         actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1936 | #         print(actions.shape)
1937 |         actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1938 |         print(actions.shape)
1939 |         actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1940 |         print(actions.shape)
1941 |         actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1942 |         print(actions.shape)
1943 |         actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1944 |         print(actions.shape)
1945 |         actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1946 |         print(actions.shape)
1947 |         actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1948 |         print (actions.shape)
1949 |         actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1950 |         print (actions.shape)
1951 |         actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1952 |         print (actions.shape)
1953 |         actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1954 |         print (actions.shape)
1955 |         actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1956 |         print (actions.shape)
1957 |         actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1958 |         print (actions.shape)
1959 |         actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1960 |         print (actions.shape)
1961 |         actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1962 |         print (actions.shape)
1963 |         actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1964 |         print (actions.shape)
1965 |         actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1966 |         print (actions.shape)
1967 |         
1968 |         actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1969 |         print (actions.shape)
1970 |         actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1971 |         print (actions.shape)
1972 |        
1973 |         actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1974 |         actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1975 | 
1976 | #         actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1977 | #         print (actions.shape)
1978 |         
1979 | #         actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1980 | #         print (actions.shape)
1981 | #         actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1982 | #         print (actions.shape)
1983 |         print (actions.shape)
1984 |         for i in (1, 2, 3,7, 14, 28):
1985 |             actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1986 |             actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1987 |             actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1988 |             actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1989 |             actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1990 |             actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1991 |             actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1992 |             actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1993 |             actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1994 |             actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1995 |             if(i<=10):
1996 |                 actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1997 |             actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1998 |         
1999 |             actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
2000 |         print(actions.shape)
2001 |         actions = pd.merge(actions, get_user_labels(test_start_date, test_end_date), how='left', on='user_id')
2002 |         
2003 |         actions = actions.fillna(0)
2004 |         print(actions.shape)
2005 | #         user_id = actions[['user_id']]
2006 | #         del actions['user_id']
2007 | #         actions = actions.fillna(0)
2008 | #         actions=actions.replace(np.inf,0)
2009 | # #         print(actions.head())
2010 | #         columns = actions.columns
2011 | 
2012 | #         min_max_scale = preprocessing.MinMaxScaler()
2013 | #         actions=actions.replace(np.inf,0)
2014 | #         actions = min_max_scale.fit_transform(actions.values)
2015 | #         actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
2016 | #         actions.to_csv(dump_path,index=False)
2017 |     return  actions
2018 | 
2019 | 
2020 | print("U model 2 finish  part_3")
2021 | 
2022 | 
2023 | 
2024 | 
2025 | 
2026 | 
2027 | ###########################################################################################
2028 | 
2029 | 
2030 | # In[ ]:
2031 | 
2032 | 
2033 | 
2034 | 
2035 | # In[9]:
2036 | 
2037 | #!/usr/bin/python
2038 | 
2039 | import numpy as np
2040 | import xgboost as xgb
2041 | # from user_feat import *
2042 | from sklearn.model_selection import train_test_split
2043 | 
2044 | 
2045 | train_start_date = '2016-03-10'
2046 | train_end_date = '2016-04-11'
2047 | test_start_date = '2016-04-11'
2048 | test_end_date = '2016-04-16'
2049 | 
2050 | # train_start_date='2016-03-05'
2051 | # train_end_date='2016-04-06'
2052 | # test_start_date='2016-04-06'
2053 | # test_end_date='2016-04-11'
2054 | 
2055 | sub_start_date = '2016-03-15'
2056 | sub_end_date = '2016-04-16'
2057 | 
2058 | #训练数据集
2059 | actions = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action)
2060 | # print(np.isinf(actions))
2061 | # print(np.isnan(actions))
2062 | 
2063 | 
2064 | 
2065 | 
2066 | 
2067 | # for index in feature_name[1:-1]:
2068 | #     actions["r"+index]=actions[index].rank(method='max')/actions.shape[0]
2069 | 
2070 | 
2071 | ## train_neg,test_neg=train_test_split(actions_neg.values,test_size=0.15,random_state=0)
2072 | 
2073 | # #test_neg = pd.DataFrame(test_neg,columns=actions_neg.columns)
2074 | 
2075 | # #actions=pd.concat([actions_pos,test_neg])
2076 | 
2077 | # actions_pos= pd.concat([actions_pos,actions_pos])
2078 | # actions_pos= pd.concat([actions_pos,actions_pos])
2079 | # actions_pos= pd.concat([actions_pos,actions_pos])
2080 | # actions_pos= pd.concat([actions_pos,actions_pos])
2081 | # actions=pd.concat([actions_pos,actions_neg])
2082 | print("+++++++++++++++++++++++")
2083 | 
2084 | 
2085 | 
2086 | train,test=train_test_split(actions.values,test_size=0.2,random_state=0)
2087 | train=pd.DataFrame(train,columns=actions.columns)
2088 | test=pd.DataFrame(test,columns=actions.columns)
2089 | 
2090 | X_train=train.drop(['user_id','label'],axis=1)
2091 | X_test=test.drop(['user_id','label'],axis=1)
2092 | y_train=train[['label']]
2093 | y_test=test[['label']]
2094 | train_index=train[['user_id']].copy()
2095 | test_index=test[['user_id']].copy()
2096 | 
2097 | 
2098 | #测试数据集
2099 | sub_test_data = make_test_set(sub_start_date, sub_end_date,user,ful_action,sub_action)
2100 | sub_trainning_data=sub_test_data.drop(['user_id'],axis=1)
2101 | sub_user_index=sub_test_data[['user_id']].copy()    
2102 |     
2103 | 
2104 | print("U model 2 finish  part_4")
2105 | 
2106 | ########################################################################
2107 | 
2108 | 
2109 | # In[ ]:
2110 | 
2111 | 
2112 | 
2113 | 
2114 | # In[11]:
2115 | 
2116 | 
2117 | print ('==========>>>train xgboost model ....')
2118 | 
2119 | dtrain = xgb.DMatrix(X_train,label=y_train)
2120 | dtest = xgb.DMatrix(X_test,label=y_test)
2121 | param = {'learning_rate' : 0.1,
2122 |         'n_estimators': 1000,
2123 |         'max_depth': 3,
2124 |         'min_child_weight': 5,
2125 |         'gamma': 0,
2126 |         'subsample': 1.0,
2127 |         'colsample_bytree': 0.8,
2128 |         'eta': 0.05,
2129 |         'silent': 1,
2130 |         'objective':
2131 |         'binary:logistic',
2132 |         'scale_pos_weight':1}
2133 | 
2134 | 
2135 | 
2136 | num_round =120
2137 | plst = list(param.items())
2138 | plst += [('eval_metric', 'logloss')]
2139 | 
2140 | evallist = [(dtest, 'eval'), (dtrain, 'train')]
2141 | bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10)
2142 | 
2143 | 
2144 | # ============================================>>>>
2145 | print ('==========>>>predict test data label')
2146 | 
2147 | 
2148 | sub_trainning_data_1 = xgb.DMatrix(sub_trainning_data)
2149 | y = bst.predict(sub_trainning_data_1)
2150 | pred = sub_user_index
2151 | sub_user_index['label'] = y
2152 | 
2153 | # print(sub_user_index.head())
2154 | 
2155 | pred=sub_user_index
2156 | #pred.sort_values(by=['user_id','label'],ascending=[0,0],inplace=True)
2157 | pred=pred.sort_values(by=['user_id','label'],ascending=[0,0])
2158 | pred = pred.groupby('user_id').first().reset_index()
2159 | result=pred.sort_values(by=['label'],ascending=[0])
2160 | result['user_id']=result['user_id'].astype('int')
2161 | 
2162 | 
2163 | name=str(datetime.now()).replace(':','-').split('.')[0]
2164 | result.to_csv('./sub/Umodel_2.csv',index=False,index_label=False )
2165 | print("U model 2 finish  part_5")
2166 | 
2167 | 


--------------------------------------------------------------------------------