├── JData_第一炉香_12_代码运行说明 解题思路.pdf ├── start.sh ├── merge_result.py ├── README.md ├── preprocessing.py ├── Umodel_1.py └── Umodel_2.py /JData_第一炉香_12_代码运行说明 解题思路.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hecongqing/2017-jdata-competition/HEAD/JData_第一炉香_12_代码运行说明 解题思路.pdf -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python preprocessing.py 3 | python USModel.py 4 | python Umodel_0.py 5 | python Umodel_1.py 6 | python Umodel_2.py 7 | python merge_result.py 8 | 9 | -------------------------------------------------------------------------------- /merge_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | import pandas as pd 4 | #3个model的结果取权值,取top700 5 | def u_id(): 6 | df1=pd.read_csv('./sub/Umodel_0.csv') 7 | df1.columns=['user_id','label1'] 8 | 9 | df2=pd.read_csv('./sub/Umodel_1.csv') 10 | df2.columns=['user_id','label2'] 11 | 12 | df3=pd.read_csv('./sub/Umodel_2.csv') 13 | df3.columns=['user_id','label3'] 14 | 15 | df=pd.merge(df1,df2,on='user_id',how='outer') 16 | df=pd.merge(df,df3,on='user_id',how='outer') 17 | df['label']=0.3*df['label1']+0.3*df['label2']+0.4*df['label3'] 18 | df.sort_values(by=['label'],ascending=[0],inplace=True) 19 | df=df[['user_id','label']].reset_index(drop=True) 20 | df=df[['user_id']] 21 | return df[:700] 22 | #usmodel的结果取top325 23 | def us_id(): 24 | df=pd.read_csv('./sub/USModel.csv') 25 | df=df[['user_id']] 26 | return df[:325] 27 | #合并user top700 ,us中的user top 325,结果为802 28 | def merge_u_us(): 29 | u = u_id() 30 | us = us_id() 31 | df=pd.merge(u,us,on='user_id',how='outer') 32 | df=df.drop_duplicates('user_id') 33 | return df 34 | 35 | #合并user802与us model['user_id','sku_id'],得到结果 36 | def result(): 37 | u = merge_u_us() 38 | us=pd.read_csv( './sub/USModel.csv') 39 | us=us[['user_id','sku_id']] 40 | us=us.astype('int') 41 | result=pd.merge(u,us,how='left',on='user_id') 42 | print ('===========>>>打印输出结果:') 43 | result=result.fillna(0) 44 | result=result.astype('int') 45 | 46 | result.to_csv('./sub/best_result.csv',index=False) 47 | return result 48 | 49 | print (result()) 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2017 JData Competition 2 | 高潜用户购买意向预测-rank12 3 | 4 | # 赛题网址 5 | 6 | https://www.datafountain.cn/competitions/247/details/data-evaluation 7 | 8 | # 赛题任务 9 | 本次大赛以京东商城真实的用户、商品和行为数据(脱敏后)为基础,参赛队伍需要通过数据挖掘的技术和机器学习的算法,构建用户购买商品的预测模型, 10 | 输出高潜用户和目标商品的匹配结果,为精准营销提供高质量的目标群体。同时,希望参赛队伍能通过本次比赛,挖掘数据背后潜在的意义,为电商用户提供 11 | 更简单、快捷、省心的购物体验。 12 | 参赛者需要使用京东多个品类下商品的历史销售数据,构建算法模型,预测用户在未来5天内,对某个目标品类下商品的购买意向。对于训练集中出现的每一 13 | 个用户,参赛者的模型需要预测该用户在未来5天内是否购买目标品类下的商品以及所购买商品的SKU_ID。评测算法将针对参赛者提交的预测结果,计算加权得分。 14 | 15 | # 评分公式 16 | 参赛者提交的结果文件中包含对所有用户购买意向的预测结果。对每一个用户的预测结果包括两方面: 17 | 18 | 1、该用户2016-04-16到2016-04-20是否下单P中的商品,提交的结果文件中仅包含预测为下单的用户,预测为未下单的用户,无须在结果中出现。若预测正确,则评测算法中置label=1,不正确label=0; 19 | 20 | 2、如果下单,下单的sku_id (只需提交一个sku_id),若sku_id预测正确,则评测算法中置pred=1,不正确pred=0。 21 | 22 | 对于参赛者提交的结果文件,按如下公式计算得分: 23 | 24 | 25 | 26 | 此处的F1值定义为: 27 | 28 | 29 | 30 | 31 | 32 | 其中,Precise为准确率,Recall为召回率.是label=1或0的F1值,是pred=1或0的F1值. 33 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | path = './' 10 | 11 | def concat_action(): 12 | action1 = pd.read_csv(path+'/data/JData_Action_201602.csv') 13 | action2 = pd.read_csv(path+'/data/JData_Action_201603.csv') 14 | action3 = pd.read_csv(path+'/data/JData_Action_201604.csv') 15 | action = pd.concat([action1,action2,action3]).sort_values(by='time') 16 | action.to_csv('./data/JData_Action.csv', index=False) 17 | 18 | def map_user_reg(x): 19 | if x=0 and d<=3: 30 | d = 1 31 | elif d>3 and d<=6: 32 | d = 2 33 | elif d>6 and d<=12: 34 | d = 3 35 | elif d>12 and d<=24: 36 | d = 4 37 | elif d>24 and d<=48: 38 | d = 5 39 | else: 40 | d = 6 41 | return d 42 | 43 | def user_process(): 44 | user = pd.read_csv(path + '/data/JData_User.csv', encoding='gbk', parse_dates=[4]) 45 | user = user.drop_duplicates('user_id') 46 | #user = user[user['user_reg_tm']= start_date) & (action_1.time < end_date)] 113 | action_2 = get_actions_2() 114 | action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)] 115 | actions = pd.concat([action_1, action_2]) 116 | action_3 = get_actions_3() 117 | action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)] 118 | actions = pd.concat([actions, action_3]) # type: pd.DataFrame 119 | actions = actions[(actions.time >= start_date) & (actions.time < end_date)] 120 | actions.to_csv(dump_path, index=False) 121 | # actions['user_id']=actions['user_id'].astype('int') 122 | return actions 123 | 124 | # 获取两个时间相差几天 125 | def get_day_chaju(x, end_date): 126 | # x=x.split(' ')[0] 127 | x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S') 128 | end_date = datetime.strptime(end_date, '%Y-%m-%d') 129 | return (end_date - x).days 130 | 131 | 132 | def get_action_feat(start_date, end_date,k): 133 | dump_path = './cache/u_action_%s_%s_%s.csv' % (start_date, end_date,k) 134 | if os.path.exists(dump_path): 135 | actions = pd.read_csv(dump_path) 136 | else: 137 | start_days=pd.to_datetime(end_date)-timedelta(days=k) 138 | start_days=str(start_days).split(' ')[0] 139 | actions = get_actions(start_days, end_date) 140 | actions = actions[['user_id', 'type']] 141 | df = pd.get_dummies(actions['type'], prefix='type') 142 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 143 | actions = actions.groupby('user_id', as_index=False).sum() 144 | min_max_scaler = preprocessing.MinMaxScaler() 145 | df = min_max_scaler.fit_transform(actions.drop(['user_id','type'],axis=1).values) 146 | df = pd.DataFrame(df) 147 | df.columns=['u_action_'+str(k)+'_'+str(i) for i in range(1,df.shape[1]+1)] 148 | actions = pd.concat([actions[['user_id']], df], axis=1) 149 | actions.to_csv(dump_path, index=False) 150 | return actions 151 | 152 | 153 | 154 | 155 | 156 | 157 | # 用户的行为转化率 158 | def get_action_user_feat1(start_date, end_date): 159 | feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio', 160 | 'user_action_5_ratio', 'user_action_6_ratio'] 161 | dump_path = './cache/user_feat_accumulate_xiugai_%s_%s.csv' % (start_date, end_date) 162 | if os.path.exists(dump_path): 163 | actions = pd.read_csv(dump_path) 164 | else: 165 | actions = get_actions(start_date, end_date) 166 | df = pd.get_dummies(actions['type'], prefix='action') 167 | actions = pd.concat([actions['user_id'], df], axis=1) 168 | actions = actions.groupby(['user_id'], as_index=False).sum() 169 | actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1'] 170 | actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2'] 171 | # actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3'] 172 | actions['user_action_3_ratio'] = actions['action_3'] / actions['action_2'] 173 | actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5'] 174 | actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6'] 175 | # 3.购物车删除 176 | actions = actions[feature] 177 | actions.to_csv(dump_path, index=False) 178 | return actions 179 | 180 | 181 | # print get_accumulate_user_feat('2016-03-10','2016-04-11') 182 | # 用户购买前访问天数 183 | # 用户购买/加入购物车/关注前访问天数 184 | def get_action_user_feat2(start_date, end_date): 185 | dump_path = './cache/user_feat2_after_%s_%s.csv' % (start_date, end_date) 186 | if os.path.exists(dump_path): 187 | actions = pd.read_csv(dump_path) 188 | 189 | else: 190 | # 用户购买前访问天数 191 | def user_feat_2_1(start_date, end_date): 192 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 193 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 194 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 195 | visit = actions[actions['type'] == 1] 196 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first') 197 | del visit['time'] 198 | del actions['time'] 199 | visit = visit.groupby('user_id', as_index=False).count() 200 | visit.columns = ['user_id', 'visit'] 201 | buy = actions[actions['type'] == 4] 202 | buy = buy.groupby('user_id', as_index=False).count() 203 | buy.columns = ['user_id', 'buy'] 204 | actions = pd.merge(visit, buy, on='user_id', how='left') 205 | actions['visit_day_before_buy'] = actions['visit'] / actions['buy'] 206 | del actions['buy'] 207 | del actions['visit'] 208 | return actions 209 | 210 | # 用户加入购物车前访问天数 211 | def user_feat_2_2(start_date, end_date): 212 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 213 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 214 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 215 | visit = actions[actions['type'] == 1] 216 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first') 217 | del visit['time'] 218 | del actions['time'] 219 | visit = visit.groupby('user_id', as_index=False).count() 220 | visit.columns = ['user_id', 'visit'] 221 | addtoshopping = actions[actions['type'] == 2] 222 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 223 | addtoshopping.columns = ['user_id', 'addtoshopping'] 224 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left') 225 | actions['visit_day_before_addtoshopping'] = actions['visit'] / actions['addtoshopping'] 226 | del actions['addtoshopping'] 227 | del actions['visit'] 228 | return actions 229 | 230 | # 用户关注前访问天数 231 | def user_feat_2_3(start_date, end_date): 232 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 233 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 234 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 235 | visit = actions[actions['type'] == 1] 236 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first') 237 | del visit['time'] 238 | del actions['time'] 239 | visit = visit.groupby('user_id', as_index=False).count() 240 | visit.columns = ['user_id', 'visit'] 241 | guanzhu = actions[actions['type'] == 5] 242 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 243 | guanzhu.columns = ['user_id', 'guanzhu'] 244 | actions = pd.merge(visit, guanzhu, on='user_id', how='left') 245 | actions['visit_day_before_guanzhu'] = actions['visit'] / actions['guanzhu'] 246 | del actions['guanzhu'] 247 | del actions['visit'] 248 | return actions 249 | 250 | # 用户购买前加入购物车天数 251 | def user_feat_2_4(start_date, end_date): 252 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 253 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 254 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 255 | addtoshopping = actions[actions['type'] == 2] 256 | addtoshopping = addtoshopping.drop_duplicates(['user_id', 'time'], keep='first') 257 | del addtoshopping['time'] 258 | del actions['time'] 259 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 260 | addtoshopping.columns = ['user_id', 'addtoshopping'] 261 | buy = actions[actions['type'] == 4] 262 | buy = buy.groupby('user_id', as_index=False).count() 263 | buy.columns = ['user_id', 'buy'] 264 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left') 265 | actions['addtoshopping_day_before_buy'] = actions['addtoshopping'] / actions['buy'] 266 | del actions['buy'] 267 | del actions['addtoshopping'] 268 | return actions 269 | 270 | # 用户购买前关注天数 271 | def user_feat_2_5(start_date, end_date): 272 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 273 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 274 | guanzhu = actions[actions['type'] == 5] 275 | guanzhu = guanzhu.drop_duplicates(['user_id', 'time'], keep='first') 276 | del guanzhu['time'] 277 | del actions['time'] 278 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 279 | guanzhu.columns = ['user_id', 'guanzhu'] 280 | buy = actions[actions['type'] == 4] 281 | buy = buy.groupby('user_id', as_index=False).count() 282 | buy.columns = ['user_id', 'buy'] 283 | actions = pd.merge(guanzhu, buy, on='user_id', how='left') 284 | actions['guanzhu_day_before_buy'] = actions['guanzhu'] / actions['buy'] 285 | del actions['buy'] 286 | del actions['guanzhu'] 287 | return actions 288 | 289 | actions = pd.merge(user_feat_2_1(start_date, end_date), user_feat_2_2(start_date, end_date), on='user_id', 290 | how='outer') 291 | actions = pd.merge(actions, user_feat_2_3(start_date, end_date), on='user_id', how='outer') 292 | actions = pd.merge(actions, user_feat_2_4(start_date, end_date), on='user_id', how='outer') 293 | actions = pd.merge(actions, user_feat_2_5(start_date, end_date), on='user_id', how='outer') 294 | user_id = actions['user_id'] 295 | del actions['user_id'] 296 | actions = actions.fillna(0) 297 | min_max_scale = preprocessing.MinMaxScaler() 298 | actions = min_max_scale.fit_transform(actions.values) 299 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1) 300 | actions.to_csv(dump_path, index=False) 301 | actions.columns = ['user_id'] + ['u_feat2_' + str(i) for i in range(1, actions.shape[1])] 302 | return actions 303 | 304 | 305 | 306 | 307 | # # 用户总购买品牌数 308 | # def get_action_user_feat5(start_date, end_date): 309 | # dump_path = './cache/user_feat5_%s_%s.csv' % (start_date, end_date) 310 | # if os.path.exists(dump_path): 311 | # actions = pd.read_csv(dump_path) 312 | # else: 313 | # actions = get_actions(start_date, end_date)[['user_id', 'sku_id']] 314 | # actions = actions.drop_duplicates(['user_id', 'sku_id'], keep='first') 315 | # actions = actions.groupby('user_id', as_index=False).count() 316 | # actions.columns = ['user_id', 'sku_num'] 317 | # actions['sku_num'] = actions['sku_num'].astype('float') 318 | # actions['sku_num'] = actions['sku_num'].map( 319 | # lambda x: (x - actions['sku_num'].min()) / (actions['sku_num'].max() - actions['sku_num'].min())) 320 | # actions.to_csv(dump_path, index=False) 321 | # actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])] 322 | # return actions 323 | 324 | 325 | # 用户平均访问间隔 326 | def get_action_user_feat6(start_date, end_date): 327 | dump_path = './cache/user_feat6_%s_%s.csv' % (start_date, end_date) 328 | if os.path.exists(dump_path): 329 | actions = pd.read_csv(dump_path) 330 | else: 331 | 332 | df = get_actions(start_date, end_date)[['user_id', 'time']] 333 | # df['user_id']=df['user_id'].astype('int') 334 | df['time'] = df['time'].map(lambda x: x.split(' ')[0]) 335 | df = df.drop_duplicates(['user_id', 'time'], keep='first') 336 | df['time'] = df['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d')) 337 | actions = df.groupby('user_id', as_index=False).agg(lambda x: x['time'].diff().mean()) 338 | actions['avg_visit'] = actions['time'].dt.days 339 | del actions['time'] 340 | actions.to_csv(dump_path, index=False) 341 | actions.columns = ['user_id'] + ['u_feat6_' + str(i) for i in range(1, actions.shape[1])] 342 | return actions 343 | 344 | 345 | # 用户平均六种行为的访问间隔 346 | def get_action_user_feat6_six(start_date, end_date): 347 | dump_path = './cache/user_feat6_six_%s_%s.csv' % (start_date, end_date) 348 | if os.path.exists(dump_path): 349 | actions = pd.read_csv(dump_path) 350 | else: 351 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 352 | df['time'] = df['time'].map(lambda x: (-1) * get_day_chaju(x, start_date)) 353 | df = df.drop_duplicates(['user_id', 'time', 'type'], keep='first') 354 | actions = df.groupby(['user_id', 'type']).agg(lambda x: np.diff(x).mean()) 355 | actions = actions.unstack() 356 | actions.columns = list(range(actions.shape[1])) 357 | actions = actions.reset_index() 358 | actions.to_csv(dump_path, index=False) 359 | actions.columns = ['user_id'] + ['u_feat6_six_' + str(i) for i in range(1, actions.shape[1])] 360 | return actions 361 | 362 | 363 | # 用户购买频率 364 | def get_action_user_feat7(start_date, end_date): 365 | dump_path = './cache/user_feat7_six_%s_%s.csv' % (start_date, end_date) 366 | if os.path.exists(dump_path): 367 | actions = pd.read_csv(dump_path) 368 | else: 369 | df = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 370 | actions = df.groupby(['user_id', 'type'], as_index=False).count() 371 | 372 | time_min = df.groupby(['user_id', 'type'], as_index=False).min() 373 | time_max = df.groupby(['user_id', 'type'], as_index=False).max() 374 | 375 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left') 376 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 377 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 378 | 379 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] - 380 | time_cha[ 381 | 'time_y']).dt.seconds // 3600 382 | del time_cha['time_x'] 383 | del time_cha['time_y'] 384 | # time_cha=time_cha.fillna(1) 385 | 386 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left") 387 | actions = actions.groupby(['user_id', 'type']).sum() 388 | actions['cnt/time'] = actions['time'] / actions["cha_hour"] 389 | actions = actions.unstack() 390 | actions.columns = list(range(actions.shape[1])) 391 | actions = actions.reset_index() 392 | actions = actions.fillna(0) 393 | actions.to_csv(dump_path, index=False) 394 | actions.columns = ['user_id'] + ['u_feat7_' + str(i) for i in range(1, actions.shape[1])] 395 | return actions 396 | 397 | 398 | def user_top_k_0_1(start_date, end_date): 399 | actions = get_actions(start_date, end_date) 400 | actions = actions[['user_id', 'sku_id', 'type']] 401 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 402 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 403 | actions = actions.groupby('user_id', as_index=False).sum() 404 | del actions['type'] 405 | del actions['sku_id'] 406 | user_id = actions['user_id'] 407 | del actions['user_id'] 408 | actions = actions.applymap(lambda x: 1 if x > 0 else 0) 409 | actions = pd.concat([user_id, actions], axis=1) 410 | return actions 411 | 412 | 413 | # 用户最近K天行为0/1提取 414 | def get_action_user_feat8(start_date, end_date): 415 | dump_path = './cache/user_feat8_%s_%s.csv' % (start_date, end_date) 416 | if os.path.exists(dump_path): 417 | actions = pd.read_csv(dump_path) 418 | else: 419 | actions = None 420 | for i in (1, 2, 3, 4, 5, 6, 7, 15, 30): 421 | print(i) 422 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i) 423 | start_days = start_days.strftime('%Y-%m-%d') 424 | if actions is None: 425 | actions = user_top_k_0_1(start_days, end_date) 426 | else: 427 | actions = pd.merge(actions, user_top_k_0_1(start_days, end_date), how='outer', on='user_id') 428 | actions.to_csv(dump_path, index=False) 429 | actions.columns = ['user_id'] + ['u_feat8_' + str(i) for i in range(1, actions.shape[1])] 430 | return actions 431 | 432 | 433 | # 获取用户的重复购买率 434 | def get_action_user_feat8_2(start_date, end_date): 435 | dump_path = './cache/product_feat8_2_%s_%s.csv' % (start_date, end_date) 436 | if os.path.exists(dump_path): 437 | actions = pd.read_csv(dump_path) 438 | else: 439 | df = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']] 440 | df = df[df['type'] == 4] # 购买的行为 441 | df = df.groupby(['user_id', 'sku_id'], as_index=False).count() 442 | df.columns = ['user_id', 'sku_id', 'count1'] 443 | df['count1'] = df['count1'].map(lambda x: 1 if x > 1 else 0) 444 | grouped = df.groupby(['user_id'], as_index=False) 445 | actions = grouped.count()[['user_id', 'count1']] 446 | actions.columns = ['user_id', 'count'] 447 | re_count = grouped.sum()[['user_id', 'count1']] 448 | re_count.columns = ['user_id', 're_count'] 449 | actions = pd.merge(actions, re_count, on='user_id', how='left') 450 | re_buy_rate = actions['re_count'] / actions['count'] 451 | actions = pd.concat([actions['user_id'], re_buy_rate], axis=1) 452 | actions.columns = ['user_id', 're_buy_rate'] 453 | actions.to_csv(dump_path, index=False) 454 | actions.columns = ['user_id'] + ['u_feat8_2_' + str(i) for i in range(1, actions.shape[1])] 455 | return actions 456 | 457 | 458 | # 获取最近一次行为的时间距离当前时间的差距 459 | def get_action_user_feat9(start_date, end_date): 460 | dump_path = './cache/user_feat9_%s_%s.csv' % (start_date, end_date) 461 | if os.path.exists(dump_path): 462 | actions = pd.read_csv(dump_path) 463 | else: 464 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 465 | # df['time'] = df['time'].map(lambda x: (-1)*get_day_chaju(x,start_date)) 466 | df = df.drop_duplicates(['user_id', 'type'], keep='last') 467 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1) 468 | actions = df.groupby(['user_id', 'type']).sum() 469 | actions = actions.unstack() 470 | actions.columns = list(range(actions.shape[1])) 471 | actions = actions.reset_index() 472 | actions = actions.fillna(30) 473 | actions.to_csv(dump_path, index=False) 474 | actions.columns = ['user_id'] + ['u_feat9_' + str(i) for i in range(1, actions.shape[1])] 475 | return actions 476 | 477 | 478 | # 获取最后一次行为的次数并且进行归一化 479 | def get_action_user_feat10(start_date, end_date): 480 | dump_path = './cache/user_feat10_%s_%s.csv' % (start_date, end_date) 481 | if os.path.exists(dump_path): 482 | actions = pd.read_csv(dump_path) 483 | else: 484 | 485 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 486 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1) 487 | 488 | idx = df.groupby(['user_id', 'type'])['time'].transform(min) 489 | idx1 = idx == df['time'] 490 | actions = df[idx1].groupby(["user_id", "type"]).count() 491 | actions = actions.unstack() 492 | actions.columns = list(range(actions.shape[1])) 493 | actions = actions.fillna(0) 494 | actions = actions.reset_index() 495 | 496 | user_sku = actions[['user_id']] 497 | del actions['user_id'] 498 | min_max_scaler = preprocessing.MinMaxScaler() 499 | actions = min_max_scaler.fit_transform(actions.values) 500 | actions = pd.DataFrame(actions) 501 | actions = pd.concat([user_sku, actions], axis=1) 502 | 503 | actions.to_csv(dump_path, index=False) 504 | actions.columns = ['user_id'] + ['u_feat10_' + str(i) for i in range(1, actions.shape[1])] 505 | return actions 506 | 507 | 508 | # 获取人物该层级最后一层的各种行为的统计数量 509 | def get_action_user_feat11(start_date, end_date, n): 510 | dump_path = './cache/user_feat11_%s_%s_%s.csv' % (start_date, end_date, n) 511 | if os.path.exists(dump_path): 512 | actions = pd.read_csv(dump_path) 513 | else: 514 | 515 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 516 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n) 517 | df = df[df['time'] == 0] 518 | del df['time'] 519 | temp = pd.get_dummies(df['type'], prefix='type') 520 | del df['type'] 521 | actions = pd.concat([df, temp], axis=1) 522 | actions = actions.groupby(['user_id'], as_index=False).sum() 523 | user_sku = actions[['user_id']] 524 | del actions['user_id'] 525 | min_max_scaler = preprocessing.MinMaxScaler() 526 | actions = min_max_scaler.fit_transform(actions.values) 527 | actions = pd.DataFrame(actions) 528 | actions = pd.concat([user_sku, actions], axis=1) 529 | actions.to_csv(dump_path, index=False) 530 | actions.columns = ['user_id'] + ['u_feat11_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 531 | return actions 532 | 533 | 534 | def get_action_user_feat12(start_date, end_date): 535 | dump_path = './cache/user_feat12_%s_%s.csv' % (start_date, end_date) 536 | if os.path.exists(dump_path): 537 | actions = pd.read_csv(dump_path) 538 | else: 539 | actions = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 540 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 541 | actions = actions.drop_duplicates(['user_id', 'time', 'type'], keep='first') 542 | actions['day'] = actions['time'].map( 543 | lambda x: (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d')).days) 544 | result = None 545 | for i in (2, 3, 7, 14, 28): # 层级个数 546 | print ('i%s' % i) 547 | actions['level%s' % i] = actions['day'].map(lambda x: x // i) 548 | a=set(actions['level%s' % i].tolist()) 549 | for j in (1, 2,3,4, 5, 6): # type 550 | print ('j%s' % j) 551 | df = actions[actions['type'] == j][['user_id', 'level%s' % i, 'time']] 552 | df = df.groupby(['user_id', 'level%s' % i]).count() 553 | df = df.unstack() 554 | b=df.columns.levels[1].tolist() 555 | df.columns = ['u_feat12_' + str('level%s_' % i) + str(j) + '_' + str(k) for k in df.columns.levels[1].tolist()] 556 | if len(list(a-set(b)))!=0: 557 | c=list(a-set(b)) 558 | for k in c: 559 | df['u_feat12_'+str('level%s_' % i)+str(j)+'_'+ str(k)]=0 560 | columns=df.columns 561 | dict={} 562 | for column in columns: 563 | k=int(column.split('_')[-1]) 564 | dict[column]=k 565 | columns=sorted(dict.items(),key=lambda x: x[1]) 566 | columns=[(columns[t])[0] for t in range(len(columns))] 567 | df=df[columns] 568 | df = df.reset_index() 569 | if result is None: 570 | result = df 571 | else: 572 | result = pd.merge(result, df, on='user_id', how='left') 573 | columns = result.columns 574 | user_id = result['user_id'] 575 | del result['user_id'] 576 | actions = result.fillna(0) 577 | 578 | min_max_scaler = preprocessing.MinMaxScaler() 579 | actions = min_max_scaler.fit_transform(actions.values) 580 | actions = pd.DataFrame(actions) 581 | actions = pd.concat([user_id, actions], axis=1) 582 | actions.columns=columns 583 | actions.to_csv(dump_path, index=False) 584 | return actions 585 | 586 | 587 | 588 | # 层级的天数 589 | def get_action_user_feat13(start_date, end_date, n): 590 | dump_path = './cache/user_feat13_%s_%s_%s.csv' % (start_date, end_date, n) 591 | if os.path.exists(dump_path): 592 | actions = pd.read_csv(dump_path) 593 | else: 594 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 595 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n) 596 | df = df.drop_duplicates(['user_id', 'type', 'time'], keep='first') 597 | actions = df.groupby(['user_id', 'type']).count() 598 | actions = actions.unstack() 599 | actions.columns = list(range(actions.shape[1])) 600 | actions = actions.fillna(0) 601 | actions = actions.reset_index() 602 | user_sku = actions[['user_id']] 603 | del actions['user_id'] 604 | min_max_scaler = preprocessing.MinMaxScaler() 605 | actions = min_max_scaler.fit_transform(actions.values) 606 | actions = pd.DataFrame(actions) 607 | actions = pd.concat([user_sku, actions], axis=1) 608 | actions.to_csv(dump_path, index=False) 609 | actions.columns = ['user_id'] + ['u_feat13_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 610 | return actions 611 | 612 | 613 | def get_action_user_feat14(start_date, end_date): 614 | dump_path = './cache/user_feat14_%s_%s.csv' % (start_date, end_date) 615 | if os.path.exists(dump_path): 616 | actions = pd.read_csv(dump_path) 617 | else: 618 | n = 5 619 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 620 | df = df[df['type'] == 4][['user_id', 'time']] 621 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n) 622 | days = np.max(df['time']) 623 | 624 | df['cnt'] = 0 625 | actions = df.groupby(['user_id', 'time']).count() 626 | 627 | actions = actions.unstack() 628 | 629 | actions.columns = list(range(actions.shape[1])) 630 | actions = actions.reset_index() 631 | 632 | actions = actions.fillna(0) 633 | user_sku = actions[['user_id']] 634 | del actions['user_id'] 635 | min_max_scaler = preprocessing.MinMaxScaler() 636 | actions = min_max_scaler.fit_transform(actions.values) 637 | actions = pd.DataFrame(actions) 638 | actions = pd.concat([user_sku, actions], axis=1) 639 | actions.to_csv(dump_path, index=False) 640 | actions.columns = ['user_id'] + ['u_feat14_' + str(i) for i in range(1, actions.shape[1])] 641 | return actions 642 | 643 | 644 | # 用户购买/加入购物车/关注前访问次数 645 | def get_action_user_feat15(start_date, end_date): 646 | dump_path = './cache/user_feat15_%s_%s.csv' % (start_date, end_date) 647 | if os.path.exists(dump_path): 648 | actions = pd.read_csv(dump_path) 649 | else: 650 | # 用户购买前访问次数 651 | def user_feat_15_1(start_date, end_date): 652 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 653 | visit = actions[actions['type'] == 1] 654 | visit = visit.groupby('user_id', as_index=False).count() 655 | visit.columns = ['user_id', 'visit'] 656 | buy = actions[actions['type'] == 4] 657 | buy = buy.groupby('user_id', as_index=False).count() 658 | buy.columns = ['user_id', 'buy'] 659 | actions = pd.merge(visit, buy, on='user_id', how='left') 660 | actions['visit_num_before_buy'] = actions['visit'] / actions['buy'] 661 | del actions['buy'] 662 | del actions['visit'] 663 | return actions 664 | 665 | # 用户加入购物车前访问次数 666 | def user_feat_15_2(start_date, end_date): 667 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 668 | visit = actions[actions['type'] == 1] 669 | visit = visit.groupby('user_id', as_index=False).count() 670 | visit.columns = ['user_id', 'visit'] 671 | addtoshopping = actions[actions['type'] == 2] 672 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 673 | addtoshopping.columns = ['user_id', 'addtoshopping'] 674 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left') 675 | actions['visit_num_before_addtoshopping'] = actions['visit'] / actions['addtoshopping'] 676 | del actions['addtoshopping'] 677 | del actions['visit'] 678 | return actions 679 | 680 | # 用户关注前访问次数 681 | def user_feat_15_3(start_date, end_date): 682 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 683 | visit = actions[actions['type'] == 1] 684 | visit = visit.groupby('user_id', as_index=False).count() 685 | visit.columns = ['user_id', 'visit'] 686 | guanzhu = actions[actions['type'] == 5] 687 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 688 | guanzhu.columns = ['user_id', 'guanzhu'] 689 | actions = pd.merge(visit, guanzhu, on='user_id', how='left') 690 | actions['visit_num_before_guanzhu'] = actions['visit'] / actions['guanzhu'] 691 | del actions['guanzhu'] 692 | del actions['visit'] 693 | return actions 694 | 695 | # 用户购买前加入购物车次数 696 | def user_feat_15_4(start_date, end_date): 697 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 698 | addtoshopping = actions[actions['type'] == 2] 699 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 700 | addtoshopping.columns = ['user_id', 'addtoshopping'] 701 | buy = actions[actions['type'] == 4] 702 | buy = buy.groupby('user_id', as_index=False).count() 703 | buy.columns = ['user_id', 'buy'] 704 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left') 705 | actions['addtoshopping_num_before_buy'] = actions['addtoshopping'] / actions['buy'] 706 | del actions['buy'] 707 | del actions['addtoshopping'] 708 | return actions 709 | 710 | # 用户购买前关注次数 711 | def user_feat_15_5(start_date, end_date): 712 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 713 | guanzhu = actions[actions['type'] == 5] 714 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 715 | guanzhu.columns = ['user_id', 'guanzhu'] 716 | buy = actions[actions['type'] == 4] 717 | buy = buy.groupby('user_id', as_index=False).count() 718 | buy.columns = ['user_id', 'buy'] 719 | actions = pd.merge(guanzhu, buy, on='user_id', how='left') 720 | actions['guanzhu_num_before_buy'] = actions['guanzhu'] / actions['buy'] 721 | del actions['buy'] 722 | del actions['guanzhu'] 723 | return actions 724 | 725 | actions = pd.merge(user_feat_15_1(start_date, end_date), user_feat_15_2(start_date, end_date), on='user_id', 726 | how='outer') 727 | actions = pd.merge(actions, user_feat_15_3(start_date, end_date), on='user_id', how='outer') 728 | actions = pd.merge(actions, user_feat_15_4(start_date, end_date), on='user_id', how='outer') 729 | actions = pd.merge(actions, user_feat_15_5(start_date, end_date), on='user_id', how='outer') 730 | user_id = actions['user_id'] 731 | del actions['user_id'] 732 | actions = actions.fillna(0) 733 | min_max_scale = preprocessing.MinMaxScaler() 734 | actions = min_max_scale.fit_transform(actions.values) 735 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1) 736 | 737 | actions.to_csv(dump_path, index=False) 738 | actions.columns = ['user_id'] + ['u_feat15_' + str(i) for i in range(1, actions.shape[1])] 739 | return actions 740 | 741 | 742 | # 用户行为的交叉 743 | def get_action_user_feat16(start_date, end_date): 744 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date) 745 | if os.path.exists(dump_path): 746 | actions = pd.read_csv(dump_path) 747 | else: 748 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 749 | actions['cnt'] = 0 750 | action1 = actions.groupby(['user_id', 'type']).count() 751 | action1 = action1.unstack() 752 | index_col = list(range(action1.shape[1])) 753 | action1.columns = index_col 754 | action1 = action1.reset_index() 755 | action2 = actions.groupby('user_id', as_index=False).count() 756 | del action2['type'] 757 | action2.columns = ['user_id', 'cnt'] 758 | actions = pd.merge(action1, action2, how='left', on='user_id') 759 | for i in index_col: 760 | actions[i] = actions[i] / actions['cnt'] 761 | del actions['cnt'] 762 | actions.to_csv(dump_path, index=False) 763 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])] 764 | return actions 765 | 766 | 767 | # 最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除) 768 | def get_action_user_feat0509_1_30(start_date, end_date, n): 769 | dump_path = './cache/user_feat0509_1_30_%s_%s_%s.csv' % (start_date, end_date, n) 770 | if os.path.exists(dump_path): 771 | actions = pd.read_csv(dump_path) 772 | else: 773 | 774 | start_days = datetime.strptime(end_dfte, '%Y-%m-%d') - timedelta(days=n) 775 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 776 | 777 | actions = get_actions(start_days, end_date)[['user_id', 'sku_id', 'type']] 778 | actions_dummy = pd.get_dummies(actions['type'], prefix='actions') 779 | actions = pd.concat([actions, actions_dummy], axis=1) 780 | del actions['type'] 781 | 782 | P = get_basic_product_feat()[['sku_id']] 783 | P['label'] = 1 784 | actions_sub = pd.merge(actions, P, on='sku_id', how='left') 785 | actions_sub = actions_sub[actions_sub['label'] == 1] 786 | del actions_sub['label'] 787 | 788 | actions_sub = actions_sub.groupby(['user_id'], as_index=False).sum() 789 | del actions_sub['sku_id'] 790 | actions_all = actions.groupby(['user_id'], as_index=False).sum() 791 | del actions_all['sku_id'] 792 | 793 | if n > 7: 794 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left') 795 | # print actions.head() 796 | for i in range(1, 7): 797 | actions['actions_%s' % i] = actions['actions_%s_y' % i] / actions['actions_%s_x' % i] 798 | # actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']] 799 | 800 | else: 801 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left') 802 | actions.to_csv(dump_path, index=False) 803 | actions.columns = ['user_id'] + ['u_feat30_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 804 | 805 | return actions 806 | 807 | 808 | 809 | #用户行为的交叉 810 | def get_action_user_feat16(start_date,end_date): 811 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date) 812 | if os.path.exists(dump_path): 813 | actions = pd.read_csv(dump_path) 814 | else: 815 | actions=get_actions(start_date, end_date)[['user_id', 'type']] 816 | actions['cnt']=0 817 | action1 = actions.groupby(['user_id', 'type']).count() 818 | action1=action1.unstack() 819 | index_col=list(range(action1.shape[1])) 820 | action1.columns=index_col 821 | action1=action1.reset_index() 822 | action2 = actions.groupby('user_id', as_index=False).count() 823 | del action2['type'] 824 | action2.columns = ['user_id', 'cnt'] 825 | actions = pd.merge(action1, action2, how='left', on='user_id') 826 | for i in index_col: 827 | actions[i] = actions[i] / actions['cnt'] 828 | del actions['cnt'] 829 | actions.to_csv(dump_path,index=False) 830 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])] 831 | return actions 832 | 833 | #最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除) 834 | def get_action_user_feat0509_1_30(start_date,end_date,n): 835 | dump_path='./cache/user_feat0509_1_30_%s_%s_%s.csv'%(start_date,end_date,n) 836 | if os.path.exists(dump_path): 837 | actions = pd.read_csv(dump_path) 838 | else: 839 | 840 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n) 841 | start_days=datetime.strftime(start_days,'%Y-%m-%d') 842 | 843 | actions=get_actions(start_days,end_date)[['user_id','sku_id','type']] 844 | actions_dummy=pd.get_dummies(actions['type'],prefix='actions') 845 | actions=pd.concat([actions,actions_dummy],axis=1) 846 | del actions['type'] 847 | 848 | P = get_basic_product_feat()[['sku_id']] 849 | P['label']=1 850 | actions_sub=pd.merge(actions,P,on='sku_id',how='left') 851 | actions_sub=actions_sub[actions_sub['label']==1] 852 | del actions_sub['label'] 853 | 854 | actions_sub=actions_sub.groupby(['user_id'],as_index=False).sum() 855 | del actions_sub['sku_id'] 856 | actions_all=actions.groupby(['user_id'],as_index=False).sum() 857 | del actions_all['sku_id'] 858 | 859 | if n>7: 860 | actions=pd.merge(actions_all,actions_sub,on=['user_id'],how='left') 861 | #print actions.head() 862 | for i in range(1,7): 863 | actions['actions_%s'%i]=actions['actions_%s_y'%i]/actions['actions_%s_x'%i] 864 | #actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']] 865 | 866 | else: 867 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left') 868 | actions.to_csv(dump_path,index=False) 869 | actions.columns = ['user_id'] + ['u_feat30_' +str(n)+'_'+ str(i) for i in range(1, actions.shape[1])] 870 | # user_id = actions[['user_id']] 871 | # del actions['user_id'] 872 | # actions = actions.fillna(0) 873 | # actions=actions.replace(np.inf,0) 874 | # # print(actions.head()) 875 | # columns = actions.columns 876 | 877 | # min_max_scale = preprocessing.MinMaxScaler() 878 | # actions=actions.replace(np.inf,0) 879 | # actions = min_max_scale.fit_transform(actions.values) 880 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 881 | return actions 882 | 883 | #用户点击到购买的时间间隔 884 | def get_action_user_feat0515_2_1(start_date,end_date): 885 | dump_path='./cache/get_action_user_feat0515_2_1_%s_%s.csv'%(start_date,end_date) 886 | if os.path.exists(dump_path): 887 | actions = pd.read_csv(dump_path) 888 | else: 889 | actions = get_actions(start_date,end_date) 890 | actions_dianji=actions[actions['type']==6][['user_id','sku_id','time']] 891 | actions_dianji['time_dianji'] = actions_dianji['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 892 | actions_dianji = actions_dianji[['user_id', 'sku_id','time_dianji']] 893 | actions_dianji= actions_dianji.drop_duplicates(['user_id', 'sku_id'], keep='first') 894 | 895 | 896 | actions_goumai=actions[actions['type']==4][['user_id','sku_id','time']] 897 | actions_goumai['time_goumai'] = actions_goumai['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 898 | actions_goumai = actions_goumai[['user_id', 'sku_id','time_goumai']] 899 | actions_goumai= actions_goumai.drop_duplicates(['user_id', 'sku_id'], keep='last') 900 | 901 | actions = pd.merge(actions_dianji,actions_goumai,on=['user_id','sku_id'],how='inner') 902 | actions['time_jiange']=actions['time_goumai']-actions['time_dianji'] 903 | actions=actions.drop(['sku_id','time_goumai','time_dianji'],axis=1) 904 | actions['time_jiange']=actions['time_jiange'].map(lambda x:x.days*24+x.seconds//3600+1) 905 | 906 | actions_min = actions.groupby('user_id').min().reset_index() 907 | actions_min.columns = ['user_id','time_min'] 908 | # actions_mean = actions.groupby('user_id').mean().reset_index() 909 | # actions_mean.columns = ['user_id','time_mean'] 910 | actions_max = actions.groupby('user_id').max().reset_index() 911 | actions_max.columns = ['user_id','time_max'] 912 | actions=pd.merge(actions_min,actions_max,on='user_id',how='left') 913 | 914 | user_id = actions[['user_id']] 915 | del actions['user_id'] 916 | actions = actions.fillna(0) 917 | columns = actions.columns 918 | min_max_scale = preprocessing.MinMaxScaler() 919 | actions = min_max_scale.fit_transform(actions.values) 920 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 921 | actions.to_csv(dump_path,index=False) 922 | return actions 923 | 924 | 925 | #用户购买每种cate的数量 926 | def get_action_user_feat0515_2_2(start_date,end_date): 927 | dump_path='./cache/get_action_user_feat0515_2_2_%s_%s.csv'%(start_date,end_date) 928 | if os.path.exists(dump_path): 929 | actions = pd.read_csv(dump_path) 930 | else: 931 | actions = get_actions(start_date,end_date) 932 | actions = get_actions(start_date,end_date)[['user_id','cate']] 933 | cate_col = pd.get_dummies(actions['cate'],prefix='cate') 934 | actions=pd.concat([actions[['user_id']],cate_col],axis=1) 935 | actions= actions.groupby('user_id').sum().reset_index() 936 | 937 | user_id = actions[['user_id']] 938 | del actions['user_id'] 939 | actions = actions.fillna(0) 940 | columns = actions.columns 941 | min_max_scale = preprocessing.MinMaxScaler() 942 | actions = min_max_scale.fit_transform(actions.values) 943 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 944 | actions.to_csv(dump_path,index=False) 945 | return actions 946 | 947 | 948 | #获取某人某段时间内加入购物车的数量以及关注的数量 949 | def get_action_user_feat0515_2_3(start_date, end_date, n): 950 | dump_path = './cache/get_action_user_feat0515_2_3_%s_%s_%s_1.csv' % (start_date, end_date, n) 951 | if os.path.exists(dump_path): 952 | actions = pd.read_csv(dump_path) 953 | else: 954 | 955 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n) 956 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 957 | 958 | actions = get_actions(start_days,end_date)[['user_id','type','cate']] 959 | actions_gouwuche=actions[actions['type']==2] 960 | actions_gouwuche_1= actions_gouwuche[['user_id','type']] 961 | actions_gouwuche_1= actions_gouwuche_1.groupby('user_id').count().reset_index() 962 | actions_gouwuche_1.columns = ['user_id',str(n)+'gouwuche_add'] 963 | 964 | actions_gouwuche_2= actions_gouwuche[actions_gouwuche['cate']==8][['user_id','type']] 965 | actions_gouwuche_2= actions_gouwuche_2.groupby('user_id').count().reset_index() 966 | actions_gouwuche_2.columns = ['user_id',str(n)+'gouwuche_add_cate_8'] 967 | 968 | actions_guanzhu=actions[actions['type']==5] 969 | actions_guanzhu_1= actions_guanzhu[['user_id','type']] 970 | actions_guanzhu_1= actions_guanzhu_1.groupby('user_id').count().reset_index() 971 | actions_guanzhu_1.columns = ['user_id',str(n)+'guanzhu_add'] 972 | 973 | actions_guanzhu_2= actions_guanzhu[actions_guanzhu['cate']==8][['user_id','type']] 974 | actions_guanzhu_2= actions_guanzhu_2.groupby('user_id').count().reset_index() 975 | actions_guanzhu_2.columns = ['user_id',str(n)+'guanzhu_add_cate_8'] 976 | 977 | actions = pd.merge(actions_gouwuche_1,actions_gouwuche_2,on='user_id',how ='outer') 978 | actions = pd.merge(actions,actions_guanzhu_1,on='user_id',how ='outer') 979 | actions = pd.merge(actions,actions_guanzhu_2,on='user_id',how ='outer') 980 | actions=actions.fillna(0) 981 | 982 | user_id = actions[['user_id']] 983 | del actions['user_id'] 984 | actions = actions.fillna(0) 985 | columns = actions.columns 986 | min_max_scale = preprocessing.MinMaxScaler() 987 | actions = min_max_scale.fit_transform(actions.values) 988 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 989 | actions.to_csv(dump_path, index=False) 990 | 991 | 992 | return actions 993 | 994 | #top n 中 某人使用了多少天产生了该行为 995 | def get_action_user_feat0515_2_4(start_date, end_date, n): 996 | dump_path = './cache/get_action_user_feat0515_2_4_%s_%s_%s.csv' % (start_date, end_date, n) 997 | if os.path.exists(dump_path): 998 | actions = pd.read_csv(dump_path) 999 | else: 1000 | 1001 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n) 1002 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 1003 | 1004 | actions = get_actions(start_days,end_date)[['user_id','type','time']] 1005 | actions['time'] = actions['time'].map(lambda x: (datetime.strptime(end_date,'%Y-%m-%d')-datetime.strptime(x, '%Y-%m-%d %H:%M:%S')).days) 1006 | actions=actions.drop_duplicates(['user_id','type','time']) 1007 | actions = actions.groupby(['user_id','type']).count() 1008 | actions.columns = [str(n)+'day_nums'] 1009 | actions=actions.unstack() 1010 | actions=actions.reset_index() 1011 | actions.columns = ['user_id'] + ['get_action_user_feat0515_2_4_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 1012 | actions=actions.fillna(0) 1013 | 1014 | user_id = actions[['user_id']] 1015 | del actions['user_id'] 1016 | actions = actions.fillna(0) 1017 | columns = actions.columns 1018 | min_max_scale = preprocessing.MinMaxScaler() 1019 | actions = min_max_scale.fit_transform(actions.values) 1020 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1021 | actions.to_csv(dump_path, index=False) 1022 | return actions 1023 | 1024 | 1025 | # 用户总购买/加购/关注/点击/浏览品牌数 1026 | def get_action_user_feat5(start_date, end_date): 1027 | dump_path = './cache/user_feat5_a_%s_%s.csv' % (start_date, end_date) 1028 | if os.path.exists(dump_path): 1029 | actions = pd.read_csv(dump_path) 1030 | else: 1031 | actions = get_actions(start_date, end_date) 1032 | action=None 1033 | for i in (1,2,4,5,6): 1034 | df=actions[actions['type']==i][['user_id', 'sku_id']] 1035 | df = df.drop_duplicates(['user_id', 'sku_id'], keep='first') 1036 | df = df.groupby('user_id', as_index=False).count() 1037 | df.columns = ['user_id', 'num_%s'%i] 1038 | if i==1: 1039 | action=df 1040 | else: 1041 | action=pd.merge(action,df,on='user_id',how='outer') 1042 | actions=action.fillna(0) 1043 | actions = actions.astype('float') 1044 | user=actions[['user_id']] 1045 | min_max_scaler = preprocessing.MinMaxScaler() 1046 | actions = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values) 1047 | actions = pd.DataFrame(actions) 1048 | actions = pd.concat([user, actions], axis=1) 1049 | actions.to_csv(dump_path, index=False) 1050 | actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])] 1051 | return actions 1052 | 1053 | #top k 用户总购买/加购/关注/点击/浏览品牌数 1054 | def get_action_u0515_feat5(start_date,end_date,k): 1055 | dump_path = './cache/u0515_feat5_%s_%s_%s.csv' % (start_date, end_date,k) 1056 | if os.path.exists(dump_path): 1057 | actions = pd.read_csv(dump_path) 1058 | else: 1059 | start_days=pd.to_datetime(end_date)-timedelta(days=k) 1060 | start_days=str(start_days).split(' ')[0] 1061 | actions=get_action_user_feat5(start_days, end_date) 1062 | actions.to_csv(dump_path,index=False) 1063 | actions.columns=['user_id']+['u0515_feat5_'+str(k)+'_'+str(i) for i in range(1,actions.shape[1])] 1064 | return actions 1065 | 1066 | 1067 | #最早交互时间 1068 | def get_action_u0524_feat1(start_date,end_date): 1069 | dump_path = './cache/u0524_feat1_%s_%s.csv' % (start_date, end_date,) 1070 | if os.path.exists(dump_path): 1071 | actions = pd.read_csv(dump_path) 1072 | else: 1073 | #全集 1074 | actions=get_actions(start_date,end_date)[['user_id','time']] 1075 | actions=actions.groupby('user_id',as_index=False).first() 1076 | actions['time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(actions['time']) 1077 | actions['time_diff_early']=actions['time_diff_early'].dt.days*24+actions['time_diff_early'].dt.seconds//3600 1078 | actions=actions[['user_id','time_diff_early']] 1079 | #子集 1080 | sub_actions=sub_get_actions(start_date,end_date)[['user_id','time']] 1081 | sub_actions=sub_actions.groupby('user_id',as_index=False).first() 1082 | sub_actions['sub_time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(sub_actions['time']) 1083 | sub_actions['sub_time_diff_early']=sub_actions['sub_time_diff_early'].dt.days*24+sub_actions['sub_time_diff_early'].dt.seconds//3600 1084 | sub_actions = sub_actions[['user_id', 'sub_time_diff_early']] 1085 | 1086 | actions=pd.merge(actions,sub_actions,on='user_id',how='left') 1087 | actions=actions.fillna(0) 1088 | min_max_scale = preprocessing.MinMaxScaler() 1089 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values) 1090 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1) 1091 | actions.to_csv(dump_path,index=False) 1092 | actions.columns=['user_id']+['u0524_feat1_'+str(i)for i in range(1,actions.shape[1])] 1093 | return actions 1094 | 1095 | #最晚交互时间 1096 | def get_action_u0524_feat2(start_date,end_date): 1097 | dump_path = './cache/u0524_feat2_%s_%s.csv' % (start_date, end_date,) 1098 | if os.path.exists(dump_path): 1099 | actions = pd.read_csv(dump_path) 1100 | else: 1101 | # 全集 1102 | actions = get_actions(start_date, end_date)[['user_id', 'time']] 1103 | actions = actions.groupby('user_id', as_index=False).last() 1104 | actions['time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(actions['time']) 1105 | actions['time_diff_recent'] = actions['time_diff_recent'].dt.days * 24 + actions['time_diff_recent'].dt.seconds // 3600 1106 | actions = actions[['user_id', 'time_diff_recent']] 1107 | # 子集 1108 | sub_actions = sub_get_actions(start_date, end_date)[['user_id', 'time']] 1109 | sub_actions = sub_actions.groupby('user_id', as_index=False).last() 1110 | sub_actions['sub_time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(sub_actions['time']) 1111 | sub_actions['sub_time_diff_recent'] = sub_actions['sub_time_diff_recent'].dt.days * 24 + sub_actions['sub_time_diff_recent'].dt.seconds // 3600 1112 | sub_actions = sub_actions[['user_id', 'sub_time_diff_recent']] 1113 | 1114 | actions = pd.merge(actions, sub_actions, on='user_id', how='left') 1115 | actions=actions.fillna(0) 1116 | min_max_scale = preprocessing.MinMaxScaler() 1117 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values) 1118 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1) 1119 | actions.to_csv(dump_path,index=False) 1120 | actions.columns = ['user_id'] + ['u0524_feat2_' + str(i) for i in range(1, actions.shape[1])] 1121 | return actions 1122 | 1123 | 1124 | #活跃天数 1125 | def get_action_u0524_feat3(start_date,end_date): 1126 | dump_path = './cache/u0524_feat3_%s_%s.csv' % (start_date, end_date,) 1127 | if os.path.exists(dump_path): 1128 | actions = pd.read_csv(dump_path) 1129 | else: 1130 | #全集 1131 | actions=get_actions(start_date,end_date) 1132 | actions['time']=pd.to_datetime(actions['time']).dt.date 1133 | actions=actions.drop_duplicates(['user_id','time'])[['user_id','time']] 1134 | actions=actions.groupby('user_id',as_index=False).count() 1135 | #子集 1136 | sub_actions=sub_get_actions(start_date,end_date) 1137 | sub_actions['time']=pd.to_datetime(sub_actions['time']).dt.date 1138 | sub_actions=sub_actions.drop_duplicates(['user_id','time'])[['user_id','time']] 1139 | sub_actions=sub_actions.groupby('user_id',as_index=False).count() 1140 | actions=pd.merge(actions,sub_actions,on='user_id',how='left') 1141 | actions=actions.fillna(0) 1142 | min_max_scale = preprocessing.MinMaxScaler() 1143 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values) 1144 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1) 1145 | actions.to_csv(dump_path,index=False) 1146 | actions.columns=['user_id']+['u0524_feat3_'+str(i) for i in range(1,actions.shape[1])] 1147 | return actions 1148 | 1149 | 1150 | #点击模块 1151 | def get_action_user_feat0509_1_31(start_date,end_date,n): 1152 | dump_path='./cache/user_feat0509_1_31_%s_%s_%s.csv'%(start_date,end_date,n) 1153 | if os.path.exists(dump_path): 1154 | actions = pd.read_csv(dump_path) 1155 | else: 1156 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n) 1157 | start_days=datetime.strftime(start_days,'%Y-%m-%d') 1158 | actions=get_actions(start_days,end_date) 1159 | actions=actions[actions['type']==6][['user_id','model_id']] 1160 | 1161 | # actions = actions.drop('type',axis=1) 1162 | 1163 | actions_click_sum=actions[['user_id','model_id']].groupby('user_id').count().reset_index() 1164 | actions_click_sum.columns = ['user_id',str(n)+'click_sum_all'] 1165 | actions[str(n)+'u_click14_history'] = actions['model_id'].map(lambda x: int(x == 14)) 1166 | actions[str(n)+'u_click21_history'] = actions['model_id'].map(lambda x: int(x == 21)) 1167 | actions[str(n)+'u_click28_history'] = actions['model_id'].map(lambda x: int(x == 28)) 1168 | actions[str(n)+'u_click110_history'] = actions['model_id'].map(lambda x: int(x == 110)) 1169 | actions[str(n)+'u_click210_history'] = actions['model_id'].map(lambda x: int(x == 210)) 1170 | actions = actions.groupby('user_id').sum().reset_index().drop('model_id', axis=1) 1171 | # actions.to_csv(dump_path,index=False) 1172 | actions = pd.merge(actions,actions_click_sum,how='left',on='user_id') 1173 | 1174 | actions[str(n)+'u_click14/click_sum_history'] = actions[str(n)+'u_click14_history']/actions[str(n)+'click_sum_all'] 1175 | actions[str(n)+'u_click21/click_sum_history'] = actions[str(n)+'u_click21_history']/actions[str(n)+'click_sum_all'] 1176 | actions[str(n)+'u_click28/click_sum_history'] = actions[str(n)+'u_click28_history']/actions[str(n)+'click_sum_all'] 1177 | actions[str(n)+'u_click110/click_sum_history'] = actions[str(n)+'u_click110_history']/actions[str(n)+'click_sum_all'] 1178 | actions[str(n)+'u_click210/click_sum_history'] = actions[str(n)+'u_click210_history']/actions[str(n)+'click_sum_all'] 1179 | 1180 | user_id = actions[['user_id']] 1181 | del actions['user_id'] 1182 | actions = actions.fillna(0) 1183 | columns = actions.columns 1184 | min_max_scale = preprocessing.MinMaxScaler() 1185 | actions = min_max_scale.fit_transform(actions.values) 1186 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1187 | actions.to_csv(dump_path,index=False) 1188 | return actions 1189 | #u模型cate=8的购买者和不是cate=8的购买者 1190 | def get_action_u0513_feat16(start_date,end_date): 1191 | dump_path = './cache/u0513_feat16_%s_%s.csv' % (start_date, end_date) 1192 | if os.path.exists(dump_path): 1193 | actions = pd.read_csv(dump_path) 1194 | else: 1195 | df = get_actions(start_date, end_date)[['user_id', 'type', 'cate']] 1196 | df = df[df['type'] == 4] 1197 | df = df.groupby(['user_id', 'cate']).count() 1198 | df = df.unstack().reset_index() 1199 | df.columns = ['user_id'] + ['cate' + str(i) for i in range(4, 12)] 1200 | df = df.fillna(0) 1201 | sum1 = df.drop(['user_id', 'cate8'], axis=1).apply(sum, axis=1) 1202 | sum2 = df.drop(['user_id'], axis=1).apply(sum, axis=1) 1203 | actions = pd.concat([df[['user_id', 'cate8']], sum1, sum2], axis=1) 1204 | actions.columns = ['user_id', 'cate8', 'sum_other_cate', 'sum'] 1205 | actions['cate8_rate'] = actions['cate8'] / actions['sum'] 1206 | actions['sum_other_cate_rate'] = actions['sum_other_cate'] / actions['sum'] 1207 | del actions['sum'] 1208 | actions.to_csv(dump_path,index=False) 1209 | return actions 1210 | 1211 | #get_action_u0513_feat16('2016-02-01','2016-04-16') 1212 | # 用户层级特征 1213 | def get_action_user_feat_six_xingwei(start_date, end_date, n): 1214 | dump_path = './cache/user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n) 1215 | if os.path.exists(dump_path): 1216 | actions = pd.read_csv(dump_path) 1217 | print("user_zlzl" + str(n)) 1218 | 1219 | else: 1220 | actions = get_actions(start_date, end_date) 1221 | actions['time'] = actions['time'].map(lambda x: get_day_chaju(x, end_date) // n) 1222 | num_day = np.max(actions['time']) 1223 | df = None 1224 | print(num_day) 1225 | for i in range(min(num_day + 1, 6)): 1226 | in_temp = pd.get_dummies(actions['type'], prefix="user_action_time_" + str(i)) 1227 | temp = actions[actions['time'] == i] 1228 | temp = pd.concat([temp['user_id'], in_temp], axis=1) 1229 | 1230 | feature = ['user_id'] 1231 | for j in range(1, 7, 1): 1232 | feature.append('user_action_time_' + str(i) + '_' + str(j)) 1233 | 1234 | temp = temp.groupby(['user_id'], as_index=False).sum() 1235 | temp.columns = feature 1236 | if df is None: 1237 | df = temp 1238 | else: 1239 | df = pd.merge(df, temp, how='outer', on='user_id') 1240 | df.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, df.shape[1])] 1241 | df.to_csv(dump_path, index=False) 1242 | actions=df 1243 | 1244 | # user_id = actions[['user_id']] 1245 | # del actions['user_id'] 1246 | # actions = actions.fillna(0) 1247 | # actions=actions.replace(np.inf,0) 1248 | # # print(actions.head()) 1249 | # columns = actions.columns 1250 | 1251 | # min_max_scale = preprocessing.MinMaxScaler() 1252 | # actions=actions.replace(np.inf,0) 1253 | # actions = min_max_scale.fit_transform(actions.values) 1254 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1255 | actions.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 1256 | return actions 1257 | 1258 | 1259 | def deal_user_six_deal(start_date, end_date, n): 1260 | dump_path = './cache/deal_user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n) 1261 | if os.path.exists(dump_path): 1262 | actions = pd.read_csv(dump_path) 1263 | actions.columns = ['user_id'] + ['u_featsix_deal_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 1264 | return actions 1265 | else: 1266 | temp = get_action_user_feat_six_xingwei(start_date, end_date, n) # 修改 1267 | time1 = datetime.now() 1268 | columns = ["user_id"] 1269 | all_col = temp.shape[1] - 1 1270 | temp.columns = columns + list(range(all_col)) 1271 | temp = temp.fillna(0) 1272 | columns = ['user_id'] 1273 | for j in range(0, 6, 1): 1274 | temp["zl_" + str(j)] = 0 1275 | columns.append("zl_" + str(j)) 1276 | for k in range(j, all_col, 6): 1277 | temp["zl_" + str(j)] = temp["zl_" + str(j)] + temp[k].map(lambda x: x * ((k // 6 + 1) ** (-0.67))) 1278 | temp["zl_" + str(j)] = temp["zl_" + str(j)].map(lambda x: (x - np.min(temp["zl_" + str(j)])) / ( 1279 | np.max(temp["zl_" + str(j)]) - np.min(temp["zl_" + str(j)]))) 1280 | temp = temp[columns] 1281 | temp.to_csv(dump_path, index=False) 1282 | return temp 1283 | 1284 | # # get user sku 1285 | # def get_user(start_date, end_date): 1286 | # dump_path = './cache/user_sku_%s_%s.csv' % (start_date, end_date) 1287 | # if os.path.exists(dump_path): 1288 | # actions = pd.read_csv(dump_path) 1289 | # else: 1290 | # actions = get_actions(start_date, end_date) 1291 | # actions = actions[(actions['type'] == 2) | (actions['type'] == 5) | (actions['type'] == 4)] 1292 | # actions=actions[actions['cate']==8] 1293 | # actions = actions[['user_id']] 1294 | # actions = actions.drop_duplicates(['user_id'], keep='first') 1295 | # actions.to_csv(dump_path, index=False) 1296 | # return actions 1297 | 1298 | 1299 | #用户购买前的行为 1300 | def get_action_u0509_feat_28(start_date, end_date,k): 1301 | dump_path = './cache/u0509_feat_28_%s_%s_%s.csv' % (start_date, end_date,k) 1302 | if os.path.exists(dump_path): 1303 | actions = pd.read_csv(dump_path) 1304 | else: 1305 | actions = get_actions(start_date, end_date) 1306 | actions = actions[actions['type'] == 4] 1307 | actions['time_buy'] = actions['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d')) 1308 | actions = actions[['user_id', 'sku_id', 'time_buy']].reset_index(drop=True) 1309 | actions['before_time_buy'] = actions['time_buy'] - timedelta(days=k) 1310 | 1311 | df = get_actions('2016-02-01','2016-04-16')[['user_id', 'sku_id', 'time', 'type']] 1312 | df['time'] = df['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d')) 1313 | df = pd.merge(df, actions, on=['user_id', 'sku_id'], how='left') 1314 | df = df.dropna(axis=0, how='any') 1315 | df['before_days'] = (df['time'] - df['before_time_buy']).dt.days 1316 | df['days'] = (df['time'] - df['time_buy']).dt.days 1317 | df = df[(df['before_days'] >= 0) & (df['days'] < 0)] 1318 | df_dummy = pd.get_dummies(df['type'], prefix='type') 1319 | 1320 | df = pd.concat([df, df_dummy], axis=1)[ 1321 | ['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']] 1322 | 1323 | df = df.groupby(['user_id', 'sku_id'], as_index=False).sum() 1324 | del df['sku_id'] 1325 | df = df.groupby('user_id', as_index=False).agg(['min', 'max', 'mean']) 1326 | df = df.reset_index() 1327 | df.columns = ['user_id'] + ['u0509_feat28_' + str(k) + '_' + i for i in ( 1328 | 'type_1_min', 'type_1_max', 'type_1_mean', 'type_2_min', 'type_2_max', 'type_2_mean', 1329 | 'type_3_min', 'type_3_max', 'type_3_mean', 'type_4_min', 'type_4_max', 'type_4_mean', 1330 | 'type_5_min', 'type_5_max', 'type_5_mean', 'type_6_min', 'type_6_max', 'type_6_mean')] 1331 | min_max_scaler = preprocessing.MinMaxScaler() 1332 | actions = min_max_scaler.fit_transform(df.drop('user_id', axis=1).values) 1333 | actions = pd.DataFrame(actions) 1334 | actions = pd.concat([df[['user_id']], actions], axis=1) 1335 | actions.columns = ['user_id']+['u0509_feat_28_'+str(i) for i in range(1,actions.shape[1])] 1336 | actions.to_csv(dump_path,index=False) 1337 | actions.columns = ['user_id']+['u0509_feat_28_'+str(k)+"_"+str(i) for i in range(1,actions.shape[1])] 1338 | return actions 1339 | 1340 | #用户看了几个cate=8中的brand、用户看的cate=8的brand/用户看的brand 1341 | def get_action_u0509_feat_29(start_date,end_date): 1342 | dump_path = './cache/u0509_feat_29_%s_%s.csv' % (start_date, end_date) 1343 | if os.path.exists(dump_path): 1344 | actions = pd.read_csv(dump_path) 1345 | else: 1346 | actions=get_actions(start_date,end_date) 1347 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','brand'])[['user_id','brand']] 1348 | df1=df1.groupby(['user_id'],as_index=False).count() 1349 | df1.columns=['user_id','brand_cate=8'] 1350 | df2=actions.drop_duplicates(['user_id','brand'])[['user_id','brand']] 1351 | df2 = df2.groupby(['user_id'], as_index=False).count() 1352 | df2.columns=['user_id','brand_cate_all'] 1353 | df=pd.merge(df1,df2,on='user_id',how='right') 1354 | df['rate']=df['brand_cate=8']/df['brand_cate_all'] 1355 | # print df 1356 | actions=df.fillna(0) 1357 | actions.to_csv(dump_path,index=False) 1358 | actions.columns=['user_id']+['u0509_feat_29'+str(i) for i in range(1,actions.shape[1])] 1359 | return actions 1360 | 1361 | def get_action_u0521_feat_31(start_date,end_date,k): 1362 | dump_path = './cache/u0509_feat_31_%s_%s_%s.csv' % (start_date, end_date,k) 1363 | if os.path.exists(dump_path): 1364 | actions = pd.read_csv(dump_path) 1365 | else: 1366 | start_days=pd.to_datetime(end_date)-timedelta(days=k) 1367 | start_days=datetime.strftime(start_days,'%H-%m-%d') 1368 | actions=get_actions(start_days,end_date) 1369 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','cate'])[['user_id','cate']] 1370 | df1=df1.groupby('user_id',as_index=False).count() 1371 | df1.columns=['user_id','cate8'] 1372 | df2=actions.drop_duplicates(['user_id','cate'])[['user_id','cate']] 1373 | df2=df2.groupby('user_id',as_index=False).count() 1374 | actions=pd.merge(df1,df2,on='user_id',how='right') 1375 | actions['cate8/cate']=actions['cate8']/actions['cate'] 1376 | actions=actions.fillna(0) 1377 | min_max_scaler = preprocessing.MinMaxScaler() 1378 | df = min_max_scaler.fit_transform(actions[['cate8','cate']].values) 1379 | df = pd.DataFrame(df) 1380 | actions = pd.concat([actions[['user_id','cate8/cate']], df], axis=1) 1381 | actions.to_csv(dump_path,index=False) 1382 | actions.columns=['user_id']+['u0509_feat_31_'+str(k)+'_'+str(i)for i in range(1,actions.shape[1])] 1383 | return actions 1384 | 1385 | 1386 | def get_action_u0521_feat_32(start_date,end_date): 1387 | dump_path = './cache/u0509_feat_32_%s_%s.csv' % (start_date, end_date) 1388 | if os.path.exists(dump_path): 1389 | actions = pd.read_csv(dump_path) 1390 | else: 1391 | actions=get_actions(start_date,end_date) 1392 | actions=actions[actions['cate']==8][['user_id','brand']] 1393 | df1=actions.drop_duplicates(['user_id','brand']).groupby('user_id',as_index=False).count() 1394 | df1.columns=['user_id','brand_num'] 1395 | df2=actions.groupby('user_id',as_index=False).count() 1396 | actions=pd.merge(df1,df2,on='user_id',how='left') 1397 | actions['brand_num/brand']=actions['brand']/actions['brand_num'] 1398 | actions=actions.fillna(0) 1399 | min_max_scaler = preprocessing.MinMaxScaler() 1400 | df = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values) 1401 | df = pd.DataFrame(df) 1402 | actions = pd.concat([actions[['user_id']], df], axis=1) 1403 | actions.to_csv(dump_path, index=False) 1404 | actions.columns = ['user_id'] + ['u0509_feat_32_' + str(i) for i in range(1, actions.shape[1])] 1405 | return actions 1406 | 1407 | def get_action_user_feat7_0522_huachuang(start_date, end_date,n): 1408 | dump_path = './cache/user_feat7_six_%s_%s_%s_0522.csv' % (start_date, end_date,n) 1409 | if os.path.exists(dump_path): 1410 | actions = pd.read_csv(dump_path) 1411 | else: 1412 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n) 1413 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 1414 | 1415 | df = get_actions(start_days, end_date)[['user_id', 'type', 'time']] 1416 | actions = df.groupby(['user_id', 'type'], as_index=False).count() 1417 | 1418 | time_min = df.groupby(['user_id', 'type'], as_index=False).min() 1419 | time_max = df.groupby(['user_id', 'type'], as_index=False).max() 1420 | 1421 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left') 1422 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 1423 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 1424 | 1425 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] - 1426 | time_cha[ 1427 | 'time_y']).dt.seconds // 3600 1428 | del time_cha['time_x'] 1429 | del time_cha['time_y'] 1430 | # time_cha=time_cha.fillna(1) 1431 | 1432 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left") 1433 | actions = actions.groupby(['user_id', 'type']).sum() 1434 | actions['cnt/time'] = actions['time'] / actions["cha_hour"] 1435 | actions = actions.unstack() 1436 | actions.columns = list(range(actions.shape[1])) 1437 | actions = actions.reset_index() 1438 | actions = actions.fillna(0) 1439 | actions.to_csv(dump_path, index=False) 1440 | actions.columns = ['user_id'] + ['u_feat7_' +str(n)+"_"+ str(i) for i in range(1, actions.shape[1])] 1441 | return actions 1442 | 1443 | def get_user_labels(test_start_date,test_end_date): 1444 | dump_path = './cache/user_labels_%s_%s_11.csv' % (test_start_date, test_end_date) 1445 | if os.path.exists(dump_path): 1446 | actions = pd.read_csv(dump_path) 1447 | else: 1448 | actions = get_actions(test_start_date, test_end_date) 1449 | actions = actions[actions['cate']==8] 1450 | actions = actions[actions['type'] == 4].drop_duplicates(['user_id'])[['user_id']] 1451 | actions['label'] = 1 1452 | 1453 | return actions 1454 | 1455 | print("U model 1 finish part_0") 1456 | 1457 | ######################################################################################################### 1458 | 1459 | 1460 | # In[ ]: 1461 | 1462 | 1463 | 1464 | 1465 | # In[ ]: 1466 | 1467 | 1468 | 1469 | 1470 | # In[ ]: 1471 | 1472 | 1473 | 1474 | 1475 | # In[ ]: 1476 | 1477 | 1478 | 1479 | 1480 | # In[ ]: 1481 | 1482 | 1483 | 1484 | 1485 | # In[ ]: 1486 | 1487 | 1488 | 1489 | 1490 | # In[ ]: 1491 | 1492 | 1493 | 1494 | 1495 | # In[ ]: 1496 | 1497 | 1498 | 1499 | 1500 | # In[ ]: 1501 | 1502 | 1503 | 1504 | 1505 | # In[ ]: 1506 | 1507 | 1508 | 1509 | 1510 | # In[2]: 1511 | 1512 | import os 1513 | from datetime import datetime 1514 | from datetime import timedelta 1515 | 1516 | # -*- coding: utf-8 -*- 1517 | """ 1518 | Created on Sun May 14 10:27:41 2017 1519 | @author: 老虎趴趴走 1520 | """ 1521 | import pandas as pd 1522 | import numpy as np 1523 | # import datetime 1524 | import math 1525 | 1526 | def user_features(user, ful_action, sub_action, end_date): 1527 | dump_path='./cache/user_features_%s_0514_2.csv'%(end_date) 1528 | if os.path.exists(dump_path): 1529 | actions = pd.read_csv(dump_path) 1530 | 1531 | else: 1532 | end_date=pd.to_datetime(end_date) 1533 | day = timedelta(1, 0) 1534 | print('=====> 提取特征...') 1535 | sub_1 = sub_action[(sub_action['time']>=end_date-1*day) & (sub_action['time']=end_date-3*day) & (sub_action['time']=end_date-5*day) & (sub_action['time']=end_date-30*day) & (sub_action['time']=end_date-5*day) & (ful_action['time']=end_date-30*day) & (ful_action['time'] 完成!') 1763 | actions.to_csv(dump_path,index=False) 1764 | 1765 | # user_id = actions[['user_id']] 1766 | # del actions['user_id'] 1767 | # actions = actions.fillna(0) 1768 | # actions=actions.replace(np.inf,0) 1769 | # print(actions.head()) 1770 | # columns = actions.columns 1771 | 1772 | # min_max_scale = preprocessing.MinMaxScaler() 1773 | # actions=actions.replace(np.inf,0) 1774 | # actions = min_max_scale.fit_transform(actions.values) 1775 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1776 | return actions 1777 | 1778 | import pandas as pd 1779 | ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True) 1780 | sub_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True) 1781 | user = pd.read_csv('./data/JData_modified_user.csv', parse_dates=[4]) 1782 | # user_features(user,ful_action,sel_action,'2016-04-11') 1783 | print("U model 1 finish part_1") 1784 | ###################################################################################### 1785 | 1786 | 1787 | # In[ ]: 1788 | 1789 | 1790 | 1791 | 1792 | # In[6]: 1793 | 1794 | # 测试集 1795 | # ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True) 1796 | # sel_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True) 1797 | def make_test_set(train_start_date, train_end_date,user,ful_action,sub_action): 1798 | dump_path = './cache/bu0525model_1_u_test_set_%s_%s.csv' % (train_start_date, train_end_date) 1799 | if os.path.exists(dump_path): 1800 | actions = pd.read_csv(dump_path) 1801 | else: 1802 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1803 | actions_1 = get_actions(start_days, train_end_date) 1804 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1805 | 1806 | 1807 | 1808 | print (actions.shape) 1809 | 1810 | start_days = "2016-02-01" 1811 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id') 1812 | # print(actions.shape) 1813 | # 1814 | 1815 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id') 1816 | # print(actions.shape) 1817 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id') 1818 | print(actions.shape) 1819 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id') 1820 | print(actions.shape) 1821 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id') 1822 | print(actions.shape) 1823 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id') 1824 | print(actions.shape) 1825 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id') 1826 | print(actions.shape) 1827 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id') 1828 | print (actions.shape) 1829 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id') 1830 | print (actions.shape) 1831 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id') 1832 | print (actions.shape) 1833 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id') 1834 | print (actions.shape) 1835 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id') 1836 | print (actions.shape) 1837 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id') 1838 | print (actions.shape) 1839 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id') 1840 | print (actions.shape) 1841 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id') 1842 | print (actions.shape) 1843 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id') 1844 | print (actions.shape) 1845 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id') 1846 | print (actions.shape) 1847 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id') 1848 | print (actions.shape) 1849 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id') 1850 | print (actions.shape) 1851 | 1852 | #模型1 和 模型二 1853 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id') 1854 | print (actions.shape) 1855 | #模型 二 1856 | # actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id') 1857 | 1858 | 1859 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id') 1860 | # print (actions.shape) 1861 | 1862 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id') 1863 | # print (actions.shape) 1864 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id') 1865 | # print (actions.shape) 1866 | 1867 | for i in (1, 2, 3, 7, 14, 28): 1868 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id') 1869 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id') 1870 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id') 1871 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id') 1872 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id') 1873 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id') 1874 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id') 1875 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id') 1876 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id') 1877 | #模型1 和 模型二 1878 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id') 1879 | if(i<=10): 1880 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id') 1881 | #模型 二 1882 | # actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id') 1883 | # actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id') 1884 | print(actions.shape) 1885 | print(actions.shape) 1886 | 1887 | actions = actions.fillna(0) 1888 | # user_id = actions[['user_id']] 1889 | # del actions['user_id'] 1890 | # actions = actions.fillna(0) 1891 | # actions=actions.replace(np.inf,0) 1892 | # # print(actions.head()) 1893 | # columns = actions.columns 1894 | 1895 | # min_max_scale = preprocessing.MinMaxScaler() 1896 | # actions=actions.replace(np.inf,0) 1897 | # actions = min_max_scale.fit_transform(actions.values) 1898 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1899 | # actions.to_csv(dump_path,index=False) 1900 | return actions 1901 | 1902 | 1903 | # 训练集 1904 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action): 1905 | dump_path = './cache/bu0525model_1_u_train_set_%s_%s_%s_%s.csv' % (train_start_date, train_end_date, test_start_date, test_end_date) 1906 | if os.path.exists(dump_path): 1907 | actions = pd.read_csv(dump_path) 1908 | else: 1909 | 1910 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1911 | actions_1 = get_actions(start_days, train_end_date) 1912 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1913 | # buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates() 1914 | # actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False] 1915 | 1916 | 1917 | 1918 | # print (actions.shape) 1919 | 1920 | # start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1921 | # actions_1 = get_actions(start_days, train_end_date) 1922 | # actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)] 1923 | # actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1924 | # actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id']) 1925 | print (actions.shape) 1926 | # start_days = train_start_date 1927 | start_days = "2016-02-01" 1928 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id') 1929 | print(actions.shape) 1930 | 1931 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id') 1932 | # print(actions.shape) 1933 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id') 1934 | print(actions.shape) 1935 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id') 1936 | print(actions.shape) 1937 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id') 1938 | print(actions.shape) 1939 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id') 1940 | print(actions.shape) 1941 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id') 1942 | print(actions.shape) 1943 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id') 1944 | print (actions.shape) 1945 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id') 1946 | print (actions.shape) 1947 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id') 1948 | print (actions.shape) 1949 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id') 1950 | print (actions.shape) 1951 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id') 1952 | print (actions.shape) 1953 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id') 1954 | print (actions.shape) 1955 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id') 1956 | print (actions.shape) 1957 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id') 1958 | print (actions.shape) 1959 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id') 1960 | print (actions.shape) 1961 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id') 1962 | print (actions.shape) 1963 | 1964 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id') 1965 | print (actions.shape) 1966 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id') 1967 | print (actions.shape) 1968 | 1969 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id') 1970 | # actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id') 1971 | 1972 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id') 1973 | # print (actions.shape) 1974 | 1975 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id') 1976 | # print (actions.shape) 1977 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id') 1978 | # print (actions.shape) 1979 | print (actions.shape) 1980 | for i in (1, 2, 3,7, 14, 28): 1981 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id') 1982 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id') 1983 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id') 1984 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id') 1985 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id') 1986 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id') 1987 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id') 1988 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id') 1989 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id') 1990 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id') 1991 | if(i<=10): 1992 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id') 1993 | # actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id') 1994 | 1995 | # actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id') 1996 | print(actions.shape) 1997 | actions = pd.merge(actions, get_user_labels(test_start_date, test_end_date), how='left', on='user_id') 1998 | 1999 | actions = actions.fillna(0) 2000 | print(actions.shape) 2001 | # user_id = actions[['user_id']] 2002 | # del actions['user_id'] 2003 | # actions = actions.fillna(0) 2004 | # actions=actions.replace(np.inf,0) 2005 | # # print(actions.head()) 2006 | # columns = actions.columns 2007 | 2008 | # min_max_scale = preprocessing.MinMaxScaler() 2009 | # actions=actions.replace(np.inf,0) 2010 | # actions = min_max_scale.fit_transform(actions.values) 2011 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 2012 | # actions.to_csv(dump_path,index=False) 2013 | return actions 2014 | 2015 | print("U model 1 finish part_3") 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | ########################################################################################### 2023 | 2024 | 2025 | # In[ ]: 2026 | 2027 | 2028 | 2029 | 2030 | # In[ ]: 2031 | 2032 | 2033 | 2034 | 2035 | # In[ ]: 2036 | 2037 | 2038 | 2039 | 2040 | # In[7]: 2041 | 2042 | #!/usr/bin/python 2043 | 2044 | import numpy as np 2045 | import xgboost as xgb 2046 | # from user_feat import * 2047 | from sklearn.model_selection import train_test_split 2048 | 2049 | 2050 | train_start_date = '2016-03-10' 2051 | train_end_date = '2016-04-11' 2052 | test_start_date = '2016-04-11' 2053 | test_end_date = '2016-04-16' 2054 | 2055 | # train_start_date='2016-03-05' 2056 | # train_end_date='2016-04-06' 2057 | # test_start_date='2016-04-06' 2058 | # test_end_date='2016-04-11' 2059 | 2060 | sub_start_date = '2016-03-15' 2061 | sub_end_date = '2016-04-16' 2062 | 2063 | #训练数据集 2064 | actions = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action) 2065 | # print(np.isinf(actions)) 2066 | # print(np.isnan(actions)) 2067 | 2068 | actions 2069 | 2070 | 2071 | feature_name = actions.columns.values 2072 | 2073 | # for index in feature_name[1:-1]: 2074 | # actions["r"+index]=actions[index].rank(method='max')/actions.shape[0] 2075 | 2076 | print(actions.shape) 2077 | actions_pos = actions[actions['label']==1] 2078 | actions_neg = actions[actions['label']==0] 2079 | 2080 | 2081 | print("+++++++++++++++++++++++") 2082 | 2083 | 2084 | 2085 | train,test=train_test_split(actions.values,test_size=0.2,random_state=0) 2086 | train=pd.DataFrame(train,columns=actions.columns) 2087 | test=pd.DataFrame(test,columns=actions.columns) 2088 | 2089 | X_train=train.drop(['user_id','label'],axis=1) 2090 | X_test=test.drop(['user_id','label'],axis=1) 2091 | y_train=train[['label']] 2092 | y_test=test[['label']] 2093 | train_index=train[['user_id']].copy() 2094 | test_index=test[['user_id']].copy() 2095 | 2096 | 2097 | 2098 | 2099 | 2100 | #测试数据集 2101 | sub_test_data = make_test_set(sub_start_date, sub_end_date,user,ful_action,sub_action) 2102 | 2103 | feature_name = sub_test_data.columns.values 2104 | # for index in feature_name[1:]: 2105 | # sub_test_data["r"+index]=sub_test_data[index].rank(method='max')/sub_test_data.shape[0] 2106 | 2107 | 2108 | sub_trainning_data=sub_test_data.drop(['user_id'],axis=1) 2109 | sub_user_index=sub_test_data[['user_id']].copy() 2110 | 2111 | print("U model 1 finish part_4") 2112 | 2113 | ######################################################################## 2114 | 2115 | 2116 | # In[ ]: 2117 | 2118 | 2119 | 2120 | 2121 | # In[9]: 2122 | 2123 | print ('==========>>>train xgboost model ....') 2124 | 2125 | dtrain = xgb.DMatrix(X_train,label=y_train) 2126 | dtest = xgb.DMatrix(X_test,label=y_test) 2127 | param = {'learning_rate' : 0.1, 2128 | 'n_estimators': 1000, 2129 | 'max_depth': 3, 2130 | 'min_child_weight': 5, 2131 | 'gamma': 0, 2132 | 'subsample': 1.0, 2133 | 'colsample_bytree': 0.8, 2134 | 'eta': 0.05, 2135 | 'silent': 1, 2136 | 'objective': 2137 | 'binary:logistic', 2138 | 'scale_pos_weight':1} 2139 | 2140 | 2141 | 2142 | num_round =120 2143 | plst = list(param.items()) 2144 | plst += [('eval_metric', 'logloss')] 2145 | 2146 | evallist = [(dtest, 'eval'), (dtrain, 'train')] 2147 | bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10) 2148 | 2149 | 2150 | 2151 | 2152 | # ============================================>>>> 2153 | print ('==========>>>predict test data label') 2154 | 2155 | 2156 | sub_trainning_data_1 = xgb.DMatrix(sub_trainning_data) 2157 | y = bst.predict(sub_trainning_data_1) 2158 | pred = sub_user_index 2159 | sub_user_index['label'] = y 2160 | 2161 | # print(sub_user_index.head()) 2162 | 2163 | pred=sub_user_index 2164 | #pred.sort_values(by=['user_id','label'],ascending=[0,0],inplace=True) 2165 | pred=pred.sort_values(by=['user_id','label'],ascending=[0,0]) 2166 | pred = pred.groupby('user_id').first().reset_index() 2167 | result=pred.sort_values(by=['label'],ascending=[0]) 2168 | result['user_id']=result['user_id'].astype('int') 2169 | 2170 | 2171 | result.to_csv('./sub/Umodel_1.csv',index=False,index_label=False ) 2172 | 2173 | print("U model 1 finish part_5") 2174 | 2175 | 2176 | # In[ ]: 2177 | 2178 | 2179 | 2180 | -------------------------------------------------------------------------------- /Umodel_2.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | #!/usr/bin/env python 7 | 8 | import time 9 | 10 | from datetime import datetime 11 | from datetime import timedelta 12 | import pandas as pd 13 | import pickle 14 | import os 15 | import math 16 | import numpy as np 17 | from sklearn import preprocessing 18 | import matplotlib.pyplot as plt 19 | 20 | action_1_path = "./data/JData_Action_201602.csv" 21 | action_2_path = "./data/JData_Action_201603.csv" 22 | action_3_path = "./data/JData_Action_201604.csv" 23 | user_path = "./data/JData_User.csv" 24 | product_path = "./data/JData_Product.csv" 25 | 26 | 27 | def convert_age(age_str): 28 | if age_str == u'-1': 29 | return 0 30 | elif age_str == u'15岁以下': 31 | return 1 32 | elif age_str == u'16-25岁': 33 | return 2 34 | elif age_str == u'26-35岁': 35 | return 3 36 | elif age_str == u'36-45岁': 37 | return 4 38 | elif age_str == u'46-55岁': 39 | return 5 40 | elif age_str == u'56岁以上': 41 | return 6 42 | else: 43 | return -1 44 | 45 | 46 | # 用户的基本信息 47 | def get_basic_user_feat(): 48 | dump_path = './cache/basic_user.csv' 49 | if os.path.exists(dump_path): 50 | user = pd.read_csv(dump_path) 51 | else: 52 | user = pd.read_csv(user_path, encoding='gbk') 53 | user['age'] = user['age'].map(convert_age) 54 | age_df = pd.get_dummies(user["age"], prefix="age") 55 | sex_df = pd.get_dummies(user["sex"], prefix="sex") 56 | user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd") 57 | user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1) 58 | user.to_csv(dump_path, index=False) 59 | return user 60 | 61 | # 商品的基本信息 62 | def get_basic_product_feat(): 63 | dump_path = './cache/basic_product.csv' 64 | if os.path.exists(dump_path): 65 | product = pd.read_csv(dump_path) 66 | else: 67 | product = pd.read_csv(product_path) 68 | attr1_df = pd.get_dummies(product["a1"], prefix="a1") 69 | attr2_df = pd.get_dummies(product["a2"], prefix="a2") 70 | attr3_df = pd.get_dummies(product["a3"], prefix="a3") 71 | product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1) 72 | product.to_csv(dump_path, index=False) 73 | return product 74 | 75 | def get_actions_1(): 76 | action = pd.read_csv(action_1_path) 77 | return action 78 | 79 | 80 | def get_actions_2(): 81 | action2 = pd.read_csv(action_2_path) 82 | return action2 83 | 84 | 85 | def get_actions_3(): 86 | action3 = pd.read_csv(action_3_path) 87 | return action3 88 | 89 | def sub_get_actions(start_date,end_date): 90 | dump_path = './cache/sub_action_%s_%s.csv' % (start_date, end_date) 91 | if os.path.exists(dump_path): 92 | actions = pd.read_csv(dump_path) 93 | else: 94 | actions=get_actions(start_date,end_date) 95 | actions=actions[actions['cate']==8] 96 | actions.to_csv(dump_path,index=False) 97 | return actions 98 | 99 | # 行为数据 100 | def get_actions(start_date, end_date): 101 | """ 102 | 103 | :param start_date: 104 | :param end_date: 105 | :return: actions: pd.Dataframe 106 | """ 107 | dump_path = './cache/all_action_%s_%s.csv' % (start_date, end_date) 108 | if os.path.exists(dump_path): 109 | actions = pd.read_csv(dump_path) 110 | else: 111 | action_1 = get_actions_1() 112 | action_1 = action_1[(action_1.time >= start_date) & (action_1.time < end_date)] 113 | action_2 = get_actions_2() 114 | action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)] 115 | actions = pd.concat([action_1, action_2]) 116 | action_3 = get_actions_3() 117 | action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)] 118 | actions = pd.concat([actions, action_3]) # type: pd.DataFrame 119 | actions = actions[(actions.time >= start_date) & (actions.time < end_date)] 120 | actions.to_csv(dump_path, index=False) 121 | # actions['user_id']=actions['user_id'].astype('int') 122 | return actions 123 | 124 | # 获取两个时间相差几天 125 | def get_day_chaju(x, end_date): 126 | # x=x.split(' ')[0] 127 | x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S') 128 | end_date = datetime.strptime(end_date, '%Y-%m-%d') 129 | return (end_date - x).days 130 | 131 | 132 | 133 | 134 | # # 所有行为的总和 135 | # def get_action_feat(start_date, end_date): 136 | # dump_path = './cache/action_%s_%s.csv' % (start_date, end_date) 137 | # if os.path.exists(dump_path): 138 | # actions = pd.read_csv(dump_path) 139 | # else: 140 | # actions = get_actions(start_date, end_date) 141 | # actions = actions[['user_id', 'sku_id', 'type']] 142 | # df = pd.get_dummies(actions['type'], prefix='action') 143 | # actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 144 | # actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum() 145 | # del actions['type'] 146 | # actions.to_csv(dump_path, index=False) 147 | # return actions 148 | # top k 天的行为次数总和(滑窗处理) 149 | 150 | #user_id,u_action_1_1,u_action_1_2,u_action_1_3,u_action_1_4,u_action_1_5,u_action_1_6 151 | def get_action_feat(start_date, end_date,k): 152 | dump_path = './cache/u_action_%s_%s_%s.csv' % (start_date, end_date,k) 153 | if os.path.exists(dump_path): 154 | actions = pd.read_csv(dump_path) 155 | else: 156 | start_days=pd.to_datetime(end_date)-timedelta(days=k) 157 | start_days=str(start_days).split(' ')[0] 158 | actions = get_actions(start_days, end_date) 159 | actions = actions[['user_id', 'type']] 160 | df = pd.get_dummies(actions['type'], prefix='type') 161 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 162 | actions = actions.groupby('user_id', as_index=False).sum() 163 | min_max_scaler = preprocessing.MinMaxScaler() 164 | df = min_max_scaler.fit_transform(actions.drop(['user_id','type'],axis=1).values) 165 | df = pd.DataFrame(df) 166 | df.columns=['u_action_'+str(k)+'_'+str(i) for i in range(1,df.shape[1]+1)] 167 | actions = pd.concat([actions[['user_id']], df], axis=1) 168 | actions.to_csv(dump_path, index=False) 169 | return actions 170 | 171 | 172 | 173 | 174 | 175 | 176 | # 用户的行为转化率 177 | def get_action_user_feat1(start_date, end_date): 178 | feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio', 179 | 'user_action_5_ratio', 'user_action_6_ratio'] 180 | dump_path = './cache/user_feat_accumulate_xiugai_%s_%s.csv' % (start_date, end_date) 181 | if os.path.exists(dump_path): 182 | actions = pd.read_csv(dump_path) 183 | else: 184 | actions = get_actions(start_date, end_date) 185 | df = pd.get_dummies(actions['type'], prefix='action') 186 | actions = pd.concat([actions['user_id'], df], axis=1) 187 | actions = actions.groupby(['user_id'], as_index=False).sum() 188 | actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1'] 189 | actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2'] 190 | # actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3'] 191 | actions['user_action_3_ratio'] = actions['action_3'] / actions['action_2'] 192 | actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5'] 193 | actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6'] 194 | # 3.购物车删除 195 | actions = actions[feature] 196 | actions.to_csv(dump_path, index=False) 197 | return actions 198 | 199 | 200 | # print get_accumulate_user_feat('2016-03-10','2016-04-11') 201 | # 用户购买前访问天数 202 | # 用户购买/加入购物车/关注前访问天数 203 | def get_action_user_feat2(start_date, end_date): 204 | dump_path = './cache/user_feat2_after_%s_%s.csv' % (start_date, end_date) 205 | if os.path.exists(dump_path): 206 | actions = pd.read_csv(dump_path) 207 | 208 | else: 209 | # 用户购买前访问天数 210 | def user_feat_2_1(start_date, end_date): 211 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 212 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 213 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 214 | visit = actions[actions['type'] == 1] 215 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first') 216 | del visit['time'] 217 | del actions['time'] 218 | visit = visit.groupby('user_id', as_index=False).count() 219 | visit.columns = ['user_id', 'visit'] 220 | buy = actions[actions['type'] == 4] 221 | buy = buy.groupby('user_id', as_index=False).count() 222 | buy.columns = ['user_id', 'buy'] 223 | actions = pd.merge(visit, buy, on='user_id', how='left') 224 | actions['visit_day_before_buy'] = actions['visit'] / actions['buy'] 225 | del actions['buy'] 226 | del actions['visit'] 227 | return actions 228 | 229 | # 用户加入购物车前访问天数 230 | def user_feat_2_2(start_date, end_date): 231 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 232 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 233 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 234 | visit = actions[actions['type'] == 1] 235 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first') 236 | del visit['time'] 237 | del actions['time'] 238 | visit = visit.groupby('user_id', as_index=False).count() 239 | visit.columns = ['user_id', 'visit'] 240 | addtoshopping = actions[actions['type'] == 2] 241 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 242 | addtoshopping.columns = ['user_id', 'addtoshopping'] 243 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left') 244 | actions['visit_day_before_addtoshopping'] = actions['visit'] / actions['addtoshopping'] 245 | del actions['addtoshopping'] 246 | del actions['visit'] 247 | return actions 248 | 249 | # 用户关注前访问天数 250 | def user_feat_2_3(start_date, end_date): 251 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 252 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 253 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 254 | visit = actions[actions['type'] == 1] 255 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first') 256 | del visit['time'] 257 | del actions['time'] 258 | visit = visit.groupby('user_id', as_index=False).count() 259 | visit.columns = ['user_id', 'visit'] 260 | guanzhu = actions[actions['type'] == 5] 261 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 262 | guanzhu.columns = ['user_id', 'guanzhu'] 263 | actions = pd.merge(visit, guanzhu, on='user_id', how='left') 264 | actions['visit_day_before_guanzhu'] = actions['visit'] / actions['guanzhu'] 265 | del actions['guanzhu'] 266 | del actions['visit'] 267 | return actions 268 | 269 | # 用户购买前加入购物车天数 270 | def user_feat_2_4(start_date, end_date): 271 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 272 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 273 | # actions=actions.drop_duplicates(['user_id','time'],keep='first') 274 | addtoshopping = actions[actions['type'] == 2] 275 | addtoshopping = addtoshopping.drop_duplicates(['user_id', 'time'], keep='first') 276 | del addtoshopping['time'] 277 | del actions['time'] 278 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 279 | addtoshopping.columns = ['user_id', 'addtoshopping'] 280 | buy = actions[actions['type'] == 4] 281 | buy = buy.groupby('user_id', as_index=False).count() 282 | buy.columns = ['user_id', 'buy'] 283 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left') 284 | actions['addtoshopping_day_before_buy'] = actions['addtoshopping'] / actions['buy'] 285 | del actions['buy'] 286 | del actions['addtoshopping'] 287 | return actions 288 | 289 | # 用户购买前关注天数 290 | def user_feat_2_5(start_date, end_date): 291 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 292 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 293 | guanzhu = actions[actions['type'] == 5] 294 | guanzhu = guanzhu.drop_duplicates(['user_id', 'time'], keep='first') 295 | del guanzhu['time'] 296 | del actions['time'] 297 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 298 | guanzhu.columns = ['user_id', 'guanzhu'] 299 | buy = actions[actions['type'] == 4] 300 | buy = buy.groupby('user_id', as_index=False).count() 301 | buy.columns = ['user_id', 'buy'] 302 | actions = pd.merge(guanzhu, buy, on='user_id', how='left') 303 | actions['guanzhu_day_before_buy'] = actions['guanzhu'] / actions['buy'] 304 | del actions['buy'] 305 | del actions['guanzhu'] 306 | return actions 307 | 308 | actions = pd.merge(user_feat_2_1(start_date, end_date), user_feat_2_2(start_date, end_date), on='user_id', 309 | how='outer') 310 | actions = pd.merge(actions, user_feat_2_3(start_date, end_date), on='user_id', how='outer') 311 | actions = pd.merge(actions, user_feat_2_4(start_date, end_date), on='user_id', how='outer') 312 | actions = pd.merge(actions, user_feat_2_5(start_date, end_date), on='user_id', how='outer') 313 | user_id = actions['user_id'] 314 | del actions['user_id'] 315 | actions = actions.fillna(0) 316 | min_max_scale = preprocessing.MinMaxScaler() 317 | actions = min_max_scale.fit_transform(actions.values) 318 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1) 319 | actions.to_csv(dump_path, index=False) 320 | actions.columns = ['user_id'] + ['u_feat2_' + str(i) for i in range(1, actions.shape[1])] 321 | return actions 322 | 323 | 324 | 325 | 326 | # # 用户总购买品牌数 327 | # def get_action_user_feat5(start_date, end_date): 328 | # dump_path = './cache/user_feat5_%s_%s.csv' % (start_date, end_date) 329 | # if os.path.exists(dump_path): 330 | # actions = pd.read_csv(dump_path) 331 | # else: 332 | # actions = get_actions(start_date, end_date)[['user_id', 'sku_id']] 333 | # actions = actions.drop_duplicates(['user_id', 'sku_id'], keep='first') 334 | # actions = actions.groupby('user_id', as_index=False).count() 335 | # actions.columns = ['user_id', 'sku_num'] 336 | # actions['sku_num'] = actions['sku_num'].astype('float') 337 | # actions['sku_num'] = actions['sku_num'].map( 338 | # lambda x: (x - actions['sku_num'].min()) / (actions['sku_num'].max() - actions['sku_num'].min())) 339 | # actions.to_csv(dump_path, index=False) 340 | # actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])] 341 | # return actions 342 | 343 | 344 | # 用户平均访问间隔 345 | def get_action_user_feat6(start_date, end_date): 346 | dump_path = './cache/user_feat6_%s_%s.csv' % (start_date, end_date) 347 | if os.path.exists(dump_path): 348 | actions = pd.read_csv(dump_path) 349 | else: 350 | 351 | df = get_actions(start_date, end_date)[['user_id', 'time']] 352 | # df['user_id']=df['user_id'].astype('int') 353 | df['time'] = df['time'].map(lambda x: x.split(' ')[0]) 354 | df = df.drop_duplicates(['user_id', 'time'], keep='first') 355 | df['time'] = df['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d')) 356 | actions = df.groupby('user_id', as_index=False).agg(lambda x: x['time'].diff().mean()) 357 | actions['avg_visit'] = actions['time'].dt.days 358 | del actions['time'] 359 | actions.to_csv(dump_path, index=False) 360 | actions.columns = ['user_id'] + ['u_feat6_' + str(i) for i in range(1, actions.shape[1])] 361 | return actions 362 | 363 | 364 | # 用户平均六种行为的访问间隔 365 | def get_action_user_feat6_six(start_date, end_date): 366 | dump_path = './cache/user_feat6_six_%s_%s.csv' % (start_date, end_date) 367 | if os.path.exists(dump_path): 368 | actions = pd.read_csv(dump_path) 369 | else: 370 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 371 | df['time'] = df['time'].map(lambda x: (-1) * get_day_chaju(x, start_date)) 372 | df = df.drop_duplicates(['user_id', 'time', 'type'], keep='first') 373 | actions = df.groupby(['user_id', 'type']).agg(lambda x: np.diff(x).mean()) 374 | actions = actions.unstack() 375 | actions.columns = list(range(actions.shape[1])) 376 | actions = actions.reset_index() 377 | actions.to_csv(dump_path, index=False) 378 | actions.columns = ['user_id'] + ['u_feat6_six_' + str(i) for i in range(1, actions.shape[1])] 379 | return actions 380 | 381 | 382 | # 用户购买频率 383 | def get_action_user_feat7(start_date, end_date): 384 | dump_path = './cache/user_feat7_six_%s_%s.csv' % (start_date, end_date) 385 | if os.path.exists(dump_path): 386 | actions = pd.read_csv(dump_path) 387 | else: 388 | df = get_actions(start_date, end_date)[['user_id', 'type', 'time']] 389 | actions = df.groupby(['user_id', 'type'], as_index=False).count() 390 | 391 | time_min = df.groupby(['user_id', 'type'], as_index=False).min() 392 | time_max = df.groupby(['user_id', 'type'], as_index=False).max() 393 | 394 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left') 395 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 396 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 397 | 398 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] - 399 | time_cha[ 400 | 'time_y']).dt.seconds // 3600 401 | del time_cha['time_x'] 402 | del time_cha['time_y'] 403 | # time_cha=time_cha.fillna(1) 404 | 405 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left") 406 | actions = actions.groupby(['user_id', 'type']).sum() 407 | actions['cnt/time'] = actions['time'] / actions["cha_hour"] 408 | actions = actions.unstack() 409 | actions.columns = list(range(actions.shape[1])) 410 | actions = actions.reset_index() 411 | actions = actions.fillna(0) 412 | actions.to_csv(dump_path, index=False) 413 | actions.columns = ['user_id'] + ['u_feat7_' + str(i) for i in range(1, actions.shape[1])] 414 | return actions 415 | 416 | 417 | def user_top_k_0_1(start_date, end_date): 418 | actions = get_actions(start_date, end_date) 419 | actions = actions[['user_id', 'sku_id', 'type']] 420 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 421 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 422 | actions = actions.groupby('user_id', as_index=False).sum() 423 | del actions['type'] 424 | del actions['sku_id'] 425 | user_id = actions['user_id'] 426 | del actions['user_id'] 427 | actions = actions.applymap(lambda x: 1 if x > 0 else 0) 428 | actions = pd.concat([user_id, actions], axis=1) 429 | return actions 430 | 431 | 432 | # 用户最近K天行为0/1提取 433 | def get_action_user_feat8(start_date, end_date): 434 | dump_path = './cache/user_feat8_%s_%s.csv' % (start_date, end_date) 435 | if os.path.exists(dump_path): 436 | actions = pd.read_csv(dump_path) 437 | else: 438 | actions = None 439 | for i in (1, 2, 3, 4, 5, 6, 7, 15, 30): 440 | print(i) 441 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i) 442 | start_days = start_days.strftime('%Y-%m-%d') 443 | if actions is None: 444 | actions = user_top_k_0_1(start_days, end_date) 445 | else: 446 | actions = pd.merge(actions, user_top_k_0_1(start_days, end_date), how='outer', on='user_id') 447 | actions.to_csv(dump_path, index=False) 448 | actions.columns = ['user_id'] + ['u_feat8_' + str(i) for i in range(1, actions.shape[1])] 449 | return actions 450 | 451 | 452 | # 获取用户的重复购买率 453 | def get_action_user_feat8_2(start_date, end_date): 454 | dump_path = './cache/product_feat8_2_%s_%s.csv' % (start_date, end_date) 455 | if os.path.exists(dump_path): 456 | actions = pd.read_csv(dump_path) 457 | else: 458 | df = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']] 459 | df = df[df['type'] == 4] # 购买的行为 460 | df = df.groupby(['user_id', 'sku_id'], as_index=False).count() 461 | df.columns = ['user_id', 'sku_id', 'count1'] 462 | df['count1'] = df['count1'].map(lambda x: 1 if x > 1 else 0) 463 | grouped = df.groupby(['user_id'], as_index=False) 464 | actions = grouped.count()[['user_id', 'count1']] 465 | actions.columns = ['user_id', 'count'] 466 | re_count = grouped.sum()[['user_id', 'count1']] 467 | re_count.columns = ['user_id', 're_count'] 468 | actions = pd.merge(actions, re_count, on='user_id', how='left') 469 | re_buy_rate = actions['re_count'] / actions['count'] 470 | actions = pd.concat([actions['user_id'], re_buy_rate], axis=1) 471 | actions.columns = ['user_id', 're_buy_rate'] 472 | actions.to_csv(dump_path, index=False) 473 | actions.columns = ['user_id'] + ['u_feat8_2_' + str(i) for i in range(1, actions.shape[1])] 474 | return actions 475 | 476 | 477 | # 获取最近一次行为的时间距离当前时间的差距 478 | def get_action_user_feat9(start_date, end_date): 479 | dump_path = './cache/user_feat9_%s_%s.csv' % (start_date, end_date) 480 | if os.path.exists(dump_path): 481 | actions = pd.read_csv(dump_path) 482 | else: 483 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 484 | # df['time'] = df['time'].map(lambda x: (-1)*get_day_chaju(x,start_date)) 485 | df = df.drop_duplicates(['user_id', 'type'], keep='last') 486 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1) 487 | actions = df.groupby(['user_id', 'type']).sum() 488 | actions = actions.unstack() 489 | actions.columns = list(range(actions.shape[1])) 490 | actions = actions.reset_index() 491 | actions = actions.fillna(30) 492 | actions.to_csv(dump_path, index=False) 493 | actions.columns = ['user_id'] + ['u_feat9_' + str(i) for i in range(1, actions.shape[1])] 494 | return actions 495 | 496 | 497 | # 获取最后一次行为的次数并且进行归一化 498 | def get_action_user_feat10(start_date, end_date): 499 | dump_path = './cache/user_feat10_%s_%s.csv' % (start_date, end_date) 500 | if os.path.exists(dump_path): 501 | actions = pd.read_csv(dump_path) 502 | else: 503 | 504 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 505 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1) 506 | 507 | idx = df.groupby(['user_id', 'type'])['time'].transform(min) 508 | idx1 = idx == df['time'] 509 | actions = df[idx1].groupby(["user_id", "type"]).count() 510 | actions = actions.unstack() 511 | actions.columns = list(range(actions.shape[1])) 512 | actions = actions.fillna(0) 513 | actions = actions.reset_index() 514 | 515 | user_sku = actions[['user_id']] 516 | del actions['user_id'] 517 | min_max_scaler = preprocessing.MinMaxScaler() 518 | actions = min_max_scaler.fit_transform(actions.values) 519 | actions = pd.DataFrame(actions) 520 | actions = pd.concat([user_sku, actions], axis=1) 521 | 522 | actions.to_csv(dump_path, index=False) 523 | actions.columns = ['user_id'] + ['u_feat10_' + str(i) for i in range(1, actions.shape[1])] 524 | return actions 525 | 526 | 527 | # 获取人物该层级最后一层的各种行为的统计数量 528 | def get_action_user_feat11(start_date, end_date, n): 529 | dump_path = './cache/user_feat11_%s_%s_%s.csv' % (start_date, end_date, n) 530 | if os.path.exists(dump_path): 531 | actions = pd.read_csv(dump_path) 532 | else: 533 | 534 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 535 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n) 536 | df = df[df['time'] == 0] 537 | del df['time'] 538 | temp = pd.get_dummies(df['type'], prefix='type') 539 | del df['type'] 540 | actions = pd.concat([df, temp], axis=1) 541 | actions = actions.groupby(['user_id'], as_index=False).sum() 542 | user_sku = actions[['user_id']] 543 | del actions['user_id'] 544 | min_max_scaler = preprocessing.MinMaxScaler() 545 | actions = min_max_scaler.fit_transform(actions.values) 546 | actions = pd.DataFrame(actions) 547 | actions = pd.concat([user_sku, actions], axis=1) 548 | actions.to_csv(dump_path, index=False) 549 | actions.columns = ['user_id'] + ['u_feat11_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 550 | return actions 551 | 552 | 553 | def get_action_user_feat12(start_date, end_date): 554 | dump_path = './cache/user_feat12_%s_%s.csv' % (start_date, end_date) 555 | if os.path.exists(dump_path): 556 | actions = pd.read_csv(dump_path) 557 | else: 558 | actions = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 559 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0]) 560 | actions = actions.drop_duplicates(['user_id', 'time', 'type'], keep='first') 561 | actions['day'] = actions['time'].map( 562 | lambda x: (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d')).days) 563 | result = None 564 | for i in (2, 3, 7, 14, 28): # 层级个数 565 | print ('i%s' % i) 566 | actions['level%s' % i] = actions['day'].map(lambda x: x // i) 567 | a=set(actions['level%s' % i].tolist()) 568 | for j in (1, 2,3,4, 5, 6): # type 569 | print ('j%s' % j) 570 | df = actions[actions['type'] == j][['user_id', 'level%s' % i, 'time']] 571 | df = df.groupby(['user_id', 'level%s' % i]).count() 572 | df = df.unstack() 573 | b=df.columns.levels[1].tolist() 574 | df.columns = ['u_feat12_' + str('level%s_' % i) + str(j) + '_' + str(k) for k in df.columns.levels[1].tolist()] 575 | if len(list(a-set(b)))!=0: 576 | c=list(a-set(b)) 577 | for k in c: 578 | df['u_feat12_'+str('level%s_' % i)+str(j)+'_'+ str(k)]=0 579 | columns=df.columns 580 | dict={} 581 | for column in columns: 582 | k=int(column.split('_')[-1]) 583 | dict[column]=k 584 | columns=sorted(dict.items(),key=lambda x: x[1]) 585 | columns=[(columns[t])[0] for t in range(len(columns))] 586 | df=df[columns] 587 | df = df.reset_index() 588 | if result is None: 589 | result = df 590 | else: 591 | result = pd.merge(result, df, on='user_id', how='left') 592 | columns = result.columns 593 | user_id = result['user_id'] 594 | del result['user_id'] 595 | actions = result.fillna(0) 596 | 597 | min_max_scaler = preprocessing.MinMaxScaler() 598 | actions = min_max_scaler.fit_transform(actions.values) 599 | actions = pd.DataFrame(actions) 600 | actions = pd.concat([user_id, actions], axis=1) 601 | actions.columns=columns 602 | actions.to_csv(dump_path, index=False) 603 | return actions 604 | 605 | 606 | 607 | # 层级的天数 608 | def get_action_user_feat13(start_date, end_date, n): 609 | dump_path = './cache/user_feat13_%s_%s_%s.csv' % (start_date, end_date, n) 610 | if os.path.exists(dump_path): 611 | actions = pd.read_csv(dump_path) 612 | else: 613 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 614 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n) 615 | df = df.drop_duplicates(['user_id', 'type', 'time'], keep='first') 616 | actions = df.groupby(['user_id', 'type']).count() 617 | actions = actions.unstack() 618 | actions.columns = list(range(actions.shape[1])) 619 | actions = actions.fillna(0) 620 | actions = actions.reset_index() 621 | user_sku = actions[['user_id']] 622 | del actions['user_id'] 623 | min_max_scaler = preprocessing.MinMaxScaler() 624 | actions = min_max_scaler.fit_transform(actions.values) 625 | actions = pd.DataFrame(actions) 626 | actions = pd.concat([user_sku, actions], axis=1) 627 | actions.to_csv(dump_path, index=False) 628 | actions.columns = ['user_id'] + ['u_feat13_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 629 | return actions 630 | 631 | 632 | def get_action_user_feat14(start_date, end_date): 633 | dump_path = './cache/user_feat14_%s_%s.csv' % (start_date, end_date) 634 | if os.path.exists(dump_path): 635 | actions = pd.read_csv(dump_path) 636 | else: 637 | n = 5 638 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']] 639 | df = df[df['type'] == 4][['user_id', 'time']] 640 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n) 641 | days = np.max(df['time']) 642 | 643 | df['cnt'] = 0 644 | actions = df.groupby(['user_id', 'time']).count() 645 | 646 | actions = actions.unstack() 647 | 648 | actions.columns = list(range(actions.shape[1])) 649 | actions = actions.reset_index() 650 | 651 | actions = actions.fillna(0) 652 | user_sku = actions[['user_id']] 653 | del actions['user_id'] 654 | min_max_scaler = preprocessing.MinMaxScaler() 655 | actions = min_max_scaler.fit_transform(actions.values) 656 | actions = pd.DataFrame(actions) 657 | actions = pd.concat([user_sku, actions], axis=1) 658 | actions.to_csv(dump_path, index=False) 659 | actions.columns = ['user_id'] + ['u_feat14_' + str(i) for i in range(1, actions.shape[1])] 660 | return actions 661 | 662 | 663 | # 用户购买/加入购物车/关注前访问次数 664 | def get_action_user_feat15(start_date, end_date): 665 | dump_path = './cache/user_feat15_%s_%s.csv' % (start_date, end_date) 666 | if os.path.exists(dump_path): 667 | actions = pd.read_csv(dump_path) 668 | else: 669 | # 用户购买前访问次数 670 | def user_feat_15_1(start_date, end_date): 671 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 672 | visit = actions[actions['type'] == 1] 673 | visit = visit.groupby('user_id', as_index=False).count() 674 | visit.columns = ['user_id', 'visit'] 675 | buy = actions[actions['type'] == 4] 676 | buy = buy.groupby('user_id', as_index=False).count() 677 | buy.columns = ['user_id', 'buy'] 678 | actions = pd.merge(visit, buy, on='user_id', how='left') 679 | actions['visit_num_before_buy'] = actions['visit'] / actions['buy'] 680 | del actions['buy'] 681 | del actions['visit'] 682 | return actions 683 | 684 | # 用户加入购物车前访问次数 685 | def user_feat_15_2(start_date, end_date): 686 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 687 | visit = actions[actions['type'] == 1] 688 | visit = visit.groupby('user_id', as_index=False).count() 689 | visit.columns = ['user_id', 'visit'] 690 | addtoshopping = actions[actions['type'] == 2] 691 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 692 | addtoshopping.columns = ['user_id', 'addtoshopping'] 693 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left') 694 | actions['visit_num_before_addtoshopping'] = actions['visit'] / actions['addtoshopping'] 695 | del actions['addtoshopping'] 696 | del actions['visit'] 697 | return actions 698 | 699 | # 用户关注前访问次数 700 | def user_feat_15_3(start_date, end_date): 701 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 702 | visit = actions[actions['type'] == 1] 703 | visit = visit.groupby('user_id', as_index=False).count() 704 | visit.columns = ['user_id', 'visit'] 705 | guanzhu = actions[actions['type'] == 5] 706 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 707 | guanzhu.columns = ['user_id', 'guanzhu'] 708 | actions = pd.merge(visit, guanzhu, on='user_id', how='left') 709 | actions['visit_num_before_guanzhu'] = actions['visit'] / actions['guanzhu'] 710 | del actions['guanzhu'] 711 | del actions['visit'] 712 | return actions 713 | 714 | # 用户购买前加入购物车次数 715 | def user_feat_15_4(start_date, end_date): 716 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 717 | addtoshopping = actions[actions['type'] == 2] 718 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count() 719 | addtoshopping.columns = ['user_id', 'addtoshopping'] 720 | buy = actions[actions['type'] == 4] 721 | buy = buy.groupby('user_id', as_index=False).count() 722 | buy.columns = ['user_id', 'buy'] 723 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left') 724 | actions['addtoshopping_num_before_buy'] = actions['addtoshopping'] / actions['buy'] 725 | del actions['buy'] 726 | del actions['addtoshopping'] 727 | return actions 728 | 729 | # 用户购买前关注次数 730 | def user_feat_15_5(start_date, end_date): 731 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 732 | guanzhu = actions[actions['type'] == 5] 733 | guanzhu = guanzhu.groupby('user_id', as_index=False).count() 734 | guanzhu.columns = ['user_id', 'guanzhu'] 735 | buy = actions[actions['type'] == 4] 736 | buy = buy.groupby('user_id', as_index=False).count() 737 | buy.columns = ['user_id', 'buy'] 738 | actions = pd.merge(guanzhu, buy, on='user_id', how='left') 739 | actions['guanzhu_num_before_buy'] = actions['guanzhu'] / actions['buy'] 740 | del actions['buy'] 741 | del actions['guanzhu'] 742 | return actions 743 | 744 | actions = pd.merge(user_feat_15_1(start_date, end_date), user_feat_15_2(start_date, end_date), on='user_id', 745 | how='outer') 746 | actions = pd.merge(actions, user_feat_15_3(start_date, end_date), on='user_id', how='outer') 747 | actions = pd.merge(actions, user_feat_15_4(start_date, end_date), on='user_id', how='outer') 748 | actions = pd.merge(actions, user_feat_15_5(start_date, end_date), on='user_id', how='outer') 749 | user_id = actions['user_id'] 750 | del actions['user_id'] 751 | actions = actions.fillna(0) 752 | min_max_scale = preprocessing.MinMaxScaler() 753 | actions = min_max_scale.fit_transform(actions.values) 754 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1) 755 | 756 | actions.to_csv(dump_path, index=False) 757 | actions.columns = ['user_id'] + ['u_feat15_' + str(i) for i in range(1, actions.shape[1])] 758 | return actions 759 | 760 | 761 | # 用户行为的交叉 762 | def get_action_user_feat16(start_date, end_date): 763 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date) 764 | if os.path.exists(dump_path): 765 | actions = pd.read_csv(dump_path) 766 | else: 767 | actions = get_actions(start_date, end_date)[['user_id', 'type']] 768 | actions['cnt'] = 0 769 | action1 = actions.groupby(['user_id', 'type']).count() 770 | action1 = action1.unstack() 771 | index_col = list(range(action1.shape[1])) 772 | action1.columns = index_col 773 | action1 = action1.reset_index() 774 | action2 = actions.groupby('user_id', as_index=False).count() 775 | del action2['type'] 776 | action2.columns = ['user_id', 'cnt'] 777 | actions = pd.merge(action1, action2, how='left', on='user_id') 778 | for i in index_col: 779 | actions[i] = actions[i] / actions['cnt'] 780 | del actions['cnt'] 781 | actions.to_csv(dump_path, index=False) 782 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])] 783 | return actions 784 | 785 | 786 | # 最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除) 787 | def get_action_user_feat0509_1_30(start_date, end_date, n): 788 | dump_path = './cache/user_feat0509_1_30_%s_%s_%s.csv' % (start_date, end_date, n) 789 | if os.path.exists(dump_path): 790 | actions = pd.read_csv(dump_path) 791 | else: 792 | 793 | start_days = datetime.strptime(end_dfte, '%Y-%m-%d') - timedelta(days=n) 794 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 795 | 796 | actions = get_actions(start_days, end_date)[['user_id', 'sku_id', 'type']] 797 | actions_dummy = pd.get_dummies(actions['type'], prefix='actions') 798 | actions = pd.concat([actions, actions_dummy], axis=1) 799 | del actions['type'] 800 | 801 | P = get_basic_product_feat()[['sku_id']] 802 | P['label'] = 1 803 | actions_sub = pd.merge(actions, P, on='sku_id', how='left') 804 | actions_sub = actions_sub[actions_sub['label'] == 1] 805 | del actions_sub['label'] 806 | 807 | actions_sub = actions_sub.groupby(['user_id'], as_index=False).sum() 808 | del actions_sub['sku_id'] 809 | actions_all = actions.groupby(['user_id'], as_index=False).sum() 810 | del actions_all['sku_id'] 811 | 812 | if n > 7: 813 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left') 814 | # print actions.head() 815 | for i in range(1, 7): 816 | actions['actions_%s' % i] = actions['actions_%s_y' % i] / actions['actions_%s_x' % i] 817 | # actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']] 818 | 819 | else: 820 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left') 821 | actions.to_csv(dump_path, index=False) 822 | actions.columns = ['user_id'] + ['u_feat30_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 823 | 824 | return actions 825 | 826 | 827 | 828 | #用户行为的交叉 829 | def get_action_user_feat16(start_date,end_date): 830 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date) 831 | if os.path.exists(dump_path): 832 | actions = pd.read_csv(dump_path) 833 | else: 834 | actions=get_actions(start_date, end_date)[['user_id', 'type']] 835 | actions['cnt']=0 836 | action1 = actions.groupby(['user_id', 'type']).count() 837 | action1=action1.unstack() 838 | index_col=list(range(action1.shape[1])) 839 | action1.columns=index_col 840 | action1=action1.reset_index() 841 | action2 = actions.groupby('user_id', as_index=False).count() 842 | del action2['type'] 843 | action2.columns = ['user_id', 'cnt'] 844 | actions = pd.merge(action1, action2, how='left', on='user_id') 845 | for i in index_col: 846 | actions[i] = actions[i] / actions['cnt'] 847 | del actions['cnt'] 848 | actions.to_csv(dump_path,index=False) 849 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])] 850 | return actions 851 | 852 | #最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除) 853 | def get_action_user_feat0509_1_30(start_date,end_date,n): 854 | dump_path='./cache/user_feat0509_1_30_%s_%s_%s.csv'%(start_date,end_date,n) 855 | if os.path.exists(dump_path): 856 | actions = pd.read_csv(dump_path) 857 | else: 858 | 859 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n) 860 | start_days=datetime.strftime(start_days,'%Y-%m-%d') 861 | 862 | actions=get_actions(start_days,end_date)[['user_id','sku_id','type']] 863 | actions_dummy=pd.get_dummies(actions['type'],prefix='actions') 864 | actions=pd.concat([actions,actions_dummy],axis=1) 865 | del actions['type'] 866 | 867 | P = get_basic_product_feat()[['sku_id']] 868 | P['label']=1 869 | actions_sub=pd.merge(actions,P,on='sku_id',how='left') 870 | actions_sub=actions_sub[actions_sub['label']==1] 871 | del actions_sub['label'] 872 | 873 | actions_sub=actions_sub.groupby(['user_id'],as_index=False).sum() 874 | del actions_sub['sku_id'] 875 | actions_all=actions.groupby(['user_id'],as_index=False).sum() 876 | del actions_all['sku_id'] 877 | 878 | if n>7: 879 | actions=pd.merge(actions_all,actions_sub,on=['user_id'],how='left') 880 | #print actions.head() 881 | for i in range(1,7): 882 | actions['actions_%s'%i]=actions['actions_%s_y'%i]/actions['actions_%s_x'%i] 883 | #actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']] 884 | 885 | else: 886 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left') 887 | actions.to_csv(dump_path,index=False) 888 | actions.columns = ['user_id'] + ['u_feat30_' +str(n)+'_'+ str(i) for i in range(1, actions.shape[1])] 889 | # user_id = actions[['user_id']] 890 | # del actions['user_id'] 891 | # actions = actions.fillna(0) 892 | # actions=actions.replace(np.inf,0) 893 | # # print(actions.head()) 894 | # columns = actions.columns 895 | 896 | # min_max_scale = preprocessing.MinMaxScaler() 897 | # actions=actions.replace(np.inf,0) 898 | # actions = min_max_scale.fit_transform(actions.values) 899 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 900 | return actions 901 | 902 | #用户点击到购买的时间间隔 903 | def get_action_user_feat0515_2_1(start_date,end_date): 904 | dump_path='./cache/get_action_user_feat0515_2_1_%s_%s.csv'%(start_date,end_date) 905 | if os.path.exists(dump_path): 906 | actions = pd.read_csv(dump_path) 907 | else: 908 | actions = get_actions(start_date,end_date) 909 | actions_dianji=actions[actions['type']==6][['user_id','sku_id','time']] 910 | actions_dianji['time_dianji'] = actions_dianji['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 911 | actions_dianji = actions_dianji[['user_id', 'sku_id','time_dianji']] 912 | actions_dianji= actions_dianji.drop_duplicates(['user_id', 'sku_id'], keep='first') 913 | 914 | 915 | actions_goumai=actions[actions['type']==4][['user_id','sku_id','time']] 916 | actions_goumai['time_goumai'] = actions_goumai['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 917 | actions_goumai = actions_goumai[['user_id', 'sku_id','time_goumai']] 918 | actions_goumai= actions_goumai.drop_duplicates(['user_id', 'sku_id'], keep='last') 919 | 920 | actions = pd.merge(actions_dianji,actions_goumai,on=['user_id','sku_id'],how='inner') 921 | actions['time_jiange']=actions['time_goumai']-actions['time_dianji'] 922 | actions=actions.drop(['sku_id','time_goumai','time_dianji'],axis=1) 923 | actions['time_jiange']=actions['time_jiange'].map(lambda x:x.days*24+x.seconds//3600+1) 924 | 925 | actions_min = actions.groupby('user_id').min().reset_index() 926 | actions_min.columns = ['user_id','time_min'] 927 | # actions_mean = actions.groupby('user_id').mean().reset_index() 928 | # actions_mean.columns = ['user_id','time_mean'] 929 | actions_max = actions.groupby('user_id').max().reset_index() 930 | actions_max.columns = ['user_id','time_max'] 931 | actions=pd.merge(actions_min,actions_max,on='user_id',how='left') 932 | 933 | user_id = actions[['user_id']] 934 | del actions['user_id'] 935 | actions = actions.fillna(0) 936 | columns = actions.columns 937 | min_max_scale = preprocessing.MinMaxScaler() 938 | actions = min_max_scale.fit_transform(actions.values) 939 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 940 | actions.to_csv(dump_path,index=False) 941 | return actions 942 | 943 | 944 | #用户购买每种cate的数量 945 | def get_action_user_feat0515_2_2(start_date,end_date): 946 | dump_path='./cache/get_action_user_feat0515_2_2_%s_%s.csv'%(start_date,end_date) 947 | if os.path.exists(dump_path): 948 | actions = pd.read_csv(dump_path) 949 | else: 950 | actions = get_actions(start_date,end_date) 951 | actions = get_actions(start_date,end_date)[['user_id','cate']] 952 | cate_col = pd.get_dummies(actions['cate'],prefix='cate') 953 | actions=pd.concat([actions[['user_id']],cate_col],axis=1) 954 | actions= actions.groupby('user_id').sum().reset_index() 955 | 956 | user_id = actions[['user_id']] 957 | del actions['user_id'] 958 | actions = actions.fillna(0) 959 | columns = actions.columns 960 | min_max_scale = preprocessing.MinMaxScaler() 961 | actions = min_max_scale.fit_transform(actions.values) 962 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 963 | actions.to_csv(dump_path,index=False) 964 | return actions 965 | 966 | 967 | #获取某人某段时间内加入购物车的数量以及关注的数量 968 | def get_action_user_feat0515_2_3(start_date, end_date, n): 969 | dump_path = './cache/get_action_user_feat0515_2_3_%s_%s_%s_1.csv' % (start_date, end_date, n) 970 | if os.path.exists(dump_path): 971 | actions = pd.read_csv(dump_path) 972 | else: 973 | 974 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n) 975 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 976 | 977 | actions = get_actions(start_days,end_date)[['user_id','type','cate']] 978 | actions_gouwuche=actions[actions['type']==2] 979 | actions_gouwuche_1= actions_gouwuche[['user_id','type']] 980 | actions_gouwuche_1= actions_gouwuche_1.groupby('user_id').count().reset_index() 981 | actions_gouwuche_1.columns = ['user_id',str(n)+'gouwuche_add'] 982 | 983 | actions_gouwuche_2= actions_gouwuche[actions_gouwuche['cate']==8][['user_id','type']] 984 | actions_gouwuche_2= actions_gouwuche_2.groupby('user_id').count().reset_index() 985 | actions_gouwuche_2.columns = ['user_id',str(n)+'gouwuche_add_cate_8'] 986 | 987 | actions_guanzhu=actions[actions['type']==5] 988 | actions_guanzhu_1= actions_guanzhu[['user_id','type']] 989 | actions_guanzhu_1= actions_guanzhu_1.groupby('user_id').count().reset_index() 990 | actions_guanzhu_1.columns = ['user_id',str(n)+'guanzhu_add'] 991 | 992 | actions_guanzhu_2= actions_guanzhu[actions_guanzhu['cate']==8][['user_id','type']] 993 | actions_guanzhu_2= actions_guanzhu_2.groupby('user_id').count().reset_index() 994 | actions_guanzhu_2.columns = ['user_id',str(n)+'guanzhu_add_cate_8'] 995 | 996 | actions = pd.merge(actions_gouwuche_1,actions_gouwuche_2,on='user_id',how ='outer') 997 | actions = pd.merge(actions,actions_guanzhu_1,on='user_id',how ='outer') 998 | actions = pd.merge(actions,actions_guanzhu_2,on='user_id',how ='outer') 999 | actions=actions.fillna(0) 1000 | 1001 | user_id = actions[['user_id']] 1002 | del actions['user_id'] 1003 | actions = actions.fillna(0) 1004 | columns = actions.columns 1005 | min_max_scale = preprocessing.MinMaxScaler() 1006 | actions = min_max_scale.fit_transform(actions.values) 1007 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1008 | actions.to_csv(dump_path, index=False) 1009 | 1010 | 1011 | return actions 1012 | 1013 | #top n 中 某人使用了多少天产生了该行为 1014 | def get_action_user_feat0515_2_4(start_date, end_date, n): 1015 | dump_path = './cache/get_action_user_feat0515_2_4_%s_%s_%s.csv' % (start_date, end_date, n) 1016 | if os.path.exists(dump_path): 1017 | actions = pd.read_csv(dump_path) 1018 | else: 1019 | 1020 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n) 1021 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 1022 | 1023 | actions = get_actions(start_days,end_date)[['user_id','type','time']] 1024 | actions['time'] = actions['time'].map(lambda x: (datetime.strptime(end_date,'%Y-%m-%d')-datetime.strptime(x, '%Y-%m-%d %H:%M:%S')).days) 1025 | actions=actions.drop_duplicates(['user_id','type','time']) 1026 | actions = actions.groupby(['user_id','type']).count() 1027 | actions.columns = [str(n)+'day_nums'] 1028 | actions=actions.unstack() 1029 | actions=actions.reset_index() 1030 | actions.columns = ['user_id'] + ['get_action_user_feat0515_2_4_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 1031 | actions=actions.fillna(0) 1032 | 1033 | user_id = actions[['user_id']] 1034 | del actions['user_id'] 1035 | actions = actions.fillna(0) 1036 | columns = actions.columns 1037 | min_max_scale = preprocessing.MinMaxScaler() 1038 | actions = min_max_scale.fit_transform(actions.values) 1039 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1040 | actions.to_csv(dump_path, index=False) 1041 | return actions 1042 | 1043 | 1044 | # 用户总购买/加购/关注/点击/浏览品牌数 1045 | def get_action_user_feat5(start_date, end_date): 1046 | dump_path = './cache/user_feat5_a_%s_%s.csv' % (start_date, end_date) 1047 | if os.path.exists(dump_path): 1048 | actions = pd.read_csv(dump_path) 1049 | else: 1050 | actions = get_actions(start_date, end_date) 1051 | action=None 1052 | for i in (1,2,4,5,6): 1053 | df=actions[actions['type']==i][['user_id', 'sku_id']] 1054 | df = df.drop_duplicates(['user_id', 'sku_id'], keep='first') 1055 | df = df.groupby('user_id', as_index=False).count() 1056 | df.columns = ['user_id', 'num_%s'%i] 1057 | if i==1: 1058 | action=df 1059 | else: 1060 | action=pd.merge(action,df,on='user_id',how='outer') 1061 | actions=action.fillna(0) 1062 | actions = actions.astype('float') 1063 | user=actions[['user_id']] 1064 | min_max_scaler = preprocessing.MinMaxScaler() 1065 | actions = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values) 1066 | actions = pd.DataFrame(actions) 1067 | actions = pd.concat([user, actions], axis=1) 1068 | actions.to_csv(dump_path, index=False) 1069 | actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])] 1070 | return actions 1071 | 1072 | #top k 用户总购买/加购/关注/点击/浏览品牌数 1073 | def get_action_u0515_feat5(start_date,end_date,k): 1074 | dump_path = './cache/u0515_feat5_%s_%s_%s.csv' % (start_date, end_date,k) 1075 | if os.path.exists(dump_path): 1076 | actions = pd.read_csv(dump_path) 1077 | else: 1078 | start_days=pd.to_datetime(end_date)-timedelta(days=k) 1079 | start_days=str(start_days).split(' ')[0] 1080 | actions=get_action_user_feat5(start_days, end_date) 1081 | actions.to_csv(dump_path,index=False) 1082 | actions.columns=['user_id']+['u0515_feat5_'+str(k)+'_'+str(i) for i in range(1,actions.shape[1])] 1083 | return actions 1084 | 1085 | 1086 | #最早交互时间 1087 | def get_action_u0524_feat1(start_date,end_date): 1088 | dump_path = './cache/u0524_feat1_%s_%s.csv' % (start_date, end_date,) 1089 | if os.path.exists(dump_path): 1090 | actions = pd.read_csv(dump_path) 1091 | else: 1092 | #全集 1093 | actions=get_actions(start_date,end_date)[['user_id','time']] 1094 | actions=actions.groupby('user_id',as_index=False).first() 1095 | actions['time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(actions['time']) 1096 | actions['time_diff_early']=actions['time_diff_early'].dt.days*24+actions['time_diff_early'].dt.seconds//3600 1097 | actions=actions[['user_id','time_diff_early']] 1098 | #子集 1099 | sub_actions=sub_get_actions(start_date,end_date)[['user_id','time']] 1100 | sub_actions=sub_actions.groupby('user_id',as_index=False).first() 1101 | sub_actions['sub_time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(sub_actions['time']) 1102 | sub_actions['sub_time_diff_early']=sub_actions['sub_time_diff_early'].dt.days*24+sub_actions['sub_time_diff_early'].dt.seconds//3600 1103 | sub_actions = sub_actions[['user_id', 'sub_time_diff_early']] 1104 | 1105 | actions=pd.merge(actions,sub_actions,on='user_id',how='left') 1106 | actions=actions.fillna(0) 1107 | min_max_scale = preprocessing.MinMaxScaler() 1108 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values) 1109 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1) 1110 | actions.to_csv(dump_path,index=False) 1111 | actions.columns=['user_id']+['u0524_feat1_'+str(i)for i in range(1,actions.shape[1])] 1112 | return actions 1113 | 1114 | #最晚交互时间 1115 | def get_action_u0524_feat2(start_date,end_date): 1116 | dump_path = './cache/u0524_feat2_%s_%s.csv' % (start_date, end_date,) 1117 | if os.path.exists(dump_path): 1118 | actions = pd.read_csv(dump_path) 1119 | else: 1120 | # 全集 1121 | actions = get_actions(start_date, end_date)[['user_id', 'time']] 1122 | actions = actions.groupby('user_id', as_index=False).last() 1123 | actions['time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(actions['time']) 1124 | actions['time_diff_recent'] = actions['time_diff_recent'].dt.days * 24 + actions['time_diff_recent'].dt.seconds // 3600 1125 | actions = actions[['user_id', 'time_diff_recent']] 1126 | # 子集 1127 | sub_actions = sub_get_actions(start_date, end_date)[['user_id', 'time']] 1128 | sub_actions = sub_actions.groupby('user_id', as_index=False).last() 1129 | sub_actions['sub_time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(sub_actions['time']) 1130 | sub_actions['sub_time_diff_recent'] = sub_actions['sub_time_diff_recent'].dt.days * 24 + sub_actions['sub_time_diff_recent'].dt.seconds // 3600 1131 | sub_actions = sub_actions[['user_id', 'sub_time_diff_recent']] 1132 | 1133 | actions = pd.merge(actions, sub_actions, on='user_id', how='left') 1134 | actions=actions.fillna(0) 1135 | min_max_scale = preprocessing.MinMaxScaler() 1136 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values) 1137 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1) 1138 | actions.to_csv(dump_path,index=False) 1139 | actions.columns = ['user_id'] + ['u0524_feat2_' + str(i) for i in range(1, actions.shape[1])] 1140 | return actions 1141 | 1142 | 1143 | #活跃天数 1144 | def get_action_u0524_feat3(start_date,end_date): 1145 | dump_path = './cache/u0524_feat3_%s_%s.csv' % (start_date, end_date,) 1146 | if os.path.exists(dump_path): 1147 | actions = pd.read_csv(dump_path) 1148 | else: 1149 | #全集 1150 | actions=get_actions(start_date,end_date) 1151 | actions['time']=pd.to_datetime(actions['time']).dt.date 1152 | actions=actions.drop_duplicates(['user_id','time'])[['user_id','time']] 1153 | actions=actions.groupby('user_id',as_index=False).count() 1154 | #子集 1155 | sub_actions=sub_get_actions(start_date,end_date) 1156 | sub_actions['time']=pd.to_datetime(sub_actions['time']).dt.date 1157 | sub_actions=sub_actions.drop_duplicates(['user_id','time'])[['user_id','time']] 1158 | sub_actions=sub_actions.groupby('user_id',as_index=False).count() 1159 | actions=pd.merge(actions,sub_actions,on='user_id',how='left') 1160 | actions=actions.fillna(0) 1161 | min_max_scale = preprocessing.MinMaxScaler() 1162 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values) 1163 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1) 1164 | actions.to_csv(dump_path,index=False) 1165 | actions.columns=['user_id']+['u0524_feat3_'+str(i) for i in range(1,actions.shape[1])] 1166 | return actions 1167 | 1168 | 1169 | #点击模块 1170 | def get_action_user_feat0509_1_31(start_date,end_date,n): 1171 | dump_path='./cache/user_feat0509_1_31_%s_%s_%s.csv'%(start_date,end_date,n) 1172 | if os.path.exists(dump_path): 1173 | actions = pd.read_csv(dump_path) 1174 | else: 1175 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n) 1176 | start_days=datetime.strftime(start_days,'%Y-%m-%d') 1177 | actions=get_actions(start_days,end_date) 1178 | actions=actions[actions['type']==6][['user_id','model_id']] 1179 | 1180 | # actions = actions.drop('type',axis=1) 1181 | 1182 | actions_click_sum=actions[['user_id','model_id']].groupby('user_id').count().reset_index() 1183 | actions_click_sum.columns = ['user_id',str(n)+'click_sum_all'] 1184 | actions[str(n)+'u_click14_history'] = actions['model_id'].map(lambda x: int(x == 14)) 1185 | actions[str(n)+'u_click21_history'] = actions['model_id'].map(lambda x: int(x == 21)) 1186 | actions[str(n)+'u_click28_history'] = actions['model_id'].map(lambda x: int(x == 28)) 1187 | actions[str(n)+'u_click110_history'] = actions['model_id'].map(lambda x: int(x == 110)) 1188 | actions[str(n)+'u_click210_history'] = actions['model_id'].map(lambda x: int(x == 210)) 1189 | actions = actions.groupby('user_id').sum().reset_index().drop('model_id', axis=1) 1190 | # actions.to_csv(dump_path,index=False) 1191 | actions = pd.merge(actions,actions_click_sum,how='left',on='user_id') 1192 | 1193 | actions[str(n)+'u_click14/click_sum_history'] = actions[str(n)+'u_click14_history']/actions[str(n)+'click_sum_all'] 1194 | actions[str(n)+'u_click21/click_sum_history'] = actions[str(n)+'u_click21_history']/actions[str(n)+'click_sum_all'] 1195 | actions[str(n)+'u_click28/click_sum_history'] = actions[str(n)+'u_click28_history']/actions[str(n)+'click_sum_all'] 1196 | actions[str(n)+'u_click110/click_sum_history'] = actions[str(n)+'u_click110_history']/actions[str(n)+'click_sum_all'] 1197 | actions[str(n)+'u_click210/click_sum_history'] = actions[str(n)+'u_click210_history']/actions[str(n)+'click_sum_all'] 1198 | 1199 | user_id = actions[['user_id']] 1200 | del actions['user_id'] 1201 | actions = actions.fillna(0) 1202 | columns = actions.columns 1203 | min_max_scale = preprocessing.MinMaxScaler() 1204 | actions = min_max_scale.fit_transform(actions.values) 1205 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1206 | actions.to_csv(dump_path,index=False) 1207 | return actions 1208 | #u模型cate=8的购买者和不是cate=8的购买者 1209 | def get_action_u0513_feat16(start_date,end_date): 1210 | dump_path = './cache/u0513_feat16_%s_%s.csv' % (start_date, end_date) 1211 | if os.path.exists(dump_path): 1212 | actions = pd.read_csv(dump_path) 1213 | else: 1214 | df = get_actions(start_date, end_date)[['user_id', 'type', 'cate']] 1215 | df = df[df['type'] == 4] 1216 | df = df.groupby(['user_id', 'cate']).count() 1217 | df = df.unstack().reset_index() 1218 | df.columns = ['user_id'] + ['cate' + str(i) for i in range(4, 12)] 1219 | df = df.fillna(0) 1220 | sum1 = df.drop(['user_id', 'cate8'], axis=1).apply(sum, axis=1) 1221 | sum2 = df.drop(['user_id'], axis=1).apply(sum, axis=1) 1222 | actions = pd.concat([df[['user_id', 'cate8']], sum1, sum2], axis=1) 1223 | actions.columns = ['user_id', 'cate8', 'sum_other_cate', 'sum'] 1224 | actions['cate8_rate'] = actions['cate8'] / actions['sum'] 1225 | actions['sum_other_cate_rate'] = actions['sum_other_cate'] / actions['sum'] 1226 | del actions['sum'] 1227 | actions.to_csv(dump_path,index=False) 1228 | return actions 1229 | 1230 | #get_action_u0513_feat16('2016-02-01','2016-04-16') 1231 | # 用户层级特征 1232 | def get_action_user_feat_six_xingwei(start_date, end_date, n): 1233 | dump_path = './cache/user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n) 1234 | if os.path.exists(dump_path): 1235 | actions = pd.read_csv(dump_path) 1236 | print("user_zlzl" + str(n)) 1237 | 1238 | else: 1239 | actions = get_actions(start_date, end_date) 1240 | actions['time'] = actions['time'].map(lambda x: get_day_chaju(x, end_date) // n) 1241 | num_day = np.max(actions['time']) 1242 | df = None 1243 | print(num_day) 1244 | for i in range(min(num_day + 1, 6)): 1245 | in_temp = pd.get_dummies(actions['type'], prefix="user_action_time_" + str(i)) 1246 | temp = actions[actions['time'] == i] 1247 | temp = pd.concat([temp['user_id'], in_temp], axis=1) 1248 | 1249 | feature = ['user_id'] 1250 | for j in range(1, 7, 1): 1251 | feature.append('user_action_time_' + str(i) + '_' + str(j)) 1252 | 1253 | temp = temp.groupby(['user_id'], as_index=False).sum() 1254 | temp.columns = feature 1255 | if df is None: 1256 | df = temp 1257 | else: 1258 | df = pd.merge(df, temp, how='outer', on='user_id') 1259 | df.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, df.shape[1])] 1260 | df.to_csv(dump_path, index=False) 1261 | actions=df 1262 | 1263 | # user_id = actions[['user_id']] 1264 | # del actions['user_id'] 1265 | # actions = actions.fillna(0) 1266 | # actions=actions.replace(np.inf,0) 1267 | # # print(actions.head()) 1268 | # columns = actions.columns 1269 | 1270 | # min_max_scale = preprocessing.MinMaxScaler() 1271 | # actions=actions.replace(np.inf,0) 1272 | # actions = min_max_scale.fit_transform(actions.values) 1273 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1274 | actions.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 1275 | return actions 1276 | 1277 | 1278 | def deal_user_six_deal(start_date, end_date, n): 1279 | dump_path = './cache/deal_user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n) 1280 | if os.path.exists(dump_path): 1281 | actions = pd.read_csv(dump_path) 1282 | actions.columns = ['user_id'] + ['u_featsix_deal_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])] 1283 | return actions 1284 | else: 1285 | temp = get_action_user_feat_six_xingwei(start_date, end_date, n) # 修改 1286 | time1 = datetime.now() 1287 | columns = ["user_id"] 1288 | all_col = temp.shape[1] - 1 1289 | temp.columns = columns + list(range(all_col)) 1290 | temp = temp.fillna(0) 1291 | columns = ['user_id'] 1292 | for j in range(0, 6, 1): 1293 | temp["zl_" + str(j)] = 0 1294 | columns.append("zl_" + str(j)) 1295 | for k in range(j, all_col, 6): 1296 | temp["zl_" + str(j)] = temp["zl_" + str(j)] + temp[k].map(lambda x: x * ((k // 6 + 1) ** (-0.67))) 1297 | temp["zl_" + str(j)] = temp["zl_" + str(j)].map(lambda x: (x - np.min(temp["zl_" + str(j)])) / ( 1298 | np.max(temp["zl_" + str(j)]) - np.min(temp["zl_" + str(j)]))) 1299 | temp = temp[columns] 1300 | temp.to_csv(dump_path, index=False) 1301 | return temp 1302 | 1303 | # # get user sku 1304 | # def get_user(start_date, end_date): 1305 | # dump_path = './cache/user_sku_%s_%s.csv' % (start_date, end_date) 1306 | # if os.path.exists(dump_path): 1307 | # actions = pd.read_csv(dump_path) 1308 | # else: 1309 | # actions = get_actions(start_date, end_date) 1310 | # actions = actions[(actions['type'] == 2) | (actions['type'] == 5) | (actions['type'] == 4)] 1311 | # actions=actions[actions['cate']==8] 1312 | # actions = actions[['user_id']] 1313 | # actions = actions.drop_duplicates(['user_id'], keep='first') 1314 | # actions.to_csv(dump_path, index=False) 1315 | # return actions 1316 | 1317 | 1318 | #用户购买前的行为 1319 | def get_action_u0509_feat_28(start_date, end_date,k): 1320 | dump_path = './cache/u0509_feat_28_%s_%s_%s.csv' % (start_date, end_date,k) 1321 | if os.path.exists(dump_path): 1322 | actions = pd.read_csv(dump_path) 1323 | else: 1324 | actions = get_actions(start_date, end_date) 1325 | actions = actions[actions['type'] == 4] 1326 | actions['time_buy'] = actions['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d')) 1327 | actions = actions[['user_id', 'sku_id', 'time_buy']].reset_index(drop=True) 1328 | actions['before_time_buy'] = actions['time_buy'] - timedelta(days=k) 1329 | 1330 | df = get_actions('2016-02-01','2016-04-16')[['user_id', 'sku_id', 'time', 'type']] 1331 | df['time'] = df['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d')) 1332 | df = pd.merge(df, actions, on=['user_id', 'sku_id'], how='left') 1333 | df = df.dropna(axis=0, how='any') 1334 | df['before_days'] = (df['time'] - df['before_time_buy']).dt.days 1335 | df['days'] = (df['time'] - df['time_buy']).dt.days 1336 | df = df[(df['before_days'] >= 0) & (df['days'] < 0)] 1337 | df_dummy = pd.get_dummies(df['type'], prefix='type') 1338 | 1339 | df = pd.concat([df, df_dummy], axis=1)[ 1340 | ['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']] 1341 | 1342 | df = df.groupby(['user_id', 'sku_id'], as_index=False).sum() 1343 | del df['sku_id'] 1344 | df = df.groupby('user_id', as_index=False).agg(['min', 'max', 'mean']) 1345 | df = df.reset_index() 1346 | df.columns = ['user_id'] + ['u0509_feat28_' + str(k) + '_' + i for i in ( 1347 | 'type_1_min', 'type_1_max', 'type_1_mean', 'type_2_min', 'type_2_max', 'type_2_mean', 1348 | 'type_3_min', 'type_3_max', 'type_3_mean', 'type_4_min', 'type_4_max', 'type_4_mean', 1349 | 'type_5_min', 'type_5_max', 'type_5_mean', 'type_6_min', 'type_6_max', 'type_6_mean')] 1350 | min_max_scaler = preprocessing.MinMaxScaler() 1351 | actions = min_max_scaler.fit_transform(df.drop('user_id', axis=1).values) 1352 | actions = pd.DataFrame(actions) 1353 | actions = pd.concat([df[['user_id']], actions], axis=1) 1354 | actions.columns = ['user_id']+['u0509_feat_28_'+str(i) for i in range(1,actions.shape[1])] 1355 | actions.to_csv(dump_path,index=False) 1356 | actions.columns = ['user_id']+['u0509_feat_28_'+str(k)+"_"+str(i) for i in range(1,actions.shape[1])] 1357 | return actions 1358 | 1359 | #用户看了几个cate=8中的brand、用户看的cate=8的brand/用户看的brand 1360 | def get_action_u0509_feat_29(start_date,end_date): 1361 | dump_path = './cache/u0509_feat_29_%s_%s.csv' % (start_date, end_date) 1362 | if os.path.exists(dump_path): 1363 | actions = pd.read_csv(dump_path) 1364 | else: 1365 | actions=get_actions(start_date,end_date) 1366 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','brand'])[['user_id','brand']] 1367 | df1=df1.groupby(['user_id'],as_index=False).count() 1368 | df1.columns=['user_id','brand_cate=8'] 1369 | df2=actions.drop_duplicates(['user_id','brand'])[['user_id','brand']] 1370 | df2 = df2.groupby(['user_id'], as_index=False).count() 1371 | df2.columns=['user_id','brand_cate_all'] 1372 | df=pd.merge(df1,df2,on='user_id',how='right') 1373 | df['rate']=df['brand_cate=8']/df['brand_cate_all'] 1374 | # print df 1375 | actions=df.fillna(0) 1376 | actions.to_csv(dump_path,index=False) 1377 | actions.columns=['user_id']+['u0509_feat_29'+str(i) for i in range(1,actions.shape[1])] 1378 | return actions 1379 | 1380 | def get_action_u0521_feat_31(start_date,end_date,k): 1381 | dump_path = './cache/u0509_feat_31_%s_%s_%s.csv' % (start_date, end_date,k) 1382 | if os.path.exists(dump_path): 1383 | actions = pd.read_csv(dump_path) 1384 | else: 1385 | start_days=pd.to_datetime(end_date)-timedelta(days=k) 1386 | start_days=datetime.strftime(start_days,'%H-%m-%d') 1387 | actions=get_actions(start_days,end_date) 1388 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','cate'])[['user_id','cate']] 1389 | df1=df1.groupby('user_id',as_index=False).count() 1390 | df1.columns=['user_id','cate8'] 1391 | df2=actions.drop_duplicates(['user_id','cate'])[['user_id','cate']] 1392 | df2=df2.groupby('user_id',as_index=False).count() 1393 | actions=pd.merge(df1,df2,on='user_id',how='right') 1394 | actions['cate8/cate']=actions['cate8']/actions['cate'] 1395 | actions=actions.fillna(0) 1396 | min_max_scaler = preprocessing.MinMaxScaler() 1397 | df = min_max_scaler.fit_transform(actions[['cate8','cate']].values) 1398 | df = pd.DataFrame(df) 1399 | actions = pd.concat([actions[['user_id','cate8/cate']], df], axis=1) 1400 | actions.to_csv(dump_path,index=False) 1401 | actions.columns=['user_id']+['u0509_feat_31_'+str(k)+'_'+str(i)for i in range(1,actions.shape[1])] 1402 | return actions 1403 | 1404 | 1405 | def get_action_u0521_feat_32(start_date,end_date): 1406 | dump_path = './cache/u0509_feat_32_%s_%s.csv' % (start_date, end_date) 1407 | if os.path.exists(dump_path): 1408 | actions = pd.read_csv(dump_path) 1409 | else: 1410 | actions=get_actions(start_date,end_date) 1411 | actions=actions[actions['cate']==8][['user_id','brand']] 1412 | df1=actions.drop_duplicates(['user_id','brand']).groupby('user_id',as_index=False).count() 1413 | df1.columns=['user_id','brand_num'] 1414 | df2=actions.groupby('user_id',as_index=False).count() 1415 | actions=pd.merge(df1,df2,on='user_id',how='left') 1416 | actions['brand_num/brand']=actions['brand']/actions['brand_num'] 1417 | actions=actions.fillna(0) 1418 | min_max_scaler = preprocessing.MinMaxScaler() 1419 | df = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values) 1420 | df = pd.DataFrame(df) 1421 | actions = pd.concat([actions[['user_id']], df], axis=1) 1422 | actions.to_csv(dump_path, index=False) 1423 | actions.columns = ['user_id'] + ['u0509_feat_32_' + str(i) for i in range(1, actions.shape[1])] 1424 | return actions 1425 | 1426 | def get_action_user_feat7_0522_huachuang(start_date, end_date,n): 1427 | dump_path = './cache/user_feat7_six_%s_%s_%s_0522.csv' % (start_date, end_date,n) 1428 | if os.path.exists(dump_path): 1429 | actions = pd.read_csv(dump_path) 1430 | else: 1431 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n) 1432 | start_days = datetime.strftime(start_days, '%Y-%m-%d') 1433 | 1434 | df = get_actions(start_days, end_date)[['user_id', 'type', 'time']] 1435 | actions = df.groupby(['user_id', 'type'], as_index=False).count() 1436 | 1437 | time_min = df.groupby(['user_id', 'type'], as_index=False).min() 1438 | time_max = df.groupby(['user_id', 'type'], as_index=False).max() 1439 | 1440 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left') 1441 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 1442 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 1443 | 1444 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] - 1445 | time_cha[ 1446 | 'time_y']).dt.seconds // 3600 1447 | del time_cha['time_x'] 1448 | del time_cha['time_y'] 1449 | # time_cha=time_cha.fillna(1) 1450 | 1451 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left") 1452 | actions = actions.groupby(['user_id', 'type']).sum() 1453 | actions['cnt/time'] = actions['time'] / actions["cha_hour"] 1454 | actions = actions.unstack() 1455 | actions.columns = list(range(actions.shape[1])) 1456 | actions = actions.reset_index() 1457 | actions = actions.fillna(0) 1458 | actions.to_csv(dump_path, index=False) 1459 | actions.columns = ['user_id'] + ['u_feat7_' +str(n)+"_"+ str(i) for i in range(1, actions.shape[1])] 1460 | return actions 1461 | 1462 | def get_user_labels(test_start_date,test_end_date): 1463 | dump_path = './cache/user_labels_%s_%s_11.csv' % (test_start_date, test_end_date) 1464 | if os.path.exists(dump_path): 1465 | actions = pd.read_csv(dump_path) 1466 | else: 1467 | actions = get_actions(test_start_date, test_end_date) 1468 | actions = actions[actions['cate']==8] 1469 | actions = actions[actions['type'] == 4].drop_duplicates(['user_id'])[['user_id']] 1470 | actions['label'] = 1 1471 | 1472 | return actions 1473 | 1474 | 1475 | print("U model 2 finish part_0") 1476 | ######################################################################################################### 1477 | 1478 | 1479 | # In[ ]: 1480 | 1481 | 1482 | 1483 | 1484 | # In[ ]: 1485 | 1486 | 1487 | 1488 | 1489 | # In[3]: 1490 | 1491 | import os 1492 | from datetime import datetime 1493 | from datetime import timedelta 1494 | 1495 | # -*- coding: utf-8 -*- 1496 | """ 1497 | Created on Sun May 14 10:27:41 2017 1498 | @author: 老虎趴趴走 1499 | """ 1500 | import pandas as pd 1501 | import numpy as np 1502 | # import datetime 1503 | import math 1504 | 1505 | def user_features(user, ful_action, sub_action, end_date): 1506 | dump_path='./cache/user_features_%s_0514_2.csv'%(end_date) 1507 | if os.path.exists(dump_path): 1508 | actions = pd.read_csv(dump_path) 1509 | 1510 | else: 1511 | end_date=pd.to_datetime(end_date) 1512 | day = timedelta(1, 0) 1513 | print('=====> 提取特征...') 1514 | sub_1 = sub_action[(sub_action['time']>=end_date-1*day) & (sub_action['time']=end_date-3*day) & (sub_action['time']=end_date-5*day) & (sub_action['time']=end_date-30*day) & (sub_action['time']=end_date-5*day) & (ful_action['time']=end_date-30*day) & (ful_action['time'] 完成!') 1742 | actions.to_csv(dump_path,index=False) 1743 | 1744 | # user_id = actions[['user_id']] 1745 | # del actions['user_id'] 1746 | # actions = actions.fillna(0) 1747 | # actions=actions.replace(np.inf,0) 1748 | # print(actions.head()) 1749 | # columns = actions.columns 1750 | 1751 | # min_max_scale = preprocessing.MinMaxScaler() 1752 | # actions=actions.replace(np.inf,0) 1753 | # actions = min_max_scale.fit_transform(actions.values) 1754 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1755 | return actions 1756 | 1757 | import pandas as pd 1758 | ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True) 1759 | sub_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True) 1760 | user = pd.read_csv('./data/JData_modified_user.csv', parse_dates=[4]) 1761 | # user_features(user,ful_action,sel_action,'2016-04-11') 1762 | 1763 | print("U model 2 finish part_1") 1764 | ###################################################################################### 1765 | 1766 | 1767 | # In[ ]: 1768 | 1769 | 1770 | 1771 | 1772 | # In[ ]: 1773 | 1774 | 1775 | 1776 | 1777 | # In[ ]: 1778 | 1779 | 1780 | 1781 | 1782 | # In[ ]: 1783 | 1784 | 1785 | 1786 | 1787 | # In[8]: 1788 | 1789 | # 测试集 1790 | # ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True) 1791 | # sel_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True) 1792 | def make_test_set(train_start_date, train_end_date,user,ful_action,sub_action): 1793 | dump_path = './cache/bu10525model_2_u_test_set_%s_%s.csv' % (train_start_date, train_end_date) 1794 | if os.path.exists(dump_path): 1795 | actions = pd.read_csv(dump_path) 1796 | else: 1797 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1798 | actions_1 = get_actions(start_days, train_end_date) 1799 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1800 | # buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates() 1801 | # actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False] 1802 | 1803 | # start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1804 | # actions_1 = get_actions(start_days, train_end_date) 1805 | # actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)] 1806 | # actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1807 | 1808 | 1809 | # actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id']) 1810 | 1811 | 1812 | print (actions.shape) 1813 | # start_days = train_start_date 1814 | start_days = "2016-02-01" 1815 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id') 1816 | # print(actions.shape) 1817 | # 1818 | 1819 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id') 1820 | # print(actions.shape) 1821 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id') 1822 | print(actions.shape) 1823 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id') 1824 | print(actions.shape) 1825 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id') 1826 | print(actions.shape) 1827 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id') 1828 | print(actions.shape) 1829 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id') 1830 | print(actions.shape) 1831 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id') 1832 | print (actions.shape) 1833 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id') 1834 | print (actions.shape) 1835 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id') 1836 | print (actions.shape) 1837 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id') 1838 | print (actions.shape) 1839 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id') 1840 | print (actions.shape) 1841 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id') 1842 | print (actions.shape) 1843 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id') 1844 | print (actions.shape) 1845 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id') 1846 | print (actions.shape) 1847 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id') 1848 | print (actions.shape) 1849 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id') 1850 | print (actions.shape) 1851 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id') 1852 | print (actions.shape) 1853 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id') 1854 | print (actions.shape) 1855 | 1856 | #模型1 和 模型二 1857 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id') 1858 | print (actions.shape) 1859 | #模型 二 1860 | actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id') 1861 | 1862 | 1863 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id') 1864 | # print (actions.shape) 1865 | 1866 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id') 1867 | # print (actions.shape) 1868 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id') 1869 | # print (actions.shape) 1870 | 1871 | for i in (1, 2, 3, 7, 14, 28): 1872 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id') 1873 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id') 1874 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id') 1875 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id') 1876 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id') 1877 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id') 1878 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id') 1879 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id') 1880 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id') 1881 | #模型1 和 模型二 1882 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id') 1883 | if(i<=10): 1884 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id') 1885 | #模型 二 1886 | actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id') 1887 | actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id') 1888 | print(actions.shape) 1889 | print(actions.shape) 1890 | 1891 | actions = actions.fillna(0) 1892 | # user_id = actions[['user_id']] 1893 | # del actions['user_id'] 1894 | # actions = actions.fillna(0) 1895 | # actions=actions.replace(np.inf,0) 1896 | # # print(actions.head()) 1897 | # columns = actions.columns 1898 | 1899 | # min_max_scale = preprocessing.MinMaxScaler() 1900 | # actions=actions.replace(np.inf,0) 1901 | # actions = min_max_scale.fit_transform(actions.values) 1902 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 1903 | # actions.to_csv(dump_path,index=False) 1904 | return actions 1905 | 1906 | 1907 | # 训练集 1908 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action): 1909 | dump_path = './cache/bu10525model_2_u_train_set_%s_%s_%s_%s.csv' % (train_start_date, train_end_date, test_start_date, test_end_date) 1910 | if os.path.exists(dump_path): 1911 | actions = pd.read_csv(dump_path) 1912 | else: 1913 | 1914 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1915 | actions_1 = get_actions(start_days, train_end_date) 1916 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1917 | # buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates() 1918 | # actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False] 1919 | 1920 | 1921 | 1922 | # print (actions.shape) 1923 | 1924 | # start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0] 1925 | # actions_1 = get_actions(start_days, train_end_date) 1926 | # actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)] 1927 | # actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id']) 1928 | # actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id']) 1929 | print (actions.shape) 1930 | # start_days = train_start_date 1931 | start_days = "2016-02-01" 1932 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id') 1933 | print(actions.shape) 1934 | 1935 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id') 1936 | # print(actions.shape) 1937 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id') 1938 | print(actions.shape) 1939 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id') 1940 | print(actions.shape) 1941 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id') 1942 | print(actions.shape) 1943 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id') 1944 | print(actions.shape) 1945 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id') 1946 | print(actions.shape) 1947 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id') 1948 | print (actions.shape) 1949 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id') 1950 | print (actions.shape) 1951 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id') 1952 | print (actions.shape) 1953 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id') 1954 | print (actions.shape) 1955 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id') 1956 | print (actions.shape) 1957 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id') 1958 | print (actions.shape) 1959 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id') 1960 | print (actions.shape) 1961 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id') 1962 | print (actions.shape) 1963 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id') 1964 | print (actions.shape) 1965 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id') 1966 | print (actions.shape) 1967 | 1968 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id') 1969 | print (actions.shape) 1970 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id') 1971 | print (actions.shape) 1972 | 1973 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id') 1974 | actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id') 1975 | 1976 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id') 1977 | # print (actions.shape) 1978 | 1979 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id') 1980 | # print (actions.shape) 1981 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id') 1982 | # print (actions.shape) 1983 | print (actions.shape) 1984 | for i in (1, 2, 3,7, 14, 28): 1985 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id') 1986 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id') 1987 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id') 1988 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id') 1989 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id') 1990 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id') 1991 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id') 1992 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id') 1993 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id') 1994 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id') 1995 | if(i<=10): 1996 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id') 1997 | actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id') 1998 | 1999 | actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id') 2000 | print(actions.shape) 2001 | actions = pd.merge(actions, get_user_labels(test_start_date, test_end_date), how='left', on='user_id') 2002 | 2003 | actions = actions.fillna(0) 2004 | print(actions.shape) 2005 | # user_id = actions[['user_id']] 2006 | # del actions['user_id'] 2007 | # actions = actions.fillna(0) 2008 | # actions=actions.replace(np.inf,0) 2009 | # # print(actions.head()) 2010 | # columns = actions.columns 2011 | 2012 | # min_max_scale = preprocessing.MinMaxScaler() 2013 | # actions=actions.replace(np.inf,0) 2014 | # actions = min_max_scale.fit_transform(actions.values) 2015 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1) 2016 | # actions.to_csv(dump_path,index=False) 2017 | return actions 2018 | 2019 | 2020 | print("U model 2 finish part_3") 2021 | 2022 | 2023 | 2024 | 2025 | 2026 | 2027 | ########################################################################################### 2028 | 2029 | 2030 | # In[ ]: 2031 | 2032 | 2033 | 2034 | 2035 | # In[9]: 2036 | 2037 | #!/usr/bin/python 2038 | 2039 | import numpy as np 2040 | import xgboost as xgb 2041 | # from user_feat import * 2042 | from sklearn.model_selection import train_test_split 2043 | 2044 | 2045 | train_start_date = '2016-03-10' 2046 | train_end_date = '2016-04-11' 2047 | test_start_date = '2016-04-11' 2048 | test_end_date = '2016-04-16' 2049 | 2050 | # train_start_date='2016-03-05' 2051 | # train_end_date='2016-04-06' 2052 | # test_start_date='2016-04-06' 2053 | # test_end_date='2016-04-11' 2054 | 2055 | sub_start_date = '2016-03-15' 2056 | sub_end_date = '2016-04-16' 2057 | 2058 | #训练数据集 2059 | actions = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action) 2060 | # print(np.isinf(actions)) 2061 | # print(np.isnan(actions)) 2062 | 2063 | 2064 | 2065 | 2066 | 2067 | # for index in feature_name[1:-1]: 2068 | # actions["r"+index]=actions[index].rank(method='max')/actions.shape[0] 2069 | 2070 | 2071 | ## train_neg,test_neg=train_test_split(actions_neg.values,test_size=0.15,random_state=0) 2072 | 2073 | # #test_neg = pd.DataFrame(test_neg,columns=actions_neg.columns) 2074 | 2075 | # #actions=pd.concat([actions_pos,test_neg]) 2076 | 2077 | # actions_pos= pd.concat([actions_pos,actions_pos]) 2078 | # actions_pos= pd.concat([actions_pos,actions_pos]) 2079 | # actions_pos= pd.concat([actions_pos,actions_pos]) 2080 | # actions_pos= pd.concat([actions_pos,actions_pos]) 2081 | # actions=pd.concat([actions_pos,actions_neg]) 2082 | print("+++++++++++++++++++++++") 2083 | 2084 | 2085 | 2086 | train,test=train_test_split(actions.values,test_size=0.2,random_state=0) 2087 | train=pd.DataFrame(train,columns=actions.columns) 2088 | test=pd.DataFrame(test,columns=actions.columns) 2089 | 2090 | X_train=train.drop(['user_id','label'],axis=1) 2091 | X_test=test.drop(['user_id','label'],axis=1) 2092 | y_train=train[['label']] 2093 | y_test=test[['label']] 2094 | train_index=train[['user_id']].copy() 2095 | test_index=test[['user_id']].copy() 2096 | 2097 | 2098 | #测试数据集 2099 | sub_test_data = make_test_set(sub_start_date, sub_end_date,user,ful_action,sub_action) 2100 | sub_trainning_data=sub_test_data.drop(['user_id'],axis=1) 2101 | sub_user_index=sub_test_data[['user_id']].copy() 2102 | 2103 | 2104 | print("U model 2 finish part_4") 2105 | 2106 | ######################################################################## 2107 | 2108 | 2109 | # In[ ]: 2110 | 2111 | 2112 | 2113 | 2114 | # In[11]: 2115 | 2116 | 2117 | print ('==========>>>train xgboost model ....') 2118 | 2119 | dtrain = xgb.DMatrix(X_train,label=y_train) 2120 | dtest = xgb.DMatrix(X_test,label=y_test) 2121 | param = {'learning_rate' : 0.1, 2122 | 'n_estimators': 1000, 2123 | 'max_depth': 3, 2124 | 'min_child_weight': 5, 2125 | 'gamma': 0, 2126 | 'subsample': 1.0, 2127 | 'colsample_bytree': 0.8, 2128 | 'eta': 0.05, 2129 | 'silent': 1, 2130 | 'objective': 2131 | 'binary:logistic', 2132 | 'scale_pos_weight':1} 2133 | 2134 | 2135 | 2136 | num_round =120 2137 | plst = list(param.items()) 2138 | plst += [('eval_metric', 'logloss')] 2139 | 2140 | evallist = [(dtest, 'eval'), (dtrain, 'train')] 2141 | bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10) 2142 | 2143 | 2144 | # ============================================>>>> 2145 | print ('==========>>>predict test data label') 2146 | 2147 | 2148 | sub_trainning_data_1 = xgb.DMatrix(sub_trainning_data) 2149 | y = bst.predict(sub_trainning_data_1) 2150 | pred = sub_user_index 2151 | sub_user_index['label'] = y 2152 | 2153 | # print(sub_user_index.head()) 2154 | 2155 | pred=sub_user_index 2156 | #pred.sort_values(by=['user_id','label'],ascending=[0,0],inplace=True) 2157 | pred=pred.sort_values(by=['user_id','label'],ascending=[0,0]) 2158 | pred = pred.groupby('user_id').first().reset_index() 2159 | result=pred.sort_values(by=['label'],ascending=[0]) 2160 | result['user_id']=result['user_id'].astype('int') 2161 | 2162 | 2163 | name=str(datetime.now()).replace(':','-').split('.')[0] 2164 | result.to_csv('./sub/Umodel_2.csv',index=False,index_label=False ) 2165 | print("U model 2 finish part_5") 2166 | 2167 | --------------------------------------------------------------------------------