├── src ├── util.pyc ├── util.py ├── model.py ├── feat.py ├── feat.ipynb └── model.ipynb ├── .gitignore ├── data ├── input │ └── README.md └── output │ └── README.md └── README.md /src/util.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawnyXiao/2018-ICC-TravelService/HEAD/src/util.pyc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data/input/test 3 | data/input/train 4 | data/output/sub 5 | data/output/feat 6 | data/output/feat_imp 7 | src/.ipynb_checkpoints -------------------------------------------------------------------------------- /src/util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import time 3 | 4 | 5 | def log(stri): 6 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 7 | print(str(now) + ' ' + str(stri)) 8 | -------------------------------------------------------------------------------- /data/input/README.md: -------------------------------------------------------------------------------- 1 | # 输入数据 2 | 3 | 由于文件太大,因此无法上传到 Github 上,若需要相关数据,可以发 issue 联系我。该目录结构如下: 4 | 5 | ``` 6 | input 7 | │ README.md 8 | │ 9 | ├─test 10 | │ action_test.csv 11 | │ orderFuture_test.csv 12 | │ orderHistory_test.csv 13 | │ userComment_test.csv 14 | │ userProfile_test.csv 15 | │ 16 | └─train 17 | action_train.csv 18 | orderFuture_train.csv 19 | orderHistory_train.csv 20 | userComment_train.csv 21 | userProfile_train.csv 22 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2018-ICC-精品旅行服务成单预测 2 | 3 | 这是第二届智慧中国杯(ICC)中的一题:**[精品旅行服务成单预测](http://www.dcjingsai.com/common/cmpt/%E7%B2%BE%E5%93%81%E6%97%85%E8%A1%8C%E6%9C%8D%E5%8A%A1%E6%88%90%E5%8D%95%E9%A2%84%E6%B5%8B_%E7%AB%9E%E8%B5%9B%E4%BF%A1%E6%81%AF.html)**。我入赛的时间很晚,大约是赛程的后期了。我的队伍是“魂斗罗”,最终取得**优胜奖**,排名 **14th/1135**。这个比赛可以学到不少**从序列数据中挖掘特征的套路**。 4 | 5 | ## 从序列数据中挖掘特征的套路 6 | 7 | 1. 由于用户最近的行为极可能代表了近期的想法,因此我保留了最近 k 天的行为与时间,作为 k 个特征; 8 | 2. 对最近 k 天的行为和时间进行统计,构建统计特征; 9 | 3. 对所有的行为和时间进行统计,构建全局统计特征; 10 | 4. 对时间序列进行一阶和二阶差分,保留最近的 k 个差分值,作为 k 个特征; 11 | 5. 对 k 个一阶和二阶差分值进行统计,构建统计特征; 12 | 6. 对所有的一阶和二阶差分值进行统计,构建全局统计特征; 13 | 7. 按行为分组后,对时间序列进行一阶和二阶差分,保留最近的 k 个差分值,作为 k 个特征 14 | 8. …… 15 | 16 | 诸如此类,实验表明,差分特征十分有效,是这个比赛提分的关键。 17 | 18 | ## 嘿! 19 | 20 | 如果您有任何的想法,例如:发现某处有 bug、觉得我对某个方法的讲解不正确或者不透彻、有更加有创意的见解,欢迎随时发 issue 或者 pull request 或者直接与我讨论!另外您若能 star 或者 for k 这个项目以激励刚刚踏入数据挖掘的我,我会感激不尽~ -------------------------------------------------------------------------------- /data/output/README.md: -------------------------------------------------------------------------------- 1 | # 输出数据 2 | 3 | 由于文件太大,因此无法上传到 Github 上,若需要相关数据,可以发 issue 联系我。该目录结构如下: 4 | 5 | ``` 6 | output 7 | │ README.md 8 | │ 9 | ├─feat 10 | │ │ action_num_based_on_time_last_window13 11 | │ │ action_num_based_on_time_last_window17 12 | │ │ action_num_based_on_time_last_window20 13 | │ │ action_num_based_on_time_last_window25 14 | │ │ action_num_based_on_time_last_window30 15 | │ │ action_num_based_on_time_last_window4 16 | │ │ action_num_based_on_time_last_window7 17 | │ │ action_order_time_diff 18 | │ │ action_real_time_based_on_time_last_window10_on_type1 19 | │ │ action_real_time_based_on_time_last_window10_on_type5 20 | │ │ action_real_time_based_on_time_last_window10_on_type6 21 | │ │ action_real_time_based_on_time_last_window10_on_type7 22 | │ │ action_real_time_based_on_time_last_window10_on_type8 23 | │ │ action_real_time_based_on_time_last_window10_on_type9 24 | │ │ action_real_time_based_on_time_last_window1_on_type1 25 | │ │ action_real_time_based_on_time_last_window1_on_type2 26 | │ │ action_real_time_based_on_time_last_window1_on_type3 27 | │ │ action_real_time_based_on_time_last_window1_on_type4 28 | │ │ action_real_time_based_on_time_last_window1_on_type5 29 | │ │ action_real_time_based_on_time_last_window1_on_type6 30 | │ │ action_real_time_based_on_time_last_window1_on_type7 31 | │ │ action_real_time_based_on_time_last_window1_on_type8 32 | │ │ action_real_time_based_on_time_last_window1_on_type9 33 | │ │ action_real_time_based_on_time_last_window4_on_type1 34 | │ │ action_real_time_based_on_time_last_window4_on_type5 35 | │ │ action_real_time_based_on_time_last_window4_on_type6 36 | │ │ action_real_time_based_on_time_last_window4_on_type7 37 | │ │ action_real_time_based_on_time_last_window4_on_type8 38 | │ │ action_real_time_based_on_time_last_window4_on_type9 39 | │ │ action_real_time_based_on_time_last_window7_on_type1 40 | │ │ action_real_time_based_on_time_last_window7_on_type5 41 | │ │ action_real_time_based_on_time_last_window7_on_type6 42 | │ │ action_real_time_based_on_time_last_window7_on_type7 43 | │ │ action_real_time_based_on_time_last_window7_on_type8 44 | │ │ action_real_time_based_on_time_last_window7_on_type9 45 | │ │ action_sequence_time_diff_window10 46 | │ │ action_sequence_time_diff_window11 47 | │ │ action_sequence_time_diff_window12 48 | │ │ action_sequence_time_diff_window15 49 | │ │ action_sequence_time_diff_window2 50 | │ │ action_sequence_time_diff_window3 51 | │ │ action_sequence_time_diff_window4 52 | │ │ action_sequence_time_diff_window5 53 | │ │ action_sequence_time_diff_window6 54 | │ │ action_sequence_time_diff_window7 55 | │ │ action_sequence_time_diff_window8 56 | │ │ action_sequence_time_diff_window9 57 | │ │ action_sequence_time_stat_last123 58 | │ │ action_stat_last_every_type 59 | │ │ action_time_2order_based_on_time_last_window10 60 | │ │ action_time_2order_based_on_time_last_window3 61 | │ │ action_time_2order_based_on_time_last_window4 62 | │ │ action_time_2order_based_on_time_last_window5 63 | │ │ action_time_2order_based_on_time_last_window6 64 | │ │ action_time_2order_based_on_time_last_window7 65 | │ │ action_time_2order_based_on_time_last_window8 66 | │ │ action_time_2order_based_on_time_last_window9 67 | │ │ action_time_based_on_time 68 | │ │ action_time_based_on_time_last_window10 69 | │ │ action_time_based_on_time_last_window11 70 | │ │ action_time_based_on_time_last_window12 71 | │ │ action_time_based_on_time_last_window15 72 | │ │ action_time_based_on_time_last_window15_on_type1 73 | │ │ action_time_based_on_time_last_window15_on_type2 74 | │ │ action_time_based_on_time_last_window15_on_type3 75 | │ │ action_time_based_on_time_last_window15_on_type4 76 | │ │ action_time_based_on_time_last_window15_on_type5 77 | │ │ action_time_based_on_time_last_window15_on_type6 78 | │ │ action_time_based_on_time_last_window15_on_type7 79 | │ │ action_time_based_on_time_last_window15_on_type8 80 | │ │ action_time_based_on_time_last_window15_on_type9 81 | │ │ action_time_based_on_time_last_window3 82 | │ │ action_time_based_on_time_last_window6 83 | │ │ action_time_based_on_time_last_window6_on_type1 84 | │ │ action_time_based_on_time_last_window6_on_type5 85 | │ │ action_time_based_on_time_last_window6_on_type6 86 | │ │ action_time_based_on_time_last_window6_on_type7 87 | │ │ action_time_based_on_time_last_window6_on_type8 88 | │ │ action_time_based_on_time_last_window6_on_type9 89 | │ │ action_time_based_on_time_last_window7 90 | │ │ action_time_based_on_time_last_window7_on_type1 91 | │ │ action_time_based_on_time_last_window7_on_type2 92 | │ │ action_time_based_on_time_last_window7_on_type3 93 | │ │ action_time_based_on_time_last_window7_on_type4 94 | │ │ action_time_based_on_time_last_window7_on_type5 95 | │ │ action_time_based_on_time_last_window7_on_type6 96 | │ │ action_time_based_on_time_last_window7_on_type7 97 | │ │ action_time_based_on_time_last_window7_on_type8 98 | │ │ action_time_based_on_time_last_window7_on_type9 99 | │ │ action_time_based_on_time_last_window8 100 | │ │ action_time_based_on_time_last_window9 101 | │ │ action_time_diff2_based_on_time_last_window3 102 | │ │ action_time_diff2_based_on_time_last_window4 103 | │ │ action_time_diff2_based_on_time_last_window5 104 | │ │ action_time_diff2_based_on_time_last_window6 105 | │ │ action_time_diff2_based_on_time_last_window7 106 | │ │ action_time_diff2_based_on_time_last_window8 107 | │ │ action_time_diff_234_56789_last_window6 108 | │ │ action_time_diff_stat 109 | │ │ action_time_diff_stat_last_window3 110 | │ │ action_time_diff_stat_last_window4 111 | │ │ action_time_diff_stat_last_window5 112 | │ │ action_time_diff_stat_last_window6 113 | │ │ action_time_diff_stat_last_window7 114 | │ │ action_time_diff_stat_last_window8 115 | │ │ action_time_diff_stat_last_window9 116 | │ │ action_time_last_on_every_type 117 | │ │ action_time_row_stat_based_on_time_last_window10 118 | │ │ action_time_row_stat_based_on_time_last_window14 119 | │ │ action_time_row_stat_based_on_time_last_window3 120 | │ │ action_time_row_stat_based_on_time_last_window6 121 | │ │ action_type 122 | │ │ action_type_based_on_time 123 | │ │ action_type_based_on_time_last_window3 124 | │ │ action_type_based_on_time_last_window4 125 | │ │ action_type_based_on_time_last_window5 126 | │ │ action_type_based_on_time_last_window6 127 | │ │ action_type_based_on_time_last_window7 128 | │ │ action_type_num_based_on_time_last_window10 129 | │ │ action_type_num_based_on_time_last_window11 130 | │ │ action_type_num_based_on_time_last_window12 131 | │ │ action_type_num_based_on_time_last_window13 132 | │ │ action_type_num_based_on_time_last_window14 133 | │ │ action_type_num_based_on_time_last_window15 134 | │ │ action_type_num_based_on_time_last_window17 135 | │ │ action_type_num_based_on_time_last_window2 136 | │ │ action_type_num_based_on_time_last_window20 137 | │ │ action_type_num_based_on_time_last_window25 138 | │ │ action_type_num_based_on_time_last_window3 139 | │ │ action_type_num_based_on_time_last_window30 140 | │ │ action_type_num_based_on_time_last_window4 141 | │ │ action_type_num_based_on_time_last_window5 142 | │ │ action_type_num_based_on_time_last_window6 143 | │ │ action_type_num_based_on_time_last_window7 144 | │ │ action_type_num_based_on_time_last_window8 145 | │ │ action_type_num_based_on_time_last_window9 146 | │ │ action_type_rate_based_on_time_last_window10 147 | │ │ action_type_rate_based_on_time_last_window11 148 | │ │ action_type_rate_based_on_time_last_window12 149 | │ │ action_type_rate_based_on_time_last_window13 150 | │ │ action_type_rate_based_on_time_last_window14 151 | │ │ action_type_rate_based_on_time_last_window15 152 | │ │ action_type_rate_based_on_time_last_window2 153 | │ │ action_type_rate_based_on_time_last_window20 154 | │ │ action_type_rate_based_on_time_last_window3 155 | │ │ action_type_rate_based_on_time_last_window4 156 | │ │ action_type_rate_based_on_time_last_window5 157 | │ │ action_type_rate_based_on_time_last_window6 158 | │ │ action_type_rate_based_on_time_last_window7 159 | │ │ action_type_rate_based_on_time_last_window8 160 | │ │ action_type_rate_based_on_time_last_window9 161 | │ │ action_type_row_stat_based_on_time_last_window6 162 | │ │ action_type_row_stat_based_on_time_last_window9 163 | │ │ act_ord_act_time_diff_last_window10 164 | │ │ act_ord_act_time_diff_last_window11 165 | │ │ act_ord_act_time_diff_last_window12 166 | │ │ act_ord_act_time_diff_last_window13 167 | │ │ act_ord_act_time_diff_last_window14 168 | │ │ act_ord_act_time_diff_last_window15 169 | │ │ act_ord_act_time_diff_last_window3 170 | │ │ act_ord_act_time_diff_last_window6 171 | │ │ act_ord_act_time_diff_last_window7 172 | │ │ act_ord_act_time_diff_last_window8 173 | │ │ act_ord_act_time_diff_last_window9 174 | │ │ act_ord_before_type1_stat 175 | │ │ act_ord_type1_act_time_diff_last_window14 176 | │ │ act_ord_type1_act_time_diff_last_window2 177 | │ │ act_ord_type1_act_time_diff_last_window3 178 | │ │ act_ord_type1_act_time_diff_last_window4 179 | │ │ act_ord_type1_act_time_diff_last_window6 180 | │ │ act_ord_type1_act_time_diff_last_window9 181 | │ │ order_history 182 | │ │ order_history_last_w 183 | │ │ order_last_order_ydm 184 | │ │ order_type1_ydm 185 | │ │ try 186 | │ │ user_comment 187 | │ │ user_profile 188 | │ │ 189 | │ ├─bjw 190 | │ │ all_features_test.csv 191 | │ │ all_features_train.csv 192 | │ │ test_fea.csv 193 | │ │ train_fea.csv 194 | │ │ 195 | │ └─stack 196 | │ lgb_prob_test(offline_0.966529).csv 197 | │ lgb_prob_train(offline_0.966529).csv 198 | │ 199 | ├─feat_imp 200 | │ importance-20180112-0.951112(r1200).csv 201 | │ importance-20180114-0.958592(r1622).csv 202 | │ 203 | └─sub 204 | │ 20180112-xgb-0.951112(r1200).csv 205 | │ 20180114-xgb-0.958592(r1622).csv 206 | │ 20180121-lgb-0.961160(r1389).csv 207 | │ 20180123-lgb-0.963202(r1648).csv 208 | │ 20180123-xgb-0.961940(r1742).csv 209 | │ 20180127-lgb-0.965033(r2186).csv 210 | │ 20180128-lgb-0.966097(r1864).csv 211 | │ 20180131-lgb-0.966333(r2566).csv 212 | │ 20180201-lgb-0.966529(r2245).csv 213 | │ 20180202-lgb-0.966497(r1843).csv 214 | │ 20180203-lgb-0.966497(r1843).csv 215 | │ 20180207-lgb-0.970125(r1700).csv 216 | │ 20180210-lgb-0.970330(r2344).csv 217 | │ shawn_lgb_local9641_online9646.csv 218 | │ 结果提交样例.csv 219 | │ 220 | ├─bjw 221 | │ result_addUserid_0125_1.csv 222 | │ 223 | ├─blend 224 | │ 20180127-0.5+0.5-0.96619.csv 225 | │ 20180128-0.5bjw+0.3+0.2-0.97009.csv 226 | │ 20180128-0.65bjw+0.35-0.96969.csv 227 | │ 20180128-0.6bjw+0.4-0.96979.csv 228 | │ 20180128-0.7bjw+0.3-0.96950.csv 229 | │ 20180203-0.5bjw+0.3+0.1+0.1ym-0.97070.csv 230 | │ 20180203-0.6bjw+0.2+0.1+0.1ym-0.97070.csv 231 | │ 232 | └─ym 233 | lz96490.csv 234 | ``` -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | from __future__ import division 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn import preprocessing 10 | import xgboost as xgb 11 | import lightgbm as lgb 12 | import catboost as cb 13 | import time 14 | import datetime 15 | import warnings 16 | warnings.filterwarnings('ignore') 17 | import util 18 | 19 | 20 | # In[3]: 21 | 22 | def merge_feature( 23 | act_type_window, 24 | act_type_num_window, 25 | act_type_rate_window, 26 | act_type_row_stat_window, 27 | act_time_window, 28 | act_time_1type_window, 29 | act_ord_act_time_diff_window, 30 | action_sequence_time_diff_window, 31 | action_time_diff_234_56789_window, 32 | action_time_diff_stat_window 33 | ): 34 | util.log('Merge feature...') 35 | 36 | order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv') 37 | order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv') 38 | 39 | user_profile = pd.read_csv('../data/output/feat/%s' % 'user_profile') 40 | train = pd.merge(order_future_tr, user_profile, on='userid', how='left') 41 | test = pd.merge(order_future_te, user_profile, on='userid', how='left') 42 | 43 | user_comment = pd.read_csv('../data/output/feat/%s' % 'user_comment') 44 | train = pd.merge(train, user_comment, on='userid', how='left') 45 | test = pd.merge(test, user_comment, on='userid', how='left') 46 | 47 | order_history = pd.read_csv('../data/output/feat/%s' % 'order_history') 48 | train = pd.merge(train, order_history, on='userid', how='left') 49 | test = pd.merge(test, order_history, on='userid', how='left') 50 | 51 | # order_history_last_w = pd.read_csv('../data/output/feat/%s' % 'order_history_last_w') 52 | # train = pd.merge(train, order_history_last_w, on='userid', how='left') 53 | # test = pd.merge(test, order_history_last_w, on='userid', how='left') 54 | 55 | action_type = pd.read_csv('../data/output/feat/%s' % 'action_type') 56 | train = pd.merge(train, action_type, on='userid', how='left') 57 | test = pd.merge(test, action_type, on='userid', how='left') 58 | 59 | action_type_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_type_based_on_time') 60 | train = pd.merge(train, action_type_based_on_time, on='userid', how='left') 61 | test = pd.merge(test, action_type_based_on_time, on='userid', how='left') 62 | 63 | util.log('act_type_window=' + str(act_type_window)) 64 | window = act_type_window 65 | action_type_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window)) 66 | train = pd.merge(train, action_type_based_on_time_last_window, on='userid', how='left') 67 | test = pd.merge(test, action_type_based_on_time_last_window, on='userid', how='left') 68 | 69 | util.log('act_type_num_window=' + str(act_type_num_window)) 70 | window = act_type_num_window 71 | action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window)) 72 | train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left') 73 | test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left') 74 | 75 | util.log('act_type_rate_window=' + str(act_type_rate_window)) 76 | window = act_type_rate_window 77 | action_type_rate_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window)) 78 | train = pd.merge(train, action_type_rate_based_on_time_last_window, on='userid', how='left') 79 | test = pd.merge(test, action_type_rate_based_on_time_last_window, on='userid', how='left') 80 | 81 | util.log('act_type_row_stat_window=' + str(act_type_row_stat_window)) 82 | window = act_type_row_stat_window 83 | action_type_row_stat_based_on_time_last_window_feat = pd.read_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window)) 84 | train = pd.merge(train, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left') 85 | test = pd.merge(test, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left') 86 | 87 | # util.log('action_num_window=' + str(action_num_window)) 88 | # window = action_num_window 89 | # action_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window)) 90 | # train = pd.merge(train, action_num_based_on_time_last_window, on='userid', how='left') 91 | # test = pd.merge(test, action_num_based_on_time_last_window, on='userid', how='left') 92 | 93 | # util.log('action_type_num_window=' + str(action_type_num_window)) 94 | # window = action_type_num_window 95 | # action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window)) 96 | # train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left') 97 | # test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left') 98 | 99 | action_time_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_time_based_on_time') 100 | train = pd.merge(train, action_time_based_on_time, on='userid', how='left') 101 | test = pd.merge(test, action_time_based_on_time, on='userid', how='left') 102 | 103 | util.log('act_time_window=' + str(act_time_window)) 104 | window = act_time_window 105 | action_time_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window)) 106 | train = pd.merge(train, action_time_based_on_time_last_window, on='userid', how='left') 107 | test = pd.merge(test, action_time_based_on_time_last_window, on='userid', how='left') 108 | 109 | # util.log('act_time_row_stat_window=' + str(act_time_row_stat_window)) 110 | # window = act_time_row_stat_window 111 | # action_time_row_stat_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window)) 112 | # train = pd.merge(train, action_time_row_stat_based_on_time_last_window, on='userid', how='left') 113 | # test = pd.merge(test, action_time_row_stat_based_on_time_last_window, on='userid', how='left') 114 | 115 | # util.log('action_time_diff2_window=' + str(action_time_diff2_window)) 116 | # window = action_time_diff2_window 117 | # action_time_diff2_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window)) 118 | # train = pd.merge(train, action_time_diff2_based_on_time_last_window, on='userid', how='left') 119 | # test = pd.merge(test, action_time_diff2_based_on_time_last_window, on='userid', how='left') 120 | 121 | util.log('act_time_1type_window=%d' % act_time_1type_window) 122 | window = act_time_1type_window 123 | for ttype in [1, 5, 6, 7, 8, 9]: 124 | action_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype)) 125 | train = pd.merge(train, action_time_based_on_time_last_window_on_type, on='userid', how='left') 126 | test = pd.merge(test, action_time_based_on_time_last_window_on_type, on='userid', how='left') 127 | 128 | # util.log('action_time_2order_window=' + str(action_time_2order_window)) 129 | # window = action_time_2order_window 130 | # action_time_2order_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window)) 131 | # train = pd.merge(train, action_time_2order_based_on_time_last_window, on='userid', how='left') 132 | # test = pd.merge(test, action_time_2order_based_on_time_last_window, on='userid', how='left') 133 | 134 | # util.log('act_real_time_1type_window=%d' % act_real_time_1type_window) 135 | # window = act_real_time_1type_window 136 | # for ttype in [1, 5, 6, 7, 8, 9]: 137 | # action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype)) 138 | # train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left') 139 | # test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left') 140 | 141 | # action_order_time_diff = pd.read_csv('../data/output/feat/%s' % 'action_order_time_diff') 142 | # train = pd.merge(train, action_order_time_diff, on='userid', how='left') 143 | # test = pd.merge(test, action_order_time_diff, on='userid', how='left') 144 | 145 | # order_last_order_ydm = pd.read_csv('../data/output/feat/%s' % 'order_last_order_ydm') 146 | # train = pd.merge(train, order_last_order_ydm, on='userid', how='left') 147 | # test = pd.merge(test, order_last_order_ydm, on='userid', how='left') 148 | 149 | order_type1_ydm = pd.read_csv('../data/output/feat/%s' % 'order_type1_ydm') 150 | train = pd.merge(train, order_type1_ydm, on='userid', how='left') 151 | test = pd.merge(test, order_type1_ydm, on='userid', how='left') 152 | 153 | util.log('act_ord_act_time_diff_window=' + str(act_ord_act_time_diff_window)) 154 | window = act_ord_act_time_diff_window 155 | act_ord_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window)) 156 | train = pd.merge(train, act_ord_act_time_diff_last_window, on='userid', how='left') 157 | test = pd.merge(test, act_ord_act_time_diff_last_window, on='userid', how='left') 158 | 159 | # util.log('act_ord_type1_act_time_diff_window=' + str(act_ord_type1_act_time_diff_window)) 160 | # window = act_ord_type1_act_time_diff_window 161 | # act_ord_type1_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window)) 162 | # train = pd.merge(train, act_ord_type1_act_time_diff_last_window, on='userid', how='left') 163 | # test = pd.merge(test, act_ord_type1_act_time_diff_last_window, on='userid', how='left') 164 | 165 | util.log('action_sequence_time_diff_window=' + str(action_sequence_time_diff_window)) 166 | window = action_sequence_time_diff_window 167 | action_sequence_time_diff_window = pd.read_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window)) 168 | train = pd.merge(train, action_sequence_time_diff_window, on='userid', how='left') 169 | test = pd.merge(test, action_sequence_time_diff_window, on='userid', how='left') 170 | 171 | # action_sequence_time_stat_last123 = pd.read_csv('../data/output/feat/%s' % 'action_sequence_time_stat_last123') 172 | # train = pd.merge(train, action_sequence_time_stat_last123, on='userid', how='left') 173 | # test = pd.merge(test, action_sequence_time_stat_last123, on='userid', how='left') 174 | 175 | util.log('action_time_diff_234_56789_window=' + str(action_time_diff_234_56789_window)) 176 | window = action_time_diff_234_56789_window 177 | action_time_diff_234_56789_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window)) 178 | train = pd.merge(train, action_time_diff_234_56789_last_window, on='userid', how='left') 179 | test = pd.merge(test, action_time_diff_234_56789_last_window, on='userid', how='left') 180 | 181 | # action_stat_last_every_type = pd.read_csv('../data/output/feat/%s' % 'action_stat_last_every_type') 182 | # train = pd.merge(train, action_stat_last_every_type, on='userid', how='left') 183 | # test = pd.merge(test, action_stat_last_every_type, on='userid', how='left') 184 | 185 | # act_ord_before_type1_stat = pd.read_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat') 186 | # train = pd.merge(train, act_ord_before_type1_stat, on='userid', how='left') 187 | # test = pd.merge(test, act_ord_before_type1_stat, on='userid', how='left') 188 | 189 | action_time_diff_stat = pd.read_csv('../data/output/feat/%s' % 'action_time_diff_stat') 190 | train = pd.merge(train, action_time_diff_stat, on='userid', how='left') 191 | test = pd.merge(test, action_time_diff_stat, on='userid', how='left') 192 | 193 | util.log('action_time_diff_stat_window=' + str(action_time_diff_stat_window)) 194 | window = action_time_diff_stat_window 195 | action_time_diff_stat_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window)) 196 | train = pd.merge(train, action_time_diff_stat_last_window, on='userid', how='left') 197 | test = pd.merge(test, action_time_diff_stat_last_window, on='userid', how='left') 198 | 199 | # action_time_last_on_every_type = pd.read_csv('../data/output/feat/%s' % 'action_time_last_on_every_type') 200 | # train = pd.merge(train, action_time_last_on_every_type, on='userid', how='left') 201 | # test = pd.merge(test, action_time_last_on_every_type, on='userid', how='left') 202 | 203 | # bjw comment 中出现 order 中没有出现的为 1 204 | bjw_train = pd.read_csv('../data/output/feat/bjw/train_fea.csv') 205 | bjw_test = pd.read_csv('../data/output/feat/bjw/test_fea.csv') 206 | train = pd.merge(train, bjw_train, on='userid', how='left') 207 | test = pd.merge(test, bjw_test, on='userid', how='left') 208 | 209 | # 别人的开源特征,基于自己理解实现了一部分 210 | tryy = pd.read_csv('../data/output/feat/%s' % 'try') 211 | train = pd.merge(train, tryy, on='userid', how='left') 212 | test = pd.merge(test, tryy, on='userid', how='left') 213 | 214 | # bjw 的特征 215 | bjw_train = pd.read_csv('../data/output/feat/bjw/all_features_train.csv').drop(['Unnamed: 0', 'orderType'], axis=1) 216 | bjw_train.columns = ['userid' if i == 0 else i for i in range(len(bjw_train.columns))] 217 | bjw_test = pd.read_csv('../data/output/feat/bjw/all_features_test.csv').drop(['Unnamed: 0'], axis=1) 218 | bjw_test.columns = ['userid' if i == 0 else i for i in range(len(bjw_test.columns))] 219 | train = pd.merge(train, bjw_train, on='userid', how='left') 220 | test = pd.merge(test, bjw_test, on='userid', how='left') 221 | 222 | ################################################################################################################# 223 | 224 | # 用于交叉特征,使用之后会移除 225 | window = 1 226 | for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]: 227 | action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype)) 228 | train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left') 229 | test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left') 230 | 231 | train, test = cross_feature(train, test) 232 | 233 | train, test = drop_duplicate_column(train, test) 234 | 235 | train_feature = train.drop(['orderType'], axis = 1) 236 | train_label = train.orderType.values 237 | test_feature = test 238 | test_index = test.userid.values 239 | 240 | return train_feature, train_label, test_feature, test_index 241 | 242 | 243 | # In[4]: 244 | 245 | def cross_feature(train, test): 246 | util.log('Cross feature...') 247 | 248 | # 最近的 action 与最近的 order 的时间差 249 | train['act_last_time-ord_last_time'] = train['act_last_time'] - train['ord_last_time'] 250 | train['act_last_time-ord_type0_time_max'] = train['act_last_time'] - train['ord_type0_time_max'] 251 | train['act_last_time-ord_type1_time_max'] = train['act_last_time'] - train['ord_type1_time_max'] 252 | test['act_last_time-ord_last_time'] = test['act_last_time'] - test['ord_last_time'] 253 | test['act_last_time-ord_type0_time_max'] = test['act_last_time'] - test['ord_type0_time_max'] 254 | test['act_last_time-ord_type1_time_max'] = test['act_last_time'] - test['ord_type1_time_max'] 255 | 256 | # 最早的 action 与最早的 order 的时间差 257 | train['act_first_time-ord_first_time'] = train['act_first_time'] - train['ord_first_time'] 258 | train['act_first_time-ord_type0_time_min'] = train['act_first_time'] - train['ord_type0_time_min'] 259 | train['act_first_time-ord_type1_time_min'] = train['act_first_time'] - train['ord_type1_time_min'] 260 | test['act_first_time-ord_first_time'] = test['act_first_time'] - test['ord_first_time'] 261 | test['act_first_time-ord_type0_time_min'] = test['act_first_time'] - test['ord_type0_time_min'] 262 | test['act_first_time-ord_type1_time_min'] = test['act_first_time'] - test['ord_type1_time_min'] 263 | 264 | # 最近的 action 与最近的每一个 type 的 action 的时间差 + 最早的 action 与最早的每一个 type 的 action 的时间差 265 | for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]: 266 | train['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_last_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 267 | train['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 268 | test['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_last_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 269 | test['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 270 | train = train.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1) 271 | test = test.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1) 272 | 273 | # 是否下过精品服务的单 * 最近的 action 的时间 274 | tmp = train['ord_num(type_1)'].copy() 275 | tmp[tmp > 1] = 1 276 | tmp = pd.get_dummies(tmp.fillna(-1)) 277 | tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes'] 278 | train = pd.concat([train, tmp.mul(train['act_last_time'], axis=0)], axis=1) 279 | tmp = test['ord_num(type_1)'].copy() 280 | tmp[tmp > 1] = 1 281 | tmp = pd.get_dummies(tmp.fillna(-1)) 282 | tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes'] 283 | test = pd.concat([test, tmp.mul(test['act_last_time'], axis=0)], axis=1) 284 | 285 | # 是否下过精品服务的单 * 每一个 type 的 action 的数量 286 | tmp = train['ord_num(type_1)'].copy() 287 | tmp[tmp > 1] = 1 288 | tmp = pd.get_dummies(tmp.fillna(-1)) 289 | tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes'] 290 | for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]: 291 | train = train.join(tmp.mul(train['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype) 292 | tmp = test['ord_num(type_1)'].copy() 293 | tmp[tmp > 1] = 1 294 | tmp = pd.get_dummies(tmp.fillna(-1)) 295 | tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes'] 296 | for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]: 297 | test = test.join(tmp.mul(test['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype) 298 | 299 | # # 最近的 order 与最近的每一个 type 的 action 的时间差 + 最早的 order 与最早的每一个 type 的 action 的时间差 (all/0/1) 300 | # for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]: 301 | # train['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_last_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 302 | # train['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 303 | # train['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 304 | # train['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 305 | # train['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 306 | # train['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype] 307 | # test['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_last_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 308 | # test['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 309 | # test['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 310 | # test['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 311 | # test['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 312 | # test['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype] 313 | 314 | return train, test 315 | 316 | 317 | # In[5]: 318 | 319 | def drop_duplicate_column(train, test): 320 | util.log('Drop duplicate column...') 321 | 322 | train = train.drop(['act_type(rank_1)(window6)'], axis=1) # window9 323 | test = test.drop(['act_type(rank_1)(window6)'], axis=1) 324 | 325 | return train, test 326 | 327 | 328 | # In[6]: 329 | 330 | def lgb_cv(train_feature, train_label, params, folds, rounds): 331 | start = time.clock() 332 | print train_feature.columns 333 | dtrain = lgb.Dataset(train_feature, label=train_label) 334 | num_round = rounds 335 | print 'run cv: ' + 'round: ' + str(rounds) 336 | res = lgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=20, early_stopping_rounds=100) 337 | elapsed = (time.clock() - start) 338 | print 'Time used:', elapsed, 's' 339 | return len(res['auc-mean']), res['auc-mean'][len(res['auc-mean']) - 1] 340 | 341 | 342 | def lgb_predict(train_feature, train_label, test_feature, rounds, params): 343 | dtrain = lgb.Dataset(train_feature, label=train_label) 344 | valid_sets = [dtrain] 345 | num_round = rounds 346 | model = lgb.train(params, dtrain, num_round, valid_sets, verbose_eval=50) 347 | predict = model.predict(test_feature) 348 | return model, predict 349 | 350 | 351 | def store_result(test_index, pred, name): 352 | result = pd.DataFrame({'userid': test_index, 'orderType': pred}) 353 | result.to_csv('../data/output/sub/' + name + '.csv', index=0, columns=['userid', 'orderType']) 354 | return result 355 | 356 | 357 | # In[7]: 358 | 359 | train_feature, train_label, test_feature, test_index = merge_feature(6, 6, 3, 6, 6, 6, 6, 6, 6, 3) 360 | print train_feature.shape, train_label.shape, test_feature.shape 361 | 362 | 363 | # In[8]: 364 | 365 | config = { 366 | 'rounds': 10000, 367 | 'folds': 5 368 | } 369 | 370 | params_lgb = { 371 | 'task': 'train', 372 | 'boosting_type': 'gbdt', 373 | 'objective': 'binary', 374 | 'metric': 'auc', 375 | 'min_sum_hessian_in_leaf': 0.1, 376 | 'learning_rate': 0.01, 377 | 'verbosity': 2, 378 | 'tree_learner': 'feature', 379 | 'num_leaves': 128, 380 | 'feature_fraction': 0.75, 381 | 'bagging_fraction': 0.9, 382 | 'bagging_freq': 1, 383 | 'num_threads': 16, 384 | 'seed': 7 385 | } 386 | 387 | 388 | # In[10]: 389 | 390 | iterations, best_score = lgb_cv(train_feature, train_label, params_lgb, config['folds'], config['rounds']) 391 | 392 | 393 | # In[11]: 394 | 395 | preds = 0 396 | for s in range(7, 11): 397 | params_lgb['seed'] = s 398 | model, pred = lgb_predict(train_feature, train_label, test_feature, iterations, params_lgb) 399 | preds += pred 400 | preds /= 4 401 | 402 | 403 | # In[12]: 404 | 405 | res = store_result(test_index, preds, '20180210-lgb-%f(r%d)' % (best_score, iterations)) 406 | 407 | 408 | # In[13]: 409 | 410 | print("\n".join(("%s: %.2f" % x) for x in sorted(zip(train_feature.columns, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))) 411 | 412 | 413 | # In[ ]: 414 | 415 | 416 | 417 | 418 | # In[ ]: 419 | 420 | ######################################### blending ######################################### 421 | 422 | 423 | # In[ ]: 424 | 425 | test1 = pd.read_csv('../data/output/sub/bjw/result_addUserid_0125_1.csv') 426 | test2 = pd.read_csv('../data/output/sub/20180203-lgb-0.966497(r1843).csv') 427 | test3 = pd.read_csv('../data/output/sub/shawn_lgb_local9641_online9646.csv') 428 | test4 = pd.read_csv('../data/output/sub/ym/lz96490.csv') 429 | testa = pd.merge(test1, test2, on='userid', how='left') 430 | testb = pd.merge(test3, test4, on='userid', how='left') 431 | test = pd.merge(testa, testb, on='userid', how='left') 432 | 433 | 434 | # In[ ]: 435 | 436 | test['orderType'] = 0.5 * test['orderType_x_x'] + 0.3 * test['orderType_y_x'] + 0.1 * test['orderType_x_y'] + 0.1 * test['orderType_y_y'] 437 | 438 | 439 | # In[ ]: 440 | 441 | test[['userid','orderType']].to_csv('../data/output/sub/blend/20180203-0.5bjw+0.3+0.1+0.1ym.csv',index=False) 442 | 443 | -------------------------------------------------------------------------------- /src/feat.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | from __future__ import division 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn import preprocessing 10 | import xgboost as xgb 11 | import lightgbm as lgb 12 | import catboost as cb 13 | import time 14 | import datetime 15 | import sys 16 | import math 17 | import warnings 18 | warnings.filterwarnings('ignore') 19 | import util 20 | 21 | 22 | # In[ ]: 23 | 24 | def get_user_profile_feature(df): 25 | df = df.copy() 26 | 27 | mydf = df[['userid']] 28 | le = preprocessing.LabelEncoder() 29 | mydf['gender'] = le.fit_transform(df['gender']) 30 | 31 | mydf['province'] = le.fit_transform(df['province']) 32 | 33 | mydf['age'] = le.fit_transform(df['age']) 34 | 35 | return mydf 36 | 37 | 38 | # In[ ]: 39 | 40 | def get_user_comment_feature(df): 41 | df = df.copy() 42 | 43 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 44 | 45 | com_rating = df.groupby('userid')['rating'].agg(['sum', 'count']).reset_index() 46 | com_rating.columns = [i if i == 'userid' else 'com_rating_' + i for i in com_rating.columns] 47 | 48 | mydf = pd.merge(mydf, com_rating, on='userid', how='left') 49 | 50 | return mydf 51 | 52 | 53 | # In[ ]: 54 | 55 | def get_order_history_feature(df): 56 | df = df.copy() 57 | 58 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 59 | 60 | # type 为 0 和 1 的订单的数量和比率 + 总订单数 61 | ord_hist_ord = df.groupby('userid')['orderType'].agg(['sum', 'count']).reset_index() 62 | ord_hist_ord.columns = ['userid', 'ord_num(type_1)', 'ord_num'] 63 | ord_hist_ord['ord_num(type_0)'] = ord_hist_ord['ord_num'] - ord_hist_ord['ord_num(type_1)'] 64 | ord_hist_ord['ord_rate(type_1)'] = ord_hist_ord['ord_num(type_1)'] / ord_hist_ord['ord_num'] 65 | ord_hist_ord['ord_rate(type_0)'] = ord_hist_ord['ord_num(type_0)'] / ord_hist_ord['ord_num'] 66 | 67 | # city, country, continent 的数量 68 | addr_count = df.groupby('userid')['city', 'country', 'continent'].count().reset_index() 69 | addr_count.columns = ['userid', 'city_num', 'country_num', 'continent_num'] 70 | 71 | # type 为 1 的 city, country, continent 的数量 72 | addr_count_pos = df[df['orderType'] == 1].groupby('userid')['city', 'country', 'continent'].count().reset_index() 73 | addr_count_pos.columns = ['userid', 'city_num(type_1)', 'country_num(type_1)', 'continent_num(type_1)'] 74 | 75 | # 每个 country 的订单数量 76 | lb = preprocessing.LabelBinarizer() 77 | tmp = lb.fit_transform(df['country']) 78 | tmp_col = ['country_' + str(i) for i in range(tmp.shape[1])] 79 | tmp = pd.DataFrame(tmp, columns=tmp_col) 80 | tmp['userid'] = df['userid'].values 81 | country = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index() 82 | 83 | # 每个 continent 的订单数量 84 | lb = preprocessing.LabelBinarizer() 85 | tmp = lb.fit_transform(df['continent']) 86 | tmp_col = ['continent_' + str(i) for i in range(tmp.shape[1])] 87 | tmp = pd.DataFrame(tmp, columns=tmp_col) 88 | tmp['userid'] = df['userid'].values 89 | continent = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index() 90 | 91 | # 最后一次的 order 92 | last_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']] 93 | last_ord.columns = ['userid', 'ord_last_id', 'ord_last_time', 'ord_last_type'] 94 | 95 | # 第一次的 order 96 | first_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']] 97 | first_ord.columns = ['userid', 'ord_first_id', 'ord_first_time', 'ord_first_type'] 98 | 99 | # type 分别为 0/1 的订单的时间的统计 100 | for t in [0, 1]: 101 | ord_time_stat = df[df['orderType'] == t].groupby('userid')['orderTime'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index() 102 | ord_time_stat.columns = [i if i == 'userid' else 'ord_type%d_time_%s' % (t, i) for i in ord_time_stat.columns] 103 | mydf = pd.merge(mydf, ord_time_stat, on='userid', how='left') 104 | 105 | mydf = pd.merge(mydf, ord_hist_ord, on='userid', how='left') 106 | mydf = pd.merge(mydf, addr_count, on='userid', how='left') 107 | mydf = pd.merge(mydf, addr_count_pos, on='userid', how='left') 108 | mydf = pd.merge(mydf, country, on='userid', how='left') 109 | mydf = pd.merge(mydf, continent, on='userid', how='left') 110 | mydf = pd.merge(mydf, last_ord, on='userid', how='left') 111 | mydf = pd.merge(mydf, first_ord, on='userid', how='left') 112 | 113 | return mydf 114 | 115 | 116 | # In[ ]: 117 | 118 | def get_order_history_last_w_feature(df): 119 | df = df.copy() 120 | 121 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 122 | 123 | # 最后 w 次订单的统计 124 | for w in [2, 3, 4]: 125 | util.log(w) 126 | 127 | last_order = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(w)).reset_index(drop=True)[['userid', 'orderTime', 'orderType']] 128 | last_order.columns = ['userid', 'ord_last_time', 'ord_last_type'] 129 | 130 | ord_last_time_stat = last_order.groupby('userid')['ord_last_time'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index() 131 | ord_last_time_stat.columns = [i if i == 'userid' else 'ord_last%d_time_%s' % (w, i) for i in ord_last_time_stat.columns] 132 | 133 | ord_last_type_stat = last_order.groupby('userid')['ord_last_type'].agg(['count', sum]).reset_index() 134 | ord_last_type_stat.columns = [i if i == 'userid' else 'ord_last%d_type_%s' % (w, i) for i in ord_last_type_stat.columns] 135 | 136 | mydf = pd.merge(mydf, ord_last_time_stat, on='userid', how='left') 137 | mydf = pd.merge(mydf, ord_last_type_stat, on='userid', how='left') 138 | 139 | return mydf 140 | 141 | 142 | # In[ ]: 143 | 144 | def get_action_type_feature(df): 145 | df = df.copy() 146 | 147 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 148 | 149 | # 每个用户的 action 和 actionType 的数量 150 | act_num = df.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([sum, len]).reset_index() 151 | act_num.columns = ['userid', 'act_num', 'act_type_num'] 152 | 153 | # 每个类别的数量 154 | act_type_num = df.groupby(['userid', 'actionType']).size().unstack().reset_index() 155 | act_type_num.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')' for i in act_type_num.columns] 156 | 157 | mydf = pd.merge(mydf, act_num, on='userid', how='left') 158 | mydf = pd.merge(mydf, act_type_num, on='userid', how='left') 159 | 160 | return mydf 161 | 162 | 163 | # In[ ]: 164 | 165 | def get_action_type_based_on_time_feature(df): 166 | df = df.copy() 167 | 168 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 169 | 170 | # 最近的一次 action 的 type 171 | act_last_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionType']] 172 | act_last_type.columns = ['userid', 'act_last_type'] 173 | 174 | # 最早的一次 action 的 type 175 | act_first_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionType']] 176 | act_first_type.columns = ['userid', 'act_first_type'] 177 | 178 | mydf = pd.merge(mydf, act_last_type, on='userid', how='left') 179 | mydf = pd.merge(mydf, act_first_type, on='userid', how='left') 180 | 181 | return mydf 182 | 183 | 184 | # In[ ]: 185 | 186 | def get_action_type_based_on_time_last_window_feature(df, window): 187 | df = df.copy() 188 | 189 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 190 | 191 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 192 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 193 | 194 | # type 的差值 195 | act_type = tmp.pivot('userid', 'act_time_rank', 'actionType') 196 | act_type = act_type[act_type.columns[::-1]] 197 | act_type_diff = act_type.diff(1, axis=1) 198 | act_type_diff = act_type_diff.iloc[:, 1:].reset_index() 199 | act_type_diff.columns = [i if i == 'userid' else 'act_type_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_type_diff.columns] 200 | 201 | mydf = pd.merge(mydf, act_type_diff, on='userid', how='left') 202 | 203 | return mydf 204 | 205 | 206 | # In[ ]: 207 | 208 | def get_action_type_num_based_on_time_last_window_feature(df, window): 209 | df = df.copy() 210 | 211 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 212 | 213 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 214 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 215 | 216 | # 每个类别的数量 217 | act_num_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().reset_index() 218 | act_num_in_window.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_num_in_window.columns] 219 | 220 | mydf = pd.merge(mydf, act_num_in_window, on='userid', how='left') 221 | 222 | return mydf 223 | 224 | 225 | # In[ ]: 226 | 227 | def get_action_type_rate_based_on_time_last_window_feature(df, window): 228 | df = df.copy() 229 | 230 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 231 | 232 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 233 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 234 | 235 | # 每个类别的列级别的比率 236 | act_column_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply(lambda x: x / np.sum(x)).reset_index() 237 | act_column_rate_in_window.columns = [i if i == 'userid' else 'act_column_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_column_rate_in_window.columns] 238 | 239 | # 每个类别的行级别的比率 240 | act_row_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply((lambda x: x / np.sum(x)), axis=1).reset_index() 241 | act_row_rate_in_window.columns = [i if i == 'userid' else 'act_row_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_row_rate_in_window.columns] 242 | 243 | mydf = pd.merge(mydf, act_column_rate_in_window, on='userid', how='left') 244 | mydf = pd.merge(mydf, act_row_rate_in_window, on='userid', how='left') 245 | 246 | return mydf 247 | 248 | 249 | # In[ ]: 250 | 251 | def get_action_type_row_stat_based_on_time_last_window_feature(df, window): 252 | df = df.copy() 253 | 254 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 255 | 256 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 257 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 258 | 259 | # 最近的 type 值 + 行级别的统计值 260 | act_type = tmp.pivot('userid', 'act_time_rank', 'actionType') 261 | act_type.columns = ['act_type(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_type.columns] 262 | for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]: 263 | act_type['act_row_type_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_type_' + i.func_name + '(window_' + str(window) + ')'] = act_type.apply(i, axis=1) 264 | act_type = act_type.reset_index() 265 | 266 | mydf = pd.merge(mydf, act_type, on='userid', how='left') 267 | 268 | return mydf 269 | 270 | 271 | # In[ ]: 272 | 273 | def get_action_num_based_on_time_last_window_feature(df, window): 274 | df = df.copy() 275 | 276 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 277 | 278 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 279 | 280 | # action 的数量 281 | act_num = tmp.groupby('userid').size().reset_index() 282 | act_num.columns = ['userid', 'act_num(window_%d)' % window] 283 | 284 | mydf = pd.merge(mydf, act_num, on='userid', how='left') 285 | 286 | return mydf 287 | 288 | 289 | # In[ ]: 290 | 291 | def get_action_type_num_based_on_time_last_window_feature(df, window): 292 | df = df.copy() 293 | 294 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 295 | 296 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 297 | 298 | # type 的数量 299 | act_type_num = tmp.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([len]).reset_index() 300 | act_type_num.columns = ['userid', 'act_type_num(window_%d)' % window] 301 | 302 | mydf = pd.merge(mydf, act_type_num, on='userid', how='left') 303 | 304 | return mydf 305 | 306 | 307 | # In[ ]: 308 | 309 | def get_action_time_based_on_time_feature(df): 310 | df = df.copy() 311 | 312 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 313 | 314 | # 最近的一次 action 的 time 315 | act_last_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionTime']] 316 | act_last_time.columns = ['userid', 'act_last_time'] 317 | 318 | # 最早的一次 action 的 time 319 | act_first_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionTime']] 320 | act_first_time.columns = ['userid', 'act_first_time'] 321 | 322 | mydf = pd.merge(mydf, act_last_time, on='userid', how='left') 323 | mydf = pd.merge(mydf, act_first_time, on='userid', how='left') 324 | 325 | mydf['act_time_last-first'] = mydf['act_last_time'] - mydf['act_first_time'] 326 | 327 | return mydf 328 | 329 | 330 | # In[ ]: 331 | 332 | def get_action_time_based_on_time_last_window_feature(df, window): 333 | df = df.copy() 334 | 335 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 336 | 337 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 338 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 339 | 340 | # time 的差值 341 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 342 | act_time = act_time[act_time.columns[::-1]] 343 | act_time_diff = act_time.diff(1, axis=1) 344 | act_time_diff = act_time_diff.iloc[:, 1:].reset_index() 345 | act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns] 346 | 347 | mydf = pd.merge(mydf, act_time_diff, on='userid', how='left') 348 | 349 | return mydf 350 | 351 | 352 | # In[ ]: 353 | 354 | def get_action_time_row_stat_based_on_time_last_window_feature(df, window): 355 | df = df.copy() 356 | 357 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 358 | 359 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 360 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 361 | 362 | # 最近的 time 值 + 行级别的统计值 363 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 364 | act_time.columns = ['act_time(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_time.columns] 365 | for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]: 366 | act_time['act_row_time_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_time_' + i.func_name + '(window_' + str(window) + ')'] = act_time.apply(i, axis=1) 367 | act_time = act_time.reset_index() 368 | 369 | mydf = pd.merge(mydf, act_time, on='userid', how='left') 370 | 371 | return mydf 372 | 373 | 374 | # In[ ]: 375 | 376 | def get_action_time_diff2_based_on_time_last_window_feature(df, window): 377 | df = df.copy() 378 | 379 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 380 | 381 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 382 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 383 | 384 | # time 的差值 385 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 386 | act_time = act_time[act_time.columns[::-1]] 387 | act_time_diff2 = act_time.diff(2, axis=1) # need test 388 | act_time_diff2 = act_time_diff2.iloc[:, 2:].reset_index() 389 | act_time_diff2.columns = [i if i == 'userid' else 'act_time_diff2(' + str(i) + '-' + str(i + 2) + ')(window_' + str(window) + ')' for i in act_time_diff2.columns] 390 | 391 | mydf = pd.merge(mydf, act_time_diff2, on='userid', how='left') 392 | 393 | return mydf 394 | 395 | 396 | # In[ ]: 397 | 398 | def get_action_time_based_on_time_last_window_on_type_feature(df, window, ttype): 399 | df = df.copy() 400 | 401 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 402 | 403 | tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 404 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 405 | 406 | # 特定 type 的 action 的 time 的差值 407 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 408 | act_time = act_time[act_time.columns[::-1]] 409 | act_time_diff = act_time.diff(1, axis=1) 410 | act_time_diff = act_time_diff.iloc[:, 1:].reset_index() 411 | act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, window, ttype) for i in act_time_diff.columns] 412 | 413 | mydf = pd.merge(mydf, act_time_diff, on='userid', how='left') 414 | 415 | return mydf 416 | 417 | 418 | # In[ ]: 419 | 420 | def get_action_time_2order_based_on_time_last_window_feature(df, window): 421 | df = df.copy() 422 | 423 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 424 | 425 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 426 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 427 | 428 | # 特定 type 的 action 的 time 的差值 429 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 430 | act_time = act_time[act_time.columns[::-1]] 431 | act_time_diff_2order = act_time.diff(1, axis=1).diff(1, axis=1) 432 | act_time_diff_2order = act_time_diff_2order.iloc[:, 2:].reset_index() 433 | act_time_diff_2order.columns = [i if i == 'userid' else 'act_time_diff_2order(%d-%d)(window_%d)' % (i, i+1, window) for i in act_time_diff_2order.columns] 434 | 435 | mydf = pd.merge(mydf, act_time_diff_2order, on='userid', how='left') 436 | 437 | return mydf 438 | 439 | 440 | # In[ ]: 441 | 442 | def get_action_real_time_based_on_time_last_window_on_type_feature(df, window, ttype): 443 | df = df.copy() 444 | 445 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 446 | 447 | tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 448 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 449 | 450 | # 特定的 type 的 action 的最近的 time 值 451 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime').reset_index() 452 | act_time.columns = [i if i == 'userid' else 'act_time(rank_%d)(window_%d)(type_%d)' % (i, window, ttype) for i in act_time.columns] 453 | 454 | mydf = pd.merge(mydf, act_time, on='userid', how='left') 455 | 456 | return mydf 457 | 458 | 459 | # In[ ]: 460 | 461 | def get_act_ord_time_diff_feature(act, oord): 462 | act = act.copy() 463 | oord = oord.copy() 464 | 465 | mydf = oord[['userid']].drop_duplicates().reset_index(drop=True) 466 | 467 | ord_time = oord.groupby('userid')['orderTime'].max().reset_index() 468 | act = pd.merge(act, ord_time, on='userid', how='left') # fillna? 469 | act['act_time-ord_time'] = act['actionTime'] - act['orderTime'] 470 | act_ord_time_diff = act[act['act_time-ord_time'] > 0].groupby('userid').size().reset_index() 471 | act_ord_time_diff.columns = ['userid', 'act_ord_time_diff_gt0_count'] 472 | 473 | mydf = pd.merge(mydf, act_ord_time_diff, on='userid', how='left') 474 | 475 | return mydf 476 | 477 | 478 | # In[ ]: 479 | 480 | def get_order_last_order_ydm_feature(df): 481 | df = df.copy() 482 | 483 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 484 | 485 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True) 486 | 487 | mydf['ord_last_ord_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year 488 | mydf['ord_last_ord_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month 489 | mydf['ord_last_ord_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day 490 | 491 | return mydf 492 | 493 | 494 | # In[ ]: 495 | 496 | def get_order_type1_ydm_feature(df): 497 | df = df.copy() 498 | 499 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 500 | 501 | # 最近一次的 type 为 1 的订单的年月日 502 | tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True) 503 | mydf['ord_last_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year 504 | mydf['ord_last_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month 505 | mydf['ord_last_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day 506 | 507 | # 最早一次的 type 为 1 的订单的年月日 508 | tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True) 509 | mydf['ord_first_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year 510 | mydf['ord_first_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month 511 | mydf['ord_first_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day 512 | 513 | return mydf 514 | 515 | 516 | # In[ ]: 517 | 518 | def get_act_ord_act_time_diff_last_window_feature(act, oord, window): 519 | act = act.copy() 520 | oord = oord.copy() 521 | 522 | mydf = oord[['userid']].drop_duplicates().reset_index(drop=True) 523 | 524 | ord_time = oord.groupby('userid')['orderTime'].max().reset_index() 525 | act = pd.merge(act, ord_time, on='userid', how='left') 526 | 527 | df = act[act['actionTime'] < act['orderTime']] 528 | 529 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 530 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 531 | 532 | # 最后一次订单之前的 action 的 time 的差值 533 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 534 | act_time = act_time[act_time.columns[::-1]] 535 | act_time_diff = act_time.diff(1, axis=1) 536 | act_time_diff = act_time_diff.iloc[:, 1:].reset_index() 537 | act_time_diff.columns = [i if i == 'userid' else 'act_ord_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns] 538 | 539 | mydf = pd.merge(mydf, act_time_diff, on='userid', how='left') 540 | 541 | return mydf 542 | 543 | 544 | # In[ ]: 545 | 546 | def get_act_ord_type1_act_time_diff_last_window_feature(act, oord, window): 547 | act = act.copy() 548 | oord = oord.copy() 549 | 550 | mydf = oord[['userid']].drop_duplicates().reset_index(drop=True) 551 | 552 | ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index() 553 | act = pd.merge(act, ord_time, on='userid', how='left') 554 | 555 | df = act[act['actionTime'] < act['orderTime']] 556 | 557 | tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 558 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 559 | 560 | # 最后一次精品订单之前的 action 的 time 的差值 561 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 562 | act_time = act_time[act_time.columns[::-1]] 563 | act_time_diff = act_time.diff(1, axis=1) 564 | act_time_diff = act_time_diff.iloc[:, 1:].reset_index() 565 | act_time_diff.columns = [i if i == 'userid' else 'act_ord_type1_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns] 566 | 567 | mydf = pd.merge(mydf, act_time_diff, on='userid', how='left') 568 | 569 | return mydf 570 | 571 | 572 | # In[ ]: 573 | 574 | def get_action_sequence_time_diff_feature(df): 575 | df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True) 576 | 577 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 578 | 579 | df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s') 580 | df['actionTimeDiff'] = df['actionTime'].diff() 581 | 582 | counter = 1 583 | last_userid = df.iloc[0, 0] 584 | seq_list = [] 585 | for i, r in df[['userid', 'actionTimeDiff']].iterrows(): 586 | if i % 500000 == 0: 587 | util.log(i) 588 | if r.userid != last_userid: 589 | counter = 1 590 | seq_list.append(counter) 591 | last_userid = r.userid 592 | elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid: 593 | seq_list.append(counter) 594 | else: 595 | counter += 1 596 | seq_list.append(counter) 597 | df['actionSeq'] = pd.Series(seq_list) 598 | 599 | # 基于10分钟分块(时差低于10分钟的行为为一部分),每个块的时差 600 | seq_time_max = df.groupby(['userid', 'actionSeq'])['actionTime'].max().unstack() 601 | seq_time_diff = seq_time_max.diff(1, axis=1) 602 | for window in [2,3,4,5,6,7,10,15]: 603 | tmp = seq_time_diff.iloc[:, 1:(window+1)] 604 | tmp.columns = ['act_seq_time_diff(%d-%d)(window_%d)' % (i, i-1, window) for i in tmp.columns] 605 | tmp = tmp.reset_index() 606 | data = pd.merge(mydf, tmp, on='userid', how='left') 607 | util.log('window=%d' % window) 608 | data.to_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window), index=False) 609 | 610 | 611 | # In[ ]: 612 | 613 | def get_action_sequence_time_stat_feature(df): 614 | df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True) 615 | 616 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 617 | 618 | df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s') 619 | df['actionTimeDiff'] = df['actionTime'].diff() 620 | 621 | counter = 1 622 | last_userid = df.iloc[0, 0] 623 | seq_list = [] 624 | for i, r in df[['userid', 'actionTimeDiff']].iterrows(): 625 | if i % 500000 == 0: 626 | util.log(i) 627 | if r.userid != last_userid: 628 | counter = 1 629 | seq_list.append(counter) 630 | last_userid = r.userid 631 | elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid: 632 | seq_list.append(counter) 633 | else: 634 | counter += 1 635 | seq_list.append(counter) 636 | df['actionSeq'] = pd.Series(seq_list) 637 | 638 | time_stat = df[(df['actionSeq'] == 1) | (df['actionSeq'] == 2) | (df['actionSeq'] == 3)].groupby(['userid', 'actionSeq'])['actionTime'].agg([min, max, np.mean, np.median, np.ptp, np.std, 'count']).unstack().reset_index() 639 | time_stat.columns = ['userid' if i[0] == 'userid' else 'act_seq_time_stat_%s_last%d' % (i[0], i[1]) for i in time_stat.columns] 640 | 641 | time_stat.to_csv('../data/output/feat/%s' % ('action_sequence_time_stat_last123'), index=False) 642 | 643 | 644 | # In[ ]: 645 | 646 | def get_action_time_diff_234_56789_last_window_feature(df, window): 647 | df = df.copy() 648 | 649 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 650 | 651 | # 234 类型的 action 的 time 的差值 652 | tmp = df[df['actionType'].isin([2, 3, 4])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 653 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 654 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 655 | act_time = act_time[act_time.columns[::-1]] 656 | act_time_diff_234 = act_time.diff(1, axis=1) 657 | act_time_diff_234 = act_time_diff_234.iloc[:, 1:].reset_index() 658 | act_time_diff_234.columns = [i if i == 'userid' else 'act_time_diff_234(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_234.columns] 659 | 660 | # 56789 类型的 action 的 time 的差值 661 | tmp = df[df['actionType'].isin([5, 6, 7, 8, 9])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True) 662 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 663 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 664 | act_time = act_time[act_time.columns[::-1]] 665 | act_time_diff_56789 = act_time.diff(1, axis=1) 666 | act_time_diff_56789 = act_time_diff_56789.iloc[:, 1:].reset_index() 667 | act_time_diff_56789.columns = [i if i == 'userid' else 'act_time_diff_56789(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_56789.columns] 668 | 669 | mydf = pd.merge(mydf, act_time_diff_234, on='userid', how='left') 670 | mydf = pd.merge(mydf, act_time_diff_56789, on='userid', how='left') 671 | 672 | return mydf 673 | 674 | 675 | # In[ ]: 676 | 677 | def get_action_stat_last_every_type_feature(df): 678 | df = df.copy() 679 | 680 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 681 | 682 | # 离最近的 123456789 的 action 的时间的统计 683 | for t in range(1, 10): 684 | tmp = df[df['actionType'] == t].groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index() 685 | tmp.columns = [i if i == 'userid' else 'act_time_%s(type_%d)' % (i, t) for i in tmp.columns] 686 | 687 | mydf = pd.merge(mydf, tmp, on='userid', how='left') 688 | 689 | return mydf 690 | 691 | 692 | # In[ ]: 693 | 694 | def get_act_ord_before_type1_stat_feature(act, oord): 695 | act = act.copy() 696 | oord = oord.copy() 697 | 698 | mydf = oord[['userid']].drop_duplicates().reset_index(drop=True) 699 | 700 | ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index() 701 | act = pd.merge(act, ord_time, on='userid', how='left') 702 | 703 | df = act[act['actionTime'] < act['orderTime']] 704 | 705 | act_time_stat = df.groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index() 706 | act_time_stat.columns = [i if i == 'userid' else 'act_ord_before_type1_act_time_%s' % i for i in act_time_stat.columns] 707 | 708 | act_type_size = mydf.copy() 709 | for t in range(1, 10): 710 | tmp = df[df['actionType'] == t].groupby('userid').size().reset_index() 711 | tmp.columns = ['userid', 'act_ord_before_type1_act_type_size(type_%d)' % t] 712 | act_type_size = pd.merge(act_type_size, tmp, on='userid', how='left') 713 | 714 | mydf = pd.merge(mydf, act_time_stat, on='userid', how='left') 715 | mydf = pd.merge(mydf, act_type_size, on='userid', how='left') 716 | 717 | return mydf 718 | 719 | 720 | # In[ ]: 721 | 722 | def get_action_time_diff_stat_feature(df): 723 | df = df.copy() 724 | 725 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 726 | 727 | df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy() 728 | df['actionTimeDiff'] = df['actionTime'].diff(1) 729 | df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True) 730 | 731 | act_time_diff_stat = df.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index() 732 | act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s' % i for i in act_time_diff_stat.columns] 733 | 734 | mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left') 735 | 736 | return mydf 737 | 738 | 739 | # In[ ]: 740 | 741 | def get_action_time_diff_stat_last_window_feature(df, window): 742 | df = df.copy() 743 | 744 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 745 | 746 | df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy() 747 | df['actionTimeDiff'] = df['actionTime'].diff(1) 748 | df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True) 749 | 750 | tmp = df.groupby('userid').apply(lambda x: x.iloc[:-window, :]).reset_index(drop=True) 751 | act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index() 752 | act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s(window_%d)' % (i, window) for i in act_time_diff_stat.columns] 753 | 754 | mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left') 755 | 756 | return mydf 757 | 758 | 759 | # In[ ]: 760 | 761 | def get_action_time_last_on_every_type_feature(df): 762 | df = df.copy() 763 | 764 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 765 | 766 | df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy() 767 | for t in range(1, 10): 768 | act_time = df[df['actionType'] == t].groupby('userid').apply(lambda x: x.head(1)).reset_index(drop=True) 769 | act_time = act_time[['userid', 'actionTime']] 770 | act_time.columns = ['userid', 'act_time_last(type_%d)' % t] 771 | 772 | mydf = pd.merge(mydf, act_time, on='userid', how='left') 773 | 774 | return mydf 775 | 776 | 777 | # In[ ]: 778 | 779 | def get_try_feat(df): 780 | df = df.copy() 781 | 782 | mydf = df[['userid']].drop_duplicates().reset_index(drop=True) 783 | 784 | df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy() 785 | 786 | last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid']) 787 | last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid']) 788 | time_gap_last56 = pd.merge(last_5, last_6, on='userid', how='outer') 789 | time_gap_last56['time_gap_last56'] = time_gap_last56.actionTime_y - time_gap_last56.actionTime_x 790 | mydf = pd.merge(mydf, time_gap_last56[['userid', 'time_gap_last56']], on='userid', how='left') 791 | 792 | tmp = df[df['actionType'] == 5].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(2)).reset_index(drop=True) 793 | tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int) 794 | act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime') 795 | act_time = act_time[act_time.columns[::-1]] 796 | act_time_diff = act_time.diff(1, axis=1) 797 | act_time_diff = act_time_diff.iloc[:, 1:].reset_index() 798 | act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, 2, 5) for i in act_time_diff.columns] 799 | mydf = pd.merge(mydf, act_time_diff, on='userid', how='left') 800 | 801 | last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid']) 802 | last_7 = df[df.actionType == 7].drop_duplicates(subset=['userid']) 803 | time_gap_last67 = pd.merge(last_6, last_7, on='userid', how='outer') 804 | time_gap_last67['time_gap_last67'] = time_gap_last67.actionTime_y - time_gap_last67.actionTime_x 805 | mydf = pd.merge(mydf, time_gap_last67[['userid', 'time_gap_last67']], on='userid', how='left') 806 | 807 | df['actionDate'] = pd.to_datetime(df['actionTime'], unit='s') 808 | df = pd.merge(df, df.drop_duplicates(subset=['userid'])[['userid', 'actionDate']], on='userid', how='left') 809 | df['lastDay'] = df.actionDate_x.dt.day == df.actionDate_y.dt.day 810 | last_day = df[df.lastDay].groupby('userid')['lastDay'].size().reset_index() 811 | last_day_5 = df[df.lastDay & (df.actionType == 5)].groupby('userid')['lastDay'].size().reset_index() 812 | tmp = pd.merge(last_day, last_day_5, on='userid', how='left') 813 | tmp['last_day_rate(type_5)'] = tmp.lastDay_y / tmp.lastDay_x 814 | mydf = pd.merge(mydf, tmp[['userid', 'last_day_rate(type_5)']], on='userid', how='left') 815 | 816 | last_time = df.drop_duplicates(subset=['userid'])[['userid', 'actionTime']] 817 | last_time.columns = ['userid', 'last_time'] 818 | mydf = pd.merge(mydf, last_time, on='userid', how='left') 819 | 820 | last_4 = df[df.actionType == 4].drop_duplicates(subset=['userid']) 821 | last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid']) 822 | time_gap_last45 = pd.merge(last_4, last_5, on='userid', how='outer') 823 | time_gap_last45['time_gap_last45'] = time_gap_last45.actionTime_y - time_gap_last45.actionTime_x 824 | mydf = pd.merge(mydf, time_gap_last45[['userid', 'time_gap_last45']], on='userid', how='left') 825 | 826 | last_1 = df[df.actionType == 1].drop_duplicates(subset=['userid']) 827 | last = df.drop_duplicates(subset=['userid']) 828 | time_gap_last1 = pd.merge(last_1, last, on='userid', how='outer') 829 | time_gap_last1['time_gap_last1'] = time_gap_last1.actionTime_y - time_gap_last1.actionTime_x 830 | mydf = pd.merge(mydf, time_gap_last1[['userid', 'time_gap_last1']], on='userid', how='left') 831 | 832 | last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid']) 833 | last = df.drop_duplicates(subset=['userid']) 834 | time_gap_last5 = pd.merge(last_5, last, on='userid', how='outer') 835 | time_gap_last5['time_gap_last5'] = time_gap_last5.actionTime_y - time_gap_last5.actionTime_x 836 | mydf = pd.merge(mydf, time_gap_last5[['userid', 'time_gap_last5']], on='userid', how='left') 837 | 838 | last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid']) 839 | last = df.drop_duplicates(subset=['userid']) 840 | time_gap_last6 = pd.merge(last_6, last, on='userid', how='outer') 841 | time_gap_last6['time_gap_last6'] = time_gap_last6.actionTime_y - time_gap_last6.actionTime_x 842 | mydf = pd.merge(mydf, time_gap_last6[['userid', 'time_gap_last6']], on='userid', how='left') 843 | 844 | tmp = df[df.actionType.isin([5, 6])].copy() 845 | tmp['actionTimeDiff'] = tmp['actionTime'].diff(1) 846 | tmp = tmp.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True) 847 | act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index() 848 | act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_56_%s' % i for i in act_time_diff_stat.columns] 849 | mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left') 850 | 851 | return mydf 852 | 853 | 854 | # In[ ]: 855 | 856 | action_tr = pd.read_csv('../data/input/train/action_train.csv') # 用户行为数据 857 | order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv') # 待预测数据 858 | order_history_tr = pd.read_csv('../data/input/train/orderHistory_train.csv') # 用户历史订单数据 859 | user_comment_tr = pd.read_csv('../data/input/train/userComment_train.csv') # 用户评论数据 860 | user_profile_tr = pd.read_csv('../data/input/train/userProfile_train.csv') # 用户个人信息 861 | 862 | action_te = pd.read_csv('../data/input/test/action_test.csv') 863 | order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv') 864 | order_history_te = pd.read_csv('../data/input/test/orderHistory_test.csv') 865 | user_comment_te = pd.read_csv('../data/input/test/userComment_test.csv') 866 | user_profile_te = pd.read_csv('../data/input/test/userProfile_test.csv') 867 | 868 | action = pd.concat([action_tr, action_te], axis=0).reset_index(drop=True) 869 | order_history = pd.concat([order_history_tr, order_history_te], axis=0).reset_index(drop=True) 870 | user_comment = pd.concat([user_comment_tr, user_comment_te], axis=0).reset_index(drop=True) 871 | user_profile = pd.concat([user_profile_tr, user_profile_te], axis=0).reset_index(drop=True) 872 | 873 | 874 | # In[ ]: 875 | 876 | user_profile_feat = get_user_profile_feature(user_profile) 877 | user_profile_feat.to_csv('../data/output/feat/%s' % 'user_profile', index=False) 878 | 879 | 880 | # In[ ]: 881 | 882 | user_comment_feat = get_user_comment_feature(user_comment) 883 | user_comment_feat.to_csv('../data/output/feat/%s' % 'user_comment', index=False) 884 | 885 | 886 | # In[ ]: 887 | 888 | order_history_feat = get_order_history_feature(order_history) 889 | order_history_feat.to_csv('../data/output/feat/%s' % 'order_history', index=False) 890 | 891 | 892 | # In[ ]: 893 | 894 | order_history_last_w_feat = get_order_history_last_w_feature(order_history) 895 | order_history_last_w_feat.to_csv('../data/output/feat/%s' % 'order_history_last_w', index=False) 896 | 897 | 898 | # In[ ]: 899 | 900 | action_type_feat = get_action_type_feature(action) 901 | action_type_feat.to_csv('../data/output/feat/%s' % 'action_type', index=False) 902 | 903 | 904 | # In[ ]: 905 | 906 | action_type_based_on_time_feat = get_action_type_based_on_time_feature(action) 907 | action_type_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_type_based_on_time', index=False) 908 | 909 | 910 | # In[ ]: 911 | 912 | for window in [3,4,5,6,7]: 913 | util.log(window) 914 | action_type_based_on_time_last_window_feat = get_action_type_based_on_time_last_window_feature(action, window) 915 | action_type_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window), index=False) 916 | 917 | 918 | # In[ ]: 919 | 920 | for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]: 921 | util.log(window) 922 | action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window) 923 | action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False) 924 | 925 | 926 | # In[ ]: 927 | 928 | for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]: 929 | util.log(window) 930 | action_type_rate_based_on_time_last_window_feat = get_action_type_rate_based_on_time_last_window_feature(action, window) 931 | action_type_rate_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window), index=False) 932 | 933 | 934 | # In[ ]: 935 | 936 | for window in [6]: 937 | util.log(window) 938 | action_type_row_stat_based_on_time_last_window_feat = get_action_type_row_stat_based_on_time_last_window_feature(action, window) 939 | action_type_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window), index=False) 940 | 941 | 942 | # In[ ]: 943 | 944 | for window in [4, 7, 13, 17, 20, 25, 30]: 945 | util.log(window) 946 | action_num_based_on_time_last_window_feat = get_action_num_based_on_time_last_window_feature(action, window) 947 | action_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window), index=False) 948 | 949 | 950 | # In[ ]: 951 | 952 | for window in [4, 7, 13, 17, 20, 25, 30]: 953 | util.log(window) 954 | action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window) 955 | action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False) 956 | 957 | 958 | # In[ ]: 959 | 960 | action_time_based_on_time_feat = get_action_time_based_on_time_feature(action) 961 | action_time_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_time_based_on_time', index=False) 962 | 963 | 964 | # In[ ]: 965 | 966 | for window in [6]: 967 | util.log(window) 968 | action_time_based_on_time_last_window_feat = get_action_time_based_on_time_last_window_feature(action, window) 969 | action_time_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window), index=False) 970 | 971 | 972 | # In[ ]: 973 | 974 | for window in [3, 6, 10, 14]: 975 | util.log(window) 976 | action_time_row_stat_based_on_time_last_window_feat = get_action_time_row_stat_based_on_time_last_window_feature(action, window) 977 | action_time_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window), index=False) 978 | 979 | 980 | # In[ ]: 981 | 982 | for window in [3, 4, 5, 6, 7, 8]: 983 | util.log(window) 984 | action_time_diff2_based_on_time_last_window_feat = get_action_time_diff2_based_on_time_last_window_feature(action, window) 985 | action_time_diff2_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window), index=False) 986 | 987 | 988 | # In[ ]: 989 | 990 | for ttype in [1,5,6,7,8,9]: 991 | for window in [6]: 992 | util.log('type=%d window=%d' % (ttype, window)) 993 | action_time_based_on_time_last_window_on_type_feat = get_action_time_based_on_time_last_window_on_type_feature(action, window, ttype) 994 | action_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype), index=False) 995 | 996 | 997 | # In[ ]: 998 | 999 | for window in [3, 4, 5, 6, 7, 8, 9, 10]: 1000 | util.log(window) 1001 | action_time_2order_based_on_time_last_window_feat = get_action_time_2order_based_on_time_last_window_feature(action, window) 1002 | action_time_2order_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window), index=False) 1003 | 1004 | 1005 | # In[ ]: 1006 | 1007 | for ttype in [1,5,6,7,8,9]: 1008 | for window in [4, 7, 10]: 1009 | util.log('type=%d window=%d' % (ttype, window)) 1010 | action_real_time_based_on_time_last_window_on_type_feat = get_action_real_time_based_on_time_last_window_on_type_feature(action, window, ttype) 1011 | action_real_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype), index=False) 1012 | 1013 | 1014 | # In[ ]: 1015 | 1016 | act_ord_time_diff_feat = get_act_ord_time_diff_feature(action, order_history) 1017 | act_ord_time_diff_feat.to_csv('../data/output/feat/%s' % 'action_order_time_diff', index=False) 1018 | 1019 | 1020 | # In[ ]: 1021 | 1022 | order_last_order_ydm_feat = get_order_last_order_ydm_feature(order_history) 1023 | order_last_order_ydm_feat.to_csv('../data/output/feat/%s' % 'order_last_order_ydm', index=False) 1024 | 1025 | 1026 | # In[ ]: 1027 | 1028 | order_type1_ydm_feat = get_order_type1_ydm_feature(order_history) 1029 | order_type1_ydm_feat.to_csv('../data/output/feat/%s' % 'order_type1_ydm', index=False) 1030 | 1031 | 1032 | # In[ ]: 1033 | 1034 | for window in [7,8,10,11]: 1035 | util.log(window) 1036 | act_ord_act_time_diff_last_window_feat = get_act_ord_act_time_diff_last_window_feature(action, order_history, window) 1037 | act_ord_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window), index=False) 1038 | 1039 | 1040 | # In[ ]: 1041 | 1042 | for window in [2,4]: 1043 | util.log(window) 1044 | act_ord_type1_act_time_diff_last_window_feat = get_act_ord_type1_act_time_diff_last_window_feature(action, order_history, window) 1045 | act_ord_type1_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window), index=False) 1046 | 1047 | 1048 | # In[ ]: 1049 | 1050 | get_action_sequence_time_diff_feature(action) 1051 | 1052 | 1053 | # In[ ]: 1054 | 1055 | get_action_sequence_time_stat_feature(action) 1056 | 1057 | 1058 | # In[ ]: 1059 | 1060 | for window in [6]: 1061 | util.log(window) 1062 | action_time_diff_234_56789_last_window_feat = get_action_time_diff_234_56789_last_window_feature(action, window) 1063 | action_time_diff_234_56789_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window), index=False) 1064 | 1065 | 1066 | # In[ ]: 1067 | 1068 | action_stat_last_every_type_feat = get_action_stat_last_every_type_feature(action) 1069 | action_stat_last_every_type_feat.to_csv('../data/output/feat/%s' % 'action_stat_last_every_type', index=False) 1070 | 1071 | 1072 | # In[ ]: 1073 | 1074 | act_ord_before_type1_stat_feat = get_act_ord_before_type1_stat_feature(action, order_history) 1075 | act_ord_before_type1_stat_feat.to_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat', index=False) 1076 | 1077 | 1078 | # In[ ]: 1079 | 1080 | action_time_diff_stat_feat = get_action_time_diff_stat_feature(action) # untest 1081 | action_time_diff_stat_feat.to_csv('../data/output/feat/%s' % 'action_time_diff_stat', index=False) 1082 | 1083 | 1084 | # In[ ]: 1085 | 1086 | for window in [3, 4, 5, 6, 7, 8, 9]: 1087 | util.log(window) 1088 | action_time_diff_stat_last_window_feat = get_action_time_diff_stat_last_window_feature(action, window) 1089 | action_time_diff_stat_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window), index=False) 1090 | 1091 | 1092 | # In[ ]: 1093 | 1094 | action_time_last_on_every_type_feat = get_action_time_last_on_every_type_feature(action) 1095 | action_time_last_on_every_type_feat.to_csv('../data/output/feat/%s' % 'action_time_last_on_every_type', index=False) 1096 | 1097 | 1098 | # In[ ]: 1099 | 1100 | try_feat = get_try_feat(action) 1101 | try_feat.to_csv('../data/output/feat/%s' % 'try', index=False) 1102 | 1103 | -------------------------------------------------------------------------------- /src/feat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from __future__ import division\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "from sklearn import preprocessing\n", 15 | "import xgboost as xgb\n", 16 | "import lightgbm as lgb\n", 17 | "import catboost as cb\n", 18 | "import time\n", 19 | "import datetime\n", 20 | "import sys\n", 21 | "import math\n", 22 | "import warnings\n", 23 | "warnings.filterwarnings('ignore')\n", 24 | "import util" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "def get_user_profile_feature(df):\n", 36 | " df = df.copy()\n", 37 | "\n", 38 | " mydf = df[['userid']]\n", 39 | " le = preprocessing.LabelEncoder()\n", 40 | " mydf['gender'] = le.fit_transform(df['gender'])\n", 41 | "\n", 42 | " mydf['province'] = le.fit_transform(df['province'])\n", 43 | "\n", 44 | " mydf['age'] = le.fit_transform(df['age'])\n", 45 | "\n", 46 | " return mydf" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "def get_user_comment_feature(df):\n", 58 | " df = df.copy()\n", 59 | " \n", 60 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 61 | "\n", 62 | " com_rating = df.groupby('userid')['rating'].agg(['sum', 'count']).reset_index()\n", 63 | " com_rating.columns = [i if i == 'userid' else 'com_rating_' + i for i in com_rating.columns]\n", 64 | "\n", 65 | " mydf = pd.merge(mydf, com_rating, on='userid', how='left')\n", 66 | " \n", 67 | " return mydf" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "def get_order_history_feature(df):\n", 79 | " df = df.copy()\n", 80 | "\n", 81 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 82 | "\n", 83 | " # type 为 0 和 1 的订单的数量和比率 + 总订单数\n", 84 | " ord_hist_ord = df.groupby('userid')['orderType'].agg(['sum', 'count']).reset_index()\n", 85 | " ord_hist_ord.columns = ['userid', 'ord_num(type_1)', 'ord_num']\n", 86 | " ord_hist_ord['ord_num(type_0)'] = ord_hist_ord['ord_num'] - ord_hist_ord['ord_num(type_1)']\n", 87 | " ord_hist_ord['ord_rate(type_1)'] = ord_hist_ord['ord_num(type_1)'] / ord_hist_ord['ord_num']\n", 88 | " ord_hist_ord['ord_rate(type_0)'] = ord_hist_ord['ord_num(type_0)'] / ord_hist_ord['ord_num']\n", 89 | "\n", 90 | " # city, country, continent 的数量\n", 91 | " addr_count = df.groupby('userid')['city', 'country', 'continent'].count().reset_index()\n", 92 | " addr_count.columns = ['userid', 'city_num', 'country_num', 'continent_num']\n", 93 | "\n", 94 | " # type 为 1 的 city, country, continent 的数量\n", 95 | " addr_count_pos = df[df['orderType'] == 1].groupby('userid')['city', 'country', 'continent'].count().reset_index()\n", 96 | " addr_count_pos.columns = ['userid', 'city_num(type_1)', 'country_num(type_1)', 'continent_num(type_1)']\n", 97 | "\n", 98 | " # 每个 country 的订单数量\n", 99 | " lb = preprocessing.LabelBinarizer()\n", 100 | " tmp = lb.fit_transform(df['country'])\n", 101 | " tmp_col = ['country_' + str(i) for i in range(tmp.shape[1])]\n", 102 | " tmp = pd.DataFrame(tmp, columns=tmp_col)\n", 103 | " tmp['userid'] = df['userid'].values\n", 104 | " country = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index()\n", 105 | "\n", 106 | " # 每个 continent 的订单数量\n", 107 | " lb = preprocessing.LabelBinarizer()\n", 108 | " tmp = lb.fit_transform(df['continent'])\n", 109 | " tmp_col = ['continent_' + str(i) for i in range(tmp.shape[1])]\n", 110 | " tmp = pd.DataFrame(tmp, columns=tmp_col)\n", 111 | " tmp['userid'] = df['userid'].values\n", 112 | " continent = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index()\n", 113 | " \n", 114 | " # 最后一次的 order\n", 115 | " last_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']]\n", 116 | " last_ord.columns = ['userid', 'ord_last_id', 'ord_last_time', 'ord_last_type']\n", 117 | " \n", 118 | " # 第一次的 order\n", 119 | " first_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']]\n", 120 | " first_ord.columns = ['userid', 'ord_first_id', 'ord_first_time', 'ord_first_type']\n", 121 | " \n", 122 | " # type 分别为 0/1 的订单的时间的统计\n", 123 | " for t in [0, 1]:\n", 124 | " ord_time_stat = df[df['orderType'] == t].groupby('userid')['orderTime'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index()\n", 125 | " ord_time_stat.columns = [i if i == 'userid' else 'ord_type%d_time_%s' % (t, i) for i in ord_time_stat.columns]\n", 126 | " mydf = pd.merge(mydf, ord_time_stat, on='userid', how='left')\n", 127 | " \n", 128 | " mydf = pd.merge(mydf, ord_hist_ord, on='userid', how='left')\n", 129 | " mydf = pd.merge(mydf, addr_count, on='userid', how='left')\n", 130 | " mydf = pd.merge(mydf, addr_count_pos, on='userid', how='left')\n", 131 | " mydf = pd.merge(mydf, country, on='userid', how='left')\n", 132 | " mydf = pd.merge(mydf, continent, on='userid', how='left')\n", 133 | " mydf = pd.merge(mydf, last_ord, on='userid', how='left')\n", 134 | " mydf = pd.merge(mydf, first_ord, on='userid', how='left')\n", 135 | " \n", 136 | " return mydf" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "def get_order_history_last_w_feature(df):\n", 148 | " df = df.copy()\n", 149 | "\n", 150 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 151 | "\n", 152 | " # 最后 w 次订单的统计\n", 153 | " for w in [2, 3, 4]:\n", 154 | " util.log(w)\n", 155 | " \n", 156 | " last_order = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(w)).reset_index(drop=True)[['userid', 'orderTime', 'orderType']]\n", 157 | " last_order.columns = ['userid', 'ord_last_time', 'ord_last_type']\n", 158 | " \n", 159 | " ord_last_time_stat = last_order.groupby('userid')['ord_last_time'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index()\n", 160 | " ord_last_time_stat.columns = [i if i == 'userid' else 'ord_last%d_time_%s' % (w, i) for i in ord_last_time_stat.columns]\n", 161 | " \n", 162 | " ord_last_type_stat = last_order.groupby('userid')['ord_last_type'].agg(['count', sum]).reset_index()\n", 163 | " ord_last_type_stat.columns = [i if i == 'userid' else 'ord_last%d_type_%s' % (w, i) for i in ord_last_type_stat.columns]\n", 164 | " \n", 165 | " mydf = pd.merge(mydf, ord_last_time_stat, on='userid', how='left')\n", 166 | " mydf = pd.merge(mydf, ord_last_type_stat, on='userid', how='left')\n", 167 | "\n", 168 | " return mydf" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "def get_action_type_feature(df):\n", 180 | " df = df.copy()\n", 181 | "\n", 182 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 183 | "\n", 184 | " # 每个用户的 action 和 actionType 的数量\n", 185 | " act_num = df.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([sum, len]).reset_index()\n", 186 | " act_num.columns = ['userid', 'act_num', 'act_type_num']\n", 187 | "\n", 188 | " # 每个类别的数量\n", 189 | " act_type_num = df.groupby(['userid', 'actionType']).size().unstack().reset_index()\n", 190 | " act_type_num.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')' for i in act_type_num.columns]\n", 191 | "\n", 192 | " mydf = pd.merge(mydf, act_num, on='userid', how='left')\n", 193 | " mydf = pd.merge(mydf, act_type_num, on='userid', how='left')\n", 194 | "\n", 195 | " return mydf" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "def get_action_type_based_on_time_feature(df):\n", 207 | " df = df.copy()\n", 208 | "\n", 209 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 210 | "\n", 211 | " # 最近的一次 action 的 type\n", 212 | " act_last_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionType']]\n", 213 | " act_last_type.columns = ['userid', 'act_last_type']\n", 214 | " \n", 215 | " # 最早的一次 action 的 type\n", 216 | " act_first_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionType']]\n", 217 | " act_first_type.columns = ['userid', 'act_first_type']\n", 218 | "\n", 219 | " mydf = pd.merge(mydf, act_last_type, on='userid', how='left')\n", 220 | " mydf = pd.merge(mydf, act_first_type, on='userid', how='left')\n", 221 | "\n", 222 | " return mydf" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "collapsed": true 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "def get_action_type_based_on_time_last_window_feature(df, window):\n", 234 | " df = df.copy()\n", 235 | "\n", 236 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 237 | "\n", 238 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 239 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 240 | "\n", 241 | " # type 的差值\n", 242 | " act_type = tmp.pivot('userid', 'act_time_rank', 'actionType')\n", 243 | " act_type = act_type[act_type.columns[::-1]]\n", 244 | " act_type_diff = act_type.diff(1, axis=1)\n", 245 | " act_type_diff = act_type_diff.iloc[:, 1:].reset_index()\n", 246 | " act_type_diff.columns = [i if i == 'userid' else 'act_type_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_type_diff.columns]\n", 247 | "\n", 248 | " mydf = pd.merge(mydf, act_type_diff, on='userid', how='left')\n", 249 | "\n", 250 | " return mydf" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "def get_action_type_num_based_on_time_last_window_feature(df, window):\n", 262 | " df = df.copy()\n", 263 | "\n", 264 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 265 | "\n", 266 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 267 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 268 | "\n", 269 | " # 每个类别的数量\n", 270 | " act_num_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().reset_index()\n", 271 | " act_num_in_window.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_num_in_window.columns]\n", 272 | " \n", 273 | " mydf = pd.merge(mydf, act_num_in_window, on='userid', how='left')\n", 274 | "\n", 275 | " return mydf" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "def get_action_type_rate_based_on_time_last_window_feature(df, window):\n", 287 | " df = df.copy()\n", 288 | "\n", 289 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 290 | "\n", 291 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 292 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 293 | "\n", 294 | " # 每个类别的列级别的比率\n", 295 | " act_column_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply(lambda x: x / np.sum(x)).reset_index()\n", 296 | " act_column_rate_in_window.columns = [i if i == 'userid' else 'act_column_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_column_rate_in_window.columns]\n", 297 | "\n", 298 | " # 每个类别的行级别的比率\n", 299 | " act_row_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply((lambda x: x / np.sum(x)), axis=1).reset_index()\n", 300 | " act_row_rate_in_window.columns = [i if i == 'userid' else 'act_row_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_row_rate_in_window.columns]\n", 301 | " \n", 302 | " mydf = pd.merge(mydf, act_column_rate_in_window, on='userid', how='left')\n", 303 | " mydf = pd.merge(mydf, act_row_rate_in_window, on='userid', how='left')\n", 304 | "\n", 305 | " return mydf" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "def get_action_type_row_stat_based_on_time_last_window_feature(df, window):\n", 317 | " df = df.copy()\n", 318 | "\n", 319 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 320 | "\n", 321 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 322 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 323 | "\n", 324 | " # 最近的 type 值 + 行级别的统计值\n", 325 | " act_type = tmp.pivot('userid', 'act_time_rank', 'actionType')\n", 326 | " act_type.columns = ['act_type(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_type.columns]\n", 327 | " for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]:\n", 328 | " act_type['act_row_type_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_type_' + i.func_name + '(window_' + str(window) + ')'] = act_type.apply(i, axis=1)\n", 329 | " act_type = act_type.reset_index()\n", 330 | " \n", 331 | " mydf = pd.merge(mydf, act_type, on='userid', how='left')\n", 332 | "\n", 333 | " return mydf" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "def get_action_num_based_on_time_last_window_feature(df, window):\n", 345 | " df = df.copy()\n", 346 | "\n", 347 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 348 | "\n", 349 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 350 | "\n", 351 | " # action 的数量\n", 352 | " act_num = tmp.groupby('userid').size().reset_index()\n", 353 | " act_num.columns = ['userid', 'act_num(window_%d)' % window]\n", 354 | " \n", 355 | " mydf = pd.merge(mydf, act_num, on='userid', how='left')\n", 356 | "\n", 357 | " return mydf" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "def get_action_type_num_based_on_time_last_window_feature(df, window):\n", 369 | " df = df.copy()\n", 370 | "\n", 371 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 372 | "\n", 373 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 374 | "\n", 375 | " # type 的数量\n", 376 | " act_type_num = tmp.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([len]).reset_index()\n", 377 | " act_type_num.columns = ['userid', 'act_type_num(window_%d)' % window]\n", 378 | " \n", 379 | " mydf = pd.merge(mydf, act_type_num, on='userid', how='left')\n", 380 | "\n", 381 | " return mydf" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "def get_action_time_based_on_time_feature(df):\n", 393 | " df = df.copy()\n", 394 | "\n", 395 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 396 | "\n", 397 | " # 最近的一次 action 的 time\n", 398 | " act_last_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionTime']]\n", 399 | " act_last_time.columns = ['userid', 'act_last_time']\n", 400 | " \n", 401 | " # 最早的一次 action 的 time\n", 402 | " act_first_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionTime']]\n", 403 | " act_first_time.columns = ['userid', 'act_first_time']\n", 404 | " \n", 405 | " mydf = pd.merge(mydf, act_last_time, on='userid', how='left')\n", 406 | " mydf = pd.merge(mydf, act_first_time, on='userid', how='left')\n", 407 | " \n", 408 | " mydf['act_time_last-first'] = mydf['act_last_time'] - mydf['act_first_time']\n", 409 | "\n", 410 | " return mydf" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "collapsed": true 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "def get_action_time_based_on_time_last_window_feature(df, window):\n", 422 | " df = df.copy()\n", 423 | "\n", 424 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 425 | " \n", 426 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 427 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 428 | " \n", 429 | " # time 的差值\n", 430 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 431 | " act_time = act_time[act_time.columns[::-1]]\n", 432 | " act_time_diff = act_time.diff(1, axis=1)\n", 433 | " act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n", 434 | " act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]\n", 435 | "\n", 436 | " mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n", 437 | " \n", 438 | " return mydf" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "def get_action_time_row_stat_based_on_time_last_window_feature(df, window):\n", 450 | " df = df.copy()\n", 451 | "\n", 452 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 453 | " \n", 454 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 455 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 456 | " \n", 457 | " # 最近的 time 值 + 行级别的统计值\n", 458 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 459 | " act_time.columns = ['act_time(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_time.columns]\n", 460 | " for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]:\n", 461 | " act_time['act_row_time_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_time_' + i.func_name + '(window_' + str(window) + ')'] = act_time.apply(i, axis=1)\n", 462 | " act_time = act_time.reset_index()\n", 463 | "\n", 464 | " mydf = pd.merge(mydf, act_time, on='userid', how='left')\n", 465 | " \n", 466 | " return mydf" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "collapsed": true 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "def get_action_time_diff2_based_on_time_last_window_feature(df, window):\n", 478 | " df = df.copy()\n", 479 | "\n", 480 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 481 | " \n", 482 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 483 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 484 | " \n", 485 | " # time 的差值\n", 486 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 487 | " act_time = act_time[act_time.columns[::-1]]\n", 488 | " act_time_diff2 = act_time.diff(2, axis=1) # need test\n", 489 | " act_time_diff2 = act_time_diff2.iloc[:, 2:].reset_index()\n", 490 | " act_time_diff2.columns = [i if i == 'userid' else 'act_time_diff2(' + str(i) + '-' + str(i + 2) + ')(window_' + str(window) + ')' for i in act_time_diff2.columns]\n", 491 | "\n", 492 | " mydf = pd.merge(mydf, act_time_diff2, on='userid', how='left')\n", 493 | " \n", 494 | " return mydf" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "collapsed": true 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "def get_action_time_based_on_time_last_window_on_type_feature(df, window, ttype):\n", 506 | " df = df.copy()\n", 507 | "\n", 508 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 509 | " \n", 510 | " tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 511 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 512 | " \n", 513 | " # 特定 type 的 action 的 time 的差值\n", 514 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 515 | " act_time = act_time[act_time.columns[::-1]]\n", 516 | " act_time_diff = act_time.diff(1, axis=1)\n", 517 | " act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n", 518 | " act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, window, ttype) for i in act_time_diff.columns]\n", 519 | "\n", 520 | " mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n", 521 | " \n", 522 | " return mydf" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "collapsed": true 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "def get_action_time_2order_based_on_time_last_window_feature(df, window):\n", 534 | " df = df.copy()\n", 535 | "\n", 536 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 537 | " \n", 538 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 539 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 540 | " \n", 541 | " # 特定 type 的 action 的 time 的差值\n", 542 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 543 | " act_time = act_time[act_time.columns[::-1]]\n", 544 | " act_time_diff_2order = act_time.diff(1, axis=1).diff(1, axis=1)\n", 545 | " act_time_diff_2order = act_time_diff_2order.iloc[:, 2:].reset_index()\n", 546 | " act_time_diff_2order.columns = [i if i == 'userid' else 'act_time_diff_2order(%d-%d)(window_%d)' % (i, i+1, window) for i in act_time_diff_2order.columns]\n", 547 | "\n", 548 | " mydf = pd.merge(mydf, act_time_diff_2order, on='userid', how='left')\n", 549 | " \n", 550 | " return mydf" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": { 557 | "collapsed": true 558 | }, 559 | "outputs": [], 560 | "source": [ 561 | "def get_action_real_time_based_on_time_last_window_on_type_feature(df, window, ttype):\n", 562 | " df = df.copy()\n", 563 | "\n", 564 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 565 | " \n", 566 | " tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 567 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 568 | " \n", 569 | " # 特定的 type 的 action 的最近的 time 值\n", 570 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime').reset_index()\n", 571 | " act_time.columns = [i if i == 'userid' else 'act_time(rank_%d)(window_%d)(type_%d)' % (i, window, ttype) for i in act_time.columns]\n", 572 | "\n", 573 | " mydf = pd.merge(mydf, act_time, on='userid', how='left')\n", 574 | " \n", 575 | " return mydf" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": { 582 | "collapsed": true 583 | }, 584 | "outputs": [], 585 | "source": [ 586 | "def get_act_ord_time_diff_feature(act, oord):\n", 587 | " act = act.copy()\n", 588 | " oord = oord.copy()\n", 589 | "\n", 590 | " mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n", 591 | "\n", 592 | " ord_time = oord.groupby('userid')['orderTime'].max().reset_index()\n", 593 | " act = pd.merge(act, ord_time, on='userid', how='left') # fillna?\n", 594 | " act['act_time-ord_time'] = act['actionTime'] - act['orderTime']\n", 595 | " act_ord_time_diff = act[act['act_time-ord_time'] > 0].groupby('userid').size().reset_index()\n", 596 | " act_ord_time_diff.columns = ['userid', 'act_ord_time_diff_gt0_count']\n", 597 | "\n", 598 | " mydf = pd.merge(mydf, act_ord_time_diff, on='userid', how='left')\n", 599 | " \n", 600 | " return mydf" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "metadata": { 607 | "collapsed": true 608 | }, 609 | "outputs": [], 610 | "source": [ 611 | "def get_order_last_order_ydm_feature(df):\n", 612 | " df = df.copy()\n", 613 | "\n", 614 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 615 | "\n", 616 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)\n", 617 | "\n", 618 | " mydf['ord_last_ord_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year\n", 619 | " mydf['ord_last_ord_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month\n", 620 | " mydf['ord_last_ord_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day\n", 621 | "\n", 622 | " return mydf" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": { 629 | "collapsed": true 630 | }, 631 | "outputs": [], 632 | "source": [ 633 | "def get_order_type1_ydm_feature(df):\n", 634 | " df = df.copy()\n", 635 | "\n", 636 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 637 | "\n", 638 | " # 最近一次的 type 为 1 的订单的年月日\n", 639 | " tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)\n", 640 | " mydf['ord_last_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year\n", 641 | " mydf['ord_last_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month\n", 642 | " mydf['ord_last_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day\n", 643 | " \n", 644 | " # 最早一次的 type 为 1 的订单的年月日\n", 645 | " tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)\n", 646 | " mydf['ord_first_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year\n", 647 | " mydf['ord_first_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month\n", 648 | " mydf['ord_first_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day\n", 649 | "\n", 650 | " return mydf" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": { 657 | "collapsed": true 658 | }, 659 | "outputs": [], 660 | "source": [ 661 | "def get_act_ord_act_time_diff_last_window_feature(act, oord, window):\n", 662 | " act = act.copy()\n", 663 | " oord = oord.copy()\n", 664 | "\n", 665 | " mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n", 666 | "\n", 667 | " ord_time = oord.groupby('userid')['orderTime'].max().reset_index()\n", 668 | " act = pd.merge(act, ord_time, on='userid', how='left')\n", 669 | "\n", 670 | " df = act[act['actionTime'] < act['orderTime']]\n", 671 | "\n", 672 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 673 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 674 | "\n", 675 | " # 最后一次订单之前的 action 的 time 的差值\n", 676 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 677 | " act_time = act_time[act_time.columns[::-1]]\n", 678 | " act_time_diff = act_time.diff(1, axis=1)\n", 679 | " act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n", 680 | " act_time_diff.columns = [i if i == 'userid' else 'act_ord_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]\n", 681 | "\n", 682 | " mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n", 683 | " \n", 684 | " return mydf" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": null, 690 | "metadata": { 691 | "collapsed": true 692 | }, 693 | "outputs": [], 694 | "source": [ 695 | "def get_act_ord_type1_act_time_diff_last_window_feature(act, oord, window):\n", 696 | " act = act.copy()\n", 697 | " oord = oord.copy()\n", 698 | "\n", 699 | " mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n", 700 | "\n", 701 | " ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index()\n", 702 | " act = pd.merge(act, ord_time, on='userid', how='left')\n", 703 | "\n", 704 | " df = act[act['actionTime'] < act['orderTime']]\n", 705 | "\n", 706 | " tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 707 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 708 | "\n", 709 | " # 最后一次精品订单之前的 action 的 time 的差值\n", 710 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 711 | " act_time = act_time[act_time.columns[::-1]]\n", 712 | " act_time_diff = act_time.diff(1, axis=1)\n", 713 | " act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n", 714 | " act_time_diff.columns = [i if i == 'userid' else 'act_ord_type1_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]\n", 715 | "\n", 716 | " mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n", 717 | " \n", 718 | " return mydf" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": { 725 | "collapsed": true 726 | }, 727 | "outputs": [], 728 | "source": [ 729 | "def get_action_sequence_time_diff_feature(df):\n", 730 | " df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True)\n", 731 | "\n", 732 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 733 | "\n", 734 | " df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s')\n", 735 | " df['actionTimeDiff'] = df['actionTime'].diff()\n", 736 | "\n", 737 | " counter = 1\n", 738 | " last_userid = df.iloc[0, 0]\n", 739 | " seq_list = []\n", 740 | " for i, r in df[['userid', 'actionTimeDiff']].iterrows():\n", 741 | " if i % 500000 == 0:\n", 742 | " util.log(i)\n", 743 | " if r.userid != last_userid:\n", 744 | " counter = 1\n", 745 | " seq_list.append(counter)\n", 746 | " last_userid = r.userid\n", 747 | " elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid:\n", 748 | " seq_list.append(counter)\n", 749 | " else:\n", 750 | " counter += 1\n", 751 | " seq_list.append(counter)\n", 752 | " df['actionSeq'] = pd.Series(seq_list)\n", 753 | " \n", 754 | " # 基于10分钟分块(时差低于10分钟的行为为一部分),每个块的时差\n", 755 | " seq_time_max = df.groupby(['userid', 'actionSeq'])['actionTime'].max().unstack()\n", 756 | " seq_time_diff = seq_time_max.diff(1, axis=1)\n", 757 | " for window in [2,3,4,5,6,7,10,15]:\n", 758 | " tmp = seq_time_diff.iloc[:, 1:(window+1)]\n", 759 | " tmp.columns = ['act_seq_time_diff(%d-%d)(window_%d)' % (i, i-1, window) for i in tmp.columns]\n", 760 | " tmp = tmp.reset_index()\n", 761 | " data = pd.merge(mydf, tmp, on='userid', how='left')\n", 762 | " util.log('window=%d' % window)\n", 763 | " data.to_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window), index=False)" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": null, 769 | "metadata": { 770 | "collapsed": true 771 | }, 772 | "outputs": [], 773 | "source": [ 774 | "def get_action_sequence_time_stat_feature(df):\n", 775 | " df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True)\n", 776 | "\n", 777 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 778 | "\n", 779 | " df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s')\n", 780 | " df['actionTimeDiff'] = df['actionTime'].diff()\n", 781 | "\n", 782 | " counter = 1\n", 783 | " last_userid = df.iloc[0, 0]\n", 784 | " seq_list = []\n", 785 | " for i, r in df[['userid', 'actionTimeDiff']].iterrows():\n", 786 | " if i % 500000 == 0:\n", 787 | " util.log(i)\n", 788 | " if r.userid != last_userid:\n", 789 | " counter = 1\n", 790 | " seq_list.append(counter)\n", 791 | " last_userid = r.userid\n", 792 | " elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid:\n", 793 | " seq_list.append(counter)\n", 794 | " else:\n", 795 | " counter += 1\n", 796 | " seq_list.append(counter)\n", 797 | " df['actionSeq'] = pd.Series(seq_list)\n", 798 | " \n", 799 | " time_stat = df[(df['actionSeq'] == 1) | (df['actionSeq'] == 2) | (df['actionSeq'] == 3)].groupby(['userid', 'actionSeq'])['actionTime'].agg([min, max, np.mean, np.median, np.ptp, np.std, 'count']).unstack().reset_index()\n", 800 | " time_stat.columns = ['userid' if i[0] == 'userid' else 'act_seq_time_stat_%s_last%d' % (i[0], i[1]) for i in time_stat.columns]\n", 801 | " \n", 802 | " time_stat.to_csv('../data/output/feat/%s' % ('action_sequence_time_stat_last123'), index=False)" 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": null, 808 | "metadata": { 809 | "collapsed": true 810 | }, 811 | "outputs": [], 812 | "source": [ 813 | "def get_action_time_diff_234_56789_last_window_feature(df, window):\n", 814 | " df = df.copy()\n", 815 | "\n", 816 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 817 | " \n", 818 | " # 234 类型的 action 的 time 的差值\n", 819 | " tmp = df[df['actionType'].isin([2, 3, 4])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 820 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 821 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 822 | " act_time = act_time[act_time.columns[::-1]]\n", 823 | " act_time_diff_234 = act_time.diff(1, axis=1)\n", 824 | " act_time_diff_234 = act_time_diff_234.iloc[:, 1:].reset_index()\n", 825 | " act_time_diff_234.columns = [i if i == 'userid' else 'act_time_diff_234(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_234.columns]\n", 826 | " \n", 827 | " # 56789 类型的 action 的 time 的差值\n", 828 | " tmp = df[df['actionType'].isin([5, 6, 7, 8, 9])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n", 829 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 830 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 831 | " act_time = act_time[act_time.columns[::-1]]\n", 832 | " act_time_diff_56789 = act_time.diff(1, axis=1)\n", 833 | " act_time_diff_56789 = act_time_diff_56789.iloc[:, 1:].reset_index()\n", 834 | " act_time_diff_56789.columns = [i if i == 'userid' else 'act_time_diff_56789(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_56789.columns]\n", 835 | "\n", 836 | " mydf = pd.merge(mydf, act_time_diff_234, on='userid', how='left')\n", 837 | " mydf = pd.merge(mydf, act_time_diff_56789, on='userid', how='left')\n", 838 | " \n", 839 | " return mydf" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": null, 845 | "metadata": { 846 | "collapsed": true 847 | }, 848 | "outputs": [], 849 | "source": [ 850 | "def get_action_stat_last_every_type_feature(df):\n", 851 | " df = df.copy()\n", 852 | "\n", 853 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 854 | "\n", 855 | " # 离最近的 123456789 的 action 的时间的统计\n", 856 | " for t in range(1, 10):\n", 857 | " tmp = df[df['actionType'] == t].groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index()\n", 858 | " tmp.columns = [i if i == 'userid' else 'act_time_%s(type_%d)' % (i, t) for i in tmp.columns]\n", 859 | " \n", 860 | " mydf = pd.merge(mydf, tmp, on='userid', how='left')\n", 861 | "\n", 862 | " return mydf" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "metadata": { 869 | "collapsed": true 870 | }, 871 | "outputs": [], 872 | "source": [ 873 | "def get_act_ord_before_type1_stat_feature(act, oord):\n", 874 | " act = act.copy()\n", 875 | " oord = oord.copy()\n", 876 | "\n", 877 | " mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n", 878 | "\n", 879 | " ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index()\n", 880 | " act = pd.merge(act, ord_time, on='userid', how='left')\n", 881 | "\n", 882 | " df = act[act['actionTime'] < act['orderTime']]\n", 883 | "\n", 884 | " act_time_stat = df.groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index()\n", 885 | " act_time_stat.columns = [i if i == 'userid' else 'act_ord_before_type1_act_time_%s' % i for i in act_time_stat.columns]\n", 886 | " \n", 887 | " act_type_size = mydf.copy()\n", 888 | " for t in range(1, 10):\n", 889 | " tmp = df[df['actionType'] == t].groupby('userid').size().reset_index()\n", 890 | " tmp.columns = ['userid', 'act_ord_before_type1_act_type_size(type_%d)' % t]\n", 891 | " act_type_size = pd.merge(act_type_size, tmp, on='userid', how='left')\n", 892 | "\n", 893 | " mydf = pd.merge(mydf, act_time_stat, on='userid', how='left')\n", 894 | " mydf = pd.merge(mydf, act_type_size, on='userid', how='left')\n", 895 | " \n", 896 | " return mydf" 897 | ] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "execution_count": null, 902 | "metadata": { 903 | "collapsed": true 904 | }, 905 | "outputs": [], 906 | "source": [ 907 | "def get_action_time_diff_stat_feature(df):\n", 908 | " df = df.copy()\n", 909 | "\n", 910 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 911 | "\n", 912 | " df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy()\n", 913 | " df['actionTimeDiff'] = df['actionTime'].diff(1)\n", 914 | " df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)\n", 915 | "\n", 916 | " act_time_diff_stat = df.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()\n", 917 | " act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s' % i for i in act_time_diff_stat.columns]\n", 918 | "\n", 919 | " mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')\n", 920 | " \n", 921 | " return mydf" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": { 928 | "collapsed": true 929 | }, 930 | "outputs": [], 931 | "source": [ 932 | "def get_action_time_diff_stat_last_window_feature(df, window):\n", 933 | " df = df.copy()\n", 934 | "\n", 935 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 936 | "\n", 937 | " df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy()\n", 938 | " df['actionTimeDiff'] = df['actionTime'].diff(1)\n", 939 | " df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)\n", 940 | " \n", 941 | " tmp = df.groupby('userid').apply(lambda x: x.iloc[:-window, :]).reset_index(drop=True)\n", 942 | " act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()\n", 943 | " act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s(window_%d)' % (i, window) for i in act_time_diff_stat.columns]\n", 944 | "\n", 945 | " mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')\n", 946 | " \n", 947 | " return mydf" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": null, 953 | "metadata": { 954 | "collapsed": true 955 | }, 956 | "outputs": [], 957 | "source": [ 958 | "def get_action_time_last_on_every_type_feature(df):\n", 959 | " df = df.copy()\n", 960 | "\n", 961 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 962 | "\n", 963 | " df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy()\n", 964 | " for t in range(1, 10):\n", 965 | " act_time = df[df['actionType'] == t].groupby('userid').apply(lambda x: x.head(1)).reset_index(drop=True)\n", 966 | " act_time = act_time[['userid', 'actionTime']]\n", 967 | " act_time.columns = ['userid', 'act_time_last(type_%d)' % t]\n", 968 | " \n", 969 | " mydf = pd.merge(mydf, act_time, on='userid', how='left')\n", 970 | " \n", 971 | " return mydf" 972 | ] 973 | }, 974 | { 975 | "cell_type": "code", 976 | "execution_count": null, 977 | "metadata": { 978 | "collapsed": true 979 | }, 980 | "outputs": [], 981 | "source": [ 982 | "def get_try_feat(df):\n", 983 | " df = df.copy()\n", 984 | " \n", 985 | " mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n", 986 | "\n", 987 | " df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy()\n", 988 | " \n", 989 | " last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])\n", 990 | " last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])\n", 991 | " time_gap_last56 = pd.merge(last_5, last_6, on='userid', how='outer')\n", 992 | " time_gap_last56['time_gap_last56'] = time_gap_last56.actionTime_y - time_gap_last56.actionTime_x\n", 993 | " mydf = pd.merge(mydf, time_gap_last56[['userid', 'time_gap_last56']], on='userid', how='left')\n", 994 | "\n", 995 | " tmp = df[df['actionType'] == 5].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(2)).reset_index(drop=True)\n", 996 | " tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n", 997 | " act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n", 998 | " act_time = act_time[act_time.columns[::-1]]\n", 999 | " act_time_diff = act_time.diff(1, axis=1)\n", 1000 | " act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n", 1001 | " act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, 2, 5) for i in act_time_diff.columns]\n", 1002 | " mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n", 1003 | "\n", 1004 | " last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])\n", 1005 | " last_7 = df[df.actionType == 7].drop_duplicates(subset=['userid'])\n", 1006 | " time_gap_last67 = pd.merge(last_6, last_7, on='userid', how='outer')\n", 1007 | " time_gap_last67['time_gap_last67'] = time_gap_last67.actionTime_y - time_gap_last67.actionTime_x\n", 1008 | " mydf = pd.merge(mydf, time_gap_last67[['userid', 'time_gap_last67']], on='userid', how='left')\n", 1009 | "\n", 1010 | " df['actionDate'] = pd.to_datetime(df['actionTime'], unit='s')\n", 1011 | " df = pd.merge(df, df.drop_duplicates(subset=['userid'])[['userid', 'actionDate']], on='userid', how='left')\n", 1012 | " df['lastDay'] = df.actionDate_x.dt.day == df.actionDate_y.dt.day\n", 1013 | " last_day = df[df.lastDay].groupby('userid')['lastDay'].size().reset_index()\n", 1014 | " last_day_5 = df[df.lastDay & (df.actionType == 5)].groupby('userid')['lastDay'].size().reset_index()\n", 1015 | " tmp = pd.merge(last_day, last_day_5, on='userid', how='left')\n", 1016 | " tmp['last_day_rate(type_5)'] = tmp.lastDay_y / tmp.lastDay_x\n", 1017 | " mydf = pd.merge(mydf, tmp[['userid', 'last_day_rate(type_5)']], on='userid', how='left')\n", 1018 | "\n", 1019 | " last_time = df.drop_duplicates(subset=['userid'])[['userid', 'actionTime']]\n", 1020 | " last_time.columns = ['userid', 'last_time']\n", 1021 | " mydf = pd.merge(mydf, last_time, on='userid', how='left')\n", 1022 | "\n", 1023 | " last_4 = df[df.actionType == 4].drop_duplicates(subset=['userid'])\n", 1024 | " last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])\n", 1025 | " time_gap_last45 = pd.merge(last_4, last_5, on='userid', how='outer')\n", 1026 | " time_gap_last45['time_gap_last45'] = time_gap_last45.actionTime_y - time_gap_last45.actionTime_x\n", 1027 | " mydf = pd.merge(mydf, time_gap_last45[['userid', 'time_gap_last45']], on='userid', how='left')\n", 1028 | "\n", 1029 | " last_1 = df[df.actionType == 1].drop_duplicates(subset=['userid'])\n", 1030 | " last = df.drop_duplicates(subset=['userid'])\n", 1031 | " time_gap_last1 = pd.merge(last_1, last, on='userid', how='outer')\n", 1032 | " time_gap_last1['time_gap_last1'] = time_gap_last1.actionTime_y - time_gap_last1.actionTime_x\n", 1033 | " mydf = pd.merge(mydf, time_gap_last1[['userid', 'time_gap_last1']], on='userid', how='left')\n", 1034 | "\n", 1035 | " last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])\n", 1036 | " last = df.drop_duplicates(subset=['userid'])\n", 1037 | " time_gap_last5 = pd.merge(last_5, last, on='userid', how='outer')\n", 1038 | " time_gap_last5['time_gap_last5'] = time_gap_last5.actionTime_y - time_gap_last5.actionTime_x\n", 1039 | " mydf = pd.merge(mydf, time_gap_last5[['userid', 'time_gap_last5']], on='userid', how='left')\n", 1040 | "\n", 1041 | " last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])\n", 1042 | " last = df.drop_duplicates(subset=['userid'])\n", 1043 | " time_gap_last6 = pd.merge(last_6, last, on='userid', how='outer')\n", 1044 | " time_gap_last6['time_gap_last6'] = time_gap_last6.actionTime_y - time_gap_last6.actionTime_x\n", 1045 | " mydf = pd.merge(mydf, time_gap_last6[['userid', 'time_gap_last6']], on='userid', how='left')\n", 1046 | "\n", 1047 | " tmp = df[df.actionType.isin([5, 6])].copy()\n", 1048 | " tmp['actionTimeDiff'] = tmp['actionTime'].diff(1)\n", 1049 | " tmp = tmp.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)\n", 1050 | " act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()\n", 1051 | " act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_56_%s' % i for i in act_time_diff_stat.columns]\n", 1052 | " mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')\n", 1053 | " \n", 1054 | " return mydf" 1055 | ] 1056 | }, 1057 | { 1058 | "cell_type": "code", 1059 | "execution_count": null, 1060 | "metadata": { 1061 | "collapsed": true 1062 | }, 1063 | "outputs": [], 1064 | "source": [ 1065 | "action_tr = pd.read_csv('../data/input/train/action_train.csv') # 用户行为数据\n", 1066 | "order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv') # 待预测数据\n", 1067 | "order_history_tr = pd.read_csv('../data/input/train/orderHistory_train.csv') # 用户历史订单数据\n", 1068 | "user_comment_tr = pd.read_csv('../data/input/train/userComment_train.csv') # 用户评论数据\n", 1069 | "user_profile_tr = pd.read_csv('../data/input/train/userProfile_train.csv') # 用户个人信息\n", 1070 | "\n", 1071 | "action_te = pd.read_csv('../data/input/test/action_test.csv')\n", 1072 | "order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')\n", 1073 | "order_history_te = pd.read_csv('../data/input/test/orderHistory_test.csv')\n", 1074 | "user_comment_te = pd.read_csv('../data/input/test/userComment_test.csv')\n", 1075 | "user_profile_te = pd.read_csv('../data/input/test/userProfile_test.csv')\n", 1076 | "\n", 1077 | "action = pd.concat([action_tr, action_te], axis=0).reset_index(drop=True)\n", 1078 | "order_history = pd.concat([order_history_tr, order_history_te], axis=0).reset_index(drop=True)\n", 1079 | "user_comment = pd.concat([user_comment_tr, user_comment_te], axis=0).reset_index(drop=True)\n", 1080 | "user_profile = pd.concat([user_profile_tr, user_profile_te], axis=0).reset_index(drop=True)" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": null, 1086 | "metadata": { 1087 | "collapsed": true 1088 | }, 1089 | "outputs": [], 1090 | "source": [ 1091 | "user_profile_feat = get_user_profile_feature(user_profile)\n", 1092 | "user_profile_feat.to_csv('../data/output/feat/%s' % 'user_profile', index=False)" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "code", 1097 | "execution_count": null, 1098 | "metadata": { 1099 | "collapsed": true 1100 | }, 1101 | "outputs": [], 1102 | "source": [ 1103 | "user_comment_feat = get_user_comment_feature(user_comment)\n", 1104 | "user_comment_feat.to_csv('../data/output/feat/%s' % 'user_comment', index=False)" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "code", 1109 | "execution_count": null, 1110 | "metadata": { 1111 | "collapsed": true 1112 | }, 1113 | "outputs": [], 1114 | "source": [ 1115 | "order_history_feat = get_order_history_feature(order_history)\n", 1116 | "order_history_feat.to_csv('../data/output/feat/%s' % 'order_history', index=False)" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "metadata": { 1123 | "collapsed": true 1124 | }, 1125 | "outputs": [], 1126 | "source": [ 1127 | "order_history_last_w_feat = get_order_history_last_w_feature(order_history)\n", 1128 | "order_history_last_w_feat.to_csv('../data/output/feat/%s' % 'order_history_last_w', index=False)" 1129 | ] 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "execution_count": null, 1134 | "metadata": { 1135 | "collapsed": true 1136 | }, 1137 | "outputs": [], 1138 | "source": [ 1139 | "action_type_feat = get_action_type_feature(action)\n", 1140 | "action_type_feat.to_csv('../data/output/feat/%s' % 'action_type', index=False)" 1141 | ] 1142 | }, 1143 | { 1144 | "cell_type": "code", 1145 | "execution_count": null, 1146 | "metadata": { 1147 | "collapsed": true 1148 | }, 1149 | "outputs": [], 1150 | "source": [ 1151 | "action_type_based_on_time_feat = get_action_type_based_on_time_feature(action)\n", 1152 | "action_type_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_type_based_on_time', index=False)" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": null, 1158 | "metadata": { 1159 | "collapsed": true, 1160 | "scrolled": true 1161 | }, 1162 | "outputs": [], 1163 | "source": [ 1164 | "for window in [3,4,5,6,7]:\n", 1165 | " util.log(window)\n", 1166 | " action_type_based_on_time_last_window_feat = get_action_type_based_on_time_last_window_feature(action, window)\n", 1167 | " action_type_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window), index=False)" 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "code", 1172 | "execution_count": null, 1173 | "metadata": { 1174 | "collapsed": true, 1175 | "scrolled": true 1176 | }, 1177 | "outputs": [], 1178 | "source": [ 1179 | "for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]:\n", 1180 | " util.log(window)\n", 1181 | " action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window)\n", 1182 | " action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False)" 1183 | ] 1184 | }, 1185 | { 1186 | "cell_type": "code", 1187 | "execution_count": null, 1188 | "metadata": { 1189 | "collapsed": true 1190 | }, 1191 | "outputs": [], 1192 | "source": [ 1193 | "for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]:\n", 1194 | " util.log(window)\n", 1195 | " action_type_rate_based_on_time_last_window_feat = get_action_type_rate_based_on_time_last_window_feature(action, window)\n", 1196 | " action_type_rate_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window), index=False)" 1197 | ] 1198 | }, 1199 | { 1200 | "cell_type": "code", 1201 | "execution_count": null, 1202 | "metadata": { 1203 | "collapsed": true 1204 | }, 1205 | "outputs": [], 1206 | "source": [ 1207 | "for window in [6]:\n", 1208 | " util.log(window)\n", 1209 | " action_type_row_stat_based_on_time_last_window_feat = get_action_type_row_stat_based_on_time_last_window_feature(action, window)\n", 1210 | " action_type_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window), index=False)" 1211 | ] 1212 | }, 1213 | { 1214 | "cell_type": "code", 1215 | "execution_count": null, 1216 | "metadata": { 1217 | "collapsed": true 1218 | }, 1219 | "outputs": [], 1220 | "source": [ 1221 | "for window in [4, 7, 13, 17, 20, 25, 30]:\n", 1222 | " util.log(window)\n", 1223 | " action_num_based_on_time_last_window_feat = get_action_num_based_on_time_last_window_feature(action, window)\n", 1224 | " action_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window), index=False)" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": null, 1230 | "metadata": { 1231 | "collapsed": true 1232 | }, 1233 | "outputs": [], 1234 | "source": [ 1235 | "for window in [4, 7, 13, 17, 20, 25, 30]:\n", 1236 | " util.log(window)\n", 1237 | " action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window)\n", 1238 | " action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False)" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": null, 1244 | "metadata": { 1245 | "collapsed": true 1246 | }, 1247 | "outputs": [], 1248 | "source": [ 1249 | "action_time_based_on_time_feat = get_action_time_based_on_time_feature(action)\n", 1250 | "action_time_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_time_based_on_time', index=False)" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "code", 1255 | "execution_count": null, 1256 | "metadata": { 1257 | "collapsed": true 1258 | }, 1259 | "outputs": [], 1260 | "source": [ 1261 | "for window in [6]:\n", 1262 | " util.log(window)\n", 1263 | " action_time_based_on_time_last_window_feat = get_action_time_based_on_time_last_window_feature(action, window)\n", 1264 | " action_time_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window), index=False)" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": null, 1270 | "metadata": { 1271 | "collapsed": true 1272 | }, 1273 | "outputs": [], 1274 | "source": [ 1275 | "for window in [3, 6, 10, 14]:\n", 1276 | " util.log(window)\n", 1277 | " action_time_row_stat_based_on_time_last_window_feat = get_action_time_row_stat_based_on_time_last_window_feature(action, window)\n", 1278 | " action_time_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window), index=False)" 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "execution_count": null, 1284 | "metadata": { 1285 | "collapsed": true 1286 | }, 1287 | "outputs": [], 1288 | "source": [ 1289 | "for window in [3, 4, 5, 6, 7, 8]:\n", 1290 | " util.log(window)\n", 1291 | " action_time_diff2_based_on_time_last_window_feat = get_action_time_diff2_based_on_time_last_window_feature(action, window)\n", 1292 | " action_time_diff2_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window), index=False)" 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "code", 1297 | "execution_count": null, 1298 | "metadata": { 1299 | "collapsed": true 1300 | }, 1301 | "outputs": [], 1302 | "source": [ 1303 | "for ttype in [1,5,6,7,8,9]:\n", 1304 | " for window in [6]:\n", 1305 | " util.log('type=%d window=%d' % (ttype, window))\n", 1306 | " action_time_based_on_time_last_window_on_type_feat = get_action_time_based_on_time_last_window_on_type_feature(action, window, ttype)\n", 1307 | " action_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype), index=False)" 1308 | ] 1309 | }, 1310 | { 1311 | "cell_type": "code", 1312 | "execution_count": null, 1313 | "metadata": { 1314 | "collapsed": true 1315 | }, 1316 | "outputs": [], 1317 | "source": [ 1318 | "for window in [3, 4, 5, 6, 7, 8, 9, 10]:\n", 1319 | " util.log(window)\n", 1320 | " action_time_2order_based_on_time_last_window_feat = get_action_time_2order_based_on_time_last_window_feature(action, window)\n", 1321 | " action_time_2order_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window), index=False)" 1322 | ] 1323 | }, 1324 | { 1325 | "cell_type": "code", 1326 | "execution_count": null, 1327 | "metadata": { 1328 | "collapsed": true 1329 | }, 1330 | "outputs": [], 1331 | "source": [ 1332 | "for ttype in [1,5,6,7,8,9]:\n", 1333 | " for window in [4, 7, 10]:\n", 1334 | " util.log('type=%d window=%d' % (ttype, window))\n", 1335 | " action_real_time_based_on_time_last_window_on_type_feat = get_action_real_time_based_on_time_last_window_on_type_feature(action, window, ttype)\n", 1336 | " action_real_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype), index=False)" 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": null, 1342 | "metadata": { 1343 | "collapsed": true 1344 | }, 1345 | "outputs": [], 1346 | "source": [ 1347 | "act_ord_time_diff_feat = get_act_ord_time_diff_feature(action, order_history)\n", 1348 | "act_ord_time_diff_feat.to_csv('../data/output/feat/%s' % 'action_order_time_diff', index=False)" 1349 | ] 1350 | }, 1351 | { 1352 | "cell_type": "code", 1353 | "execution_count": null, 1354 | "metadata": { 1355 | "collapsed": true 1356 | }, 1357 | "outputs": [], 1358 | "source": [ 1359 | "order_last_order_ydm_feat = get_order_last_order_ydm_feature(order_history)\n", 1360 | "order_last_order_ydm_feat.to_csv('../data/output/feat/%s' % 'order_last_order_ydm', index=False)" 1361 | ] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "execution_count": null, 1366 | "metadata": { 1367 | "collapsed": true 1368 | }, 1369 | "outputs": [], 1370 | "source": [ 1371 | "order_type1_ydm_feat = get_order_type1_ydm_feature(order_history)\n", 1372 | "order_type1_ydm_feat.to_csv('../data/output/feat/%s' % 'order_type1_ydm', index=False)" 1373 | ] 1374 | }, 1375 | { 1376 | "cell_type": "code", 1377 | "execution_count": null, 1378 | "metadata": { 1379 | "collapsed": true 1380 | }, 1381 | "outputs": [], 1382 | "source": [ 1383 | "for window in [7,8,10,11]:\n", 1384 | " util.log(window)\n", 1385 | " act_ord_act_time_diff_last_window_feat = get_act_ord_act_time_diff_last_window_feature(action, order_history, window)\n", 1386 | " act_ord_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window), index=False)" 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "execution_count": null, 1392 | "metadata": { 1393 | "collapsed": true 1394 | }, 1395 | "outputs": [], 1396 | "source": [ 1397 | "for window in [2,4]:\n", 1398 | " util.log(window)\n", 1399 | " act_ord_type1_act_time_diff_last_window_feat = get_act_ord_type1_act_time_diff_last_window_feature(action, order_history, window)\n", 1400 | " act_ord_type1_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window), index=False)" 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": null, 1406 | "metadata": { 1407 | "collapsed": true 1408 | }, 1409 | "outputs": [], 1410 | "source": [ 1411 | "get_action_sequence_time_diff_feature(action)" 1412 | ] 1413 | }, 1414 | { 1415 | "cell_type": "code", 1416 | "execution_count": null, 1417 | "metadata": { 1418 | "collapsed": true 1419 | }, 1420 | "outputs": [], 1421 | "source": [ 1422 | "get_action_sequence_time_stat_feature(action)" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "execution_count": null, 1428 | "metadata": { 1429 | "collapsed": true 1430 | }, 1431 | "outputs": [], 1432 | "source": [ 1433 | "for window in [6]:\n", 1434 | " util.log(window)\n", 1435 | " action_time_diff_234_56789_last_window_feat = get_action_time_diff_234_56789_last_window_feature(action, window)\n", 1436 | " action_time_diff_234_56789_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window), index=False)" 1437 | ] 1438 | }, 1439 | { 1440 | "cell_type": "code", 1441 | "execution_count": null, 1442 | "metadata": { 1443 | "collapsed": true 1444 | }, 1445 | "outputs": [], 1446 | "source": [ 1447 | "action_stat_last_every_type_feat = get_action_stat_last_every_type_feature(action)\n", 1448 | "action_stat_last_every_type_feat.to_csv('../data/output/feat/%s' % 'action_stat_last_every_type', index=False)" 1449 | ] 1450 | }, 1451 | { 1452 | "cell_type": "code", 1453 | "execution_count": null, 1454 | "metadata": { 1455 | "collapsed": true 1456 | }, 1457 | "outputs": [], 1458 | "source": [ 1459 | "act_ord_before_type1_stat_feat = get_act_ord_before_type1_stat_feature(action, order_history)\n", 1460 | "act_ord_before_type1_stat_feat.to_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat', index=False)" 1461 | ] 1462 | }, 1463 | { 1464 | "cell_type": "code", 1465 | "execution_count": null, 1466 | "metadata": { 1467 | "collapsed": true 1468 | }, 1469 | "outputs": [], 1470 | "source": [ 1471 | "action_time_diff_stat_feat = get_action_time_diff_stat_feature(action) # untest\n", 1472 | "action_time_diff_stat_feat.to_csv('../data/output/feat/%s' % 'action_time_diff_stat', index=False)" 1473 | ] 1474 | }, 1475 | { 1476 | "cell_type": "code", 1477 | "execution_count": null, 1478 | "metadata": { 1479 | "collapsed": true 1480 | }, 1481 | "outputs": [], 1482 | "source": [ 1483 | "for window in [3, 4, 5, 6, 7, 8, 9]:\n", 1484 | " util.log(window)\n", 1485 | " action_time_diff_stat_last_window_feat = get_action_time_diff_stat_last_window_feature(action, window)\n", 1486 | " action_time_diff_stat_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window), index=False)" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "code", 1491 | "execution_count": null, 1492 | "metadata": { 1493 | "collapsed": true 1494 | }, 1495 | "outputs": [], 1496 | "source": [ 1497 | "action_time_last_on_every_type_feat = get_action_time_last_on_every_type_feature(action)\n", 1498 | "action_time_last_on_every_type_feat.to_csv('../data/output/feat/%s' % 'action_time_last_on_every_type', index=False)" 1499 | ] 1500 | }, 1501 | { 1502 | "cell_type": "code", 1503 | "execution_count": null, 1504 | "metadata": { 1505 | "collapsed": true 1506 | }, 1507 | "outputs": [], 1508 | "source": [ 1509 | "try_feat = get_try_feat(action)\n", 1510 | "try_feat.to_csv('../data/output/feat/%s' % 'try', index=False)" 1511 | ] 1512 | } 1513 | ], 1514 | "metadata": { 1515 | "kernelspec": { 1516 | "display_name": "Python [default]", 1517 | "language": "python", 1518 | "name": "python2" 1519 | }, 1520 | "language_info": { 1521 | "codemirror_mode": { 1522 | "name": "ipython", 1523 | "version": 2 1524 | }, 1525 | "file_extension": ".py", 1526 | "mimetype": "text/x-python", 1527 | "name": "python", 1528 | "nbconvert_exporter": "python", 1529 | "pygments_lexer": "ipython2", 1530 | "version": "2.7.13" 1531 | } 1532 | }, 1533 | "nbformat": 4, 1534 | "nbformat_minor": 1 1535 | } 1536 | -------------------------------------------------------------------------------- /src/model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\ProgramData\\Anaconda2\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 13 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from __future__ import division\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "from sklearn import preprocessing\n", 22 | "import xgboost as xgb\n", 23 | "import lightgbm as lgb\n", 24 | "import catboost as cb\n", 25 | "import time\n", 26 | "import datetime\n", 27 | "import warnings\n", 28 | "warnings.filterwarnings('ignore')\n", 29 | "import util" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "def merge_feature(\n", 41 | " act_type_window,\n", 42 | " act_type_num_window,\n", 43 | " act_type_rate_window,\n", 44 | " act_type_row_stat_window,\n", 45 | " act_time_window,\n", 46 | " act_time_1type_window,\n", 47 | " act_ord_act_time_diff_window,\n", 48 | " action_sequence_time_diff_window,\n", 49 | " action_time_diff_234_56789_window,\n", 50 | " action_time_diff_stat_window\n", 51 | "):\n", 52 | " util.log('Merge feature...')\n", 53 | " \n", 54 | " order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')\n", 55 | " order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')\n", 56 | "\n", 57 | " user_profile = pd.read_csv('../data/output/feat/%s' % 'user_profile')\n", 58 | " train = pd.merge(order_future_tr, user_profile, on='userid', how='left')\n", 59 | " test = pd.merge(order_future_te, user_profile, on='userid', how='left')\n", 60 | " \n", 61 | " user_comment = pd.read_csv('../data/output/feat/%s' % 'user_comment')\n", 62 | " train = pd.merge(train, user_comment, on='userid', how='left')\n", 63 | " test = pd.merge(test, user_comment, on='userid', how='left')\n", 64 | " \n", 65 | " order_history = pd.read_csv('../data/output/feat/%s' % 'order_history')\n", 66 | " train = pd.merge(train, order_history, on='userid', how='left')\n", 67 | " test = pd.merge(test, order_history, on='userid', how='left')\n", 68 | " \n", 69 | "# order_history_last_w = pd.read_csv('../data/output/feat/%s' % 'order_history_last_w')\n", 70 | "# train = pd.merge(train, order_history_last_w, on='userid', how='left')\n", 71 | "# test = pd.merge(test, order_history_last_w, on='userid', how='left')\n", 72 | " \n", 73 | " action_type = pd.read_csv('../data/output/feat/%s' % 'action_type')\n", 74 | " train = pd.merge(train, action_type, on='userid', how='left')\n", 75 | " test = pd.merge(test, action_type, on='userid', how='left')\n", 76 | " \n", 77 | " action_type_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_type_based_on_time')\n", 78 | " train = pd.merge(train, action_type_based_on_time, on='userid', how='left')\n", 79 | " test = pd.merge(test, action_type_based_on_time, on='userid', how='left')\n", 80 | " \n", 81 | " util.log('act_type_window=' + str(act_type_window))\n", 82 | " window = act_type_window\n", 83 | " action_type_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window))\n", 84 | " train = pd.merge(train, action_type_based_on_time_last_window, on='userid', how='left')\n", 85 | " test = pd.merge(test, action_type_based_on_time_last_window, on='userid', how='left')\n", 86 | " \n", 87 | " util.log('act_type_num_window=' + str(act_type_num_window))\n", 88 | " window = act_type_num_window\n", 89 | " action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window))\n", 90 | " train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left')\n", 91 | " test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left')\n", 92 | " \n", 93 | " util.log('act_type_rate_window=' + str(act_type_rate_window))\n", 94 | " window = act_type_rate_window\n", 95 | " action_type_rate_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window))\n", 96 | " train = pd.merge(train, action_type_rate_based_on_time_last_window, on='userid', how='left')\n", 97 | " test = pd.merge(test, action_type_rate_based_on_time_last_window, on='userid', how='left')\n", 98 | " \n", 99 | " util.log('act_type_row_stat_window=' + str(act_type_row_stat_window))\n", 100 | " window = act_type_row_stat_window\n", 101 | " action_type_row_stat_based_on_time_last_window_feat = pd.read_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window))\n", 102 | " train = pd.merge(train, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left')\n", 103 | " test = pd.merge(test, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left')\n", 104 | " \n", 105 | "# util.log('action_num_window=' + str(action_num_window))\n", 106 | "# window = action_num_window\n", 107 | "# action_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window))\n", 108 | "# train = pd.merge(train, action_num_based_on_time_last_window, on='userid', how='left')\n", 109 | "# test = pd.merge(test, action_num_based_on_time_last_window, on='userid', how='left')\n", 110 | "\n", 111 | "# util.log('action_type_num_window=' + str(action_type_num_window))\n", 112 | "# window = action_type_num_window\n", 113 | "# action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window))\n", 114 | "# train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left')\n", 115 | "# test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left')\n", 116 | "\n", 117 | " action_time_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_time_based_on_time')\n", 118 | " train = pd.merge(train, action_time_based_on_time, on='userid', how='left')\n", 119 | " test = pd.merge(test, action_time_based_on_time, on='userid', how='left')\n", 120 | " \n", 121 | " util.log('act_time_window=' + str(act_time_window))\n", 122 | " window = act_time_window\n", 123 | " action_time_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window))\n", 124 | " train = pd.merge(train, action_time_based_on_time_last_window, on='userid', how='left')\n", 125 | " test = pd.merge(test, action_time_based_on_time_last_window, on='userid', how='left')\n", 126 | " \n", 127 | "# util.log('act_time_row_stat_window=' + str(act_time_row_stat_window))\n", 128 | "# window = act_time_row_stat_window\n", 129 | "# action_time_row_stat_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window))\n", 130 | "# train = pd.merge(train, action_time_row_stat_based_on_time_last_window, on='userid', how='left')\n", 131 | "# test = pd.merge(test, action_time_row_stat_based_on_time_last_window, on='userid', how='left')\n", 132 | " \n", 133 | "# util.log('action_time_diff2_window=' + str(action_time_diff2_window))\n", 134 | "# window = action_time_diff2_window\n", 135 | "# action_time_diff2_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window))\n", 136 | "# train = pd.merge(train, action_time_diff2_based_on_time_last_window, on='userid', how='left')\n", 137 | "# test = pd.merge(test, action_time_diff2_based_on_time_last_window, on='userid', how='left')\n", 138 | "\n", 139 | " util.log('act_time_1type_window=%d' % act_time_1type_window)\n", 140 | " window = act_time_1type_window\n", 141 | " for ttype in [1, 5, 6, 7, 8, 9]:\n", 142 | " action_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype))\n", 143 | " train = pd.merge(train, action_time_based_on_time_last_window_on_type, on='userid', how='left')\n", 144 | " test = pd.merge(test, action_time_based_on_time_last_window_on_type, on='userid', how='left')\n", 145 | " \n", 146 | "# util.log('action_time_2order_window=' + str(action_time_2order_window))\n", 147 | "# window = action_time_2order_window\n", 148 | "# action_time_2order_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window))\n", 149 | "# train = pd.merge(train, action_time_2order_based_on_time_last_window, on='userid', how='left')\n", 150 | "# test = pd.merge(test, action_time_2order_based_on_time_last_window, on='userid', how='left')\n", 151 | "\n", 152 | "# util.log('act_real_time_1type_window=%d' % act_real_time_1type_window)\n", 153 | "# window = act_real_time_1type_window\n", 154 | "# for ttype in [1, 5, 6, 7, 8, 9]:\n", 155 | "# action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype))\n", 156 | "# train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n", 157 | "# test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n", 158 | "\n", 159 | "# action_order_time_diff = pd.read_csv('../data/output/feat/%s' % 'action_order_time_diff')\n", 160 | "# train = pd.merge(train, action_order_time_diff, on='userid', how='left')\n", 161 | "# test = pd.merge(test, action_order_time_diff, on='userid', how='left')\n", 162 | "\n", 163 | "# order_last_order_ydm = pd.read_csv('../data/output/feat/%s' % 'order_last_order_ydm')\n", 164 | "# train = pd.merge(train, order_last_order_ydm, on='userid', how='left')\n", 165 | "# test = pd.merge(test, order_last_order_ydm, on='userid', how='left')\n", 166 | "\n", 167 | " order_type1_ydm = pd.read_csv('../data/output/feat/%s' % 'order_type1_ydm')\n", 168 | " train = pd.merge(train, order_type1_ydm, on='userid', how='left')\n", 169 | " test = pd.merge(test, order_type1_ydm, on='userid', how='left')\n", 170 | "\n", 171 | " util.log('act_ord_act_time_diff_window=' + str(act_ord_act_time_diff_window))\n", 172 | " window = act_ord_act_time_diff_window\n", 173 | " act_ord_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window))\n", 174 | " train = pd.merge(train, act_ord_act_time_diff_last_window, on='userid', how='left')\n", 175 | " test = pd.merge(test, act_ord_act_time_diff_last_window, on='userid', how='left')\n", 176 | "\n", 177 | "# util.log('act_ord_type1_act_time_diff_window=' + str(act_ord_type1_act_time_diff_window))\n", 178 | "# window = act_ord_type1_act_time_diff_window\n", 179 | "# act_ord_type1_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window))\n", 180 | "# train = pd.merge(train, act_ord_type1_act_time_diff_last_window, on='userid', how='left')\n", 181 | "# test = pd.merge(test, act_ord_type1_act_time_diff_last_window, on='userid', how='left')\n", 182 | "\n", 183 | " util.log('action_sequence_time_diff_window=' + str(action_sequence_time_diff_window))\n", 184 | " window = action_sequence_time_diff_window\n", 185 | " action_sequence_time_diff_window = pd.read_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window))\n", 186 | " train = pd.merge(train, action_sequence_time_diff_window, on='userid', how='left')\n", 187 | " test = pd.merge(test, action_sequence_time_diff_window, on='userid', how='left')\n", 188 | "\n", 189 | "# action_sequence_time_stat_last123 = pd.read_csv('../data/output/feat/%s' % 'action_sequence_time_stat_last123')\n", 190 | "# train = pd.merge(train, action_sequence_time_stat_last123, on='userid', how='left')\n", 191 | "# test = pd.merge(test, action_sequence_time_stat_last123, on='userid', how='left')\n", 192 | "\n", 193 | " util.log('action_time_diff_234_56789_window=' + str(action_time_diff_234_56789_window))\n", 194 | " window = action_time_diff_234_56789_window\n", 195 | " action_time_diff_234_56789_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window))\n", 196 | " train = pd.merge(train, action_time_diff_234_56789_last_window, on='userid', how='left')\n", 197 | " test = pd.merge(test, action_time_diff_234_56789_last_window, on='userid', how='left')\n", 198 | " \n", 199 | "# action_stat_last_every_type = pd.read_csv('../data/output/feat/%s' % 'action_stat_last_every_type')\n", 200 | "# train = pd.merge(train, action_stat_last_every_type, on='userid', how='left')\n", 201 | "# test = pd.merge(test, action_stat_last_every_type, on='userid', how='left')\n", 202 | "\n", 203 | "# act_ord_before_type1_stat = pd.read_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat')\n", 204 | "# train = pd.merge(train, act_ord_before_type1_stat, on='userid', how='left')\n", 205 | "# test = pd.merge(test, act_ord_before_type1_stat, on='userid', how='left')\n", 206 | "\n", 207 | " action_time_diff_stat = pd.read_csv('../data/output/feat/%s' % 'action_time_diff_stat')\n", 208 | " train = pd.merge(train, action_time_diff_stat, on='userid', how='left')\n", 209 | " test = pd.merge(test, action_time_diff_stat, on='userid', how='left')\n", 210 | "\n", 211 | " util.log('action_time_diff_stat_window=' + str(action_time_diff_stat_window))\n", 212 | " window = action_time_diff_stat_window\n", 213 | " action_time_diff_stat_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window))\n", 214 | " train = pd.merge(train, action_time_diff_stat_last_window, on='userid', how='left')\n", 215 | " test = pd.merge(test, action_time_diff_stat_last_window, on='userid', how='left')\n", 216 | " \n", 217 | "# action_time_last_on_every_type = pd.read_csv('../data/output/feat/%s' % 'action_time_last_on_every_type')\n", 218 | "# train = pd.merge(train, action_time_last_on_every_type, on='userid', how='left')\n", 219 | "# test = pd.merge(test, action_time_last_on_every_type, on='userid', how='left')\n", 220 | "\n", 221 | " # bjw comment 中出现 order 中没有出现的为 1\n", 222 | " bjw_train = pd.read_csv('../data/output/feat/bjw/train_fea.csv')\n", 223 | " bjw_test = pd.read_csv('../data/output/feat/bjw/test_fea.csv')\n", 224 | " train = pd.merge(train, bjw_train, on='userid', how='left')\n", 225 | " test = pd.merge(test, bjw_test, on='userid', how='left')\n", 226 | " \n", 227 | " # 别人的开源特征,基于自己理解实现了一部分\n", 228 | " tryy = pd.read_csv('../data/output/feat/%s' % 'try')\n", 229 | " train = pd.merge(train, tryy, on='userid', how='left')\n", 230 | " test = pd.merge(test, tryy, on='userid', how='left')\n", 231 | " \n", 232 | " # bjw 的特征\n", 233 | " bjw_train = pd.read_csv('../data/output/feat/bjw/all_features_train.csv').drop(['Unnamed: 0', 'orderType'], axis=1)\n", 234 | " bjw_train.columns = ['userid' if i == 0 else i for i in range(len(bjw_train.columns))]\n", 235 | " bjw_test = pd.read_csv('../data/output/feat/bjw/all_features_test.csv').drop(['Unnamed: 0'], axis=1)\n", 236 | " bjw_test.columns = ['userid' if i == 0 else i for i in range(len(bjw_test.columns))]\n", 237 | " train = pd.merge(train, bjw_train, on='userid', how='left')\n", 238 | " test = pd.merge(test, bjw_test, on='userid', how='left')\n", 239 | " \n", 240 | "#################################################################################################################\n", 241 | " \n", 242 | " # 用于交叉特征,使用之后会移除\n", 243 | " window = 1\n", 244 | " for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n", 245 | " action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype))\n", 246 | " train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n", 247 | " test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n", 248 | "\n", 249 | " train, test = cross_feature(train, test)\n", 250 | " \n", 251 | " train, test = drop_duplicate_column(train, test)\n", 252 | " \n", 253 | " train_feature = train.drop(['orderType'], axis = 1)\n", 254 | " train_label = train.orderType.values\n", 255 | " test_feature = test\n", 256 | " test_index = test.userid.values\n", 257 | " \n", 258 | " return train_feature, train_label, test_feature, test_index" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 4, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "def cross_feature(train, test):\n", 270 | " util.log('Cross feature...')\n", 271 | " \n", 272 | " # 最近的 action 与最近的 order 的时间差\n", 273 | " train['act_last_time-ord_last_time'] = train['act_last_time'] - train['ord_last_time']\n", 274 | " train['act_last_time-ord_type0_time_max'] = train['act_last_time'] - train['ord_type0_time_max']\n", 275 | " train['act_last_time-ord_type1_time_max'] = train['act_last_time'] - train['ord_type1_time_max']\n", 276 | " test['act_last_time-ord_last_time'] = test['act_last_time'] - test['ord_last_time']\n", 277 | " test['act_last_time-ord_type0_time_max'] = test['act_last_time'] - test['ord_type0_time_max']\n", 278 | " test['act_last_time-ord_type1_time_max'] = test['act_last_time'] - test['ord_type1_time_max']\n", 279 | " \n", 280 | " # 最早的 action 与最早的 order 的时间差\n", 281 | " train['act_first_time-ord_first_time'] = train['act_first_time'] - train['ord_first_time']\n", 282 | " train['act_first_time-ord_type0_time_min'] = train['act_first_time'] - train['ord_type0_time_min']\n", 283 | " train['act_first_time-ord_type1_time_min'] = train['act_first_time'] - train['ord_type1_time_min']\n", 284 | " test['act_first_time-ord_first_time'] = test['act_first_time'] - test['ord_first_time']\n", 285 | " test['act_first_time-ord_type0_time_min'] = test['act_first_time'] - test['ord_type0_time_min']\n", 286 | " test['act_first_time-ord_type1_time_min'] = test['act_first_time'] - test['ord_type1_time_min']\n", 287 | " \n", 288 | " # 最近的 action 与最近的每一个 type 的 action 的时间差 + 最早的 action 与最早的每一个 type 的 action 的时间差\n", 289 | " for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n", 290 | " train['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_last_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 291 | " train['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 292 | " test['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_last_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 293 | " test['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 294 | " train = train.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1)\n", 295 | " test = test.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1)\n", 296 | "\n", 297 | " # 是否下过精品服务的单 * 最近的 action 的时间\n", 298 | " tmp = train['ord_num(type_1)'].copy()\n", 299 | " tmp[tmp > 1] = 1\n", 300 | " tmp = pd.get_dummies(tmp.fillna(-1))\n", 301 | " tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n", 302 | " train = pd.concat([train, tmp.mul(train['act_last_time'], axis=0)], axis=1)\n", 303 | " tmp = test['ord_num(type_1)'].copy()\n", 304 | " tmp[tmp > 1] = 1\n", 305 | " tmp = pd.get_dummies(tmp.fillna(-1))\n", 306 | " tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n", 307 | " test = pd.concat([test, tmp.mul(test['act_last_time'], axis=0)], axis=1)\n", 308 | " \n", 309 | " # 是否下过精品服务的单 * 每一个 type 的 action 的数量\n", 310 | " tmp = train['ord_num(type_1)'].copy()\n", 311 | " tmp[tmp > 1] = 1\n", 312 | " tmp = pd.get_dummies(tmp.fillna(-1))\n", 313 | " tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n", 314 | " for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n", 315 | " train = train.join(tmp.mul(train['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype)\n", 316 | " tmp = test['ord_num(type_1)'].copy()\n", 317 | " tmp[tmp > 1] = 1\n", 318 | " tmp = pd.get_dummies(tmp.fillna(-1))\n", 319 | " tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n", 320 | " for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n", 321 | " test = test.join(tmp.mul(test['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype)\n", 322 | " \n", 323 | "# # 最近的 order 与最近的每一个 type 的 action 的时间差 + 最早的 order 与最早的每一个 type 的 action 的时间差 (all/0/1)\n", 324 | "# for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n", 325 | "# train['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_last_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 326 | "# train['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 327 | "# train['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 328 | "# train['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 329 | "# train['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 330 | "# train['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 331 | "# test['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_last_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 332 | "# test['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 333 | "# test['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 334 | "# test['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 335 | "# test['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 336 | "# test['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n", 337 | " \n", 338 | " return train, test" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 5, 344 | "metadata": { 345 | "collapsed": true 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "def drop_duplicate_column(train, test):\n", 350 | " util.log('Drop duplicate column...')\n", 351 | " \n", 352 | " train = train.drop(['act_type(rank_1)(window6)'], axis=1) # window9\n", 353 | " test = test.drop(['act_type(rank_1)(window6)'], axis=1)\n", 354 | " \n", 355 | " return train, test" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 6, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "def lgb_cv(train_feature, train_label, params, folds, rounds):\n", 367 | " start = time.clock()\n", 368 | " print train_feature.columns\n", 369 | " dtrain = lgb.Dataset(train_feature, label=train_label)\n", 370 | " num_round = rounds\n", 371 | " print 'run cv: ' + 'round: ' + str(rounds)\n", 372 | " res = lgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=20, early_stopping_rounds=100)\n", 373 | " elapsed = (time.clock() - start)\n", 374 | " print 'Time used:', elapsed, 's'\n", 375 | " return len(res['auc-mean']), res['auc-mean'][len(res['auc-mean']) - 1]\n", 376 | "\n", 377 | "\n", 378 | "def lgb_predict(train_feature, train_label, test_feature, rounds, params):\n", 379 | " dtrain = lgb.Dataset(train_feature, label=train_label)\n", 380 | " valid_sets = [dtrain]\n", 381 | " num_round = rounds\n", 382 | " model = lgb.train(params, dtrain, num_round, valid_sets, verbose_eval=50)\n", 383 | " predict = model.predict(test_feature)\n", 384 | " return model, predict\n", 385 | "\n", 386 | "\n", 387 | "def store_result(test_index, pred, name):\n", 388 | " result = pd.DataFrame({'userid': test_index, 'orderType': pred})\n", 389 | " result.to_csv('../data/output/sub/' + name + '.csv', index=0, columns=['userid', 'orderType'])\n", 390 | " return result" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 7, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "2018-02-10 11:57:35 Merge feature...\n", 403 | "2018-02-10 11:57:36 act_type_window=6\n", 404 | "2018-02-10 11:57:36 act_type_num_window=6\n", 405 | "2018-02-10 11:57:36 act_type_rate_window=3\n", 406 | "2018-02-10 11:57:36 act_type_row_stat_window=6\n", 407 | "2018-02-10 11:57:36 act_time_window=6\n", 408 | "2018-02-10 11:57:36 act_time_1type_window=6\n", 409 | "2018-02-10 11:57:37 act_ord_act_time_diff_window=6\n", 410 | "2018-02-10 11:57:37 action_sequence_time_diff_window=6\n", 411 | "2018-02-10 11:57:37 action_time_diff_234_56789_window=6\n", 412 | "2018-02-10 11:57:38 action_time_diff_stat_window=3\n", 413 | "2018-02-10 11:57:42 Cross feature...\n", 414 | "2018-02-10 11:57:47 Drop duplicate column...\n", 415 | "(40307, 599) (40307L,) (10076, 599)\n" 416 | ] 417 | } 418 | ], 419 | "source": [ 420 | "train_feature, train_label, test_feature, test_index = merge_feature(6, 6, 3, 6, 6, 6, 6, 6, 6, 3)\n", 421 | "print train_feature.shape, train_label.shape, test_feature.shape" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 8, 427 | "metadata": { 428 | "collapsed": true 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "config = {\n", 433 | " 'rounds': 10000,\n", 434 | " 'folds': 5\n", 435 | "}\n", 436 | "\n", 437 | "params_lgb = {\n", 438 | " 'task': 'train',\n", 439 | " 'boosting_type': 'gbdt',\n", 440 | " 'objective': 'binary',\n", 441 | " 'metric': 'auc',\n", 442 | " 'min_sum_hessian_in_leaf': 0.1,\n", 443 | " 'learning_rate': 0.01,\n", 444 | " 'verbosity': 2,\n", 445 | " 'tree_learner': 'feature',\n", 446 | " 'num_leaves': 128,\n", 447 | " 'feature_fraction': 0.75,\n", 448 | " 'bagging_fraction': 0.9,\n", 449 | " 'bagging_freq': 1,\n", 450 | " 'num_threads': 16,\n", 451 | " 'seed': 7\n", 452 | "}" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 10, 458 | "metadata": { 459 | "scrolled": true 460 | }, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "Index([ u'userid',\n", 467 | " u'gender',\n", 468 | " u'province',\n", 469 | " u'age',\n", 470 | " u'com_rating_sum',\n", 471 | " u'com_rating_count',\n", 472 | " u'ord_type0_time_min',\n", 473 | " u'ord_type0_time_max',\n", 474 | " u'ord_type0_time_ptp',\n", 475 | " u'ord_type0_time_mean',\n", 476 | " ...\n", 477 | " u'has_ord_serv_yes*act_num(type_6)',\n", 478 | " u'has_ord_serv_nan*act_num(type_7)',\n", 479 | " u'has_ord_serv_no*act_num(type_7)',\n", 480 | " u'has_ord_serv_yes*act_num(type_7)',\n", 481 | " u'has_ord_serv_nan*act_num(type_8)',\n", 482 | " u'has_ord_serv_no*act_num(type_8)',\n", 483 | " u'has_ord_serv_yes*act_num(type_8)',\n", 484 | " u'has_ord_serv_nan*act_num(type_9)',\n", 485 | " u'has_ord_serv_no*act_num(type_9)',\n", 486 | " u'has_ord_serv_yes*act_num(type_9)'],\n", 487 | " dtype='object', length=599)\n", 488 | "run cv: round: 10000\n", 489 | "[20]\tcv_agg's auc: 0.92985 + 0.00347325\n", 490 | "[40]\tcv_agg's auc: 0.937074 + 0.00334338\n", 491 | "[60]\tcv_agg's auc: 0.939803 + 0.00342068\n", 492 | "[80]\tcv_agg's auc: 0.942163 + 0.00369461\n", 493 | "[100]\tcv_agg's auc: 0.944753 + 0.00353578\n", 494 | "[120]\tcv_agg's auc: 0.946343 + 0.00347801\n", 495 | "[140]\tcv_agg's auc: 0.948005 + 0.00343983\n", 496 | "[160]\tcv_agg's auc: 0.94951 + 0.00354529\n", 497 | "[180]\tcv_agg's auc: 0.950927 + 0.0035433\n", 498 | "[200]\tcv_agg's auc: 0.952201 + 0.00358382\n", 499 | "[220]\tcv_agg's auc: 0.953184 + 0.00362432\n", 500 | "[240]\tcv_agg's auc: 0.954438 + 0.00351397\n", 501 | "[260]\tcv_agg's auc: 0.955437 + 0.00352917\n", 502 | "[280]\tcv_agg's auc: 0.956519 + 0.00349788\n", 503 | "[300]\tcv_agg's auc: 0.957527 + 0.00343062\n", 504 | "[320]\tcv_agg's auc: 0.958456 + 0.00334904\n", 505 | "[340]\tcv_agg's auc: 0.959236 + 0.00324421\n", 506 | "[360]\tcv_agg's auc: 0.960058 + 0.00316942\n", 507 | "[380]\tcv_agg's auc: 0.96071 + 0.00313101\n", 508 | "[400]\tcv_agg's auc: 0.96131 + 0.00305216\n", 509 | "[420]\tcv_agg's auc: 0.961842 + 0.00303876\n", 510 | "[440]\tcv_agg's auc: 0.962355 + 0.00306587\n", 511 | "[460]\tcv_agg's auc: 0.962823 + 0.00307233\n", 512 | "[480]\tcv_agg's auc: 0.963272 + 0.00308154\n", 513 | "[500]\tcv_agg's auc: 0.963682 + 0.00306173\n", 514 | "[520]\tcv_agg's auc: 0.964027 + 0.00304902\n", 515 | "[540]\tcv_agg's auc: 0.964375 + 0.00305624\n", 516 | "[560]\tcv_agg's auc: 0.964683 + 0.00307713\n", 517 | "[580]\tcv_agg's auc: 0.964964 + 0.00307794\n", 518 | "[600]\tcv_agg's auc: 0.965264 + 0.00307881\n", 519 | "[620]\tcv_agg's auc: 0.965557 + 0.00307066\n", 520 | "[640]\tcv_agg's auc: 0.965813 + 0.00304611\n", 521 | "[660]\tcv_agg's auc: 0.966083 + 0.00304641\n", 522 | "[680]\tcv_agg's auc: 0.966309 + 0.00301581\n", 523 | "[700]\tcv_agg's auc: 0.966539 + 0.0030049\n", 524 | "[720]\tcv_agg's auc: 0.966745 + 0.00296058\n", 525 | "[740]\tcv_agg's auc: 0.96696 + 0.00292508\n", 526 | "[760]\tcv_agg's auc: 0.967143 + 0.00291436\n", 527 | "[780]\tcv_agg's auc: 0.967317 + 0.00288889\n", 528 | "[800]\tcv_agg's auc: 0.967494 + 0.00287349\n", 529 | "[820]\tcv_agg's auc: 0.967665 + 0.00286154\n", 530 | "[840]\tcv_agg's auc: 0.967805 + 0.00284129\n", 531 | "[860]\tcv_agg's auc: 0.967929 + 0.00283724\n", 532 | "[880]\tcv_agg's auc: 0.968058 + 0.0028292\n", 533 | "[900]\tcv_agg's auc: 0.96821 + 0.00280956\n", 534 | "[920]\tcv_agg's auc: 0.968321 + 0.00280509\n", 535 | "[940]\tcv_agg's auc: 0.96843 + 0.00279607\n", 536 | "[960]\tcv_agg's auc: 0.968546 + 0.002785\n", 537 | "[980]\tcv_agg's auc: 0.968645 + 0.00277864\n", 538 | "[1000]\tcv_agg's auc: 0.96872 + 0.00278066\n", 539 | "[1020]\tcv_agg's auc: 0.968792 + 0.00277067\n", 540 | "[1040]\tcv_agg's auc: 0.968876 + 0.00276013\n", 541 | "[1060]\tcv_agg's auc: 0.968933 + 0.00276024\n", 542 | "[1080]\tcv_agg's auc: 0.969008 + 0.00272818\n", 543 | "[1100]\tcv_agg's auc: 0.969076 + 0.00272411\n", 544 | "[1120]\tcv_agg's auc: 0.969146 + 0.00270865\n", 545 | "[1140]\tcv_agg's auc: 0.969206 + 0.00269515\n", 546 | "[1160]\tcv_agg's auc: 0.96926 + 0.002696\n", 547 | "[1180]\tcv_agg's auc: 0.969323 + 0.00268535\n", 548 | "[1200]\tcv_agg's auc: 0.969387 + 0.00267229\n", 549 | "[1220]\tcv_agg's auc: 0.969441 + 0.00267172\n", 550 | "[1240]\tcv_agg's auc: 0.969482 + 0.00267564\n", 551 | "[1260]\tcv_agg's auc: 0.969523 + 0.00267744\n", 552 | "[1280]\tcv_agg's auc: 0.969565 + 0.00265628\n", 553 | "[1300]\tcv_agg's auc: 0.969616 + 0.00265951\n", 554 | "[1320]\tcv_agg's auc: 0.969652 + 0.00264378\n", 555 | "[1340]\tcv_agg's auc: 0.969683 + 0.00265488\n", 556 | "[1360]\tcv_agg's auc: 0.969716 + 0.00265775\n", 557 | "[1380]\tcv_agg's auc: 0.969763 + 0.00265908\n", 558 | "[1400]\tcv_agg's auc: 0.969788 + 0.00266174\n", 559 | "[1420]\tcv_agg's auc: 0.969816 + 0.002664\n", 560 | "[1440]\tcv_agg's auc: 0.969844 + 0.00266299\n", 561 | "[1460]\tcv_agg's auc: 0.969869 + 0.00266542\n", 562 | "[1480]\tcv_agg's auc: 0.96991 + 0.00266125\n", 563 | "[1500]\tcv_agg's auc: 0.969927 + 0.0026605\n", 564 | "[1520]\tcv_agg's auc: 0.969946 + 0.00265666\n", 565 | "[1540]\tcv_agg's auc: 0.969976 + 0.00266038\n", 566 | "[1560]\tcv_agg's auc: 0.969992 + 0.00266306\n", 567 | "[1580]\tcv_agg's auc: 0.970015 + 0.00267018\n", 568 | "[1600]\tcv_agg's auc: 0.970022 + 0.00266937\n", 569 | "[1620]\tcv_agg's auc: 0.970038 + 0.00266377\n", 570 | "[1640]\tcv_agg's auc: 0.970062 + 0.00265507\n", 571 | "[1660]\tcv_agg's auc: 0.970068 + 0.00264258\n", 572 | "[1680]\tcv_agg's auc: 0.970086 + 0.00263613\n", 573 | "[1700]\tcv_agg's auc: 0.970088 + 0.00263336\n", 574 | "[1720]\tcv_agg's auc: 0.970101 + 0.00262677\n", 575 | "[1740]\tcv_agg's auc: 0.97011 + 0.00262187\n", 576 | "[1760]\tcv_agg's auc: 0.970129 + 0.00262488\n", 577 | "[1780]\tcv_agg's auc: 0.970143 + 0.00262872\n", 578 | "[1800]\tcv_agg's auc: 0.970163 + 0.00261796\n", 579 | "[1820]\tcv_agg's auc: 0.970163 + 0.00261577\n", 580 | "[1840]\tcv_agg's auc: 0.970164 + 0.00261959\n", 581 | "[1860]\tcv_agg's auc: 0.970173 + 0.00262113\n", 582 | "[1880]\tcv_agg's auc: 0.970184 + 0.00261985\n", 583 | "[1900]\tcv_agg's auc: 0.970195 + 0.00262071\n", 584 | "[1920]\tcv_agg's auc: 0.970207 + 0.0026204\n", 585 | "[1940]\tcv_agg's auc: 0.970221 + 0.00262299\n", 586 | "[1960]\tcv_agg's auc: 0.970224 + 0.00262606\n", 587 | "[1980]\tcv_agg's auc: 0.970226 + 0.00262837\n", 588 | "[2000]\tcv_agg's auc: 0.970227 + 0.00262799\n", 589 | "[2020]\tcv_agg's auc: 0.970243 + 0.00262799\n", 590 | "[2040]\tcv_agg's auc: 0.970252 + 0.00263339\n", 591 | "[2060]\tcv_agg's auc: 0.970263 + 0.00262685\n", 592 | "[2080]\tcv_agg's auc: 0.970272 + 0.00261628\n", 593 | "[2100]\tcv_agg's auc: 0.970289 + 0.00260981\n", 594 | "[2120]\tcv_agg's auc: 0.970292 + 0.00261098\n", 595 | "[2140]\tcv_agg's auc: 0.970291 + 0.00261651\n", 596 | "[2160]\tcv_agg's auc: 0.97029 + 0.00261132\n", 597 | "[2180]\tcv_agg's auc: 0.970303 + 0.00260396\n", 598 | "[2200]\tcv_agg's auc: 0.970303 + 0.00260545\n", 599 | "[2220]\tcv_agg's auc: 0.970302 + 0.00260331\n", 600 | "[2240]\tcv_agg's auc: 0.970304 + 0.00259132\n", 601 | "[2260]\tcv_agg's auc: 0.970313 + 0.00258719\n", 602 | "[2280]\tcv_agg's auc: 0.97032 + 0.00258453\n", 603 | "[2300]\tcv_agg's auc: 0.970321 + 0.00258521\n", 604 | "[2320]\tcv_agg's auc: 0.97032 + 0.00258072\n", 605 | "[2340]\tcv_agg's auc: 0.970327 + 0.00258175\n", 606 | "[2360]\tcv_agg's auc: 0.970327 + 0.00258231\n", 607 | "[2380]\tcv_agg's auc: 0.970324 + 0.00257717\n", 608 | "[2400]\tcv_agg's auc: 0.970319 + 0.00258236\n", 609 | "[2420]\tcv_agg's auc: 0.970319 + 0.00257521\n", 610 | "[2440]\tcv_agg's auc: 0.970319 + 0.00256878\n", 611 | "Time used: 607.245468312 s\n" 612 | ] 613 | } 614 | ], 615 | "source": [ 616 | "iterations, best_score = lgb_cv(train_feature, train_label, params_lgb, config['folds'], config['rounds'])" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 11, 622 | "metadata": { 623 | "scrolled": true 624 | }, 625 | "outputs": [ 626 | { 627 | "name": "stdout", 628 | "output_type": "stream", 629 | "text": [ 630 | "[50]\ttraining's auc: 0.943533\n", 631 | "[100]\ttraining's auc: 0.950872\n", 632 | "[150]\ttraining's auc: 0.955854\n", 633 | "[200]\ttraining's auc: 0.959148\n", 634 | "[250]\ttraining's auc: 0.962417\n", 635 | "[300]\ttraining's auc: 0.965764\n", 636 | "[350]\ttraining's auc: 0.968347\n", 637 | "[400]\ttraining's auc: 0.970468\n", 638 | "[450]\ttraining's auc: 0.972457\n", 639 | "[500]\ttraining's auc: 0.974226\n", 640 | "[550]\ttraining's auc: 0.975788\n", 641 | "[600]\ttraining's auc: 0.977252\n", 642 | "[650]\ttraining's auc: 0.978582\n", 643 | "[700]\ttraining's auc: 0.97979\n", 644 | "[750]\ttraining's auc: 0.980914\n", 645 | "[800]\ttraining's auc: 0.981934\n", 646 | "[850]\ttraining's auc: 0.982887\n", 647 | "[900]\ttraining's auc: 0.983756\n", 648 | "[950]\ttraining's auc: 0.984568\n", 649 | "[1000]\ttraining's auc: 0.985332\n", 650 | "[1050]\ttraining's auc: 0.986045\n", 651 | "[1100]\ttraining's auc: 0.986688\n", 652 | "[1150]\ttraining's auc: 0.987315\n", 653 | "[1200]\ttraining's auc: 0.987904\n", 654 | "[1250]\ttraining's auc: 0.988446\n", 655 | "[1300]\ttraining's auc: 0.988976\n", 656 | "[1350]\ttraining's auc: 0.989463\n", 657 | "[1400]\ttraining's auc: 0.98994\n", 658 | "[1450]\ttraining's auc: 0.990374\n", 659 | "[1500]\ttraining's auc: 0.990802\n", 660 | "[1550]\ttraining's auc: 0.991203\n", 661 | "[1600]\ttraining's auc: 0.991564\n", 662 | "[1650]\ttraining's auc: 0.991903\n", 663 | "[1700]\ttraining's auc: 0.992221\n", 664 | "[1750]\ttraining's auc: 0.992537\n", 665 | "[1800]\ttraining's auc: 0.992855\n", 666 | "[1850]\ttraining's auc: 0.993147\n", 667 | "[1900]\ttraining's auc: 0.993422\n", 668 | "[1950]\ttraining's auc: 0.993685\n", 669 | "[2000]\ttraining's auc: 0.993928\n", 670 | "[2050]\ttraining's auc: 0.994164\n", 671 | "[2100]\ttraining's auc: 0.994403\n", 672 | "[2150]\ttraining's auc: 0.994626\n", 673 | "[2200]\ttraining's auc: 0.994846\n", 674 | "[2250]\ttraining's auc: 0.995046\n", 675 | "[2300]\ttraining's auc: 0.995233\n", 676 | "[50]\ttraining's auc: 0.944121\n", 677 | "[100]\ttraining's auc: 0.950113\n", 678 | "[150]\ttraining's auc: 0.954538\n", 679 | "[200]\ttraining's auc: 0.958038\n", 680 | "[250]\ttraining's auc: 0.961905\n", 681 | "[300]\ttraining's auc: 0.965661\n", 682 | "[350]\ttraining's auc: 0.96851\n", 683 | "[400]\ttraining's auc: 0.970647\n", 684 | "[450]\ttraining's auc: 0.97257\n", 685 | "[500]\ttraining's auc: 0.974361\n", 686 | "[550]\ttraining's auc: 0.975971\n", 687 | "[600]\ttraining's auc: 0.977432\n", 688 | "[650]\ttraining's auc: 0.978727\n", 689 | "[700]\ttraining's auc: 0.979893\n", 690 | "[750]\ttraining's auc: 0.980962\n", 691 | "[800]\ttraining's auc: 0.982016\n", 692 | "[850]\ttraining's auc: 0.982955\n", 693 | "[900]\ttraining's auc: 0.983835\n", 694 | "[950]\ttraining's auc: 0.984645\n", 695 | "[1000]\ttraining's auc: 0.985404\n", 696 | "[1050]\ttraining's auc: 0.986127\n", 697 | "[1100]\ttraining's auc: 0.986803\n", 698 | "[1150]\ttraining's auc: 0.987408\n", 699 | "[1200]\ttraining's auc: 0.987996\n", 700 | "[1250]\ttraining's auc: 0.988534\n", 701 | "[1300]\ttraining's auc: 0.989056\n", 702 | "[1350]\ttraining's auc: 0.98956\n", 703 | "[1400]\ttraining's auc: 0.990015\n", 704 | "[1450]\ttraining's auc: 0.990433\n", 705 | "[1500]\ttraining's auc: 0.990864\n", 706 | "[1550]\ttraining's auc: 0.991236\n", 707 | "[1600]\ttraining's auc: 0.991607\n", 708 | "[1650]\ttraining's auc: 0.991958\n", 709 | "[1700]\ttraining's auc: 0.992295\n", 710 | "[1750]\ttraining's auc: 0.992604\n", 711 | "[1800]\ttraining's auc: 0.992904\n", 712 | "[1850]\ttraining's auc: 0.993195\n", 713 | "[1900]\ttraining's auc: 0.993474\n", 714 | "[1950]\ttraining's auc: 0.993725\n", 715 | "[2000]\ttraining's auc: 0.993981\n", 716 | "[2050]\ttraining's auc: 0.994216\n", 717 | "[2100]\ttraining's auc: 0.994441\n", 718 | "[2150]\ttraining's auc: 0.994666\n", 719 | "[2200]\ttraining's auc: 0.994868\n", 720 | "[2250]\ttraining's auc: 0.99506\n", 721 | "[2300]\ttraining's auc: 0.995261\n", 722 | "[50]\ttraining's auc: 0.943344\n", 723 | "[100]\ttraining's auc: 0.950687\n", 724 | "[150]\ttraining's auc: 0.955448\n", 725 | "[200]\ttraining's auc: 0.958878\n", 726 | "[250]\ttraining's auc: 0.962563\n", 727 | "[300]\ttraining's auc: 0.96581\n", 728 | "[350]\ttraining's auc: 0.968427\n", 729 | "[400]\ttraining's auc: 0.970552\n", 730 | "[450]\ttraining's auc: 0.972443\n", 731 | "[500]\ttraining's auc: 0.974272\n", 732 | "[550]\ttraining's auc: 0.975921\n", 733 | "[600]\ttraining's auc: 0.977345\n", 734 | "[650]\ttraining's auc: 0.978615\n", 735 | "[700]\ttraining's auc: 0.979787\n", 736 | "[750]\ttraining's auc: 0.980919\n", 737 | "[800]\ttraining's auc: 0.981933\n", 738 | "[850]\ttraining's auc: 0.982875\n", 739 | "[900]\ttraining's auc: 0.983745\n", 740 | "[950]\ttraining's auc: 0.984559\n", 741 | "[1000]\ttraining's auc: 0.985313\n", 742 | "[1050]\ttraining's auc: 0.986013\n", 743 | "[1100]\ttraining's auc: 0.986682\n", 744 | "[1150]\ttraining's auc: 0.987308\n", 745 | "[1200]\ttraining's auc: 0.987906\n", 746 | "[1250]\ttraining's auc: 0.98846\n", 747 | "[1300]\ttraining's auc: 0.988989\n", 748 | "[1350]\ttraining's auc: 0.989478\n", 749 | "[1400]\ttraining's auc: 0.989943\n", 750 | "[1450]\ttraining's auc: 0.990387\n", 751 | "[1500]\ttraining's auc: 0.990801\n", 752 | "[1550]\ttraining's auc: 0.991205\n", 753 | "[1600]\ttraining's auc: 0.99156\n", 754 | "[1650]\ttraining's auc: 0.991911\n", 755 | "[1700]\ttraining's auc: 0.992244\n", 756 | "[1750]\ttraining's auc: 0.992559\n", 757 | "[1800]\ttraining's auc: 0.992854\n", 758 | "[1850]\ttraining's auc: 0.993143\n", 759 | "[1900]\ttraining's auc: 0.993427\n", 760 | "[1950]\ttraining's auc: 0.99369\n", 761 | "[2000]\ttraining's auc: 0.993945\n", 762 | "[2050]\ttraining's auc: 0.994184\n", 763 | "[2100]\ttraining's auc: 0.994413\n", 764 | "[2150]\ttraining's auc: 0.99463\n", 765 | "[2200]\ttraining's auc: 0.994838\n", 766 | "[2250]\ttraining's auc: 0.995043\n", 767 | "[2300]\ttraining's auc: 0.995236\n", 768 | "[50]\ttraining's auc: 0.942483\n", 769 | "[100]\ttraining's auc: 0.950969\n", 770 | "[150]\ttraining's auc: 0.955091\n", 771 | "[200]\ttraining's auc: 0.958299\n", 772 | "[250]\ttraining's auc: 0.962131\n", 773 | "[300]\ttraining's auc: 0.965567\n", 774 | "[350]\ttraining's auc: 0.968425\n", 775 | "[400]\ttraining's auc: 0.970545\n", 776 | "[450]\ttraining's auc: 0.972499\n", 777 | "[500]\ttraining's auc: 0.974226\n", 778 | "[550]\ttraining's auc: 0.975819\n", 779 | "[600]\ttraining's auc: 0.977303\n", 780 | "[650]\ttraining's auc: 0.978605\n", 781 | "[700]\ttraining's auc: 0.979826\n", 782 | "[750]\ttraining's auc: 0.980953\n", 783 | "[800]\ttraining's auc: 0.981973\n", 784 | "[850]\ttraining's auc: 0.982899\n", 785 | "[900]\ttraining's auc: 0.983789\n", 786 | "[950]\ttraining's auc: 0.984592\n", 787 | "[1000]\ttraining's auc: 0.985353\n", 788 | "[1050]\ttraining's auc: 0.98608\n", 789 | "[1100]\ttraining's auc: 0.986717\n", 790 | "[1150]\ttraining's auc: 0.987334\n", 791 | "[1200]\ttraining's auc: 0.987908\n", 792 | "[1250]\ttraining's auc: 0.988463\n", 793 | "[1300]\ttraining's auc: 0.98898\n", 794 | "[1350]\ttraining's auc: 0.989468\n", 795 | "[1400]\ttraining's auc: 0.98992\n", 796 | "[1450]\ttraining's auc: 0.990363\n", 797 | "[1500]\ttraining's auc: 0.990792\n", 798 | "[1550]\ttraining's auc: 0.99119\n", 799 | "[1600]\ttraining's auc: 0.991556\n", 800 | "[1650]\ttraining's auc: 0.991905\n", 801 | "[1700]\ttraining's auc: 0.992245\n", 802 | "[1750]\ttraining's auc: 0.992558\n", 803 | "[1800]\ttraining's auc: 0.99285\n", 804 | "[1850]\ttraining's auc: 0.99313\n", 805 | "[1900]\ttraining's auc: 0.993402\n", 806 | "[1950]\ttraining's auc: 0.993669\n", 807 | "[2000]\ttraining's auc: 0.993918\n", 808 | "[2050]\ttraining's auc: 0.994163\n", 809 | "[2100]\ttraining's auc: 0.994396\n", 810 | "[2150]\ttraining's auc: 0.994608\n", 811 | "[2200]\ttraining's auc: 0.994833\n", 812 | "[2250]\ttraining's auc: 0.995031\n", 813 | "[2300]\ttraining's auc: 0.995223\n" 814 | ] 815 | } 816 | ], 817 | "source": [ 818 | "preds = 0\n", 819 | "for s in range(7, 11):\n", 820 | " params_lgb['seed'] = s\n", 821 | " model, pred = lgb_predict(train_feature, train_label, test_feature, iterations, params_lgb)\n", 822 | " preds += pred\n", 823 | "preds /= 4" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 12, 829 | "metadata": { 830 | "collapsed": true 831 | }, 832 | "outputs": [], 833 | "source": [ 834 | "res = store_result(test_index, preds, '20180210-lgb-%f(r%d)' % (best_score, iterations))" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 13, 840 | "metadata": {}, 841 | "outputs": [ 842 | { 843 | "name": "stdout", 844 | "output_type": "stream", 845 | "text": [ 846 | "ord_type1_time_min: 158788.50\n", 847 | "205: 119339.52\n", 848 | "192: 74425.39\n", 849 | "ord_type1_time_max: 71079.32\n", 850 | "last_day_rate(type_5): 35889.30\n", 851 | "act_time_diff(1-2)(window_6)(type_5): 33401.18\n", 852 | "act_last_type: 32289.74\n", 853 | "time_gap_last67: 31290.77\n", 854 | "time_gap_last56: 29933.51\n", 855 | "ord_type1_time_ptp: 29750.38\n", 856 | "ord_type1_time_mean: 23039.05\n", 857 | "27: 22502.02\n", 858 | "3: 21347.94\n", 859 | "13: 21197.36\n", 860 | "26: 20224.40\n", 861 | "25: 17148.59\n", 862 | "act_time_diff(1-2)(window_2)(type_5): 16612.83\n", 863 | "act_time_diff_56_max: 15666.05\n", 864 | "29: 15019.18\n", 865 | "ord_type1_time_median: 14009.13\n", 866 | "act_last_time-act_time(rank_1)(window_1)(type_4): 12531.98\n", 867 | "act_seq_time_diff(2-1)(window_6): 10592.32\n", 868 | "time_gap_last6: 10263.90\n", 869 | "30: 10063.68\n", 870 | "267: 10022.36\n", 871 | "time_gap_last1: 9354.54\n", 872 | "time_gap_last5: 8264.33\n", 873 | "act_time_diff_56789(2-3)(window_6): 7738.38\n", 874 | "act_time_diff(1-2)(window_6): 7738.33\n", 875 | "177: 7633.40\n", 876 | "264: 7547.36\n", 877 | "147: 7252.66\n", 878 | "has_ord_serv_nan: 6669.21\n", 879 | "act_time_diff(1-2)(window_6)(type_6): 6662.99\n", 880 | "47: 6109.28\n", 881 | "act_time_diff_56789(1-2)(window_6): 6105.44\n", 882 | "act_last_time: 6023.46\n", 883 | "261: 5941.93\n", 884 | "154: 5774.97\n", 885 | "16: 5362.74\n", 886 | "24: 5337.05\n", 887 | "140: 5322.88\n", 888 | "46: 5286.71\n", 889 | "260: 5243.96\n", 890 | "time_gap_last45: 5087.27\n", 891 | "204: 4967.24\n", 892 | "10: 4915.41\n", 893 | "act_time_diff(2-3)(window_6)(type_6): 4660.70\n", 894 | "act_ord_act_time_diff(1-2)(window_6): 4628.47\n", 895 | "56: 4623.05\n", 896 | "userid: 4575.01\n", 897 | "14: 4557.92\n", 898 | "280: 4342.64\n", 899 | "has_ord_serv_nan*act_num(type_8): 4241.34\n", 900 | "act_time_diff(3-4)(window_6)(type_6): 4156.29\n", 901 | "206: 4144.02\n", 902 | "act_first_time: 4091.94\n", 903 | "263: 3807.42\n", 904 | "act_last_time-act_time(rank_1)(window_1)(type_6): 3778.78\n", 905 | "148: 3764.90\n", 906 | "216: 3739.09\n", 907 | "act_num(type_6)(window_6): 3724.32\n", 908 | "ord_num(type_1): 3689.06\n", 909 | "act_time_diff(2-3)(window_6): 3635.89\n", 910 | "15: 3596.67\n", 911 | "191: 3585.44\n", 912 | "('country_27', 'sum'): 3500.75\n", 913 | "158: 3480.06\n", 914 | "139: 3439.16\n", 915 | "199: 3343.28\n", 916 | "159: 3308.34\n", 917 | "176: 3298.60\n", 918 | "act_last_time-ord_last_time: 3259.07\n", 919 | "act_time_diff_56_median: 3183.56\n", 920 | "act_type_diff(1-2)(window_6): 3141.00\n", 921 | "act_time_diff_min: 3139.59\n", 922 | "act_time_diff_56789(3-4)(window_6): 3014.31\n", 923 | "55: 2942.95\n", 924 | "act_seq_time_diff(3-2)(window_6): 2889.98\n", 925 | "com_rating_sum: 2869.91\n", 926 | "291: 2868.15\n", 927 | "286: 2793.73\n", 928 | "act_time_diff(1-2)(window_6)(type_1): 2780.36\n", 929 | "134: 2778.35\n", 930 | "149: 2768.18\n", 931 | "act_time_diff_56789(4-5)(window_6): 2752.34\n", 932 | "132: 2705.49\n", 933 | "act_time_diff(3-4)(window_6)(type_5): 2695.59\n", 934 | "143: 2682.05\n", 935 | "city_num(type_1): 2661.58\n", 936 | "45: 2644.15\n", 937 | "197: 2626.26\n", 938 | "17: 2577.90\n", 939 | "262: 2556.67\n", 940 | "has_ord_serv_nan*act_num(type_6): 2464.99\n", 941 | "185: 2454.82\n", 942 | "act_last_time-ord_type0_time_max: 2452.02\n", 943 | "act_time_diff(2-3)(window_6)(type_1): 2439.18\n", 944 | "28: 2394.69\n", 945 | "act_time_diff(4-5)(window_6)(type_6): 2393.97\n", 946 | "31: 2389.98\n", 947 | "160: 2369.23\n", 948 | "has_ord_serv_nan*act_num(type_1): 2346.15\n", 949 | "196: 2327.92\n", 950 | "157: 2327.35\n", 951 | "145: 2300.42\n", 952 | "53: 2261.73\n", 953 | "42: 2219.45\n", 954 | "265: 2200.27\n", 955 | "193: 2194.49\n", 956 | "34: 2139.11\n", 957 | "60: 2136.58\n", 958 | "act_last_time-act_time(rank_1)(window_1)(type_1): 2122.49\n", 959 | "last_time: 2109.67\n", 960 | "33: 2061.19\n", 961 | "198: 2059.85\n", 962 | "act_time_diff_median: 2052.37\n", 963 | "act_time_diff_234(3-4)(window_6): 2004.49\n", 964 | "act_ord_act_time_diff(2-3)(window_6): 1988.31\n", 965 | "act_num(type_6): 1968.70\n", 966 | "220: 1929.72\n", 967 | "144: 1911.66\n", 968 | "act_time_diff(4-5)(window_6)(type_5): 1892.98\n", 969 | "act_time_diff(5-6)(window_6)(type_6): 1880.65\n", 970 | "act_time_diff_56_std: 1858.45\n", 971 | "act_seq_time_diff(4-3)(window_6): 1811.92\n", 972 | "act_seq_time_diff(7-6)(window_6): 1810.81\n", 973 | "156: 1808.93\n", 974 | "270: 1800.24\n", 975 | "act_seq_time_diff(5-4)(window_6): 1796.10\n", 976 | "296: 1783.64\n", 977 | "has_ord_serv_nan*act_num(type_5): 1771.57\n", 978 | "23: 1761.05\n", 979 | "106: 1695.97\n", 980 | "act_time_diff(3-4)(window_6)(type_1): 1680.84\n", 981 | "142: 1676.34\n", 982 | "301: 1659.14\n", 983 | "act_time_diff(2-3)(window_6)(type_5): 1641.21\n", 984 | "act_last_time-act_time(rank_1)(window_1)(type_7): 1632.01\n", 985 | "act_seq_time_diff(6-5)(window_6): 1607.94\n", 986 | "act_time_diff_56_sum: 1583.83\n", 987 | "285: 1568.50\n", 988 | "11: 1567.28\n", 989 | "61: 1565.68\n", 990 | "province: 1540.42\n", 991 | "act_time_diff_median(window_3): 1538.59\n", 992 | "288: 1536.18\n", 993 | "act_first_time-act_time(rank_1)(window_1)(type_7): 1519.74\n", 994 | "283: 1501.60\n", 995 | "184: 1481.73\n", 996 | "in_comment_not_in_order: 1467.84\n", 997 | "276: 1446.86\n", 998 | "act_row_type_sum(window_6): 1436.19\n", 999 | "146: 1430.59\n", 1000 | "266: 1411.94\n", 1001 | "act_time_diff_56_mean: 1396.26\n", 1002 | "137: 1390.03\n", 1003 | "74: 1382.75\n", 1004 | "289: 1375.64\n", 1005 | "act_time_diff(5-6)(window_6)(type_5): 1361.74\n", 1006 | "act_time_diff_56789(5-6)(window_6): 1351.64\n", 1007 | "act_time_diff_min(window_3): 1347.88\n", 1008 | "act_first_time-act_time(rank_1)(window_1)(type_1): 1343.85\n", 1009 | "302: 1342.18\n", 1010 | "161: 1335.74\n", 1011 | "act_column_rate(type_6)(window_3): 1322.73\n", 1012 | "act_last_time-act_time(rank_1)(window_1)(type_5): 1316.36\n", 1013 | "284: 1311.53\n", 1014 | "ord_type0_time_min: 1309.10\n", 1015 | "act_time_diff(5-6)(window_6): 1301.12\n", 1016 | "155: 1293.31\n", 1017 | "act_type_diff(2-3)(window_6): 1279.50\n", 1018 | "275: 1270.86\n", 1019 | "act_num(type_4): 1265.26\n", 1020 | "287: 1251.95\n", 1021 | "act_time_diff_56_min: 1242.87\n", 1022 | "act_time_diff(4-5)(window_6): 1239.71\n", 1023 | "has_ord_serv_no: 1237.97\n", 1024 | "298: 1237.77\n", 1025 | "act_num(type_5)(window_6): 1207.33\n", 1026 | "ord_type0_time_max: 1200.36\n", 1027 | "act_time_diff_mean: 1196.56\n", 1028 | "act_time_diff(3-4)(window_6): 1192.79\n", 1029 | "has_ord_serv_no*act_num(type_6): 1146.12\n", 1030 | "act_time_diff_mean(window_3): 1128.77\n", 1031 | "act_num(type_1): 1122.70\n", 1032 | "278: 1107.60\n", 1033 | "act_row_type_ptp(window_6): 1092.74\n", 1034 | "act_first_time-act_time(rank_1)(window_1)(type_6): 1091.17\n", 1035 | "19: 1084.73\n", 1036 | "act_first_time-act_time(rank_1)(window_1)(type_5): 1081.12\n", 1037 | "152: 1075.11\n", 1038 | "57: 1068.34\n", 1039 | "299: 1067.17\n", 1040 | "297: 1054.28\n", 1041 | "7: 1045.67\n", 1042 | "290: 1032.36\n", 1043 | "292: 1027.20\n", 1044 | "act_row_type_std(window_6): 997.76\n", 1045 | "294: 992.39\n", 1046 | "186: 986.84\n", 1047 | "279: 970.18\n", 1048 | "153: 960.88\n", 1049 | "188: 957.32\n", 1050 | "act_last_time-act_time(rank_1)(window_1)(type_8): 956.99\n", 1051 | "271: 938.97\n", 1052 | "com_rating_count: 926.12\n", 1053 | "act_num(type_5): 911.37\n", 1054 | "281: 894.21\n", 1055 | "187: 887.71\n", 1056 | "act_column_rate(type_4)(window_3): 887.37\n", 1057 | "has_ord_serv_no*act_num(type_8): 868.85\n", 1058 | "35: 861.87\n", 1059 | "274: 855.11\n", 1060 | "282: 847.87\n", 1061 | "act_row_type_mean(window_6): 836.86\n", 1062 | "71: 824.62\n", 1063 | "ord_first_time: 821.98\n", 1064 | "138: 820.06\n", 1065 | "act_row_type_median(window_6): 815.67\n", 1066 | "act_time_diff_std(window_3): 812.03\n", 1067 | "act_time_diff(1-2)(window_6)(type_8): 801.78\n", 1068 | "277: 793.23\n", 1069 | "act_time_diff(4-5)(window_6)(type_1): 789.31\n", 1070 | "300: 782.75\n", 1071 | "has_ord_serv_no*act_num(type_1): 755.37\n", 1072 | "295: 753.92\n", 1073 | "70: 753.10\n", 1074 | "act_first_time-act_time(rank_1)(window_1)(type_8): 749.44\n", 1075 | "act_row_rate(type_6)(window_3): 749.17\n", 1076 | "gender: 739.94\n", 1077 | "293: 738.97\n", 1078 | "act_time_diff(5-6)(window_6)(type_1): 737.55\n", 1079 | "59: 717.40\n", 1080 | "ord_type0_time_std: 716.11\n", 1081 | "act_first_type: 705.41\n", 1082 | "51: 675.96\n", 1083 | "act_time_diff_234(2-3)(window_6): 671.28\n", 1084 | "act_time_diff_std: 669.50\n", 1085 | "190: 664.22\n", 1086 | "act_time_diff_sum: 663.76\n", 1087 | "act_time_diff_max(window_3): 661.46\n", 1088 | "act_row_rate(type_5)(window_3): 660.11\n", 1089 | "act_first_time-act_time(rank_1)(window_1)(type_4): 649.98\n", 1090 | "58: 648.62\n", 1091 | "act_type_num: 645.06\n", 1092 | "act_time_diff_234(1-2)(window_6): 645.05\n", 1093 | "272: 625.98\n", 1094 | "act_first_time-ord_type0_time_min: 607.96\n", 1095 | "189: 605.92\n", 1096 | "211: 602.81\n", 1097 | "act_time_diff_sum(window_3): 601.53\n", 1098 | "ord_last_time: 600.09\n", 1099 | "32: 596.78\n", 1100 | "act_row_rate(type_4)(window_3): 596.26\n", 1101 | "act_type_diff(3-4)(window_6): 595.63\n", 1102 | "8: 593.52\n", 1103 | "act_time_diff_234(4-5)(window_6): 586.77\n", 1104 | "act_type(rank_2)(window6): 586.45\n", 1105 | "act_first_time-ord_first_time: 578.54\n", 1106 | "act_num(type_4)(window_6): 574.12\n", 1107 | "act_time_last-first: 562.82\n", 1108 | "273: 561.08\n", 1109 | "act_column_rate(type_5)(window_3): 559.66\n", 1110 | "act_last_time-act_time(rank_1)(window_1)(type_2): 558.87\n", 1111 | "act_num(type_1)(window_6): 553.07\n", 1112 | "act_ord_act_time_diff(5-6)(window_6): 548.27\n", 1113 | "act_ord_act_time_diff(4-5)(window_6): 544.69\n", 1114 | "act_type(rank_4)(window6): 541.88\n", 1115 | "act_first_time-act_time(rank_1)(window_1)(type_2): 541.67\n", 1116 | "act_time_diff_max: 522.10\n", 1117 | "136: 499.26\n", 1118 | "ord_last_id: 490.98\n", 1119 | "age: 489.96\n", 1120 | "21: 465.80\n", 1121 | "200: 462.40\n", 1122 | "214: 450.18\n", 1123 | "151: 435.26\n", 1124 | "ord_type0_time_median: 430.44\n", 1125 | "208: 429.74\n", 1126 | "act_type_diff(5-6)(window_6): 423.72\n", 1127 | "act_row_type_max(window_6): 422.26\n", 1128 | "162: 420.51\n", 1129 | "act_num: 416.66\n", 1130 | "43: 405.37\n", 1131 | "52: 399.57\n", 1132 | "('continent_3', 'sum'): 396.81\n", 1133 | "167: 378.37\n", 1134 | "36: 376.55\n", 1135 | "114: 368.49\n", 1136 | "act_type_diff(4-5)(window_6): 341.62\n", 1137 | "2: 340.75\n", 1138 | "67: 334.79\n", 1139 | "ord_type0_time_mean: 328.52\n", 1140 | "20: 327.96\n", 1141 | "('country_34', 'sum'): 322.97\n", 1142 | "has_ord_serv_no*act_num(type_4): 320.20\n", 1143 | "54: 317.03\n", 1144 | "209: 303.39\n", 1145 | "act_row_rate(type_7)(window_3): 300.52\n", 1146 | "135: 294.93\n", 1147 | "act_num(type_2): 291.27\n", 1148 | "act_row_type_min(window_6): 284.56\n", 1149 | "1: 283.16\n", 1150 | "48: 278.25\n", 1151 | "268: 271.34\n", 1152 | "141: 268.71\n", 1153 | "act_ord_act_time_diff(3-4)(window_6): 264.33\n", 1154 | "act_time_diff_234(5-6)(window_6): 258.60\n", 1155 | "133: 251.06\n", 1156 | "210: 250.90\n", 1157 | "act_time_diff(5-6)(window_6)(type_9): 250.62\n", 1158 | "150: 237.32\n", 1159 | "act_num(type_2)(window_6): 236.80\n", 1160 | "country_num(type_1): 224.02\n", 1161 | "38: 223.78\n", 1162 | "37: 222.92\n", 1163 | "act_type(rank_3)(window6): 222.74\n", 1164 | "4: 222.04\n", 1165 | "('country_25', 'sum'): 220.33\n", 1166 | "203: 217.87\n", 1167 | "act_num(type_8)(window_6): 216.83\n", 1168 | "act_first_time-act_time(rank_1)(window_1)(type_9): 215.64\n", 1169 | "has_ord_serv_nan*act_num(type_2): 212.86\n", 1170 | "215: 208.83\n", 1171 | "act_type(rank_6)(window6): 208.78\n", 1172 | "303: 206.01\n", 1173 | "act_time_diff(1-2)(window_6)(type_7): 204.16\n", 1174 | "22: 200.08\n", 1175 | "226: 199.44\n", 1176 | "93: 199.07\n", 1177 | "ord_first_id: 196.04\n", 1178 | "act_first_time-act_time(rank_1)(window_1)(type_3): 193.07\n", 1179 | "has_ord_serv_no*act_num(type_9): 188.52\n", 1180 | "act_num(type_3): 186.01\n", 1181 | "has_ord_serv_nan*act_num(type_7): 183.59\n", 1182 | "has_ord_serv_no*act_num(type_7): 177.61\n", 1183 | "has_ord_serv_nan*act_num(type_3): 170.42\n", 1184 | "9: 169.50\n", 1185 | "has_ord_serv_nan*act_num(type_4): 166.98\n", 1186 | "has_ord_serv_nan*act_num(type_9): 159.77\n", 1187 | "ord_type0_time_ptp: 156.36\n", 1188 | "('continent_0', 'sum'): 150.92\n", 1189 | "act_time_diff(2-3)(window_6)(type_7): 150.12\n", 1190 | "has_ord_serv_no*act_num(type_5): 149.97\n", 1191 | "act_type(rank_5)(window6): 146.70\n", 1192 | "act_time_diff(2-3)(window_6)(type_8): 138.56\n", 1193 | "49: 135.74\n", 1194 | "119: 133.37\n", 1195 | "('country_3', 'sum'): 131.53\n", 1196 | "180: 131.08\n", 1197 | "5: 128.63\n", 1198 | "12: 128.51\n", 1199 | "68: 127.47\n", 1200 | "224: 114.75\n", 1201 | "6: 113.36\n", 1202 | "act_time_diff(1-2)(window_6)(type_9): 110.68\n", 1203 | "act_num(type_8): 110.32\n", 1204 | "act_row_rate(type_1)(window_3): 108.95\n", 1205 | "18: 107.46\n", 1206 | "183: 105.46\n", 1207 | "act_last_time-act_time(rank_1)(window_1)(type_3): 103.01\n", 1208 | "act_column_rate(type_7)(window_3): 101.83\n", 1209 | "202: 101.53\n", 1210 | "act_num(type_7)(window_6): 99.50\n", 1211 | "39: 99.31\n", 1212 | "has_ord_serv_yes*act_num(type_1): 98.20\n", 1213 | "182: 97.19\n", 1214 | "44: 96.11\n", 1215 | "40: 95.63\n", 1216 | "('continent_4', 'sum'): 94.07\n", 1217 | "257: 92.09\n", 1218 | "201: 90.53\n", 1219 | "243: 88.92\n", 1220 | "ord_rate(type_1): 87.99\n", 1221 | "50: 85.77\n", 1222 | "('country_39', 'sum'): 85.33\n", 1223 | "245: 84.97\n", 1224 | "('continent_1', 'sum'): 81.00\n", 1225 | "181: 78.63\n", 1226 | "173: 75.30\n", 1227 | "238: 68.02\n", 1228 | "act_column_rate(type_1)(window_3): 67.25\n", 1229 | "78: 65.97\n", 1230 | "269: 64.41\n", 1231 | "104: 59.83\n", 1232 | "80: 59.32\n", 1233 | "act_num(type_7): 57.99\n", 1234 | "act_last_time-act_time(rank_1)(window_1)(type_9): 55.26\n", 1235 | "('country_33', 'sum'): 50.74\n", 1236 | "169: 49.31\n", 1237 | "act_num(type_3)(window_6): 48.61\n", 1238 | "219: 44.39\n", 1239 | "continent_num(type_1): 43.54\n", 1240 | "178: 39.29\n", 1241 | "41: 37.74\n", 1242 | "113: 37.51\n", 1243 | "179: 36.72\n", 1244 | "ord_num(type_0): 35.74\n", 1245 | "253: 34.60\n", 1246 | "108: 33.63\n", 1247 | "ord_first_type1_month: 32.27\n", 1248 | "has_ord_serv_yes*act_num(type_7): 31.59\n", 1249 | "act_time_diff(3-4)(window_6)(type_8): 31.04\n", 1250 | "172: 30.67\n", 1251 | "has_ord_serv_no*act_num(type_2): 29.88\n", 1252 | "('country_46', 'sum'): 26.48\n", 1253 | "85: 26.33\n", 1254 | "act_time_diff(4-5)(window_6)(type_9): 25.18\n", 1255 | "234: 22.98\n", 1256 | "act_time_diff(5-6)(window_6)(type_7): 22.55\n", 1257 | "act_row_rate(type_8)(window_3): 21.55\n", 1258 | "218: 20.47\n", 1259 | "222: 19.76\n", 1260 | "has_ord_serv_yes*act_num(type_6): 18.07\n", 1261 | "241: 17.94\n", 1262 | "act_num(type_9): 16.99\n", 1263 | "170: 16.87\n", 1264 | "212: 16.59\n", 1265 | "217: 15.31\n", 1266 | "has_ord_serv_yes*act_num(type_8): 14.42\n", 1267 | "166: 12.65\n", 1268 | "ord_last_type1_day: 12.60\n", 1269 | "('country_1', 'sum'): 12.27\n", 1270 | "has_ord_serv_no*act_num(type_3): 11.49\n", 1271 | "131: 11.25\n", 1272 | "country_num: 9.67\n", 1273 | "171: 9.17\n", 1274 | "ord_last_type1_month: 7.72\n", 1275 | "164: 7.51\n", 1276 | "act_row_rate(type_3)(window_3): 7.04\n", 1277 | "213: 6.87\n", 1278 | "207: 6.40\n", 1279 | "city_num: 6.37\n", 1280 | "act_time_diff(4-5)(window_6)(type_8): 6.03\n", 1281 | "act_time_diff(3-4)(window_6)(type_9): 5.84\n", 1282 | "ord_first_type1_day: 5.59\n", 1283 | "act_column_rate(type_3)(window_3): 5.11\n", 1284 | "225: 4.75\n", 1285 | "('country_50', 'sum'): 4.26\n", 1286 | "76: 3.79\n", 1287 | "ord_num: 3.70\n", 1288 | "act_time_diff(2-3)(window_6)(type_9): 3.44\n", 1289 | "('country_42', 'sum'): 3.22\n", 1290 | "continent_num: 3.03\n", 1291 | "194: 2.68\n", 1292 | "63: 2.68\n", 1293 | "act_time_diff(4-5)(window_6)(type_7): 2.65\n", 1294 | "has_ord_serv_yes*act_num(type_5): 2.03\n", 1295 | "228: 1.51\n", 1296 | "ord_type1_time_std: 0.00\n", 1297 | "ord_rate(type_0): 0.00\n", 1298 | "('country_0', 'sum'): 0.00\n", 1299 | "('country_2', 'sum'): 0.00\n", 1300 | "('country_4', 'sum'): 0.00\n", 1301 | "('country_5', 'sum'): 0.00\n", 1302 | "('country_6', 'sum'): 0.00\n", 1303 | "('country_7', 'sum'): 0.00\n", 1304 | "('country_8', 'sum'): 0.00\n", 1305 | "('country_9', 'sum'): 0.00\n", 1306 | "('country_10', 'sum'): 0.00\n", 1307 | "('country_11', 'sum'): 0.00\n", 1308 | "('country_12', 'sum'): 0.00\n", 1309 | "('country_13', 'sum'): 0.00\n", 1310 | "('country_14', 'sum'): 0.00\n", 1311 | "('country_15', 'sum'): 0.00\n", 1312 | "('country_16', 'sum'): 0.00\n", 1313 | "('country_17', 'sum'): 0.00\n", 1314 | "('country_18', 'sum'): 0.00\n", 1315 | "('country_19', 'sum'): 0.00\n", 1316 | "('country_20', 'sum'): 0.00\n", 1317 | "('country_21', 'sum'): 0.00\n", 1318 | "('country_22', 'sum'): 0.00\n", 1319 | "('country_23', 'sum'): 0.00\n", 1320 | "('country_24', 'sum'): 0.00\n", 1321 | "('country_26', 'sum'): 0.00\n", 1322 | "('country_28', 'sum'): 0.00\n", 1323 | "('country_29', 'sum'): 0.00\n", 1324 | "('country_30', 'sum'): 0.00\n", 1325 | "('country_31', 'sum'): 0.00\n", 1326 | "('country_32', 'sum'): 0.00\n", 1327 | "('country_35', 'sum'): 0.00\n", 1328 | "('country_36', 'sum'): 0.00\n", 1329 | "('country_37', 'sum'): 0.00\n", 1330 | "('country_38', 'sum'): 0.00\n", 1331 | "('country_40', 'sum'): 0.00\n", 1332 | "('country_41', 'sum'): 0.00\n", 1333 | "('country_43', 'sum'): 0.00\n", 1334 | "('country_44', 'sum'): 0.00\n", 1335 | "('country_45', 'sum'): 0.00\n", 1336 | "('country_47', 'sum'): 0.00\n", 1337 | "('country_48', 'sum'): 0.00\n", 1338 | "('country_49', 'sum'): 0.00\n", 1339 | "('continent_2', 'sum'): 0.00\n", 1340 | "('continent_5', 'sum'): 0.00\n", 1341 | "ord_last_type: 0.00\n", 1342 | "ord_first_type: 0.00\n", 1343 | "act_num(type_9)(window_6): 0.00\n", 1344 | "act_column_rate(type_2)(window_3): 0.00\n", 1345 | "act_column_rate(type_8)(window_3): 0.00\n", 1346 | "act_column_rate(type_9)(window_3): 0.00\n", 1347 | "act_row_rate(type_2)(window_3): 0.00\n", 1348 | "act_row_rate(type_9)(window_3): 0.00\n", 1349 | "act_time_diff(3-4)(window_6)(type_7): 0.00\n", 1350 | "act_time_diff(5-6)(window_6)(type_8): 0.00\n", 1351 | "ord_last_type1_year: 0.00\n", 1352 | "ord_first_type1_year: 0.00\n", 1353 | "62: 0.00\n", 1354 | "64: 0.00\n", 1355 | "65: 0.00\n", 1356 | "66: 0.00\n", 1357 | "69: 0.00\n", 1358 | "72: 0.00\n", 1359 | "73: 0.00\n", 1360 | "75: 0.00\n", 1361 | "77: 0.00\n", 1362 | "79: 0.00\n", 1363 | "81: 0.00\n", 1364 | "82: 0.00\n", 1365 | "83: 0.00\n", 1366 | "84: 0.00\n", 1367 | "86: 0.00\n", 1368 | "87: 0.00\n", 1369 | "88: 0.00\n", 1370 | "89: 0.00\n", 1371 | "90: 0.00\n", 1372 | "91: 0.00\n", 1373 | "92: 0.00\n", 1374 | "94: 0.00\n", 1375 | "95: 0.00\n", 1376 | "96: 0.00\n", 1377 | "97: 0.00\n", 1378 | "98: 0.00\n", 1379 | "99: 0.00\n", 1380 | "100: 0.00\n", 1381 | "101: 0.00\n", 1382 | "102: 0.00\n", 1383 | "103: 0.00\n", 1384 | "105: 0.00\n", 1385 | "107: 0.00\n", 1386 | "109: 0.00\n", 1387 | "110: 0.00\n", 1388 | "111: 0.00\n", 1389 | "112: 0.00\n", 1390 | "115: 0.00\n", 1391 | "116: 0.00\n", 1392 | "117: 0.00\n", 1393 | "118: 0.00\n", 1394 | "120: 0.00\n", 1395 | "121: 0.00\n", 1396 | "122: 0.00\n", 1397 | "123: 0.00\n", 1398 | "124: 0.00\n", 1399 | "125: 0.00\n", 1400 | "126: 0.00\n", 1401 | "127: 0.00\n", 1402 | "128: 0.00\n", 1403 | "129: 0.00\n", 1404 | "130: 0.00\n", 1405 | "163: 0.00\n", 1406 | "165: 0.00\n", 1407 | "168: 0.00\n", 1408 | "174: 0.00\n", 1409 | "175: 0.00\n", 1410 | "195: 0.00\n", 1411 | "221: 0.00\n", 1412 | "223: 0.00\n", 1413 | "227: 0.00\n", 1414 | "229: 0.00\n", 1415 | "230: 0.00\n", 1416 | "231: 0.00\n", 1417 | "232: 0.00\n", 1418 | "233: 0.00\n", 1419 | "235: 0.00\n", 1420 | "236: 0.00\n", 1421 | "237: 0.00\n", 1422 | "239: 0.00\n", 1423 | "240: 0.00\n", 1424 | "242: 0.00\n", 1425 | "244: 0.00\n", 1426 | "246: 0.00\n", 1427 | "247: 0.00\n", 1428 | "248: 0.00\n", 1429 | "249: 0.00\n", 1430 | "250: 0.00\n", 1431 | "251: 0.00\n", 1432 | "252: 0.00\n", 1433 | "254: 0.00\n", 1434 | "255: 0.00\n", 1435 | "256: 0.00\n", 1436 | "258: 0.00\n", 1437 | "259: 0.00\n", 1438 | "act_last_time-ord_type1_time_max: 0.00\n", 1439 | "act_first_time-ord_type1_time_min: 0.00\n", 1440 | "has_ord_serv_yes: 0.00\n", 1441 | "has_ord_serv_yes*act_num(type_2): 0.00\n", 1442 | "has_ord_serv_yes*act_num(type_3): 0.00\n", 1443 | "has_ord_serv_yes*act_num(type_4): 0.00\n", 1444 | "has_ord_serv_yes*act_num(type_9): 0.00\n" 1445 | ] 1446 | } 1447 | ], 1448 | "source": [ 1449 | "print(\"\\n\".join((\"%s: %.2f\" % x) for x in sorted(zip(train_feature.columns, model.feature_importance(\"gain\")), key=lambda x: x[1], reverse=True)))" 1450 | ] 1451 | }, 1452 | { 1453 | "cell_type": "code", 1454 | "execution_count": null, 1455 | "metadata": { 1456 | "collapsed": true 1457 | }, 1458 | "outputs": [], 1459 | "source": [] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": null, 1464 | "metadata": { 1465 | "collapsed": true 1466 | }, 1467 | "outputs": [], 1468 | "source": [ 1469 | "######################################### blending #########################################" 1470 | ] 1471 | }, 1472 | { 1473 | "cell_type": "code", 1474 | "execution_count": null, 1475 | "metadata": { 1476 | "collapsed": true 1477 | }, 1478 | "outputs": [], 1479 | "source": [ 1480 | "test1 = pd.read_csv('../data/output/sub/bjw/result_addUserid_0125_1.csv')\n", 1481 | "test2 = pd.read_csv('../data/output/sub/20180203-lgb-0.966497(r1843).csv')\n", 1482 | "test3 = pd.read_csv('../data/output/sub/shawn_lgb_local9641_online9646.csv')\n", 1483 | "test4 = pd.read_csv('../data/output/sub/ym/lz96490.csv')\n", 1484 | "testa = pd.merge(test1, test2, on='userid', how='left')\n", 1485 | "testb = pd.merge(test3, test4, on='userid', how='left')\n", 1486 | "test = pd.merge(testa, testb, on='userid', how='left')" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "code", 1491 | "execution_count": null, 1492 | "metadata": { 1493 | "collapsed": true 1494 | }, 1495 | "outputs": [], 1496 | "source": [ 1497 | "test['orderType'] = 0.5 * test['orderType_x_x'] + 0.3 * test['orderType_y_x'] + 0.1 * test['orderType_x_y'] + 0.1 * test['orderType_y_y']" 1498 | ] 1499 | }, 1500 | { 1501 | "cell_type": "code", 1502 | "execution_count": null, 1503 | "metadata": { 1504 | "collapsed": true, 1505 | "scrolled": true 1506 | }, 1507 | "outputs": [], 1508 | "source": [ 1509 | "test[['userid','orderType']].to_csv('../data/output/sub/blend/20180203-0.5bjw+0.3+0.1+0.1ym.csv',index=False)" 1510 | ] 1511 | } 1512 | ], 1513 | "metadata": { 1514 | "kernelspec": { 1515 | "display_name": "Python [default]", 1516 | "language": "python", 1517 | "name": "python2" 1518 | }, 1519 | "language_info": { 1520 | "codemirror_mode": { 1521 | "name": "ipython", 1522 | "version": 2 1523 | }, 1524 | "file_extension": ".py", 1525 | "mimetype": "text/x-python", 1526 | "name": "python", 1527 | "nbconvert_exporter": "python", 1528 | "pygments_lexer": "ipython2", 1529 | "version": "2.7.13" 1530 | } 1531 | }, 1532 | "nbformat": 4, 1533 | "nbformat_minor": 1 1534 | } 1535 | --------------------------------------------------------------------------------