├── src
    ├── util.pyc
    ├── util.py
    ├── model.py
    ├── feat.py
    ├── feat.ipynb
    └── model.ipynb
├── .gitignore
├── data
    ├── input
    │   └── README.md
    └── output
    │   └── README.md
└── README.md


/src/util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawnyXiao/2018-ICC-TravelService/HEAD/src/util.pyc


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | data/input/test
3 | data/input/train
4 | data/output/sub
5 | data/output/feat
6 | data/output/feat_imp
7 | src/.ipynb_checkpoints


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import time
3 | 
4 | 
5 | def log(stri):
6 |     now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
7 |     print(str(now) + ' ' + str(stri))
8 | 


--------------------------------------------------------------------------------
/data/input/README.md:
--------------------------------------------------------------------------------
 1 | # 输入数据
 2 | 
 3 | 由于文件太大，因此无法上传到 Github 上，若需要相关数据，可以发 issue 联系我。该目录结构如下：
 4 | 
 5 | ```
 6 | input
 7 | │  README.md
 8 | │
 9 | ├─test
10 | │      action_test.csv
11 | │      orderFuture_test.csv
12 | │      orderHistory_test.csv
13 | │      userComment_test.csv
14 | │      userProfile_test.csv
15 | │
16 | └─train
17 |         action_train.csv
18 |         orderFuture_train.csv
19 |         orderHistory_train.csv
20 |         userComment_train.csv
21 |         userProfile_train.csv
22 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2018-ICC-精品旅行服务成单预测
 2 | 
 3 | 这是第二届智慧中国杯（ICC）中的一题：**[精品旅行服务成单预测](http://www.dcjingsai.com/common/cmpt/%E7%B2%BE%E5%93%81%E6%97%85%E8%A1%8C%E6%9C%8D%E5%8A%A1%E6%88%90%E5%8D%95%E9%A2%84%E6%B5%8B_%E7%AB%9E%E8%B5%9B%E4%BF%A1%E6%81%AF.html)**。我入赛的时间很晚，大约是赛程的后期了。我的队伍是“魂斗罗”，最终取得**优胜奖**，排名 **14th/1135**。这个比赛可以学到不少**从序列数据中挖掘特征的套路**。
 4 | 
 5 | ## 从序列数据中挖掘特征的套路
 6 | 
 7 | 1. 由于用户最近的行为极可能代表了近期的想法，因此我保留了最近 k 天的行为与时间，作为 k 个特征；
 8 | 2. 对最近 k 天的行为和时间进行统计，构建统计特征；
 9 | 3. 对所有的行为和时间进行统计，构建全局统计特征；
10 | 4. 对时间序列进行一阶和二阶差分，保留最近的 k 个差分值，作为 k 个特征；
11 | 5. 对 k 个一阶和二阶差分值进行统计，构建统计特征；
12 | 6. 对所有的一阶和二阶差分值进行统计，构建全局统计特征；
13 | 7. 按行为分组后，对时间序列进行一阶和二阶差分，保留最近的 k 个差分值，作为 k 个特征
14 | 8. ……
15 | 
16 | 诸如此类，实验表明，差分特征十分有效，是这个比赛提分的关键。
17 | 
18 | ## 嘿！
19 | 
20 | 如果您有任何的想法，例如：发现某处有 bug、觉得我对某个方法的讲解不正确或者不透彻、有更加有创意的见解，欢迎随时发 issue 或者 pull request 或者直接与我讨论！另外您若能 star 或者 for k  这个项目以激励刚刚踏入数据挖掘的我，我会感激不尽~


--------------------------------------------------------------------------------
/data/output/README.md:
--------------------------------------------------------------------------------
  1 | # 输出数据
  2 | 
  3 | 由于文件太大，因此无法上传到 Github 上，若需要相关数据，可以发 issue 联系我。该目录结构如下：
  4 | 
  5 | ```
  6 | output
  7 | │  README.md
  8 | │
  9 | ├─feat
 10 | │  │  action_num_based_on_time_last_window13
 11 | │  │  action_num_based_on_time_last_window17
 12 | │  │  action_num_based_on_time_last_window20
 13 | │  │  action_num_based_on_time_last_window25
 14 | │  │  action_num_based_on_time_last_window30
 15 | │  │  action_num_based_on_time_last_window4
 16 | │  │  action_num_based_on_time_last_window7
 17 | │  │  action_order_time_diff
 18 | │  │  action_real_time_based_on_time_last_window10_on_type1
 19 | │  │  action_real_time_based_on_time_last_window10_on_type5
 20 | │  │  action_real_time_based_on_time_last_window10_on_type6
 21 | │  │  action_real_time_based_on_time_last_window10_on_type7
 22 | │  │  action_real_time_based_on_time_last_window10_on_type8
 23 | │  │  action_real_time_based_on_time_last_window10_on_type9
 24 | │  │  action_real_time_based_on_time_last_window1_on_type1
 25 | │  │  action_real_time_based_on_time_last_window1_on_type2
 26 | │  │  action_real_time_based_on_time_last_window1_on_type3
 27 | │  │  action_real_time_based_on_time_last_window1_on_type4
 28 | │  │  action_real_time_based_on_time_last_window1_on_type5
 29 | │  │  action_real_time_based_on_time_last_window1_on_type6
 30 | │  │  action_real_time_based_on_time_last_window1_on_type7
 31 | │  │  action_real_time_based_on_time_last_window1_on_type8
 32 | │  │  action_real_time_based_on_time_last_window1_on_type9
 33 | │  │  action_real_time_based_on_time_last_window4_on_type1
 34 | │  │  action_real_time_based_on_time_last_window4_on_type5
 35 | │  │  action_real_time_based_on_time_last_window4_on_type6
 36 | │  │  action_real_time_based_on_time_last_window4_on_type7
 37 | │  │  action_real_time_based_on_time_last_window4_on_type8
 38 | │  │  action_real_time_based_on_time_last_window4_on_type9
 39 | │  │  action_real_time_based_on_time_last_window7_on_type1
 40 | │  │  action_real_time_based_on_time_last_window7_on_type5
 41 | │  │  action_real_time_based_on_time_last_window7_on_type6
 42 | │  │  action_real_time_based_on_time_last_window7_on_type7
 43 | │  │  action_real_time_based_on_time_last_window7_on_type8
 44 | │  │  action_real_time_based_on_time_last_window7_on_type9
 45 | │  │  action_sequence_time_diff_window10
 46 | │  │  action_sequence_time_diff_window11
 47 | │  │  action_sequence_time_diff_window12
 48 | │  │  action_sequence_time_diff_window15
 49 | │  │  action_sequence_time_diff_window2
 50 | │  │  action_sequence_time_diff_window3
 51 | │  │  action_sequence_time_diff_window4
 52 | │  │  action_sequence_time_diff_window5
 53 | │  │  action_sequence_time_diff_window6
 54 | │  │  action_sequence_time_diff_window7
 55 | │  │  action_sequence_time_diff_window8
 56 | │  │  action_sequence_time_diff_window9
 57 | │  │  action_sequence_time_stat_last123
 58 | │  │  action_stat_last_every_type
 59 | │  │  action_time_2order_based_on_time_last_window10
 60 | │  │  action_time_2order_based_on_time_last_window3
 61 | │  │  action_time_2order_based_on_time_last_window4
 62 | │  │  action_time_2order_based_on_time_last_window5
 63 | │  │  action_time_2order_based_on_time_last_window6
 64 | │  │  action_time_2order_based_on_time_last_window7
 65 | │  │  action_time_2order_based_on_time_last_window8
 66 | │  │  action_time_2order_based_on_time_last_window9
 67 | │  │  action_time_based_on_time
 68 | │  │  action_time_based_on_time_last_window10
 69 | │  │  action_time_based_on_time_last_window11
 70 | │  │  action_time_based_on_time_last_window12
 71 | │  │  action_time_based_on_time_last_window15
 72 | │  │  action_time_based_on_time_last_window15_on_type1
 73 | │  │  action_time_based_on_time_last_window15_on_type2
 74 | │  │  action_time_based_on_time_last_window15_on_type3
 75 | │  │  action_time_based_on_time_last_window15_on_type4
 76 | │  │  action_time_based_on_time_last_window15_on_type5
 77 | │  │  action_time_based_on_time_last_window15_on_type6
 78 | │  │  action_time_based_on_time_last_window15_on_type7
 79 | │  │  action_time_based_on_time_last_window15_on_type8
 80 | │  │  action_time_based_on_time_last_window15_on_type9
 81 | │  │  action_time_based_on_time_last_window3
 82 | │  │  action_time_based_on_time_last_window6
 83 | │  │  action_time_based_on_time_last_window6_on_type1
 84 | │  │  action_time_based_on_time_last_window6_on_type5
 85 | │  │  action_time_based_on_time_last_window6_on_type6
 86 | │  │  action_time_based_on_time_last_window6_on_type7
 87 | │  │  action_time_based_on_time_last_window6_on_type8
 88 | │  │  action_time_based_on_time_last_window6_on_type9
 89 | │  │  action_time_based_on_time_last_window7
 90 | │  │  action_time_based_on_time_last_window7_on_type1
 91 | │  │  action_time_based_on_time_last_window7_on_type2
 92 | │  │  action_time_based_on_time_last_window7_on_type3
 93 | │  │  action_time_based_on_time_last_window7_on_type4
 94 | │  │  action_time_based_on_time_last_window7_on_type5
 95 | │  │  action_time_based_on_time_last_window7_on_type6
 96 | │  │  action_time_based_on_time_last_window7_on_type7
 97 | │  │  action_time_based_on_time_last_window7_on_type8
 98 | │  │  action_time_based_on_time_last_window7_on_type9
 99 | │  │  action_time_based_on_time_last_window8
100 | │  │  action_time_based_on_time_last_window9
101 | │  │  action_time_diff2_based_on_time_last_window3
102 | │  │  action_time_diff2_based_on_time_last_window4
103 | │  │  action_time_diff2_based_on_time_last_window5
104 | │  │  action_time_diff2_based_on_time_last_window6
105 | │  │  action_time_diff2_based_on_time_last_window7
106 | │  │  action_time_diff2_based_on_time_last_window8
107 | │  │  action_time_diff_234_56789_last_window6
108 | │  │  action_time_diff_stat
109 | │  │  action_time_diff_stat_last_window3
110 | │  │  action_time_diff_stat_last_window4
111 | │  │  action_time_diff_stat_last_window5
112 | │  │  action_time_diff_stat_last_window6
113 | │  │  action_time_diff_stat_last_window7
114 | │  │  action_time_diff_stat_last_window8
115 | │  │  action_time_diff_stat_last_window9
116 | │  │  action_time_last_on_every_type
117 | │  │  action_time_row_stat_based_on_time_last_window10
118 | │  │  action_time_row_stat_based_on_time_last_window14
119 | │  │  action_time_row_stat_based_on_time_last_window3
120 | │  │  action_time_row_stat_based_on_time_last_window6
121 | │  │  action_type
122 | │  │  action_type_based_on_time
123 | │  │  action_type_based_on_time_last_window3
124 | │  │  action_type_based_on_time_last_window4
125 | │  │  action_type_based_on_time_last_window5
126 | │  │  action_type_based_on_time_last_window6
127 | │  │  action_type_based_on_time_last_window7
128 | │  │  action_type_num_based_on_time_last_window10
129 | │  │  action_type_num_based_on_time_last_window11
130 | │  │  action_type_num_based_on_time_last_window12
131 | │  │  action_type_num_based_on_time_last_window13
132 | │  │  action_type_num_based_on_time_last_window14
133 | │  │  action_type_num_based_on_time_last_window15
134 | │  │  action_type_num_based_on_time_last_window17
135 | │  │  action_type_num_based_on_time_last_window2
136 | │  │  action_type_num_based_on_time_last_window20
137 | │  │  action_type_num_based_on_time_last_window25
138 | │  │  action_type_num_based_on_time_last_window3
139 | │  │  action_type_num_based_on_time_last_window30
140 | │  │  action_type_num_based_on_time_last_window4
141 | │  │  action_type_num_based_on_time_last_window5
142 | │  │  action_type_num_based_on_time_last_window6
143 | │  │  action_type_num_based_on_time_last_window7
144 | │  │  action_type_num_based_on_time_last_window8
145 | │  │  action_type_num_based_on_time_last_window9
146 | │  │  action_type_rate_based_on_time_last_window10
147 | │  │  action_type_rate_based_on_time_last_window11
148 | │  │  action_type_rate_based_on_time_last_window12
149 | │  │  action_type_rate_based_on_time_last_window13
150 | │  │  action_type_rate_based_on_time_last_window14
151 | │  │  action_type_rate_based_on_time_last_window15
152 | │  │  action_type_rate_based_on_time_last_window2
153 | │  │  action_type_rate_based_on_time_last_window20
154 | │  │  action_type_rate_based_on_time_last_window3
155 | │  │  action_type_rate_based_on_time_last_window4
156 | │  │  action_type_rate_based_on_time_last_window5
157 | │  │  action_type_rate_based_on_time_last_window6
158 | │  │  action_type_rate_based_on_time_last_window7
159 | │  │  action_type_rate_based_on_time_last_window8
160 | │  │  action_type_rate_based_on_time_last_window9
161 | │  │  action_type_row_stat_based_on_time_last_window6
162 | │  │  action_type_row_stat_based_on_time_last_window9
163 | │  │  act_ord_act_time_diff_last_window10
164 | │  │  act_ord_act_time_diff_last_window11
165 | │  │  act_ord_act_time_diff_last_window12
166 | │  │  act_ord_act_time_diff_last_window13
167 | │  │  act_ord_act_time_diff_last_window14
168 | │  │  act_ord_act_time_diff_last_window15
169 | │  │  act_ord_act_time_diff_last_window3
170 | │  │  act_ord_act_time_diff_last_window6
171 | │  │  act_ord_act_time_diff_last_window7
172 | │  │  act_ord_act_time_diff_last_window8
173 | │  │  act_ord_act_time_diff_last_window9
174 | │  │  act_ord_before_type1_stat
175 | │  │  act_ord_type1_act_time_diff_last_window14
176 | │  │  act_ord_type1_act_time_diff_last_window2
177 | │  │  act_ord_type1_act_time_diff_last_window3
178 | │  │  act_ord_type1_act_time_diff_last_window4
179 | │  │  act_ord_type1_act_time_diff_last_window6
180 | │  │  act_ord_type1_act_time_diff_last_window9
181 | │  │  order_history
182 | │  │  order_history_last_w
183 | │  │  order_last_order_ydm
184 | │  │  order_type1_ydm
185 | │  │  try
186 | │  │  user_comment
187 | │  │  user_profile
188 | │  │
189 | │  ├─bjw
190 | │  │      all_features_test.csv
191 | │  │      all_features_train.csv
192 | │  │      test_fea.csv
193 | │  │      train_fea.csv
194 | │  │
195 | │  └─stack
196 | │          lgb_prob_test(offline_0.966529).csv
197 | │          lgb_prob_train(offline_0.966529).csv
198 | │
199 | ├─feat_imp
200 | │      importance-20180112-0.951112(r1200).csv
201 | │      importance-20180114-0.958592(r1622).csv
202 | │
203 | └─sub
204 |     │  20180112-xgb-0.951112(r1200).csv
205 |     │  20180114-xgb-0.958592(r1622).csv
206 |     │  20180121-lgb-0.961160(r1389).csv
207 |     │  20180123-lgb-0.963202(r1648).csv
208 |     │  20180123-xgb-0.961940(r1742).csv
209 |     │  20180127-lgb-0.965033(r2186).csv
210 |     │  20180128-lgb-0.966097(r1864).csv
211 |     │  20180131-lgb-0.966333(r2566).csv
212 |     │  20180201-lgb-0.966529(r2245).csv
213 |     │  20180202-lgb-0.966497(r1843).csv
214 |     │  20180203-lgb-0.966497(r1843).csv
215 |     │  20180207-lgb-0.970125(r1700).csv
216 |     │  20180210-lgb-0.970330(r2344).csv
217 |     │  shawn_lgb_local9641_online9646.csv
218 |     │  结果提交样例.csv
219 |     │
220 |     ├─bjw
221 |     │      result_addUserid_0125_1.csv
222 |     │
223 |     ├─blend
224 |     │      20180127-0.5+0.5-0.96619.csv
225 |     │      20180128-0.5bjw+0.3+0.2-0.97009.csv
226 |     │      20180128-0.65bjw+0.35-0.96969.csv
227 |     │      20180128-0.6bjw+0.4-0.96979.csv
228 |     │      20180128-0.7bjw+0.3-0.96950.csv
229 |     │      20180203-0.5bjw+0.3+0.1+0.1ym-0.97070.csv
230 |     │      20180203-0.6bjw+0.2+0.1+0.1ym-0.97070.csv
231 |     │
232 |     └─ym
233 |             lz96490.csv
234 | ```


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[2]:
  5 | 
  6 | from __future__ import division
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn import preprocessing
 10 | import xgboost as xgb
 11 | import lightgbm as lgb
 12 | import catboost as cb
 13 | import time
 14 | import datetime
 15 | import warnings
 16 | warnings.filterwarnings('ignore')
 17 | import util
 18 | 
 19 | 
 20 | # In[3]:
 21 | 
 22 | def merge_feature(
 23 |     act_type_window,
 24 |     act_type_num_window,
 25 |     act_type_rate_window,
 26 |     act_type_row_stat_window,
 27 |     act_time_window,
 28 |     act_time_1type_window,
 29 |     act_ord_act_time_diff_window,
 30 |     action_sequence_time_diff_window,
 31 |     action_time_diff_234_56789_window,
 32 |     action_time_diff_stat_window
 33 | ):
 34 |     util.log('Merge feature...')
 35 |     
 36 |     order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')
 37 |     order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')
 38 | 
 39 |     user_profile = pd.read_csv('../data/output/feat/%s' % 'user_profile')
 40 |     train = pd.merge(order_future_tr, user_profile, on='userid', how='left')
 41 |     test = pd.merge(order_future_te, user_profile, on='userid', how='left')
 42 |     
 43 |     user_comment = pd.read_csv('../data/output/feat/%s' % 'user_comment')
 44 |     train = pd.merge(train, user_comment, on='userid', how='left')
 45 |     test = pd.merge(test, user_comment, on='userid', how='left')
 46 |     
 47 |     order_history = pd.read_csv('../data/output/feat/%s' % 'order_history')
 48 |     train = pd.merge(train, order_history, on='userid', how='left')
 49 |     test = pd.merge(test, order_history, on='userid', how='left')
 50 |     
 51 | #     order_history_last_w = pd.read_csv('../data/output/feat/%s' % 'order_history_last_w')
 52 | #     train = pd.merge(train, order_history_last_w, on='userid', how='left')
 53 | #     test = pd.merge(test, order_history_last_w, on='userid', how='left')
 54 |     
 55 |     action_type = pd.read_csv('../data/output/feat/%s' % 'action_type')
 56 |     train = pd.merge(train, action_type, on='userid', how='left')
 57 |     test = pd.merge(test, action_type, on='userid', how='left')
 58 |     
 59 |     action_type_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_type_based_on_time')
 60 |     train = pd.merge(train, action_type_based_on_time, on='userid', how='left')
 61 |     test = pd.merge(test, action_type_based_on_time, on='userid', how='left')
 62 |     
 63 |     util.log('act_type_window=' + str(act_type_window))
 64 |     window = act_type_window
 65 |     action_type_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window))
 66 |     train = pd.merge(train, action_type_based_on_time_last_window, on='userid', how='left')
 67 |     test = pd.merge(test, action_type_based_on_time_last_window, on='userid', how='left')
 68 |     
 69 |     util.log('act_type_num_window=' + str(act_type_num_window))
 70 |     window = act_type_num_window
 71 |     action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window))
 72 |     train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left')
 73 |     test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left')
 74 |     
 75 |     util.log('act_type_rate_window=' + str(act_type_rate_window))
 76 |     window = act_type_rate_window
 77 |     action_type_rate_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window))
 78 |     train = pd.merge(train, action_type_rate_based_on_time_last_window, on='userid', how='left')
 79 |     test = pd.merge(test, action_type_rate_based_on_time_last_window, on='userid', how='left')
 80 |     
 81 |     util.log('act_type_row_stat_window=' + str(act_type_row_stat_window))
 82 |     window = act_type_row_stat_window
 83 |     action_type_row_stat_based_on_time_last_window_feat = pd.read_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window))
 84 |     train = pd.merge(train, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left')
 85 |     test = pd.merge(test, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left')
 86 |     
 87 | #     util.log('action_num_window=' + str(action_num_window))
 88 | #     window = action_num_window
 89 | #     action_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window))
 90 | #     train = pd.merge(train, action_num_based_on_time_last_window, on='userid', how='left')
 91 | #     test = pd.merge(test, action_num_based_on_time_last_window, on='userid', how='left')
 92 | 
 93 | #     util.log('action_type_num_window=' + str(action_type_num_window))
 94 | #     window = action_type_num_window
 95 | #     action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window))
 96 | #     train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left')
 97 | #     test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left')
 98 | 
 99 |     action_time_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_time_based_on_time')
100 |     train = pd.merge(train, action_time_based_on_time, on='userid', how='left')
101 |     test = pd.merge(test, action_time_based_on_time, on='userid', how='left')
102 |     
103 |     util.log('act_time_window=' + str(act_time_window))
104 |     window = act_time_window
105 |     action_time_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window))
106 |     train = pd.merge(train, action_time_based_on_time_last_window, on='userid', how='left')
107 |     test = pd.merge(test, action_time_based_on_time_last_window, on='userid', how='left')
108 |     
109 | #     util.log('act_time_row_stat_window=' + str(act_time_row_stat_window))
110 | #     window = act_time_row_stat_window
111 | #     action_time_row_stat_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window))
112 | #     train = pd.merge(train, action_time_row_stat_based_on_time_last_window, on='userid', how='left')
113 | #     test = pd.merge(test, action_time_row_stat_based_on_time_last_window, on='userid', how='left')
114 |  
115 | #     util.log('action_time_diff2_window=' + str(action_time_diff2_window))
116 | #     window = action_time_diff2_window
117 | #     action_time_diff2_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window))
118 | #     train = pd.merge(train, action_time_diff2_based_on_time_last_window, on='userid', how='left')
119 | #     test = pd.merge(test, action_time_diff2_based_on_time_last_window, on='userid', how='left')
120 | 
121 |     util.log('act_time_1type_window=%d' % act_time_1type_window)
122 |     window = act_time_1type_window
123 |     for ttype in [1, 5, 6, 7, 8, 9]:
124 |         action_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype))
125 |         train = pd.merge(train, action_time_based_on_time_last_window_on_type, on='userid', how='left')
126 |         test = pd.merge(test, action_time_based_on_time_last_window_on_type, on='userid', how='left')
127 |     
128 | #     util.log('action_time_2order_window=' + str(action_time_2order_window))
129 | #     window = action_time_2order_window
130 | #     action_time_2order_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window))
131 | #     train = pd.merge(train, action_time_2order_based_on_time_last_window, on='userid', how='left')
132 | #     test = pd.merge(test, action_time_2order_based_on_time_last_window, on='userid', how='left')
133 | 
134 | #     util.log('act_real_time_1type_window=%d' % act_real_time_1type_window)
135 | #     window = act_real_time_1type_window
136 | #     for ttype in [1, 5, 6, 7, 8, 9]:
137 | #         action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype))
138 | #         train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')
139 | #         test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')
140 | 
141 | #     action_order_time_diff = pd.read_csv('../data/output/feat/%s' % 'action_order_time_diff')
142 | #     train = pd.merge(train, action_order_time_diff, on='userid', how='left')
143 | #     test = pd.merge(test, action_order_time_diff, on='userid', how='left')
144 | 
145 | #     order_last_order_ydm = pd.read_csv('../data/output/feat/%s' % 'order_last_order_ydm')
146 | #     train = pd.merge(train, order_last_order_ydm, on='userid', how='left')
147 | #     test = pd.merge(test, order_last_order_ydm, on='userid', how='left')
148 | 
149 |     order_type1_ydm = pd.read_csv('../data/output/feat/%s' % 'order_type1_ydm')
150 |     train = pd.merge(train, order_type1_ydm, on='userid', how='left')
151 |     test = pd.merge(test, order_type1_ydm, on='userid', how='left')
152 | 
153 |     util.log('act_ord_act_time_diff_window=' + str(act_ord_act_time_diff_window))
154 |     window = act_ord_act_time_diff_window
155 |     act_ord_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window))
156 |     train = pd.merge(train, act_ord_act_time_diff_last_window, on='userid', how='left')
157 |     test = pd.merge(test, act_ord_act_time_diff_last_window, on='userid', how='left')
158 | 
159 | #     util.log('act_ord_type1_act_time_diff_window=' + str(act_ord_type1_act_time_diff_window))
160 | #     window = act_ord_type1_act_time_diff_window
161 | #     act_ord_type1_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window))
162 | #     train = pd.merge(train, act_ord_type1_act_time_diff_last_window, on='userid', how='left')
163 | #     test = pd.merge(test, act_ord_type1_act_time_diff_last_window, on='userid', how='left')
164 | 
165 |     util.log('action_sequence_time_diff_window=' + str(action_sequence_time_diff_window))
166 |     window = action_sequence_time_diff_window
167 |     action_sequence_time_diff_window = pd.read_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window))
168 |     train = pd.merge(train, action_sequence_time_diff_window, on='userid', how='left')
169 |     test = pd.merge(test, action_sequence_time_diff_window, on='userid', how='left')
170 | 
171 | #     action_sequence_time_stat_last123 = pd.read_csv('../data/output/feat/%s' % 'action_sequence_time_stat_last123')
172 | #     train = pd.merge(train, action_sequence_time_stat_last123, on='userid', how='left')
173 | #     test = pd.merge(test, action_sequence_time_stat_last123, on='userid', how='left')
174 | 
175 |     util.log('action_time_diff_234_56789_window=' + str(action_time_diff_234_56789_window))
176 |     window = action_time_diff_234_56789_window
177 |     action_time_diff_234_56789_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window))
178 |     train = pd.merge(train, action_time_diff_234_56789_last_window, on='userid', how='left')
179 |     test = pd.merge(test, action_time_diff_234_56789_last_window, on='userid', how='left')
180 |     
181 | #     action_stat_last_every_type = pd.read_csv('../data/output/feat/%s' % 'action_stat_last_every_type')
182 | #     train = pd.merge(train, action_stat_last_every_type, on='userid', how='left')
183 | #     test = pd.merge(test, action_stat_last_every_type, on='userid', how='left')
184 | 
185 | #     act_ord_before_type1_stat = pd.read_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat')
186 | #     train = pd.merge(train, act_ord_before_type1_stat, on='userid', how='left')
187 | #     test = pd.merge(test, act_ord_before_type1_stat, on='userid', how='left')
188 | 
189 |     action_time_diff_stat = pd.read_csv('../data/output/feat/%s' % 'action_time_diff_stat')
190 |     train = pd.merge(train, action_time_diff_stat, on='userid', how='left')
191 |     test = pd.merge(test, action_time_diff_stat, on='userid', how='left')
192 | 
193 |     util.log('action_time_diff_stat_window=' + str(action_time_diff_stat_window))
194 |     window = action_time_diff_stat_window
195 |     action_time_diff_stat_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window))
196 |     train = pd.merge(train, action_time_diff_stat_last_window, on='userid', how='left')
197 |     test = pd.merge(test, action_time_diff_stat_last_window, on='userid', how='left')
198 |     
199 | #     action_time_last_on_every_type = pd.read_csv('../data/output/feat/%s' % 'action_time_last_on_every_type')
200 | #     train = pd.merge(train, action_time_last_on_every_type, on='userid', how='left')
201 | #     test = pd.merge(test, action_time_last_on_every_type, on='userid', how='left')
202 | 
203 |     # bjw comment 中出现 order 中没有出现的为 1
204 |     bjw_train = pd.read_csv('../data/output/feat/bjw/train_fea.csv')
205 |     bjw_test = pd.read_csv('../data/output/feat/bjw/test_fea.csv')
206 |     train = pd.merge(train, bjw_train, on='userid', how='left')
207 |     test = pd.merge(test, bjw_test, on='userid', how='left')
208 |     
209 |     # 别人的开源特征，基于自己理解实现了一部分
210 |     tryy = pd.read_csv('../data/output/feat/%s' % 'try')
211 |     train = pd.merge(train, tryy, on='userid', how='left')
212 |     test = pd.merge(test, tryy, on='userid', how='left')
213 |     
214 |     # bjw 的特征
215 |     bjw_train = pd.read_csv('../data/output/feat/bjw/all_features_train.csv').drop(['Unnamed: 0', 'orderType'], axis=1)
216 |     bjw_train.columns = ['userid' if i == 0 else i for i in range(len(bjw_train.columns))]
217 |     bjw_test = pd.read_csv('../data/output/feat/bjw/all_features_test.csv').drop(['Unnamed: 0'], axis=1)
218 |     bjw_test.columns = ['userid' if i == 0 else i for i in range(len(bjw_test.columns))]
219 |     train = pd.merge(train, bjw_train, on='userid', how='left')
220 |     test = pd.merge(test, bjw_test, on='userid', how='left')
221 |     
222 | #################################################################################################################
223 |     
224 |     # 用于交叉特征，使用之后会移除
225 |     window = 1
226 |     for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
227 |         action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype))
228 |         train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')
229 |         test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')
230 | 
231 |     train, test = cross_feature(train, test)
232 |     
233 |     train, test = drop_duplicate_column(train, test)
234 |     
235 |     train_feature = train.drop(['orderType'], axis = 1)
236 |     train_label = train.orderType.values
237 |     test_feature = test
238 |     test_index = test.userid.values
239 |     
240 |     return train_feature, train_label, test_feature, test_index
241 | 
242 | 
243 | # In[4]:
244 | 
245 | def cross_feature(train, test):
246 |     util.log('Cross feature...')
247 |     
248 |     # 最近的 action 与最近的 order 的时间差
249 |     train['act_last_time-ord_last_time'] = train['act_last_time'] - train['ord_last_time']
250 |     train['act_last_time-ord_type0_time_max'] = train['act_last_time'] - train['ord_type0_time_max']
251 |     train['act_last_time-ord_type1_time_max'] = train['act_last_time'] - train['ord_type1_time_max']
252 |     test['act_last_time-ord_last_time'] = test['act_last_time'] - test['ord_last_time']
253 |     test['act_last_time-ord_type0_time_max'] = test['act_last_time'] - test['ord_type0_time_max']
254 |     test['act_last_time-ord_type1_time_max'] = test['act_last_time'] - test['ord_type1_time_max']
255 |     
256 |     # 最早的 action 与最早的 order 的时间差
257 |     train['act_first_time-ord_first_time'] = train['act_first_time'] - train['ord_first_time']
258 |     train['act_first_time-ord_type0_time_min'] = train['act_first_time'] - train['ord_type0_time_min']
259 |     train['act_first_time-ord_type1_time_min'] = train['act_first_time'] - train['ord_type1_time_min']
260 |     test['act_first_time-ord_first_time'] = test['act_first_time'] - test['ord_first_time']
261 |     test['act_first_time-ord_type0_time_min'] = test['act_first_time'] - test['ord_type0_time_min']
262 |     test['act_first_time-ord_type1_time_min'] = test['act_first_time'] - test['ord_type1_time_min']
263 |     
264 |     # 最近的 action 与最近的每一个 type 的 action 的时间差 + 最早的 action 与最早的每一个 type 的 action 的时间差
265 |     for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
266 |         train['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_last_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
267 |         train['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
268 |         test['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_last_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
269 |         test['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
270 |         train = train.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1)
271 |         test = test.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1)
272 | 
273 |     # 是否下过精品服务的单 * 最近的 action 的时间
274 |     tmp = train['ord_num(type_1)'].copy()
275 |     tmp[tmp > 1] = 1
276 |     tmp = pd.get_dummies(tmp.fillna(-1))
277 |     tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']
278 |     train = pd.concat([train, tmp.mul(train['act_last_time'], axis=0)], axis=1)
279 |     tmp = test['ord_num(type_1)'].copy()
280 |     tmp[tmp > 1] = 1
281 |     tmp = pd.get_dummies(tmp.fillna(-1))
282 |     tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']
283 |     test = pd.concat([test, tmp.mul(test['act_last_time'], axis=0)], axis=1)
284 |     
285 |     # 是否下过精品服务的单 * 每一个 type 的 action 的数量
286 |     tmp = train['ord_num(type_1)'].copy()
287 |     tmp[tmp > 1] = 1
288 |     tmp = pd.get_dummies(tmp.fillna(-1))
289 |     tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']
290 |     for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
291 |         train = train.join(tmp.mul(train['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype)
292 |     tmp = test['ord_num(type_1)'].copy()
293 |     tmp[tmp > 1] = 1
294 |     tmp = pd.get_dummies(tmp.fillna(-1))
295 |     tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']
296 |     for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
297 |         test = test.join(tmp.mul(test['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype)
298 |         
299 | #     # 最近的 order 与最近的每一个 type 的 action 的时间差 + 最早的 order 与最早的每一个 type 的 action 的时间差 （all/0/1）
300 | #     for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
301 | #         train['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_last_time'] -  train['act_time(rank_1)(window_1)(type_%d)' % ttype]
302 | #         train['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
303 | #         train['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
304 | #         train['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
305 | #         train['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
306 | #         train['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]
307 | #         test['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_last_time'] -  test['act_time(rank_1)(window_1)(type_%d)' % ttype]
308 | #         test['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
309 | #         test['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
310 | #         test['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
311 | #         test['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
312 | #         test['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]
313 |         
314 |     return train, test
315 | 
316 | 
317 | # In[5]:
318 | 
319 | def drop_duplicate_column(train, test):
320 |     util.log('Drop duplicate column...')
321 |     
322 |     train = train.drop(['act_type(rank_1)(window6)'], axis=1)  # window9
323 |     test = test.drop(['act_type(rank_1)(window6)'], axis=1)
324 |     
325 |     return train, test
326 | 
327 | 
328 | # In[6]:
329 | 
330 | def lgb_cv(train_feature, train_label, params, folds, rounds):
331 |     start = time.clock()
332 |     print train_feature.columns
333 |     dtrain = lgb.Dataset(train_feature, label=train_label)
334 |     num_round = rounds
335 |     print 'run cv: ' + 'round: ' + str(rounds)
336 |     res = lgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=20, early_stopping_rounds=100)
337 |     elapsed = (time.clock() - start)
338 |     print 'Time used:', elapsed, 's'
339 |     return len(res['auc-mean']), res['auc-mean'][len(res['auc-mean']) - 1]
340 | 
341 | 
342 | def lgb_predict(train_feature, train_label, test_feature, rounds, params):
343 |     dtrain = lgb.Dataset(train_feature, label=train_label)
344 |     valid_sets = [dtrain]
345 |     num_round = rounds
346 |     model = lgb.train(params, dtrain, num_round, valid_sets, verbose_eval=50)
347 |     predict = model.predict(test_feature)
348 |     return model, predict
349 | 
350 | 
351 | def store_result(test_index, pred, name):
352 |     result = pd.DataFrame({'userid': test_index, 'orderType': pred})
353 |     result.to_csv('../data/output/sub/' + name + '.csv', index=0, columns=['userid', 'orderType'])
354 |     return result
355 | 
356 | 
357 | # In[7]:
358 | 
359 | train_feature, train_label, test_feature, test_index = merge_feature(6, 6, 3, 6, 6, 6, 6, 6, 6, 3)
360 | print train_feature.shape, train_label.shape, test_feature.shape
361 | 
362 | 
363 | # In[8]:
364 | 
365 | config = {
366 |     'rounds': 10000,
367 |     'folds': 5
368 | }
369 | 
370 | params_lgb = {
371 |     'task': 'train',
372 |     'boosting_type': 'gbdt',
373 |     'objective': 'binary',
374 |     'metric': 'auc',
375 |     'min_sum_hessian_in_leaf': 0.1,
376 |     'learning_rate': 0.01,
377 |     'verbosity': 2,
378 |     'tree_learner': 'feature',
379 |     'num_leaves': 128,
380 |     'feature_fraction': 0.75,
381 |     'bagging_fraction': 0.9,
382 |     'bagging_freq': 1,
383 |     'num_threads': 16,
384 |     'seed': 7
385 | }
386 | 
387 | 
388 | # In[10]:
389 | 
390 | iterations, best_score = lgb_cv(train_feature, train_label, params_lgb, config['folds'], config['rounds'])
391 | 
392 | 
393 | # In[11]:
394 | 
395 | preds = 0
396 | for s in range(7, 11):
397 |     params_lgb['seed'] = s
398 |     model, pred = lgb_predict(train_feature, train_label, test_feature, iterations, params_lgb)
399 |     preds += pred
400 | preds /= 4
401 | 
402 | 
403 | # In[12]:
404 | 
405 | res = store_result(test_index, preds, '20180210-lgb-%f(r%d)' % (best_score, iterations))
406 | 
407 | 
408 | # In[13]:
409 | 
410 | print("\n".join(("%s: %.2f" % x) for x in sorted(zip(train_feature.columns, model.feature_importance("gain")), key=lambda x: x[1], reverse=True)))
411 | 
412 | 
413 | # In[ ]:
414 | 
415 | 
416 | 
417 | 
418 | # In[ ]:
419 | 
420 | ######################################### blending #########################################
421 | 
422 | 
423 | # In[ ]:
424 | 
425 | test1 = pd.read_csv('../data/output/sub/bjw/result_addUserid_0125_1.csv')
426 | test2 = pd.read_csv('../data/output/sub/20180203-lgb-0.966497(r1843).csv')
427 | test3 = pd.read_csv('../data/output/sub/shawn_lgb_local9641_online9646.csv')
428 | test4 = pd.read_csv('../data/output/sub/ym/lz96490.csv')
429 | testa = pd.merge(test1, test2, on='userid', how='left')
430 | testb = pd.merge(test3, test4, on='userid', how='left')
431 | test = pd.merge(testa, testb, on='userid', how='left')
432 | 
433 | 
434 | # In[ ]:
435 | 
436 | test['orderType'] = 0.5 * test['orderType_x_x'] + 0.3 * test['orderType_y_x'] + 0.1 * test['orderType_x_y'] + 0.1 * test['orderType_y_y']
437 | 
438 | 
439 | # In[ ]:
440 | 
441 | test[['userid','orderType']].to_csv('../data/output/sub/blend/20180203-0.5bjw+0.3+0.1+0.1ym.csv',index=False)
442 | 
443 | 


--------------------------------------------------------------------------------
/src/feat.py:
--------------------------------------------------------------------------------
   1 | 
   2 | # coding: utf-8
   3 | 
   4 | # In[ ]:
   5 | 
   6 | from __future__ import division
   7 | import numpy as np
   8 | import pandas as pd
   9 | from sklearn import preprocessing
  10 | import xgboost as xgb
  11 | import lightgbm as lgb
  12 | import catboost as cb
  13 | import time
  14 | import datetime
  15 | import sys
  16 | import math
  17 | import warnings
  18 | warnings.filterwarnings('ignore')
  19 | import util
  20 | 
  21 | 
  22 | # In[ ]:
  23 | 
  24 | def get_user_profile_feature(df):
  25 |     df = df.copy()
  26 | 
  27 |     mydf = df[['userid']]
  28 |     le = preprocessing.LabelEncoder()
  29 |     mydf['gender'] = le.fit_transform(df['gender'])
  30 | 
  31 |     mydf['province'] = le.fit_transform(df['province'])
  32 | 
  33 |     mydf['age'] = le.fit_transform(df['age'])
  34 | 
  35 |     return mydf
  36 | 
  37 | 
  38 | # In[ ]:
  39 | 
  40 | def get_user_comment_feature(df):
  41 |     df = df.copy()
  42 |     
  43 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
  44 | 
  45 |     com_rating = df.groupby('userid')['rating'].agg(['sum', 'count']).reset_index()
  46 |     com_rating.columns = [i if i == 'userid' else 'com_rating_' + i for i in com_rating.columns]
  47 | 
  48 |     mydf = pd.merge(mydf, com_rating, on='userid', how='left')
  49 |     
  50 |     return mydf
  51 | 
  52 | 
  53 | # In[ ]:
  54 | 
  55 | def get_order_history_feature(df):
  56 |     df = df.copy()
  57 | 
  58 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
  59 | 
  60 |     # type 为 0 和 1 的订单的数量和比率 + 总订单数
  61 |     ord_hist_ord = df.groupby('userid')['orderType'].agg(['sum', 'count']).reset_index()
  62 |     ord_hist_ord.columns = ['userid', 'ord_num(type_1)', 'ord_num']
  63 |     ord_hist_ord['ord_num(type_0)'] = ord_hist_ord['ord_num'] - ord_hist_ord['ord_num(type_1)']
  64 |     ord_hist_ord['ord_rate(type_1)'] = ord_hist_ord['ord_num(type_1)'] / ord_hist_ord['ord_num']
  65 |     ord_hist_ord['ord_rate(type_0)'] = ord_hist_ord['ord_num(type_0)'] / ord_hist_ord['ord_num']
  66 | 
  67 |     # city, country, continent 的数量
  68 |     addr_count = df.groupby('userid')['city', 'country', 'continent'].count().reset_index()
  69 |     addr_count.columns = ['userid', 'city_num', 'country_num', 'continent_num']
  70 | 
  71 |     # type 为 1 的 city, country, continent 的数量
  72 |     addr_count_pos = df[df['orderType'] == 1].groupby('userid')['city', 'country', 'continent'].count().reset_index()
  73 |     addr_count_pos.columns = ['userid', 'city_num(type_1)', 'country_num(type_1)', 'continent_num(type_1)']
  74 | 
  75 |     # 每个 country 的订单数量
  76 |     lb = preprocessing.LabelBinarizer()
  77 |     tmp = lb.fit_transform(df['country'])
  78 |     tmp_col = ['country_' + str(i) for i in range(tmp.shape[1])]
  79 |     tmp = pd.DataFrame(tmp, columns=tmp_col)
  80 |     tmp['userid'] = df['userid'].values
  81 |     country = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index()
  82 | 
  83 |     # 每个 continent 的订单数量
  84 |     lb = preprocessing.LabelBinarizer()
  85 |     tmp = lb.fit_transform(df['continent'])
  86 |     tmp_col = ['continent_' + str(i) for i in range(tmp.shape[1])]
  87 |     tmp = pd.DataFrame(tmp, columns=tmp_col)
  88 |     tmp['userid'] = df['userid'].values
  89 |     continent = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index()
  90 |     
  91 |     # 最后一次的 order
  92 |     last_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']]
  93 |     last_ord.columns = ['userid', 'ord_last_id', 'ord_last_time', 'ord_last_type']
  94 |     
  95 |     # 第一次的 order
  96 |     first_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']]
  97 |     first_ord.columns = ['userid', 'ord_first_id', 'ord_first_time', 'ord_first_type']
  98 |     
  99 |     # type 分别为 0/1 的订单的时间的统计
 100 |     for t in [0, 1]:
 101 |         ord_time_stat = df[df['orderType'] == t].groupby('userid')['orderTime'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index()
 102 |         ord_time_stat.columns = [i if i == 'userid' else 'ord_type%d_time_%s' % (t, i) for i in ord_time_stat.columns]
 103 |         mydf = pd.merge(mydf, ord_time_stat, on='userid', how='left')
 104 |     
 105 |     mydf = pd.merge(mydf, ord_hist_ord, on='userid', how='left')
 106 |     mydf = pd.merge(mydf, addr_count, on='userid', how='left')
 107 |     mydf = pd.merge(mydf, addr_count_pos, on='userid', how='left')
 108 |     mydf = pd.merge(mydf, country, on='userid', how='left')
 109 |     mydf = pd.merge(mydf, continent, on='userid', how='left')
 110 |     mydf = pd.merge(mydf, last_ord, on='userid', how='left')
 111 |     mydf = pd.merge(mydf, first_ord, on='userid', how='left')
 112 |         
 113 |     return mydf
 114 | 
 115 | 
 116 | # In[ ]:
 117 | 
 118 | def get_order_history_last_w_feature(df):
 119 |     df = df.copy()
 120 | 
 121 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 122 | 
 123 |     # 最后 w 次订单的统计
 124 |     for w in [2, 3, 4]:
 125 |         util.log(w)
 126 |         
 127 |         last_order = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(w)).reset_index(drop=True)[['userid', 'orderTime', 'orderType']]
 128 |         last_order.columns = ['userid', 'ord_last_time', 'ord_last_type']
 129 |         
 130 |         ord_last_time_stat = last_order.groupby('userid')['ord_last_time'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index()
 131 |         ord_last_time_stat.columns = [i if i == 'userid' else 'ord_last%d_time_%s' % (w, i) for i in ord_last_time_stat.columns]
 132 |         
 133 |         ord_last_type_stat = last_order.groupby('userid')['ord_last_type'].agg(['count', sum]).reset_index()
 134 |         ord_last_type_stat.columns = [i if i == 'userid' else 'ord_last%d_type_%s' % (w, i) for i in ord_last_type_stat.columns]
 135 |         
 136 |         mydf = pd.merge(mydf, ord_last_time_stat, on='userid', how='left')
 137 |         mydf = pd.merge(mydf, ord_last_type_stat, on='userid', how='left')
 138 | 
 139 |     return mydf
 140 | 
 141 | 
 142 | # In[ ]:
 143 | 
 144 | def get_action_type_feature(df):
 145 |     df = df.copy()
 146 | 
 147 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 148 | 
 149 |     # 每个用户的 action 和 actionType 的数量
 150 |     act_num = df.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([sum, len]).reset_index()
 151 |     act_num.columns = ['userid', 'act_num', 'act_type_num']
 152 | 
 153 |     # 每个类别的数量
 154 |     act_type_num = df.groupby(['userid', 'actionType']).size().unstack().reset_index()
 155 |     act_type_num.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')' for i in act_type_num.columns]
 156 | 
 157 |     mydf = pd.merge(mydf, act_num, on='userid', how='left')
 158 |     mydf = pd.merge(mydf, act_type_num, on='userid', how='left')
 159 | 
 160 |     return mydf
 161 | 
 162 | 
 163 | # In[ ]:
 164 | 
 165 | def get_action_type_based_on_time_feature(df):
 166 |     df = df.copy()
 167 | 
 168 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 169 | 
 170 |     # 最近的一次 action 的 type
 171 |     act_last_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionType']]
 172 |     act_last_type.columns = ['userid', 'act_last_type']
 173 |     
 174 |     # 最早的一次 action 的 type
 175 |     act_first_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionType']]
 176 |     act_first_type.columns = ['userid', 'act_first_type']
 177 | 
 178 |     mydf = pd.merge(mydf, act_last_type, on='userid', how='left')
 179 |     mydf = pd.merge(mydf, act_first_type, on='userid', how='left')
 180 | 
 181 |     return mydf
 182 | 
 183 | 
 184 | # In[ ]:
 185 | 
 186 | def get_action_type_based_on_time_last_window_feature(df, window):
 187 |     df = df.copy()
 188 | 
 189 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 190 | 
 191 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 192 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 193 | 
 194 |     # type 的差值
 195 |     act_type = tmp.pivot('userid', 'act_time_rank', 'actionType')
 196 |     act_type = act_type[act_type.columns[::-1]]
 197 |     act_type_diff = act_type.diff(1, axis=1)
 198 |     act_type_diff = act_type_diff.iloc[:, 1:].reset_index()
 199 |     act_type_diff.columns = [i if i == 'userid' else 'act_type_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_type_diff.columns]
 200 | 
 201 |     mydf = pd.merge(mydf, act_type_diff, on='userid', how='left')
 202 | 
 203 |     return mydf
 204 | 
 205 | 
 206 | # In[ ]:
 207 | 
 208 | def get_action_type_num_based_on_time_last_window_feature(df, window):
 209 |     df = df.copy()
 210 | 
 211 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 212 | 
 213 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 214 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 215 | 
 216 |     # 每个类别的数量
 217 |     act_num_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().reset_index()
 218 |     act_num_in_window.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_num_in_window.columns]
 219 |     
 220 |     mydf = pd.merge(mydf, act_num_in_window, on='userid', how='left')
 221 | 
 222 |     return mydf
 223 | 
 224 | 
 225 | # In[ ]:
 226 | 
 227 | def get_action_type_rate_based_on_time_last_window_feature(df, window):
 228 |     df = df.copy()
 229 | 
 230 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 231 | 
 232 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 233 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 234 | 
 235 |     # 每个类别的列级别的比率
 236 |     act_column_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply(lambda x: x / np.sum(x)).reset_index()
 237 |     act_column_rate_in_window.columns = [i if i == 'userid' else 'act_column_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_column_rate_in_window.columns]
 238 | 
 239 |     # 每个类别的行级别的比率
 240 |     act_row_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply((lambda x: x / np.sum(x)), axis=1).reset_index()
 241 |     act_row_rate_in_window.columns = [i if i == 'userid' else 'act_row_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_row_rate_in_window.columns]
 242 |     
 243 |     mydf = pd.merge(mydf, act_column_rate_in_window, on='userid', how='left')
 244 |     mydf = pd.merge(mydf, act_row_rate_in_window, on='userid', how='left')
 245 | 
 246 |     return mydf
 247 | 
 248 | 
 249 | # In[ ]:
 250 | 
 251 | def get_action_type_row_stat_based_on_time_last_window_feature(df, window):
 252 |     df = df.copy()
 253 | 
 254 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 255 | 
 256 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 257 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 258 | 
 259 |     # 最近的 type 值 + 行级别的统计值
 260 |     act_type = tmp.pivot('userid', 'act_time_rank', 'actionType')
 261 |     act_type.columns = ['act_type(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_type.columns]
 262 |     for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]:
 263 |         act_type['act_row_type_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_type_' + i.func_name + '(window_' + str(window) + ')'] = act_type.apply(i, axis=1)
 264 |     act_type = act_type.reset_index()
 265 |     
 266 |     mydf = pd.merge(mydf, act_type, on='userid', how='left')
 267 | 
 268 |     return mydf
 269 | 
 270 | 
 271 | # In[ ]:
 272 | 
 273 | def get_action_num_based_on_time_last_window_feature(df, window):
 274 |     df = df.copy()
 275 | 
 276 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 277 | 
 278 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 279 | 
 280 |     # action 的数量
 281 |     act_num = tmp.groupby('userid').size().reset_index()
 282 |     act_num.columns = ['userid', 'act_num(window_%d)' % window]
 283 |     
 284 |     mydf = pd.merge(mydf, act_num, on='userid', how='left')
 285 | 
 286 |     return mydf
 287 | 
 288 | 
 289 | # In[ ]:
 290 | 
 291 | def get_action_type_num_based_on_time_last_window_feature(df, window):
 292 |     df = df.copy()
 293 | 
 294 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 295 | 
 296 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 297 | 
 298 |     # type 的数量
 299 |     act_type_num = tmp.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([len]).reset_index()
 300 |     act_type_num.columns = ['userid', 'act_type_num(window_%d)' % window]
 301 |     
 302 |     mydf = pd.merge(mydf, act_type_num, on='userid', how='left')
 303 | 
 304 |     return mydf
 305 | 
 306 | 
 307 | # In[ ]:
 308 | 
 309 | def get_action_time_based_on_time_feature(df):
 310 |     df = df.copy()
 311 | 
 312 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 313 | 
 314 |     # 最近的一次 action 的 time
 315 |     act_last_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionTime']]
 316 |     act_last_time.columns = ['userid', 'act_last_time']
 317 |     
 318 |     # 最早的一次 action 的 time
 319 |     act_first_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionTime']]
 320 |     act_first_time.columns = ['userid', 'act_first_time']
 321 |     
 322 |     mydf = pd.merge(mydf, act_last_time, on='userid', how='left')
 323 |     mydf = pd.merge(mydf, act_first_time, on='userid', how='left')
 324 |     
 325 |     mydf['act_time_last-first'] = mydf['act_last_time'] - mydf['act_first_time']
 326 | 
 327 |     return mydf
 328 | 
 329 | 
 330 | # In[ ]:
 331 | 
 332 | def get_action_time_based_on_time_last_window_feature(df, window):
 333 |     df = df.copy()
 334 | 
 335 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 336 |     
 337 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 338 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 339 |     
 340 |     # time 的差值
 341 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 342 |     act_time = act_time[act_time.columns[::-1]]
 343 |     act_time_diff = act_time.diff(1, axis=1)
 344 |     act_time_diff = act_time_diff.iloc[:, 1:].reset_index()
 345 |     act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]
 346 | 
 347 |     mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')
 348 |     
 349 |     return mydf
 350 | 
 351 | 
 352 | # In[ ]:
 353 | 
 354 | def get_action_time_row_stat_based_on_time_last_window_feature(df, window):
 355 |     df = df.copy()
 356 | 
 357 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 358 |     
 359 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 360 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 361 |     
 362 |     # 最近的 time 值 + 行级别的统计值
 363 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 364 |     act_time.columns = ['act_time(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_time.columns]
 365 |     for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]:
 366 |         act_time['act_row_time_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_time_' + i.func_name + '(window_' + str(window) + ')'] = act_time.apply(i, axis=1)
 367 |     act_time = act_time.reset_index()
 368 | 
 369 |     mydf = pd.merge(mydf, act_time, on='userid', how='left')
 370 |     
 371 |     return mydf
 372 | 
 373 | 
 374 | # In[ ]:
 375 | 
 376 | def get_action_time_diff2_based_on_time_last_window_feature(df, window):
 377 |     df = df.copy()
 378 | 
 379 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 380 |     
 381 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 382 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 383 |     
 384 |     # time 的差值
 385 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 386 |     act_time = act_time[act_time.columns[::-1]]
 387 |     act_time_diff2 = act_time.diff(2, axis=1)    # need test
 388 |     act_time_diff2 = act_time_diff2.iloc[:, 2:].reset_index()
 389 |     act_time_diff2.columns = [i if i == 'userid' else 'act_time_diff2(' + str(i) + '-' + str(i + 2) + ')(window_' + str(window) + ')' for i in act_time_diff2.columns]
 390 | 
 391 |     mydf = pd.merge(mydf, act_time_diff2, on='userid', how='left')
 392 |     
 393 |     return mydf
 394 | 
 395 | 
 396 | # In[ ]:
 397 | 
 398 | def get_action_time_based_on_time_last_window_on_type_feature(df, window, ttype):
 399 |     df = df.copy()
 400 | 
 401 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 402 |     
 403 |     tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 404 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 405 |     
 406 |     # 特定 type 的 action 的 time 的差值
 407 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 408 |     act_time = act_time[act_time.columns[::-1]]
 409 |     act_time_diff = act_time.diff(1, axis=1)
 410 |     act_time_diff = act_time_diff.iloc[:, 1:].reset_index()
 411 |     act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, window, ttype) for i in act_time_diff.columns]
 412 | 
 413 |     mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')
 414 |     
 415 |     return mydf
 416 | 
 417 | 
 418 | # In[ ]:
 419 | 
 420 | def get_action_time_2order_based_on_time_last_window_feature(df, window):
 421 |     df = df.copy()
 422 | 
 423 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 424 |     
 425 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 426 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 427 |     
 428 |     # 特定 type 的 action 的 time 的差值
 429 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 430 |     act_time = act_time[act_time.columns[::-1]]
 431 |     act_time_diff_2order = act_time.diff(1, axis=1).diff(1, axis=1)
 432 |     act_time_diff_2order = act_time_diff_2order.iloc[:, 2:].reset_index()
 433 |     act_time_diff_2order.columns = [i if i == 'userid' else 'act_time_diff_2order(%d-%d)(window_%d)' % (i, i+1, window) for i in act_time_diff_2order.columns]
 434 | 
 435 |     mydf = pd.merge(mydf, act_time_diff_2order, on='userid', how='left')
 436 |     
 437 |     return mydf
 438 | 
 439 | 
 440 | # In[ ]:
 441 | 
 442 | def get_action_real_time_based_on_time_last_window_on_type_feature(df, window, ttype):
 443 |     df = df.copy()
 444 | 
 445 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 446 |     
 447 |     tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 448 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 449 |     
 450 |     # 特定的 type 的 action 的最近的 time 值
 451 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime').reset_index()
 452 |     act_time.columns = [i if i == 'userid' else 'act_time(rank_%d)(window_%d)(type_%d)' % (i, window, ttype) for i in act_time.columns]
 453 | 
 454 |     mydf = pd.merge(mydf, act_time, on='userid', how='left')
 455 |     
 456 |     return mydf
 457 | 
 458 | 
 459 | # In[ ]:
 460 | 
 461 | def get_act_ord_time_diff_feature(act, oord):
 462 |     act = act.copy()
 463 |     oord = oord.copy()
 464 | 
 465 |     mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)
 466 | 
 467 |     ord_time = oord.groupby('userid')['orderTime'].max().reset_index()
 468 |     act = pd.merge(act, ord_time, on='userid', how='left')  # fillna?
 469 |     act['act_time-ord_time'] = act['actionTime'] - act['orderTime']
 470 |     act_ord_time_diff = act[act['act_time-ord_time'] > 0].groupby('userid').size().reset_index()
 471 |     act_ord_time_diff.columns = ['userid', 'act_ord_time_diff_gt0_count']
 472 | 
 473 |     mydf = pd.merge(mydf, act_ord_time_diff, on='userid', how='left')
 474 |     
 475 |     return mydf
 476 | 
 477 | 
 478 | # In[ ]:
 479 | 
 480 | def get_order_last_order_ydm_feature(df):
 481 |     df = df.copy()
 482 | 
 483 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 484 | 
 485 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)
 486 | 
 487 |     mydf['ord_last_ord_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year
 488 |     mydf['ord_last_ord_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month
 489 |     mydf['ord_last_ord_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day
 490 | 
 491 |     return mydf
 492 | 
 493 | 
 494 | # In[ ]:
 495 | 
 496 | def get_order_type1_ydm_feature(df):
 497 |     df = df.copy()
 498 | 
 499 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 500 | 
 501 |     # 最近一次的 type 为 1 的订单的年月日
 502 |     tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)
 503 |     mydf['ord_last_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year
 504 |     mydf['ord_last_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month
 505 |     mydf['ord_last_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day
 506 |     
 507 |     # 最早一次的 type 为 1 的订单的年月日
 508 |     tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)
 509 |     mydf['ord_first_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year
 510 |     mydf['ord_first_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month
 511 |     mydf['ord_first_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day
 512 | 
 513 |     return mydf
 514 | 
 515 | 
 516 | # In[ ]:
 517 | 
 518 | def get_act_ord_act_time_diff_last_window_feature(act, oord, window):
 519 |     act = act.copy()
 520 |     oord = oord.copy()
 521 | 
 522 |     mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)
 523 | 
 524 |     ord_time = oord.groupby('userid')['orderTime'].max().reset_index()
 525 |     act = pd.merge(act, ord_time, on='userid', how='left')
 526 | 
 527 |     df = act[act['actionTime'] < act['orderTime']]
 528 | 
 529 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 530 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 531 | 
 532 |     # 最后一次订单之前的 action 的 time 的差值
 533 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 534 |     act_time = act_time[act_time.columns[::-1]]
 535 |     act_time_diff = act_time.diff(1, axis=1)
 536 |     act_time_diff = act_time_diff.iloc[:, 1:].reset_index()
 537 |     act_time_diff.columns = [i if i == 'userid' else 'act_ord_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]
 538 | 
 539 |     mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')
 540 |     
 541 |     return mydf
 542 | 
 543 | 
 544 | # In[ ]:
 545 | 
 546 | def get_act_ord_type1_act_time_diff_last_window_feature(act, oord, window):
 547 |     act = act.copy()
 548 |     oord = oord.copy()
 549 | 
 550 |     mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)
 551 | 
 552 |     ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index()
 553 |     act = pd.merge(act, ord_time, on='userid', how='left')
 554 | 
 555 |     df = act[act['actionTime'] < act['orderTime']]
 556 | 
 557 |     tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 558 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 559 | 
 560 |     # 最后一次精品订单之前的 action 的 time 的差值
 561 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 562 |     act_time = act_time[act_time.columns[::-1]]
 563 |     act_time_diff = act_time.diff(1, axis=1)
 564 |     act_time_diff = act_time_diff.iloc[:, 1:].reset_index()
 565 |     act_time_diff.columns = [i if i == 'userid' else 'act_ord_type1_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]
 566 | 
 567 |     mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')
 568 |     
 569 |     return mydf
 570 | 
 571 | 
 572 | # In[ ]:
 573 | 
 574 | def get_action_sequence_time_diff_feature(df):
 575 |     df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True)
 576 | 
 577 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 578 | 
 579 |     df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s')
 580 |     df['actionTimeDiff'] = df['actionTime'].diff()
 581 | 
 582 |     counter = 1
 583 |     last_userid = df.iloc[0, 0]
 584 |     seq_list = []
 585 |     for i, r in df[['userid', 'actionTimeDiff']].iterrows():
 586 |         if i % 500000 == 0:
 587 |             util.log(i)
 588 |         if r.userid != last_userid:
 589 |             counter = 1
 590 |             seq_list.append(counter)
 591 |             last_userid = r.userid
 592 |         elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid:
 593 |             seq_list.append(counter)
 594 |         else:
 595 |             counter += 1
 596 |             seq_list.append(counter)
 597 |     df['actionSeq'] = pd.Series(seq_list)
 598 |     
 599 |     # 基于10分钟分块（时差低于10分钟的行为为一部分），每个块的时差
 600 |     seq_time_max = df.groupby(['userid', 'actionSeq'])['actionTime'].max().unstack()
 601 |     seq_time_diff = seq_time_max.diff(1, axis=1)
 602 |     for window in [2,3,4,5,6,7,10,15]:
 603 |         tmp = seq_time_diff.iloc[:, 1:(window+1)]
 604 |         tmp.columns = ['act_seq_time_diff(%d-%d)(window_%d)' % (i, i-1, window) for i in tmp.columns]
 605 |         tmp = tmp.reset_index()
 606 |         data = pd.merge(mydf, tmp, on='userid', how='left')
 607 |         util.log('window=%d' % window)
 608 |         data.to_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window), index=False)
 609 | 
 610 | 
 611 | # In[ ]:
 612 | 
 613 | def get_action_sequence_time_stat_feature(df):
 614 |     df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True)
 615 | 
 616 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 617 | 
 618 |     df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s')
 619 |     df['actionTimeDiff'] = df['actionTime'].diff()
 620 | 
 621 |     counter = 1
 622 |     last_userid = df.iloc[0, 0]
 623 |     seq_list = []
 624 |     for i, r in df[['userid', 'actionTimeDiff']].iterrows():
 625 |         if i % 500000 == 0:
 626 |             util.log(i)
 627 |         if r.userid != last_userid:
 628 |             counter = 1
 629 |             seq_list.append(counter)
 630 |             last_userid = r.userid
 631 |         elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid:
 632 |             seq_list.append(counter)
 633 |         else:
 634 |             counter += 1
 635 |             seq_list.append(counter)
 636 |     df['actionSeq'] = pd.Series(seq_list)
 637 |     
 638 |     time_stat = df[(df['actionSeq'] == 1) | (df['actionSeq'] == 2) | (df['actionSeq'] == 3)].groupby(['userid', 'actionSeq'])['actionTime'].agg([min, max, np.mean, np.median, np.ptp, np.std, 'count']).unstack().reset_index()
 639 |     time_stat.columns = ['userid' if i[0] == 'userid' else 'act_seq_time_stat_%s_last%d' % (i[0], i[1]) for i in time_stat.columns]
 640 |     
 641 |     time_stat.to_csv('../data/output/feat/%s' % ('action_sequence_time_stat_last123'), index=False)
 642 | 
 643 | 
 644 | # In[ ]:
 645 | 
 646 | def get_action_time_diff_234_56789_last_window_feature(df, window):
 647 |     df = df.copy()
 648 | 
 649 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 650 |     
 651 |     # 234 类型的 action 的 time 的差值
 652 |     tmp = df[df['actionType'].isin([2, 3, 4])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 653 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 654 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 655 |     act_time = act_time[act_time.columns[::-1]]
 656 |     act_time_diff_234 = act_time.diff(1, axis=1)
 657 |     act_time_diff_234 = act_time_diff_234.iloc[:, 1:].reset_index()
 658 |     act_time_diff_234.columns = [i if i == 'userid' else 'act_time_diff_234(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_234.columns]
 659 |     
 660 |     # 56789 类型的 action 的 time 的差值
 661 |     tmp = df[df['actionType'].isin([5, 6, 7, 8, 9])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)
 662 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 663 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 664 |     act_time = act_time[act_time.columns[::-1]]
 665 |     act_time_diff_56789 = act_time.diff(1, axis=1)
 666 |     act_time_diff_56789 = act_time_diff_56789.iloc[:, 1:].reset_index()
 667 |     act_time_diff_56789.columns = [i if i == 'userid' else 'act_time_diff_56789(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_56789.columns]
 668 | 
 669 |     mydf = pd.merge(mydf, act_time_diff_234, on='userid', how='left')
 670 |     mydf = pd.merge(mydf, act_time_diff_56789, on='userid', how='left')
 671 |     
 672 |     return mydf
 673 | 
 674 | 
 675 | # In[ ]:
 676 | 
 677 | def get_action_stat_last_every_type_feature(df):
 678 |     df = df.copy()
 679 | 
 680 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 681 | 
 682 |     # 离最近的 123456789 的 action 的时间的统计
 683 |     for t in range(1, 10):
 684 |         tmp = df[df['actionType'] == t].groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index()
 685 |         tmp.columns = [i if i == 'userid' else 'act_time_%s(type_%d)' % (i, t) for i in tmp.columns]
 686 |         
 687 |         mydf = pd.merge(mydf, tmp, on='userid', how='left')
 688 | 
 689 |     return mydf
 690 | 
 691 | 
 692 | # In[ ]:
 693 | 
 694 | def get_act_ord_before_type1_stat_feature(act, oord):
 695 |     act = act.copy()
 696 |     oord = oord.copy()
 697 | 
 698 |     mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)
 699 | 
 700 |     ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index()
 701 |     act = pd.merge(act, ord_time, on='userid', how='left')
 702 | 
 703 |     df = act[act['actionTime'] < act['orderTime']]
 704 | 
 705 |     act_time_stat = df.groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index()
 706 |     act_time_stat.columns = [i if i == 'userid' else 'act_ord_before_type1_act_time_%s' % i for i in act_time_stat.columns]
 707 |     
 708 |     act_type_size = mydf.copy()
 709 |     for t in range(1, 10):
 710 |         tmp = df[df['actionType'] == t].groupby('userid').size().reset_index()
 711 |         tmp.columns = ['userid', 'act_ord_before_type1_act_type_size(type_%d)' % t]
 712 |         act_type_size = pd.merge(act_type_size, tmp, on='userid', how='left')
 713 | 
 714 |     mydf = pd.merge(mydf, act_time_stat, on='userid', how='left')
 715 |     mydf = pd.merge(mydf, act_type_size, on='userid', how='left')
 716 |     
 717 |     return mydf
 718 | 
 719 | 
 720 | # In[ ]:
 721 | 
 722 | def get_action_time_diff_stat_feature(df):
 723 |     df = df.copy()
 724 | 
 725 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 726 | 
 727 |     df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy()
 728 |     df['actionTimeDiff'] = df['actionTime'].diff(1)
 729 |     df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)
 730 | 
 731 |     act_time_diff_stat = df.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()
 732 |     act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s' % i for i in act_time_diff_stat.columns]
 733 | 
 734 |     mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')
 735 |     
 736 |     return mydf
 737 | 
 738 | 
 739 | # In[ ]:
 740 | 
 741 | def get_action_time_diff_stat_last_window_feature(df, window):
 742 |     df = df.copy()
 743 | 
 744 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 745 | 
 746 |     df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy()
 747 |     df['actionTimeDiff'] = df['actionTime'].diff(1)
 748 |     df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)
 749 |     
 750 |     tmp = df.groupby('userid').apply(lambda x: x.iloc[:-window, :]).reset_index(drop=True)
 751 |     act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()
 752 |     act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s(window_%d)' % (i, window) for i in act_time_diff_stat.columns]
 753 | 
 754 |     mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')
 755 |     
 756 |     return mydf
 757 | 
 758 | 
 759 | # In[ ]:
 760 | 
 761 | def get_action_time_last_on_every_type_feature(df):
 762 |     df = df.copy()
 763 | 
 764 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 765 | 
 766 |     df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy()
 767 |     for t in range(1, 10):
 768 |         act_time = df[df['actionType'] == t].groupby('userid').apply(lambda x: x.head(1)).reset_index(drop=True)
 769 |         act_time = act_time[['userid', 'actionTime']]
 770 |         act_time.columns = ['userid', 'act_time_last(type_%d)' % t]
 771 |         
 772 |         mydf = pd.merge(mydf, act_time, on='userid', how='left')
 773 |     
 774 |     return mydf
 775 | 
 776 | 
 777 | # In[ ]:
 778 | 
 779 | def get_try_feat(df):
 780 |     df = df.copy()
 781 |     
 782 |     mydf = df[['userid']].drop_duplicates().reset_index(drop=True)
 783 | 
 784 |     df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy()
 785 |     
 786 |     last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])
 787 |     last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])
 788 |     time_gap_last56 = pd.merge(last_5, last_6, on='userid', how='outer')
 789 |     time_gap_last56['time_gap_last56'] = time_gap_last56.actionTime_y - time_gap_last56.actionTime_x
 790 |     mydf = pd.merge(mydf, time_gap_last56[['userid', 'time_gap_last56']], on='userid', how='left')
 791 | 
 792 |     tmp = df[df['actionType'] == 5].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(2)).reset_index(drop=True)
 793 |     tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)
 794 |     act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')
 795 |     act_time = act_time[act_time.columns[::-1]]
 796 |     act_time_diff = act_time.diff(1, axis=1)
 797 |     act_time_diff = act_time_diff.iloc[:, 1:].reset_index()
 798 |     act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, 2, 5) for i in act_time_diff.columns]
 799 |     mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')
 800 | 
 801 |     last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])
 802 |     last_7 = df[df.actionType == 7].drop_duplicates(subset=['userid'])
 803 |     time_gap_last67 = pd.merge(last_6, last_7, on='userid', how='outer')
 804 |     time_gap_last67['time_gap_last67'] = time_gap_last67.actionTime_y - time_gap_last67.actionTime_x
 805 |     mydf = pd.merge(mydf, time_gap_last67[['userid', 'time_gap_last67']], on='userid', how='left')
 806 | 
 807 |     df['actionDate'] = pd.to_datetime(df['actionTime'], unit='s')
 808 |     df = pd.merge(df, df.drop_duplicates(subset=['userid'])[['userid', 'actionDate']], on='userid', how='left')
 809 |     df['lastDay'] = df.actionDate_x.dt.day == df.actionDate_y.dt.day
 810 |     last_day = df[df.lastDay].groupby('userid')['lastDay'].size().reset_index()
 811 |     last_day_5 = df[df.lastDay & (df.actionType == 5)].groupby('userid')['lastDay'].size().reset_index()
 812 |     tmp = pd.merge(last_day, last_day_5, on='userid', how='left')
 813 |     tmp['last_day_rate(type_5)'] = tmp.lastDay_y / tmp.lastDay_x
 814 |     mydf = pd.merge(mydf, tmp[['userid', 'last_day_rate(type_5)']], on='userid', how='left')
 815 | 
 816 |     last_time = df.drop_duplicates(subset=['userid'])[['userid', 'actionTime']]
 817 |     last_time.columns = ['userid', 'last_time']
 818 |     mydf = pd.merge(mydf, last_time, on='userid', how='left')
 819 | 
 820 |     last_4 = df[df.actionType == 4].drop_duplicates(subset=['userid'])
 821 |     last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])
 822 |     time_gap_last45 = pd.merge(last_4, last_5, on='userid', how='outer')
 823 |     time_gap_last45['time_gap_last45'] = time_gap_last45.actionTime_y - time_gap_last45.actionTime_x
 824 |     mydf = pd.merge(mydf, time_gap_last45[['userid', 'time_gap_last45']], on='userid', how='left')
 825 | 
 826 |     last_1 = df[df.actionType == 1].drop_duplicates(subset=['userid'])
 827 |     last = df.drop_duplicates(subset=['userid'])
 828 |     time_gap_last1 = pd.merge(last_1, last, on='userid', how='outer')
 829 |     time_gap_last1['time_gap_last1'] = time_gap_last1.actionTime_y - time_gap_last1.actionTime_x
 830 |     mydf = pd.merge(mydf, time_gap_last1[['userid', 'time_gap_last1']], on='userid', how='left')
 831 | 
 832 |     last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])
 833 |     last = df.drop_duplicates(subset=['userid'])
 834 |     time_gap_last5 = pd.merge(last_5, last, on='userid', how='outer')
 835 |     time_gap_last5['time_gap_last5'] = time_gap_last5.actionTime_y - time_gap_last5.actionTime_x
 836 |     mydf = pd.merge(mydf, time_gap_last5[['userid', 'time_gap_last5']], on='userid', how='left')
 837 | 
 838 |     last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])
 839 |     last = df.drop_duplicates(subset=['userid'])
 840 |     time_gap_last6 = pd.merge(last_6, last, on='userid', how='outer')
 841 |     time_gap_last6['time_gap_last6'] = time_gap_last6.actionTime_y - time_gap_last6.actionTime_x
 842 |     mydf = pd.merge(mydf, time_gap_last6[['userid', 'time_gap_last6']], on='userid', how='left')
 843 | 
 844 |     tmp = df[df.actionType.isin([5, 6])].copy()
 845 |     tmp['actionTimeDiff'] = tmp['actionTime'].diff(1)
 846 |     tmp = tmp.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)
 847 |     act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()
 848 |     act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_56_%s' % i for i in act_time_diff_stat.columns]
 849 |     mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')
 850 |     
 851 |     return mydf
 852 | 
 853 | 
 854 | # In[ ]:
 855 | 
 856 | action_tr = pd.read_csv('../data/input/train/action_train.csv')  # 用户行为数据
 857 | order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')  # 待预测数据
 858 | order_history_tr = pd.read_csv('../data/input/train/orderHistory_train.csv')  # 用户历史订单数据
 859 | user_comment_tr = pd.read_csv('../data/input/train/userComment_train.csv')  # 用户评论数据
 860 | user_profile_tr = pd.read_csv('../data/input/train/userProfile_train.csv')  # 用户个人信息
 861 | 
 862 | action_te = pd.read_csv('../data/input/test/action_test.csv')
 863 | order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')
 864 | order_history_te = pd.read_csv('../data/input/test/orderHistory_test.csv')
 865 | user_comment_te = pd.read_csv('../data/input/test/userComment_test.csv')
 866 | user_profile_te = pd.read_csv('../data/input/test/userProfile_test.csv')
 867 | 
 868 | action = pd.concat([action_tr, action_te], axis=0).reset_index(drop=True)
 869 | order_history = pd.concat([order_history_tr, order_history_te], axis=0).reset_index(drop=True)
 870 | user_comment = pd.concat([user_comment_tr, user_comment_te], axis=0).reset_index(drop=True)
 871 | user_profile = pd.concat([user_profile_tr, user_profile_te], axis=0).reset_index(drop=True)
 872 | 
 873 | 
 874 | # In[ ]:
 875 | 
 876 | user_profile_feat = get_user_profile_feature(user_profile)
 877 | user_profile_feat.to_csv('../data/output/feat/%s' % 'user_profile', index=False)
 878 | 
 879 | 
 880 | # In[ ]:
 881 | 
 882 | user_comment_feat = get_user_comment_feature(user_comment)
 883 | user_comment_feat.to_csv('../data/output/feat/%s' % 'user_comment', index=False)
 884 | 
 885 | 
 886 | # In[ ]:
 887 | 
 888 | order_history_feat = get_order_history_feature(order_history)
 889 | order_history_feat.to_csv('../data/output/feat/%s' % 'order_history', index=False)
 890 | 
 891 | 
 892 | # In[ ]:
 893 | 
 894 | order_history_last_w_feat = get_order_history_last_w_feature(order_history)
 895 | order_history_last_w_feat.to_csv('../data/output/feat/%s' % 'order_history_last_w', index=False)
 896 | 
 897 | 
 898 | # In[ ]:
 899 | 
 900 | action_type_feat = get_action_type_feature(action)
 901 | action_type_feat.to_csv('../data/output/feat/%s' % 'action_type', index=False)
 902 | 
 903 | 
 904 | # In[ ]:
 905 | 
 906 | action_type_based_on_time_feat = get_action_type_based_on_time_feature(action)
 907 | action_type_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_type_based_on_time', index=False)
 908 | 
 909 | 
 910 | # In[ ]:
 911 | 
 912 | for window in [3,4,5,6,7]:
 913 |     util.log(window)
 914 |     action_type_based_on_time_last_window_feat = get_action_type_based_on_time_last_window_feature(action, window)
 915 |     action_type_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window), index=False)
 916 | 
 917 | 
 918 | # In[ ]:
 919 | 
 920 | for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]:
 921 |     util.log(window)
 922 |     action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window)
 923 |     action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False)
 924 | 
 925 | 
 926 | # In[ ]:
 927 | 
 928 | for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]:
 929 |     util.log(window)
 930 |     action_type_rate_based_on_time_last_window_feat = get_action_type_rate_based_on_time_last_window_feature(action, window)
 931 |     action_type_rate_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window), index=False)
 932 | 
 933 | 
 934 | # In[ ]:
 935 | 
 936 | for window in [6]:
 937 |     util.log(window)
 938 |     action_type_row_stat_based_on_time_last_window_feat = get_action_type_row_stat_based_on_time_last_window_feature(action, window)
 939 |     action_type_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window), index=False)
 940 | 
 941 | 
 942 | # In[ ]:
 943 | 
 944 | for window in [4, 7, 13, 17, 20, 25, 30]:
 945 |     util.log(window)
 946 |     action_num_based_on_time_last_window_feat = get_action_num_based_on_time_last_window_feature(action, window)
 947 |     action_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window), index=False)
 948 | 
 949 | 
 950 | # In[ ]:
 951 | 
 952 | for window in [4, 7, 13, 17, 20, 25, 30]:
 953 |     util.log(window)
 954 |     action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window)
 955 |     action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False)
 956 | 
 957 | 
 958 | # In[ ]:
 959 | 
 960 | action_time_based_on_time_feat = get_action_time_based_on_time_feature(action)
 961 | action_time_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_time_based_on_time', index=False)
 962 | 
 963 | 
 964 | # In[ ]:
 965 | 
 966 | for window in [6]:
 967 |     util.log(window)
 968 |     action_time_based_on_time_last_window_feat = get_action_time_based_on_time_last_window_feature(action, window)
 969 |     action_time_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window), index=False)
 970 | 
 971 | 
 972 | # In[ ]:
 973 | 
 974 | for window in [3, 6, 10, 14]:
 975 |     util.log(window)
 976 |     action_time_row_stat_based_on_time_last_window_feat = get_action_time_row_stat_based_on_time_last_window_feature(action, window)
 977 |     action_time_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window), index=False)
 978 | 
 979 | 
 980 | # In[ ]:
 981 | 
 982 | for window in [3, 4, 5, 6, 7, 8]:
 983 |     util.log(window)
 984 |     action_time_diff2_based_on_time_last_window_feat = get_action_time_diff2_based_on_time_last_window_feature(action, window)
 985 |     action_time_diff2_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window), index=False)
 986 | 
 987 | 
 988 | # In[ ]:
 989 | 
 990 | for ttype in [1,5,6,7,8,9]:
 991 |     for window in [6]:
 992 |         util.log('type=%d window=%d' % (ttype, window))
 993 |         action_time_based_on_time_last_window_on_type_feat = get_action_time_based_on_time_last_window_on_type_feature(action, window, ttype)
 994 |         action_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype), index=False)
 995 | 
 996 | 
 997 | # In[ ]:
 998 | 
 999 | for window in [3, 4, 5, 6, 7, 8, 9, 10]:
1000 |     util.log(window)
1001 |     action_time_2order_based_on_time_last_window_feat = get_action_time_2order_based_on_time_last_window_feature(action, window)
1002 |     action_time_2order_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window), index=False)
1003 | 
1004 | 
1005 | # In[ ]:
1006 | 
1007 | for ttype in [1,5,6,7,8,9]:
1008 |     for window in [4, 7, 10]:
1009 |         util.log('type=%d window=%d' % (ttype, window))
1010 |         action_real_time_based_on_time_last_window_on_type_feat = get_action_real_time_based_on_time_last_window_on_type_feature(action, window, ttype)
1011 |         action_real_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype), index=False)
1012 | 
1013 | 
1014 | # In[ ]:
1015 | 
1016 | act_ord_time_diff_feat = get_act_ord_time_diff_feature(action, order_history)
1017 | act_ord_time_diff_feat.to_csv('../data/output/feat/%s' % 'action_order_time_diff', index=False)
1018 | 
1019 | 
1020 | # In[ ]:
1021 | 
1022 | order_last_order_ydm_feat = get_order_last_order_ydm_feature(order_history)
1023 | order_last_order_ydm_feat.to_csv('../data/output/feat/%s' % 'order_last_order_ydm', index=False)
1024 | 
1025 | 
1026 | # In[ ]:
1027 | 
1028 | order_type1_ydm_feat = get_order_type1_ydm_feature(order_history)
1029 | order_type1_ydm_feat.to_csv('../data/output/feat/%s' % 'order_type1_ydm', index=False)
1030 | 
1031 | 
1032 | # In[ ]:
1033 | 
1034 | for window in [7,8,10,11]:
1035 |     util.log(window)
1036 |     act_ord_act_time_diff_last_window_feat = get_act_ord_act_time_diff_last_window_feature(action, order_history, window)
1037 |     act_ord_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window), index=False)
1038 | 
1039 | 
1040 | # In[ ]:
1041 | 
1042 | for window in [2,4]:
1043 |     util.log(window)
1044 |     act_ord_type1_act_time_diff_last_window_feat = get_act_ord_type1_act_time_diff_last_window_feature(action, order_history, window)
1045 |     act_ord_type1_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window), index=False)
1046 | 
1047 | 
1048 | # In[ ]:
1049 | 
1050 | get_action_sequence_time_diff_feature(action)
1051 | 
1052 | 
1053 | # In[ ]:
1054 | 
1055 | get_action_sequence_time_stat_feature(action)
1056 | 
1057 | 
1058 | # In[ ]:
1059 | 
1060 | for window in [6]:
1061 |     util.log(window)
1062 |     action_time_diff_234_56789_last_window_feat = get_action_time_diff_234_56789_last_window_feature(action, window)
1063 |     action_time_diff_234_56789_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window), index=False)
1064 | 
1065 | 
1066 | # In[ ]:
1067 | 
1068 | action_stat_last_every_type_feat = get_action_stat_last_every_type_feature(action)
1069 | action_stat_last_every_type_feat.to_csv('../data/output/feat/%s' % 'action_stat_last_every_type', index=False)
1070 | 
1071 | 
1072 | # In[ ]:
1073 | 
1074 | act_ord_before_type1_stat_feat = get_act_ord_before_type1_stat_feature(action, order_history)
1075 | act_ord_before_type1_stat_feat.to_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat', index=False)
1076 | 
1077 | 
1078 | # In[ ]:
1079 | 
1080 | action_time_diff_stat_feat = get_action_time_diff_stat_feature(action)  # untest
1081 | action_time_diff_stat_feat.to_csv('../data/output/feat/%s' % 'action_time_diff_stat', index=False)
1082 | 
1083 | 
1084 | # In[ ]:
1085 | 
1086 | for window in [3, 4, 5, 6, 7, 8, 9]:
1087 |     util.log(window)
1088 |     action_time_diff_stat_last_window_feat = get_action_time_diff_stat_last_window_feature(action, window)
1089 |     action_time_diff_stat_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window), index=False)
1090 | 
1091 | 
1092 | # In[ ]:
1093 | 
1094 | action_time_last_on_every_type_feat = get_action_time_last_on_every_type_feature(action)
1095 | action_time_last_on_every_type_feat.to_csv('../data/output/feat/%s' % 'action_time_last_on_every_type', index=False)
1096 | 
1097 | 
1098 | # In[ ]:
1099 | 
1100 | try_feat = get_try_feat(action)
1101 | try_feat.to_csv('../data/output/feat/%s' % 'try', index=False)
1102 | 
1103 | 


--------------------------------------------------------------------------------
/src/feat.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "from __future__ import division\n",
  12 |     "import numpy as np\n",
  13 |     "import pandas as pd\n",
  14 |     "from sklearn import preprocessing\n",
  15 |     "import xgboost as xgb\n",
  16 |     "import lightgbm as lgb\n",
  17 |     "import catboost as cb\n",
  18 |     "import time\n",
  19 |     "import datetime\n",
  20 |     "import sys\n",
  21 |     "import math\n",
  22 |     "import warnings\n",
  23 |     "warnings.filterwarnings('ignore')\n",
  24 |     "import util"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": null,
  30 |    "metadata": {
  31 |     "collapsed": true
  32 |    },
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "def get_user_profile_feature(df):\n",
  36 |     "    df = df.copy()\n",
  37 |     "\n",
  38 |     "    mydf = df[['userid']]\n",
  39 |     "    le = preprocessing.LabelEncoder()\n",
  40 |     "    mydf['gender'] = le.fit_transform(df['gender'])\n",
  41 |     "\n",
  42 |     "    mydf['province'] = le.fit_transform(df['province'])\n",
  43 |     "\n",
  44 |     "    mydf['age'] = le.fit_transform(df['age'])\n",
  45 |     "\n",
  46 |     "    return mydf"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "code",
  51 |    "execution_count": null,
  52 |    "metadata": {
  53 |     "collapsed": true
  54 |    },
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "def get_user_comment_feature(df):\n",
  58 |     "    df = df.copy()\n",
  59 |     "    \n",
  60 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
  61 |     "\n",
  62 |     "    com_rating = df.groupby('userid')['rating'].agg(['sum', 'count']).reset_index()\n",
  63 |     "    com_rating.columns = [i if i == 'userid' else 'com_rating_' + i for i in com_rating.columns]\n",
  64 |     "\n",
  65 |     "    mydf = pd.merge(mydf, com_rating, on='userid', how='left')\n",
  66 |     "    \n",
  67 |     "    return mydf"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": null,
  73 |    "metadata": {
  74 |     "collapsed": true
  75 |    },
  76 |    "outputs": [],
  77 |    "source": [
  78 |     "def get_order_history_feature(df):\n",
  79 |     "    df = df.copy()\n",
  80 |     "\n",
  81 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
  82 |     "\n",
  83 |     "    # type 为 0 和 1 的订单的数量和比率 + 总订单数\n",
  84 |     "    ord_hist_ord = df.groupby('userid')['orderType'].agg(['sum', 'count']).reset_index()\n",
  85 |     "    ord_hist_ord.columns = ['userid', 'ord_num(type_1)', 'ord_num']\n",
  86 |     "    ord_hist_ord['ord_num(type_0)'] = ord_hist_ord['ord_num'] - ord_hist_ord['ord_num(type_1)']\n",
  87 |     "    ord_hist_ord['ord_rate(type_1)'] = ord_hist_ord['ord_num(type_1)'] / ord_hist_ord['ord_num']\n",
  88 |     "    ord_hist_ord['ord_rate(type_0)'] = ord_hist_ord['ord_num(type_0)'] / ord_hist_ord['ord_num']\n",
  89 |     "\n",
  90 |     "    # city, country, continent 的数量\n",
  91 |     "    addr_count = df.groupby('userid')['city', 'country', 'continent'].count().reset_index()\n",
  92 |     "    addr_count.columns = ['userid', 'city_num', 'country_num', 'continent_num']\n",
  93 |     "\n",
  94 |     "    # type 为 1 的 city, country, continent 的数量\n",
  95 |     "    addr_count_pos = df[df['orderType'] == 1].groupby('userid')['city', 'country', 'continent'].count().reset_index()\n",
  96 |     "    addr_count_pos.columns = ['userid', 'city_num(type_1)', 'country_num(type_1)', 'continent_num(type_1)']\n",
  97 |     "\n",
  98 |     "    # 每个 country 的订单数量\n",
  99 |     "    lb = preprocessing.LabelBinarizer()\n",
 100 |     "    tmp = lb.fit_transform(df['country'])\n",
 101 |     "    tmp_col = ['country_' + str(i) for i in range(tmp.shape[1])]\n",
 102 |     "    tmp = pd.DataFrame(tmp, columns=tmp_col)\n",
 103 |     "    tmp['userid'] = df['userid'].values\n",
 104 |     "    country = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index()\n",
 105 |     "\n",
 106 |     "    # 每个 continent 的订单数量\n",
 107 |     "    lb = preprocessing.LabelBinarizer()\n",
 108 |     "    tmp = lb.fit_transform(df['continent'])\n",
 109 |     "    tmp_col = ['continent_' + str(i) for i in range(tmp.shape[1])]\n",
 110 |     "    tmp = pd.DataFrame(tmp, columns=tmp_col)\n",
 111 |     "    tmp['userid'] = df['userid'].values\n",
 112 |     "    continent = tmp.groupby('userid')[tmp_col].agg(['sum']).reset_index()\n",
 113 |     "    \n",
 114 |     "    # 最后一次的 order\n",
 115 |     "    last_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']]\n",
 116 |     "    last_ord.columns = ['userid', 'ord_last_id', 'ord_last_time', 'ord_last_type']\n",
 117 |     "    \n",
 118 |     "    # 第一次的 order\n",
 119 |     "    first_ord = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'orderid', 'orderTime', 'orderType']]\n",
 120 |     "    first_ord.columns = ['userid', 'ord_first_id', 'ord_first_time', 'ord_first_type']\n",
 121 |     "    \n",
 122 |     "    # type 分别为 0/1 的订单的时间的统计\n",
 123 |     "    for t in [0, 1]:\n",
 124 |     "        ord_time_stat = df[df['orderType'] == t].groupby('userid')['orderTime'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index()\n",
 125 |     "        ord_time_stat.columns = [i if i == 'userid' else 'ord_type%d_time_%s' % (t, i) for i in ord_time_stat.columns]\n",
 126 |     "        mydf = pd.merge(mydf, ord_time_stat, on='userid', how='left')\n",
 127 |     "    \n",
 128 |     "    mydf = pd.merge(mydf, ord_hist_ord, on='userid', how='left')\n",
 129 |     "    mydf = pd.merge(mydf, addr_count, on='userid', how='left')\n",
 130 |     "    mydf = pd.merge(mydf, addr_count_pos, on='userid', how='left')\n",
 131 |     "    mydf = pd.merge(mydf, country, on='userid', how='left')\n",
 132 |     "    mydf = pd.merge(mydf, continent, on='userid', how='left')\n",
 133 |     "    mydf = pd.merge(mydf, last_ord, on='userid', how='left')\n",
 134 |     "    mydf = pd.merge(mydf, first_ord, on='userid', how='left')\n",
 135 |     "        \n",
 136 |     "    return mydf"
 137 |    ]
 138 |   },
 139 |   {
 140 |    "cell_type": "code",
 141 |    "execution_count": null,
 142 |    "metadata": {
 143 |     "collapsed": true
 144 |    },
 145 |    "outputs": [],
 146 |    "source": [
 147 |     "def get_order_history_last_w_feature(df):\n",
 148 |     "    df = df.copy()\n",
 149 |     "\n",
 150 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 151 |     "\n",
 152 |     "    # 最后 w 次订单的统计\n",
 153 |     "    for w in [2, 3, 4]:\n",
 154 |     "        util.log(w)\n",
 155 |     "        \n",
 156 |     "        last_order = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(w)).reset_index(drop=True)[['userid', 'orderTime', 'orderType']]\n",
 157 |     "        last_order.columns = ['userid', 'ord_last_time', 'ord_last_type']\n",
 158 |     "        \n",
 159 |     "        ord_last_time_stat = last_order.groupby('userid')['ord_last_time'].agg([min, max, np.ptp, np.mean, np.median, np.std]).reset_index()\n",
 160 |     "        ord_last_time_stat.columns = [i if i == 'userid' else 'ord_last%d_time_%s' % (w, i) for i in ord_last_time_stat.columns]\n",
 161 |     "        \n",
 162 |     "        ord_last_type_stat = last_order.groupby('userid')['ord_last_type'].agg(['count', sum]).reset_index()\n",
 163 |     "        ord_last_type_stat.columns = [i if i == 'userid' else 'ord_last%d_type_%s' % (w, i) for i in ord_last_type_stat.columns]\n",
 164 |     "        \n",
 165 |     "        mydf = pd.merge(mydf, ord_last_time_stat, on='userid', how='left')\n",
 166 |     "        mydf = pd.merge(mydf, ord_last_type_stat, on='userid', how='left')\n",
 167 |     "\n",
 168 |     "    return mydf"
 169 |    ]
 170 |   },
 171 |   {
 172 |    "cell_type": "code",
 173 |    "execution_count": null,
 174 |    "metadata": {
 175 |     "collapsed": true
 176 |    },
 177 |    "outputs": [],
 178 |    "source": [
 179 |     "def get_action_type_feature(df):\n",
 180 |     "    df = df.copy()\n",
 181 |     "\n",
 182 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 183 |     "\n",
 184 |     "    # 每个用户的 action 和 actionType 的数量\n",
 185 |     "    act_num = df.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([sum, len]).reset_index()\n",
 186 |     "    act_num.columns = ['userid', 'act_num', 'act_type_num']\n",
 187 |     "\n",
 188 |     "    # 每个类别的数量\n",
 189 |     "    act_type_num = df.groupby(['userid', 'actionType']).size().unstack().reset_index()\n",
 190 |     "    act_type_num.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')' for i in act_type_num.columns]\n",
 191 |     "\n",
 192 |     "    mydf = pd.merge(mydf, act_num, on='userid', how='left')\n",
 193 |     "    mydf = pd.merge(mydf, act_type_num, on='userid', how='left')\n",
 194 |     "\n",
 195 |     "    return mydf"
 196 |    ]
 197 |   },
 198 |   {
 199 |    "cell_type": "code",
 200 |    "execution_count": null,
 201 |    "metadata": {
 202 |     "collapsed": true
 203 |    },
 204 |    "outputs": [],
 205 |    "source": [
 206 |     "def get_action_type_based_on_time_feature(df):\n",
 207 |     "    df = df.copy()\n",
 208 |     "\n",
 209 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 210 |     "\n",
 211 |     "    # 最近的一次 action 的 type\n",
 212 |     "    act_last_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionType']]\n",
 213 |     "    act_last_type.columns = ['userid', 'act_last_type']\n",
 214 |     "    \n",
 215 |     "    # 最早的一次 action 的 type\n",
 216 |     "    act_first_type = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionType']]\n",
 217 |     "    act_first_type.columns = ['userid', 'act_first_type']\n",
 218 |     "\n",
 219 |     "    mydf = pd.merge(mydf, act_last_type, on='userid', how='left')\n",
 220 |     "    mydf = pd.merge(mydf, act_first_type, on='userid', how='left')\n",
 221 |     "\n",
 222 |     "    return mydf"
 223 |    ]
 224 |   },
 225 |   {
 226 |    "cell_type": "code",
 227 |    "execution_count": null,
 228 |    "metadata": {
 229 |     "collapsed": true
 230 |    },
 231 |    "outputs": [],
 232 |    "source": [
 233 |     "def get_action_type_based_on_time_last_window_feature(df, window):\n",
 234 |     "    df = df.copy()\n",
 235 |     "\n",
 236 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 237 |     "\n",
 238 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 239 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 240 |     "\n",
 241 |     "    # type 的差值\n",
 242 |     "    act_type = tmp.pivot('userid', 'act_time_rank', 'actionType')\n",
 243 |     "    act_type = act_type[act_type.columns[::-1]]\n",
 244 |     "    act_type_diff = act_type.diff(1, axis=1)\n",
 245 |     "    act_type_diff = act_type_diff.iloc[:, 1:].reset_index()\n",
 246 |     "    act_type_diff.columns = [i if i == 'userid' else 'act_type_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_type_diff.columns]\n",
 247 |     "\n",
 248 |     "    mydf = pd.merge(mydf, act_type_diff, on='userid', how='left')\n",
 249 |     "\n",
 250 |     "    return mydf"
 251 |    ]
 252 |   },
 253 |   {
 254 |    "cell_type": "code",
 255 |    "execution_count": null,
 256 |    "metadata": {
 257 |     "collapsed": true
 258 |    },
 259 |    "outputs": [],
 260 |    "source": [
 261 |     "def get_action_type_num_based_on_time_last_window_feature(df, window):\n",
 262 |     "    df = df.copy()\n",
 263 |     "\n",
 264 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 265 |     "\n",
 266 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 267 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 268 |     "\n",
 269 |     "    # 每个类别的数量\n",
 270 |     "    act_num_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().reset_index()\n",
 271 |     "    act_num_in_window.columns = [i if i == 'userid' else 'act_num(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_num_in_window.columns]\n",
 272 |     "    \n",
 273 |     "    mydf = pd.merge(mydf, act_num_in_window, on='userid', how='left')\n",
 274 |     "\n",
 275 |     "    return mydf"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": null,
 281 |    "metadata": {
 282 |     "collapsed": true
 283 |    },
 284 |    "outputs": [],
 285 |    "source": [
 286 |     "def get_action_type_rate_based_on_time_last_window_feature(df, window):\n",
 287 |     "    df = df.copy()\n",
 288 |     "\n",
 289 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 290 |     "\n",
 291 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 292 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 293 |     "\n",
 294 |     "    # 每个类别的列级别的比率\n",
 295 |     "    act_column_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply(lambda x: x / np.sum(x)).reset_index()\n",
 296 |     "    act_column_rate_in_window.columns = [i if i == 'userid' else 'act_column_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_column_rate_in_window.columns]\n",
 297 |     "\n",
 298 |     "    # 每个类别的行级别的比率\n",
 299 |     "    act_row_rate_in_window = tmp.groupby(['userid', 'actionType']).size().unstack().apply((lambda x: x / np.sum(x)), axis=1).reset_index()\n",
 300 |     "    act_row_rate_in_window.columns = [i if i == 'userid' else 'act_row_rate(type_' + str(i) + ')(window_' + str(window) + ')' for i in act_row_rate_in_window.columns]\n",
 301 |     "    \n",
 302 |     "    mydf = pd.merge(mydf, act_column_rate_in_window, on='userid', how='left')\n",
 303 |     "    mydf = pd.merge(mydf, act_row_rate_in_window, on='userid', how='left')\n",
 304 |     "\n",
 305 |     "    return mydf"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "code",
 310 |    "execution_count": null,
 311 |    "metadata": {
 312 |     "collapsed": true
 313 |    },
 314 |    "outputs": [],
 315 |    "source": [
 316 |     "def get_action_type_row_stat_based_on_time_last_window_feature(df, window):\n",
 317 |     "    df = df.copy()\n",
 318 |     "\n",
 319 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 320 |     "\n",
 321 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 322 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 323 |     "\n",
 324 |     "    # 最近的 type 值 + 行级别的统计值\n",
 325 |     "    act_type = tmp.pivot('userid', 'act_time_rank', 'actionType')\n",
 326 |     "    act_type.columns = ['act_type(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_type.columns]\n",
 327 |     "    for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]:\n",
 328 |     "        act_type['act_row_type_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_type_' + i.func_name + '(window_' + str(window) + ')'] = act_type.apply(i, axis=1)\n",
 329 |     "    act_type = act_type.reset_index()\n",
 330 |     "    \n",
 331 |     "    mydf = pd.merge(mydf, act_type, on='userid', how='left')\n",
 332 |     "\n",
 333 |     "    return mydf"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "code",
 338 |    "execution_count": null,
 339 |    "metadata": {
 340 |     "collapsed": true
 341 |    },
 342 |    "outputs": [],
 343 |    "source": [
 344 |     "def get_action_num_based_on_time_last_window_feature(df, window):\n",
 345 |     "    df = df.copy()\n",
 346 |     "\n",
 347 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 348 |     "\n",
 349 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 350 |     "\n",
 351 |     "    # action 的数量\n",
 352 |     "    act_num = tmp.groupby('userid').size().reset_index()\n",
 353 |     "    act_num.columns = ['userid', 'act_num(window_%d)' % window]\n",
 354 |     "    \n",
 355 |     "    mydf = pd.merge(mydf, act_num, on='userid', how='left')\n",
 356 |     "\n",
 357 |     "    return mydf"
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "code",
 362 |    "execution_count": null,
 363 |    "metadata": {
 364 |     "collapsed": true
 365 |    },
 366 |    "outputs": [],
 367 |    "source": [
 368 |     "def get_action_type_num_based_on_time_last_window_feature(df, window):\n",
 369 |     "    df = df.copy()\n",
 370 |     "\n",
 371 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 372 |     "\n",
 373 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 374 |     "\n",
 375 |     "    # type 的数量\n",
 376 |     "    act_type_num = tmp.groupby(['userid', 'actionType']).size().reset_index().groupby('userid')[0].agg([len]).reset_index()\n",
 377 |     "    act_type_num.columns = ['userid', 'act_type_num(window_%d)' % window]\n",
 378 |     "    \n",
 379 |     "    mydf = pd.merge(mydf, act_type_num, on='userid', how='left')\n",
 380 |     "\n",
 381 |     "    return mydf"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "code",
 386 |    "execution_count": null,
 387 |    "metadata": {
 388 |     "collapsed": true
 389 |    },
 390 |    "outputs": [],
 391 |    "source": [
 392 |     "def get_action_time_based_on_time_feature(df):\n",
 393 |     "    df = df.copy()\n",
 394 |     "\n",
 395 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 396 |     "\n",
 397 |     "    # 最近的一次 action 的 time\n",
 398 |     "    act_last_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(1)).reset_index(drop=True)[['userid', 'actionTime']]\n",
 399 |     "    act_last_time.columns = ['userid', 'act_last_time']\n",
 400 |     "    \n",
 401 |     "    # 最早的一次 action 的 time\n",
 402 |     "    act_first_time = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=True).head(1)).reset_index(drop=True)[['userid', 'actionTime']]\n",
 403 |     "    act_first_time.columns = ['userid', 'act_first_time']\n",
 404 |     "    \n",
 405 |     "    mydf = pd.merge(mydf, act_last_time, on='userid', how='left')\n",
 406 |     "    mydf = pd.merge(mydf, act_first_time, on='userid', how='left')\n",
 407 |     "    \n",
 408 |     "    mydf['act_time_last-first'] = mydf['act_last_time'] - mydf['act_first_time']\n",
 409 |     "\n",
 410 |     "    return mydf"
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "code",
 415 |    "execution_count": null,
 416 |    "metadata": {
 417 |     "collapsed": true
 418 |    },
 419 |    "outputs": [],
 420 |    "source": [
 421 |     "def get_action_time_based_on_time_last_window_feature(df, window):\n",
 422 |     "    df = df.copy()\n",
 423 |     "\n",
 424 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 425 |     "    \n",
 426 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 427 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 428 |     "    \n",
 429 |     "    # time 的差值\n",
 430 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 431 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 432 |     "    act_time_diff = act_time.diff(1, axis=1)\n",
 433 |     "    act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n",
 434 |     "    act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]\n",
 435 |     "\n",
 436 |     "    mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n",
 437 |     "    \n",
 438 |     "    return mydf"
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "code",
 443 |    "execution_count": null,
 444 |    "metadata": {
 445 |     "collapsed": true
 446 |    },
 447 |    "outputs": [],
 448 |    "source": [
 449 |     "def get_action_time_row_stat_based_on_time_last_window_feature(df, window):\n",
 450 |     "    df = df.copy()\n",
 451 |     "\n",
 452 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 453 |     "    \n",
 454 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 455 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 456 |     "    \n",
 457 |     "    # 最近的 time 值 + 行级别的统计值\n",
 458 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 459 |     "    act_time.columns = ['act_time(rank_' + str(i) + ')(window' + str(window) + ')' for i in act_time.columns]\n",
 460 |     "    for i in ['min', 'max', 'mean', 'median', 'std', 'sum', np.ptp]:\n",
 461 |     "        act_time['act_row_time_' + i + '(window_' + str(window) + ')' if type(i) == str else 'act_row_time_' + i.func_name + '(window_' + str(window) + ')'] = act_time.apply(i, axis=1)\n",
 462 |     "    act_time = act_time.reset_index()\n",
 463 |     "\n",
 464 |     "    mydf = pd.merge(mydf, act_time, on='userid', how='left')\n",
 465 |     "    \n",
 466 |     "    return mydf"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "code",
 471 |    "execution_count": null,
 472 |    "metadata": {
 473 |     "collapsed": true
 474 |    },
 475 |    "outputs": [],
 476 |    "source": [
 477 |     "def get_action_time_diff2_based_on_time_last_window_feature(df, window):\n",
 478 |     "    df = df.copy()\n",
 479 |     "\n",
 480 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 481 |     "    \n",
 482 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 483 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 484 |     "    \n",
 485 |     "    # time 的差值\n",
 486 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 487 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 488 |     "    act_time_diff2 = act_time.diff(2, axis=1)    # need test\n",
 489 |     "    act_time_diff2 = act_time_diff2.iloc[:, 2:].reset_index()\n",
 490 |     "    act_time_diff2.columns = [i if i == 'userid' else 'act_time_diff2(' + str(i) + '-' + str(i + 2) + ')(window_' + str(window) + ')' for i in act_time_diff2.columns]\n",
 491 |     "\n",
 492 |     "    mydf = pd.merge(mydf, act_time_diff2, on='userid', how='left')\n",
 493 |     "    \n",
 494 |     "    return mydf"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "code",
 499 |    "execution_count": null,
 500 |    "metadata": {
 501 |     "collapsed": true
 502 |    },
 503 |    "outputs": [],
 504 |    "source": [
 505 |     "def get_action_time_based_on_time_last_window_on_type_feature(df, window, ttype):\n",
 506 |     "    df = df.copy()\n",
 507 |     "\n",
 508 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 509 |     "    \n",
 510 |     "    tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 511 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 512 |     "    \n",
 513 |     "    # 特定 type 的 action 的 time 的差值\n",
 514 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 515 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 516 |     "    act_time_diff = act_time.diff(1, axis=1)\n",
 517 |     "    act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n",
 518 |     "    act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, window, ttype) for i in act_time_diff.columns]\n",
 519 |     "\n",
 520 |     "    mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n",
 521 |     "    \n",
 522 |     "    return mydf"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": null,
 528 |    "metadata": {
 529 |     "collapsed": true
 530 |    },
 531 |    "outputs": [],
 532 |    "source": [
 533 |     "def get_action_time_2order_based_on_time_last_window_feature(df, window):\n",
 534 |     "    df = df.copy()\n",
 535 |     "\n",
 536 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 537 |     "    \n",
 538 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 539 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 540 |     "    \n",
 541 |     "    # 特定 type 的 action 的 time 的差值\n",
 542 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 543 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 544 |     "    act_time_diff_2order = act_time.diff(1, axis=1).diff(1, axis=1)\n",
 545 |     "    act_time_diff_2order = act_time_diff_2order.iloc[:, 2:].reset_index()\n",
 546 |     "    act_time_diff_2order.columns = [i if i == 'userid' else 'act_time_diff_2order(%d-%d)(window_%d)' % (i, i+1, window) for i in act_time_diff_2order.columns]\n",
 547 |     "\n",
 548 |     "    mydf = pd.merge(mydf, act_time_diff_2order, on='userid', how='left')\n",
 549 |     "    \n",
 550 |     "    return mydf"
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "code",
 555 |    "execution_count": null,
 556 |    "metadata": {
 557 |     "collapsed": true
 558 |    },
 559 |    "outputs": [],
 560 |    "source": [
 561 |     "def get_action_real_time_based_on_time_last_window_on_type_feature(df, window, ttype):\n",
 562 |     "    df = df.copy()\n",
 563 |     "\n",
 564 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 565 |     "    \n",
 566 |     "    tmp = df[df['actionType'] == ttype].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 567 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 568 |     "    \n",
 569 |     "    # 特定的 type 的 action 的最近的 time 值\n",
 570 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime').reset_index()\n",
 571 |     "    act_time.columns = [i if i == 'userid' else 'act_time(rank_%d)(window_%d)(type_%d)' % (i, window, ttype) for i in act_time.columns]\n",
 572 |     "\n",
 573 |     "    mydf = pd.merge(mydf, act_time, on='userid', how='left')\n",
 574 |     "    \n",
 575 |     "    return mydf"
 576 |    ]
 577 |   },
 578 |   {
 579 |    "cell_type": "code",
 580 |    "execution_count": null,
 581 |    "metadata": {
 582 |     "collapsed": true
 583 |    },
 584 |    "outputs": [],
 585 |    "source": [
 586 |     "def get_act_ord_time_diff_feature(act, oord):\n",
 587 |     "    act = act.copy()\n",
 588 |     "    oord = oord.copy()\n",
 589 |     "\n",
 590 |     "    mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n",
 591 |     "\n",
 592 |     "    ord_time = oord.groupby('userid')['orderTime'].max().reset_index()\n",
 593 |     "    act = pd.merge(act, ord_time, on='userid', how='left')  # fillna?\n",
 594 |     "    act['act_time-ord_time'] = act['actionTime'] - act['orderTime']\n",
 595 |     "    act_ord_time_diff = act[act['act_time-ord_time'] > 0].groupby('userid').size().reset_index()\n",
 596 |     "    act_ord_time_diff.columns = ['userid', 'act_ord_time_diff_gt0_count']\n",
 597 |     "\n",
 598 |     "    mydf = pd.merge(mydf, act_ord_time_diff, on='userid', how='left')\n",
 599 |     "    \n",
 600 |     "    return mydf"
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "code",
 605 |    "execution_count": null,
 606 |    "metadata": {
 607 |     "collapsed": true
 608 |    },
 609 |    "outputs": [],
 610 |    "source": [
 611 |     "def get_order_last_order_ydm_feature(df):\n",
 612 |     "    df = df.copy()\n",
 613 |     "\n",
 614 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 615 |     "\n",
 616 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)\n",
 617 |     "\n",
 618 |     "    mydf['ord_last_ord_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year\n",
 619 |     "    mydf['ord_last_ord_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month\n",
 620 |     "    mydf['ord_last_ord_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day\n",
 621 |     "\n",
 622 |     "    return mydf"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "code",
 627 |    "execution_count": null,
 628 |    "metadata": {
 629 |     "collapsed": true
 630 |    },
 631 |    "outputs": [],
 632 |    "source": [
 633 |     "def get_order_type1_ydm_feature(df):\n",
 634 |     "    df = df.copy()\n",
 635 |     "\n",
 636 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 637 |     "\n",
 638 |     "    # 最近一次的 type 为 1 的订单的年月日\n",
 639 |     "    tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=False).head(1)).reset_index(drop=True)\n",
 640 |     "    mydf['ord_last_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year\n",
 641 |     "    mydf['ord_last_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month\n",
 642 |     "    mydf['ord_last_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day\n",
 643 |     "    \n",
 644 |     "    # 最早一次的 type 为 1 的订单的年月日\n",
 645 |     "    tmp = df[df['orderType'] == 1].groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending=True).head(1)).reset_index(drop=True)\n",
 646 |     "    mydf['ord_first_type1_year'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.year\n",
 647 |     "    mydf['ord_first_type1_month'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.month\n",
 648 |     "    mydf['ord_first_type1_day'] = pd.to_datetime(tmp['orderTime'], unit='s').dt.day\n",
 649 |     "\n",
 650 |     "    return mydf"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": null,
 656 |    "metadata": {
 657 |     "collapsed": true
 658 |    },
 659 |    "outputs": [],
 660 |    "source": [
 661 |     "def get_act_ord_act_time_diff_last_window_feature(act, oord, window):\n",
 662 |     "    act = act.copy()\n",
 663 |     "    oord = oord.copy()\n",
 664 |     "\n",
 665 |     "    mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n",
 666 |     "\n",
 667 |     "    ord_time = oord.groupby('userid')['orderTime'].max().reset_index()\n",
 668 |     "    act = pd.merge(act, ord_time, on='userid', how='left')\n",
 669 |     "\n",
 670 |     "    df = act[act['actionTime'] < act['orderTime']]\n",
 671 |     "\n",
 672 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 673 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 674 |     "\n",
 675 |     "    # 最后一次订单之前的 action 的 time 的差值\n",
 676 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 677 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 678 |     "    act_time_diff = act_time.diff(1, axis=1)\n",
 679 |     "    act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n",
 680 |     "    act_time_diff.columns = [i if i == 'userid' else 'act_ord_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]\n",
 681 |     "\n",
 682 |     "    mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n",
 683 |     "    \n",
 684 |     "    return mydf"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "code",
 689 |    "execution_count": null,
 690 |    "metadata": {
 691 |     "collapsed": true
 692 |    },
 693 |    "outputs": [],
 694 |    "source": [
 695 |     "def get_act_ord_type1_act_time_diff_last_window_feature(act, oord, window):\n",
 696 |     "    act = act.copy()\n",
 697 |     "    oord = oord.copy()\n",
 698 |     "\n",
 699 |     "    mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n",
 700 |     "\n",
 701 |     "    ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index()\n",
 702 |     "    act = pd.merge(act, ord_time, on='userid', how='left')\n",
 703 |     "\n",
 704 |     "    df = act[act['actionTime'] < act['orderTime']]\n",
 705 |     "\n",
 706 |     "    tmp = df.groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 707 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 708 |     "\n",
 709 |     "    # 最后一次精品订单之前的 action 的 time 的差值\n",
 710 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 711 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 712 |     "    act_time_diff = act_time.diff(1, axis=1)\n",
 713 |     "    act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n",
 714 |     "    act_time_diff.columns = [i if i == 'userid' else 'act_ord_type1_act_time_diff(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff.columns]\n",
 715 |     "\n",
 716 |     "    mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n",
 717 |     "    \n",
 718 |     "    return mydf"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": null,
 724 |    "metadata": {
 725 |     "collapsed": true
 726 |    },
 727 |    "outputs": [],
 728 |    "source": [
 729 |     "def get_action_sequence_time_diff_feature(df):\n",
 730 |     "    df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True)\n",
 731 |     "\n",
 732 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 733 |     "\n",
 734 |     "    df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s')\n",
 735 |     "    df['actionTimeDiff'] = df['actionTime'].diff()\n",
 736 |     "\n",
 737 |     "    counter = 1\n",
 738 |     "    last_userid = df.iloc[0, 0]\n",
 739 |     "    seq_list = []\n",
 740 |     "    for i, r in df[['userid', 'actionTimeDiff']].iterrows():\n",
 741 |     "        if i % 500000 == 0:\n",
 742 |     "            util.log(i)\n",
 743 |     "        if r.userid != last_userid:\n",
 744 |     "            counter = 1\n",
 745 |     "            seq_list.append(counter)\n",
 746 |     "            last_userid = r.userid\n",
 747 |     "        elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid:\n",
 748 |     "            seq_list.append(counter)\n",
 749 |     "        else:\n",
 750 |     "            counter += 1\n",
 751 |     "            seq_list.append(counter)\n",
 752 |     "    df['actionSeq'] = pd.Series(seq_list)\n",
 753 |     "    \n",
 754 |     "    # 基于10分钟分块（时差低于10分钟的行为为一部分），每个块的时差\n",
 755 |     "    seq_time_max = df.groupby(['userid', 'actionSeq'])['actionTime'].max().unstack()\n",
 756 |     "    seq_time_diff = seq_time_max.diff(1, axis=1)\n",
 757 |     "    for window in [2,3,4,5,6,7,10,15]:\n",
 758 |     "        tmp = seq_time_diff.iloc[:, 1:(window+1)]\n",
 759 |     "        tmp.columns = ['act_seq_time_diff(%d-%d)(window_%d)' % (i, i-1, window) for i in tmp.columns]\n",
 760 |     "        tmp = tmp.reset_index()\n",
 761 |     "        data = pd.merge(mydf, tmp, on='userid', how='left')\n",
 762 |     "        util.log('window=%d' % window)\n",
 763 |     "        data.to_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window), index=False)"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "code",
 768 |    "execution_count": null,
 769 |    "metadata": {
 770 |     "collapsed": true
 771 |    },
 772 |    "outputs": [],
 773 |    "source": [
 774 |     "def get_action_sequence_time_stat_feature(df):\n",
 775 |     "    df = df.sort_values(by=['userid', 'actionTime'], ascending=[True, False]).copy().reset_index(drop=True)\n",
 776 |     "\n",
 777 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 778 |     "\n",
 779 |     "    df['actionTimee'] = pd.to_datetime(df['actionTime'], unit='s')\n",
 780 |     "    df['actionTimeDiff'] = df['actionTime'].diff()\n",
 781 |     "\n",
 782 |     "    counter = 1\n",
 783 |     "    last_userid = df.iloc[0, 0]\n",
 784 |     "    seq_list = []\n",
 785 |     "    for i, r in df[['userid', 'actionTimeDiff']].iterrows():\n",
 786 |     "        if i % 500000 == 0:\n",
 787 |     "            util.log(i)\n",
 788 |     "        if r.userid != last_userid:\n",
 789 |     "            counter = 1\n",
 790 |     "            seq_list.append(counter)\n",
 791 |     "            last_userid = r.userid\n",
 792 |     "        elif (r.actionTimeDiff <= 0 and r.actionTimeDiff >= -600 or math.isnan(r.actionTimeDiff)) and r.userid == last_userid:\n",
 793 |     "            seq_list.append(counter)\n",
 794 |     "        else:\n",
 795 |     "            counter += 1\n",
 796 |     "            seq_list.append(counter)\n",
 797 |     "    df['actionSeq'] = pd.Series(seq_list)\n",
 798 |     "    \n",
 799 |     "    time_stat = df[(df['actionSeq'] == 1) | (df['actionSeq'] == 2) | (df['actionSeq'] == 3)].groupby(['userid', 'actionSeq'])['actionTime'].agg([min, max, np.mean, np.median, np.ptp, np.std, 'count']).unstack().reset_index()\n",
 800 |     "    time_stat.columns = ['userid' if i[0] == 'userid' else 'act_seq_time_stat_%s_last%d' % (i[0], i[1]) for i in time_stat.columns]\n",
 801 |     "    \n",
 802 |     "    time_stat.to_csv('../data/output/feat/%s' % ('action_sequence_time_stat_last123'), index=False)"
 803 |    ]
 804 |   },
 805 |   {
 806 |    "cell_type": "code",
 807 |    "execution_count": null,
 808 |    "metadata": {
 809 |     "collapsed": true
 810 |    },
 811 |    "outputs": [],
 812 |    "source": [
 813 |     "def get_action_time_diff_234_56789_last_window_feature(df, window):\n",
 814 |     "    df = df.copy()\n",
 815 |     "\n",
 816 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 817 |     "    \n",
 818 |     "    # 234 类型的 action 的 time 的差值\n",
 819 |     "    tmp = df[df['actionType'].isin([2, 3, 4])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 820 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 821 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 822 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 823 |     "    act_time_diff_234 = act_time.diff(1, axis=1)\n",
 824 |     "    act_time_diff_234 = act_time_diff_234.iloc[:, 1:].reset_index()\n",
 825 |     "    act_time_diff_234.columns = [i if i == 'userid' else 'act_time_diff_234(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_234.columns]\n",
 826 |     "    \n",
 827 |     "    # 56789 类型的 action 的 time 的差值\n",
 828 |     "    tmp = df[df['actionType'].isin([5, 6, 7, 8, 9])].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(window)).reset_index(drop=True)\n",
 829 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 830 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 831 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 832 |     "    act_time_diff_56789 = act_time.diff(1, axis=1)\n",
 833 |     "    act_time_diff_56789 = act_time_diff_56789.iloc[:, 1:].reset_index()\n",
 834 |     "    act_time_diff_56789.columns = [i if i == 'userid' else 'act_time_diff_56789(' + str(i) + '-' + str(i + 1) + ')(window_' + str(window) + ')' for i in act_time_diff_56789.columns]\n",
 835 |     "\n",
 836 |     "    mydf = pd.merge(mydf, act_time_diff_234, on='userid', how='left')\n",
 837 |     "    mydf = pd.merge(mydf, act_time_diff_56789, on='userid', how='left')\n",
 838 |     "    \n",
 839 |     "    return mydf"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "code",
 844 |    "execution_count": null,
 845 |    "metadata": {
 846 |     "collapsed": true
 847 |    },
 848 |    "outputs": [],
 849 |    "source": [
 850 |     "def get_action_stat_last_every_type_feature(df):\n",
 851 |     "    df = df.copy()\n",
 852 |     "\n",
 853 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 854 |     "\n",
 855 |     "    # 离最近的 123456789 的 action 的时间的统计\n",
 856 |     "    for t in range(1, 10):\n",
 857 |     "        tmp = df[df['actionType'] == t].groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index()\n",
 858 |     "        tmp.columns = [i if i == 'userid' else 'act_time_%s(type_%d)' % (i, t) for i in tmp.columns]\n",
 859 |     "        \n",
 860 |     "        mydf = pd.merge(mydf, tmp, on='userid', how='left')\n",
 861 |     "\n",
 862 |     "    return mydf"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": null,
 868 |    "metadata": {
 869 |     "collapsed": true
 870 |    },
 871 |    "outputs": [],
 872 |    "source": [
 873 |     "def get_act_ord_before_type1_stat_feature(act, oord):\n",
 874 |     "    act = act.copy()\n",
 875 |     "    oord = oord.copy()\n",
 876 |     "\n",
 877 |     "    mydf = oord[['userid']].drop_duplicates().reset_index(drop=True)\n",
 878 |     "\n",
 879 |     "    ord_time = oord[oord['orderType'] == 1].groupby('userid')['orderTime'].max().reset_index()\n",
 880 |     "    act = pd.merge(act, ord_time, on='userid', how='left')\n",
 881 |     "\n",
 882 |     "    df = act[act['actionTime'] < act['orderTime']]\n",
 883 |     "\n",
 884 |     "    act_time_stat = df.groupby('userid')['actionTime'].agg([min, max, np.ptp, np.std, np.mean, np.median, 'count']).reset_index()\n",
 885 |     "    act_time_stat.columns = [i if i == 'userid' else 'act_ord_before_type1_act_time_%s' % i for i in act_time_stat.columns]\n",
 886 |     "    \n",
 887 |     "    act_type_size = mydf.copy()\n",
 888 |     "    for t in range(1, 10):\n",
 889 |     "        tmp = df[df['actionType'] == t].groupby('userid').size().reset_index()\n",
 890 |     "        tmp.columns = ['userid', 'act_ord_before_type1_act_type_size(type_%d)' % t]\n",
 891 |     "        act_type_size = pd.merge(act_type_size, tmp, on='userid', how='left')\n",
 892 |     "\n",
 893 |     "    mydf = pd.merge(mydf, act_time_stat, on='userid', how='left')\n",
 894 |     "    mydf = pd.merge(mydf, act_type_size, on='userid', how='left')\n",
 895 |     "    \n",
 896 |     "    return mydf"
 897 |    ]
 898 |   },
 899 |   {
 900 |    "cell_type": "code",
 901 |    "execution_count": null,
 902 |    "metadata": {
 903 |     "collapsed": true
 904 |    },
 905 |    "outputs": [],
 906 |    "source": [
 907 |     "def get_action_time_diff_stat_feature(df):\n",
 908 |     "    df = df.copy()\n",
 909 |     "\n",
 910 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 911 |     "\n",
 912 |     "    df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy()\n",
 913 |     "    df['actionTimeDiff'] = df['actionTime'].diff(1)\n",
 914 |     "    df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)\n",
 915 |     "\n",
 916 |     "    act_time_diff_stat = df.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()\n",
 917 |     "    act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s' % i for i in act_time_diff_stat.columns]\n",
 918 |     "\n",
 919 |     "    mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')\n",
 920 |     "    \n",
 921 |     "    return mydf"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "code",
 926 |    "execution_count": null,
 927 |    "metadata": {
 928 |     "collapsed": true
 929 |    },
 930 |    "outputs": [],
 931 |    "source": [
 932 |     "def get_action_time_diff_stat_last_window_feature(df, window):\n",
 933 |     "    df = df.copy()\n",
 934 |     "\n",
 935 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 936 |     "\n",
 937 |     "    df = df.sort_values(['userid', 'actionTime']).reset_index(drop=True).copy()\n",
 938 |     "    df['actionTimeDiff'] = df['actionTime'].diff(1)\n",
 939 |     "    df = df.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)\n",
 940 |     "    \n",
 941 |     "    tmp = df.groupby('userid').apply(lambda x: x.iloc[:-window, :]).reset_index(drop=True)\n",
 942 |     "    act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()\n",
 943 |     "    act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_%s(window_%d)' % (i, window) for i in act_time_diff_stat.columns]\n",
 944 |     "\n",
 945 |     "    mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')\n",
 946 |     "    \n",
 947 |     "    return mydf"
 948 |    ]
 949 |   },
 950 |   {
 951 |    "cell_type": "code",
 952 |    "execution_count": null,
 953 |    "metadata": {
 954 |     "collapsed": true
 955 |    },
 956 |    "outputs": [],
 957 |    "source": [
 958 |     "def get_action_time_last_on_every_type_feature(df):\n",
 959 |     "    df = df.copy()\n",
 960 |     "\n",
 961 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 962 |     "\n",
 963 |     "    df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy()\n",
 964 |     "    for t in range(1, 10):\n",
 965 |     "        act_time = df[df['actionType'] == t].groupby('userid').apply(lambda x: x.head(1)).reset_index(drop=True)\n",
 966 |     "        act_time = act_time[['userid', 'actionTime']]\n",
 967 |     "        act_time.columns = ['userid', 'act_time_last(type_%d)' % t]\n",
 968 |     "        \n",
 969 |     "        mydf = pd.merge(mydf, act_time, on='userid', how='left')\n",
 970 |     "    \n",
 971 |     "    return mydf"
 972 |    ]
 973 |   },
 974 |   {
 975 |    "cell_type": "code",
 976 |    "execution_count": null,
 977 |    "metadata": {
 978 |     "collapsed": true
 979 |    },
 980 |    "outputs": [],
 981 |    "source": [
 982 |     "def get_try_feat(df):\n",
 983 |     "    df = df.copy()\n",
 984 |     "    \n",
 985 |     "    mydf = df[['userid']].drop_duplicates().reset_index(drop=True)\n",
 986 |     "\n",
 987 |     "    df = df.sort_values(['userid', 'actionTime'], ascending=[True, False]).reset_index(drop=True).copy()\n",
 988 |     "    \n",
 989 |     "    last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])\n",
 990 |     "    last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])\n",
 991 |     "    time_gap_last56 = pd.merge(last_5, last_6, on='userid', how='outer')\n",
 992 |     "    time_gap_last56['time_gap_last56'] = time_gap_last56.actionTime_y - time_gap_last56.actionTime_x\n",
 993 |     "    mydf = pd.merge(mydf, time_gap_last56[['userid', 'time_gap_last56']], on='userid', how='left')\n",
 994 |     "\n",
 995 |     "    tmp = df[df['actionType'] == 5].groupby('userid').apply(lambda x: x.sort_values('actionTime', ascending=False).head(2)).reset_index(drop=True)\n",
 996 |     "    tmp['act_time_rank'] = tmp.groupby('userid')['actionTime'].rank(method = 'first', ascending=False).astype(int)\n",
 997 |     "    act_time = tmp.pivot('userid', 'act_time_rank', 'actionTime')\n",
 998 |     "    act_time = act_time[act_time.columns[::-1]]\n",
 999 |     "    act_time_diff = act_time.diff(1, axis=1)\n",
1000 |     "    act_time_diff = act_time_diff.iloc[:, 1:].reset_index()\n",
1001 |     "    act_time_diff.columns = [i if i == 'userid' else 'act_time_diff(%d-%d)(window_%d)(type_%d)' % (i, i+1, 2, 5) for i in act_time_diff.columns]\n",
1002 |     "    mydf = pd.merge(mydf, act_time_diff, on='userid', how='left')\n",
1003 |     "\n",
1004 |     "    last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])\n",
1005 |     "    last_7 = df[df.actionType == 7].drop_duplicates(subset=['userid'])\n",
1006 |     "    time_gap_last67 = pd.merge(last_6, last_7, on='userid', how='outer')\n",
1007 |     "    time_gap_last67['time_gap_last67'] = time_gap_last67.actionTime_y - time_gap_last67.actionTime_x\n",
1008 |     "    mydf = pd.merge(mydf, time_gap_last67[['userid', 'time_gap_last67']], on='userid', how='left')\n",
1009 |     "\n",
1010 |     "    df['actionDate'] = pd.to_datetime(df['actionTime'], unit='s')\n",
1011 |     "    df = pd.merge(df, df.drop_duplicates(subset=['userid'])[['userid', 'actionDate']], on='userid', how='left')\n",
1012 |     "    df['lastDay'] = df.actionDate_x.dt.day == df.actionDate_y.dt.day\n",
1013 |     "    last_day = df[df.lastDay].groupby('userid')['lastDay'].size().reset_index()\n",
1014 |     "    last_day_5 = df[df.lastDay & (df.actionType == 5)].groupby('userid')['lastDay'].size().reset_index()\n",
1015 |     "    tmp = pd.merge(last_day, last_day_5, on='userid', how='left')\n",
1016 |     "    tmp['last_day_rate(type_5)'] = tmp.lastDay_y / tmp.lastDay_x\n",
1017 |     "    mydf = pd.merge(mydf, tmp[['userid', 'last_day_rate(type_5)']], on='userid', how='left')\n",
1018 |     "\n",
1019 |     "    last_time = df.drop_duplicates(subset=['userid'])[['userid', 'actionTime']]\n",
1020 |     "    last_time.columns = ['userid', 'last_time']\n",
1021 |     "    mydf = pd.merge(mydf, last_time, on='userid', how='left')\n",
1022 |     "\n",
1023 |     "    last_4 = df[df.actionType == 4].drop_duplicates(subset=['userid'])\n",
1024 |     "    last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])\n",
1025 |     "    time_gap_last45 = pd.merge(last_4, last_5, on='userid', how='outer')\n",
1026 |     "    time_gap_last45['time_gap_last45'] = time_gap_last45.actionTime_y - time_gap_last45.actionTime_x\n",
1027 |     "    mydf = pd.merge(mydf, time_gap_last45[['userid', 'time_gap_last45']], on='userid', how='left')\n",
1028 |     "\n",
1029 |     "    last_1 = df[df.actionType == 1].drop_duplicates(subset=['userid'])\n",
1030 |     "    last = df.drop_duplicates(subset=['userid'])\n",
1031 |     "    time_gap_last1 = pd.merge(last_1, last, on='userid', how='outer')\n",
1032 |     "    time_gap_last1['time_gap_last1'] = time_gap_last1.actionTime_y - time_gap_last1.actionTime_x\n",
1033 |     "    mydf = pd.merge(mydf, time_gap_last1[['userid', 'time_gap_last1']], on='userid', how='left')\n",
1034 |     "\n",
1035 |     "    last_5 = df[df.actionType == 5].drop_duplicates(subset=['userid'])\n",
1036 |     "    last = df.drop_duplicates(subset=['userid'])\n",
1037 |     "    time_gap_last5 = pd.merge(last_5, last, on='userid', how='outer')\n",
1038 |     "    time_gap_last5['time_gap_last5'] = time_gap_last5.actionTime_y - time_gap_last5.actionTime_x\n",
1039 |     "    mydf = pd.merge(mydf, time_gap_last5[['userid', 'time_gap_last5']], on='userid', how='left')\n",
1040 |     "\n",
1041 |     "    last_6 = df[df.actionType == 6].drop_duplicates(subset=['userid'])\n",
1042 |     "    last = df.drop_duplicates(subset=['userid'])\n",
1043 |     "    time_gap_last6 = pd.merge(last_6, last, on='userid', how='outer')\n",
1044 |     "    time_gap_last6['time_gap_last6'] = time_gap_last6.actionTime_y - time_gap_last6.actionTime_x\n",
1045 |     "    mydf = pd.merge(mydf, time_gap_last6[['userid', 'time_gap_last6']], on='userid', how='left')\n",
1046 |     "\n",
1047 |     "    tmp = df[df.actionType.isin([5, 6])].copy()\n",
1048 |     "    tmp['actionTimeDiff'] = tmp['actionTime'].diff(1)\n",
1049 |     "    tmp = tmp.groupby('userid').apply(lambda x: x.iloc[1:, :]).reset_index(drop=True)\n",
1050 |     "    act_time_diff_stat = tmp.groupby('userid')['actionTimeDiff'].agg([min, max, np.mean, np.median, np.std, sum]).reset_index()\n",
1051 |     "    act_time_diff_stat.columns = [i if i == 'userid' else 'act_time_diff_56_%s' % i for i in act_time_diff_stat.columns]\n",
1052 |     "    mydf = pd.merge(mydf, act_time_diff_stat, on='userid', how='left')\n",
1053 |     "    \n",
1054 |     "    return mydf"
1055 |    ]
1056 |   },
1057 |   {
1058 |    "cell_type": "code",
1059 |    "execution_count": null,
1060 |    "metadata": {
1061 |     "collapsed": true
1062 |    },
1063 |    "outputs": [],
1064 |    "source": [
1065 |     "action_tr = pd.read_csv('../data/input/train/action_train.csv')  # 用户行为数据\n",
1066 |     "order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')  # 待预测数据\n",
1067 |     "order_history_tr = pd.read_csv('../data/input/train/orderHistory_train.csv')  # 用户历史订单数据\n",
1068 |     "user_comment_tr = pd.read_csv('../data/input/train/userComment_train.csv')  # 用户评论数据\n",
1069 |     "user_profile_tr = pd.read_csv('../data/input/train/userProfile_train.csv')  # 用户个人信息\n",
1070 |     "\n",
1071 |     "action_te = pd.read_csv('../data/input/test/action_test.csv')\n",
1072 |     "order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')\n",
1073 |     "order_history_te = pd.read_csv('../data/input/test/orderHistory_test.csv')\n",
1074 |     "user_comment_te = pd.read_csv('../data/input/test/userComment_test.csv')\n",
1075 |     "user_profile_te = pd.read_csv('../data/input/test/userProfile_test.csv')\n",
1076 |     "\n",
1077 |     "action = pd.concat([action_tr, action_te], axis=0).reset_index(drop=True)\n",
1078 |     "order_history = pd.concat([order_history_tr, order_history_te], axis=0).reset_index(drop=True)\n",
1079 |     "user_comment = pd.concat([user_comment_tr, user_comment_te], axis=0).reset_index(drop=True)\n",
1080 |     "user_profile = pd.concat([user_profile_tr, user_profile_te], axis=0).reset_index(drop=True)"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "code",
1085 |    "execution_count": null,
1086 |    "metadata": {
1087 |     "collapsed": true
1088 |    },
1089 |    "outputs": [],
1090 |    "source": [
1091 |     "user_profile_feat = get_user_profile_feature(user_profile)\n",
1092 |     "user_profile_feat.to_csv('../data/output/feat/%s' % 'user_profile', index=False)"
1093 |    ]
1094 |   },
1095 |   {
1096 |    "cell_type": "code",
1097 |    "execution_count": null,
1098 |    "metadata": {
1099 |     "collapsed": true
1100 |    },
1101 |    "outputs": [],
1102 |    "source": [
1103 |     "user_comment_feat = get_user_comment_feature(user_comment)\n",
1104 |     "user_comment_feat.to_csv('../data/output/feat/%s' % 'user_comment', index=False)"
1105 |    ]
1106 |   },
1107 |   {
1108 |    "cell_type": "code",
1109 |    "execution_count": null,
1110 |    "metadata": {
1111 |     "collapsed": true
1112 |    },
1113 |    "outputs": [],
1114 |    "source": [
1115 |     "order_history_feat = get_order_history_feature(order_history)\n",
1116 |     "order_history_feat.to_csv('../data/output/feat/%s' % 'order_history', index=False)"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "code",
1121 |    "execution_count": null,
1122 |    "metadata": {
1123 |     "collapsed": true
1124 |    },
1125 |    "outputs": [],
1126 |    "source": [
1127 |     "order_history_last_w_feat = get_order_history_last_w_feature(order_history)\n",
1128 |     "order_history_last_w_feat.to_csv('../data/output/feat/%s' % 'order_history_last_w', index=False)"
1129 |    ]
1130 |   },
1131 |   {
1132 |    "cell_type": "code",
1133 |    "execution_count": null,
1134 |    "metadata": {
1135 |     "collapsed": true
1136 |    },
1137 |    "outputs": [],
1138 |    "source": [
1139 |     "action_type_feat = get_action_type_feature(action)\n",
1140 |     "action_type_feat.to_csv('../data/output/feat/%s' % 'action_type', index=False)"
1141 |    ]
1142 |   },
1143 |   {
1144 |    "cell_type": "code",
1145 |    "execution_count": null,
1146 |    "metadata": {
1147 |     "collapsed": true
1148 |    },
1149 |    "outputs": [],
1150 |    "source": [
1151 |     "action_type_based_on_time_feat = get_action_type_based_on_time_feature(action)\n",
1152 |     "action_type_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_type_based_on_time', index=False)"
1153 |    ]
1154 |   },
1155 |   {
1156 |    "cell_type": "code",
1157 |    "execution_count": null,
1158 |    "metadata": {
1159 |     "collapsed": true,
1160 |     "scrolled": true
1161 |    },
1162 |    "outputs": [],
1163 |    "source": [
1164 |     "for window in [3,4,5,6,7]:\n",
1165 |     "    util.log(window)\n",
1166 |     "    action_type_based_on_time_last_window_feat = get_action_type_based_on_time_last_window_feature(action, window)\n",
1167 |     "    action_type_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window), index=False)"
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "code",
1172 |    "execution_count": null,
1173 |    "metadata": {
1174 |     "collapsed": true,
1175 |     "scrolled": true
1176 |    },
1177 |    "outputs": [],
1178 |    "source": [
1179 |     "for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]:\n",
1180 |     "    util.log(window)\n",
1181 |     "    action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window)\n",
1182 |     "    action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False)"
1183 |    ]
1184 |   },
1185 |   {
1186 |    "cell_type": "code",
1187 |    "execution_count": null,
1188 |    "metadata": {
1189 |     "collapsed": true
1190 |    },
1191 |    "outputs": [],
1192 |    "source": [
1193 |     "for window in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]:\n",
1194 |     "    util.log(window)\n",
1195 |     "    action_type_rate_based_on_time_last_window_feat = get_action_type_rate_based_on_time_last_window_feature(action, window)\n",
1196 |     "    action_type_rate_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window), index=False)"
1197 |    ]
1198 |   },
1199 |   {
1200 |    "cell_type": "code",
1201 |    "execution_count": null,
1202 |    "metadata": {
1203 |     "collapsed": true
1204 |    },
1205 |    "outputs": [],
1206 |    "source": [
1207 |     "for window in [6]:\n",
1208 |     "    util.log(window)\n",
1209 |     "    action_type_row_stat_based_on_time_last_window_feat = get_action_type_row_stat_based_on_time_last_window_feature(action, window)\n",
1210 |     "    action_type_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window), index=False)"
1211 |    ]
1212 |   },
1213 |   {
1214 |    "cell_type": "code",
1215 |    "execution_count": null,
1216 |    "metadata": {
1217 |     "collapsed": true
1218 |    },
1219 |    "outputs": [],
1220 |    "source": [
1221 |     "for window in [4, 7, 13, 17, 20, 25, 30]:\n",
1222 |     "    util.log(window)\n",
1223 |     "    action_num_based_on_time_last_window_feat = get_action_num_based_on_time_last_window_feature(action, window)\n",
1224 |     "    action_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window), index=False)"
1225 |    ]
1226 |   },
1227 |   {
1228 |    "cell_type": "code",
1229 |    "execution_count": null,
1230 |    "metadata": {
1231 |     "collapsed": true
1232 |    },
1233 |    "outputs": [],
1234 |    "source": [
1235 |     "for window in [4, 7, 13, 17, 20, 25, 30]:\n",
1236 |     "    util.log(window)\n",
1237 |     "    action_type_num_based_on_time_last_window_feat = get_action_type_num_based_on_time_last_window_feature(action, window)\n",
1238 |     "    action_type_num_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window), index=False)"
1239 |    ]
1240 |   },
1241 |   {
1242 |    "cell_type": "code",
1243 |    "execution_count": null,
1244 |    "metadata": {
1245 |     "collapsed": true
1246 |    },
1247 |    "outputs": [],
1248 |    "source": [
1249 |     "action_time_based_on_time_feat = get_action_time_based_on_time_feature(action)\n",
1250 |     "action_time_based_on_time_feat.to_csv('../data/output/feat/%s' % 'action_time_based_on_time', index=False)"
1251 |    ]
1252 |   },
1253 |   {
1254 |    "cell_type": "code",
1255 |    "execution_count": null,
1256 |    "metadata": {
1257 |     "collapsed": true
1258 |    },
1259 |    "outputs": [],
1260 |    "source": [
1261 |     "for window in [6]:\n",
1262 |     "    util.log(window)\n",
1263 |     "    action_time_based_on_time_last_window_feat = get_action_time_based_on_time_last_window_feature(action, window)\n",
1264 |     "    action_time_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window), index=False)"
1265 |    ]
1266 |   },
1267 |   {
1268 |    "cell_type": "code",
1269 |    "execution_count": null,
1270 |    "metadata": {
1271 |     "collapsed": true
1272 |    },
1273 |    "outputs": [],
1274 |    "source": [
1275 |     "for window in [3, 6, 10, 14]:\n",
1276 |     "    util.log(window)\n",
1277 |     "    action_time_row_stat_based_on_time_last_window_feat = get_action_time_row_stat_based_on_time_last_window_feature(action, window)\n",
1278 |     "    action_time_row_stat_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window), index=False)"
1279 |    ]
1280 |   },
1281 |   {
1282 |    "cell_type": "code",
1283 |    "execution_count": null,
1284 |    "metadata": {
1285 |     "collapsed": true
1286 |    },
1287 |    "outputs": [],
1288 |    "source": [
1289 |     "for window in [3, 4, 5, 6, 7, 8]:\n",
1290 |     "    util.log(window)\n",
1291 |     "    action_time_diff2_based_on_time_last_window_feat = get_action_time_diff2_based_on_time_last_window_feature(action, window)\n",
1292 |     "    action_time_diff2_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window), index=False)"
1293 |    ]
1294 |   },
1295 |   {
1296 |    "cell_type": "code",
1297 |    "execution_count": null,
1298 |    "metadata": {
1299 |     "collapsed": true
1300 |    },
1301 |    "outputs": [],
1302 |    "source": [
1303 |     "for ttype in [1,5,6,7,8,9]:\n",
1304 |     "    for window in [6]:\n",
1305 |     "        util.log('type=%d window=%d' % (ttype, window))\n",
1306 |     "        action_time_based_on_time_last_window_on_type_feat = get_action_time_based_on_time_last_window_on_type_feature(action, window, ttype)\n",
1307 |     "        action_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype), index=False)"
1308 |    ]
1309 |   },
1310 |   {
1311 |    "cell_type": "code",
1312 |    "execution_count": null,
1313 |    "metadata": {
1314 |     "collapsed": true
1315 |    },
1316 |    "outputs": [],
1317 |    "source": [
1318 |     "for window in [3, 4, 5, 6, 7, 8, 9, 10]:\n",
1319 |     "    util.log(window)\n",
1320 |     "    action_time_2order_based_on_time_last_window_feat = get_action_time_2order_based_on_time_last_window_feature(action, window)\n",
1321 |     "    action_time_2order_based_on_time_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window), index=False)"
1322 |    ]
1323 |   },
1324 |   {
1325 |    "cell_type": "code",
1326 |    "execution_count": null,
1327 |    "metadata": {
1328 |     "collapsed": true
1329 |    },
1330 |    "outputs": [],
1331 |    "source": [
1332 |     "for ttype in [1,5,6,7,8,9]:\n",
1333 |     "    for window in [4, 7, 10]:\n",
1334 |     "        util.log('type=%d window=%d' % (ttype, window))\n",
1335 |     "        action_real_time_based_on_time_last_window_on_type_feat = get_action_real_time_based_on_time_last_window_on_type_feature(action, window, ttype)\n",
1336 |     "        action_real_time_based_on_time_last_window_on_type_feat.to_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype), index=False)"
1337 |    ]
1338 |   },
1339 |   {
1340 |    "cell_type": "code",
1341 |    "execution_count": null,
1342 |    "metadata": {
1343 |     "collapsed": true
1344 |    },
1345 |    "outputs": [],
1346 |    "source": [
1347 |     "act_ord_time_diff_feat = get_act_ord_time_diff_feature(action, order_history)\n",
1348 |     "act_ord_time_diff_feat.to_csv('../data/output/feat/%s' % 'action_order_time_diff', index=False)"
1349 |    ]
1350 |   },
1351 |   {
1352 |    "cell_type": "code",
1353 |    "execution_count": null,
1354 |    "metadata": {
1355 |     "collapsed": true
1356 |    },
1357 |    "outputs": [],
1358 |    "source": [
1359 |     "order_last_order_ydm_feat = get_order_last_order_ydm_feature(order_history)\n",
1360 |     "order_last_order_ydm_feat.to_csv('../data/output/feat/%s' % 'order_last_order_ydm', index=False)"
1361 |    ]
1362 |   },
1363 |   {
1364 |    "cell_type": "code",
1365 |    "execution_count": null,
1366 |    "metadata": {
1367 |     "collapsed": true
1368 |    },
1369 |    "outputs": [],
1370 |    "source": [
1371 |     "order_type1_ydm_feat = get_order_type1_ydm_feature(order_history)\n",
1372 |     "order_type1_ydm_feat.to_csv('../data/output/feat/%s' % 'order_type1_ydm', index=False)"
1373 |    ]
1374 |   },
1375 |   {
1376 |    "cell_type": "code",
1377 |    "execution_count": null,
1378 |    "metadata": {
1379 |     "collapsed": true
1380 |    },
1381 |    "outputs": [],
1382 |    "source": [
1383 |     "for window in [7,8,10,11]:\n",
1384 |     "    util.log(window)\n",
1385 |     "    act_ord_act_time_diff_last_window_feat = get_act_ord_act_time_diff_last_window_feature(action, order_history, window)\n",
1386 |     "    act_ord_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window), index=False)"
1387 |    ]
1388 |   },
1389 |   {
1390 |    "cell_type": "code",
1391 |    "execution_count": null,
1392 |    "metadata": {
1393 |     "collapsed": true
1394 |    },
1395 |    "outputs": [],
1396 |    "source": [
1397 |     "for window in [2,4]:\n",
1398 |     "    util.log(window)\n",
1399 |     "    act_ord_type1_act_time_diff_last_window_feat = get_act_ord_type1_act_time_diff_last_window_feature(action, order_history, window)\n",
1400 |     "    act_ord_type1_act_time_diff_last_window_feat.to_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window), index=False)"
1401 |    ]
1402 |   },
1403 |   {
1404 |    "cell_type": "code",
1405 |    "execution_count": null,
1406 |    "metadata": {
1407 |     "collapsed": true
1408 |    },
1409 |    "outputs": [],
1410 |    "source": [
1411 |     "get_action_sequence_time_diff_feature(action)"
1412 |    ]
1413 |   },
1414 |   {
1415 |    "cell_type": "code",
1416 |    "execution_count": null,
1417 |    "metadata": {
1418 |     "collapsed": true
1419 |    },
1420 |    "outputs": [],
1421 |    "source": [
1422 |     "get_action_sequence_time_stat_feature(action)"
1423 |    ]
1424 |   },
1425 |   {
1426 |    "cell_type": "code",
1427 |    "execution_count": null,
1428 |    "metadata": {
1429 |     "collapsed": true
1430 |    },
1431 |    "outputs": [],
1432 |    "source": [
1433 |     "for window in [6]:\n",
1434 |     "    util.log(window)\n",
1435 |     "    action_time_diff_234_56789_last_window_feat = get_action_time_diff_234_56789_last_window_feature(action, window)\n",
1436 |     "    action_time_diff_234_56789_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window), index=False)"
1437 |    ]
1438 |   },
1439 |   {
1440 |    "cell_type": "code",
1441 |    "execution_count": null,
1442 |    "metadata": {
1443 |     "collapsed": true
1444 |    },
1445 |    "outputs": [],
1446 |    "source": [
1447 |     "action_stat_last_every_type_feat = get_action_stat_last_every_type_feature(action)\n",
1448 |     "action_stat_last_every_type_feat.to_csv('../data/output/feat/%s' % 'action_stat_last_every_type', index=False)"
1449 |    ]
1450 |   },
1451 |   {
1452 |    "cell_type": "code",
1453 |    "execution_count": null,
1454 |    "metadata": {
1455 |     "collapsed": true
1456 |    },
1457 |    "outputs": [],
1458 |    "source": [
1459 |     "act_ord_before_type1_stat_feat = get_act_ord_before_type1_stat_feature(action, order_history)\n",
1460 |     "act_ord_before_type1_stat_feat.to_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat', index=False)"
1461 |    ]
1462 |   },
1463 |   {
1464 |    "cell_type": "code",
1465 |    "execution_count": null,
1466 |    "metadata": {
1467 |     "collapsed": true
1468 |    },
1469 |    "outputs": [],
1470 |    "source": [
1471 |     "action_time_diff_stat_feat = get_action_time_diff_stat_feature(action)  # untest\n",
1472 |     "action_time_diff_stat_feat.to_csv('../data/output/feat/%s' % 'action_time_diff_stat', index=False)"
1473 |    ]
1474 |   },
1475 |   {
1476 |    "cell_type": "code",
1477 |    "execution_count": null,
1478 |    "metadata": {
1479 |     "collapsed": true
1480 |    },
1481 |    "outputs": [],
1482 |    "source": [
1483 |     "for window in [3, 4, 5, 6, 7, 8, 9]:\n",
1484 |     "    util.log(window)\n",
1485 |     "    action_time_diff_stat_last_window_feat = get_action_time_diff_stat_last_window_feature(action, window)\n",
1486 |     "    action_time_diff_stat_last_window_feat.to_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window), index=False)"
1487 |    ]
1488 |   },
1489 |   {
1490 |    "cell_type": "code",
1491 |    "execution_count": null,
1492 |    "metadata": {
1493 |     "collapsed": true
1494 |    },
1495 |    "outputs": [],
1496 |    "source": [
1497 |     "action_time_last_on_every_type_feat = get_action_time_last_on_every_type_feature(action)\n",
1498 |     "action_time_last_on_every_type_feat.to_csv('../data/output/feat/%s' % 'action_time_last_on_every_type', index=False)"
1499 |    ]
1500 |   },
1501 |   {
1502 |    "cell_type": "code",
1503 |    "execution_count": null,
1504 |    "metadata": {
1505 |     "collapsed": true
1506 |    },
1507 |    "outputs": [],
1508 |    "source": [
1509 |     "try_feat = get_try_feat(action)\n",
1510 |     "try_feat.to_csv('../data/output/feat/%s' % 'try', index=False)"
1511 |    ]
1512 |   }
1513 |  ],
1514 |  "metadata": {
1515 |   "kernelspec": {
1516 |    "display_name": "Python [default]",
1517 |    "language": "python",
1518 |    "name": "python2"
1519 |   },
1520 |   "language_info": {
1521 |    "codemirror_mode": {
1522 |     "name": "ipython",
1523 |     "version": 2
1524 |    },
1525 |    "file_extension": ".py",
1526 |    "mimetype": "text/x-python",
1527 |    "name": "python",
1528 |    "nbconvert_exporter": "python",
1529 |    "pygments_lexer": "ipython2",
1530 |    "version": "2.7.13"
1531 |   }
1532 |  },
1533 |  "nbformat": 4,
1534 |  "nbformat_minor": 1
1535 | }
1536 | 


--------------------------------------------------------------------------------
/src/model.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 2,
   6 |    "metadata": {},
   7 |    "outputs": [
   8 |     {
   9 |      "name": "stderr",
  10 |      "output_type": "stream",
  11 |      "text": [
  12 |       "C:\\ProgramData\\Anaconda2\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
  13 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
  14 |      ]
  15 |     }
  16 |    ],
  17 |    "source": [
  18 |     "from __future__ import division\n",
  19 |     "import numpy as np\n",
  20 |     "import pandas as pd\n",
  21 |     "from sklearn import preprocessing\n",
  22 |     "import xgboost as xgb\n",
  23 |     "import lightgbm as lgb\n",
  24 |     "import catboost as cb\n",
  25 |     "import time\n",
  26 |     "import datetime\n",
  27 |     "import warnings\n",
  28 |     "warnings.filterwarnings('ignore')\n",
  29 |     "import util"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": 3,
  35 |    "metadata": {
  36 |     "collapsed": true
  37 |    },
  38 |    "outputs": [],
  39 |    "source": [
  40 |     "def merge_feature(\n",
  41 |     "    act_type_window,\n",
  42 |     "    act_type_num_window,\n",
  43 |     "    act_type_rate_window,\n",
  44 |     "    act_type_row_stat_window,\n",
  45 |     "    act_time_window,\n",
  46 |     "    act_time_1type_window,\n",
  47 |     "    act_ord_act_time_diff_window,\n",
  48 |     "    action_sequence_time_diff_window,\n",
  49 |     "    action_time_diff_234_56789_window,\n",
  50 |     "    action_time_diff_stat_window\n",
  51 |     "):\n",
  52 |     "    util.log('Merge feature...')\n",
  53 |     "    \n",
  54 |     "    order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')\n",
  55 |     "    order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')\n",
  56 |     "\n",
  57 |     "    user_profile = pd.read_csv('../data/output/feat/%s' % 'user_profile')\n",
  58 |     "    train = pd.merge(order_future_tr, user_profile, on='userid', how='left')\n",
  59 |     "    test = pd.merge(order_future_te, user_profile, on='userid', how='left')\n",
  60 |     "    \n",
  61 |     "    user_comment = pd.read_csv('../data/output/feat/%s' % 'user_comment')\n",
  62 |     "    train = pd.merge(train, user_comment, on='userid', how='left')\n",
  63 |     "    test = pd.merge(test, user_comment, on='userid', how='left')\n",
  64 |     "    \n",
  65 |     "    order_history = pd.read_csv('../data/output/feat/%s' % 'order_history')\n",
  66 |     "    train = pd.merge(train, order_history, on='userid', how='left')\n",
  67 |     "    test = pd.merge(test, order_history, on='userid', how='left')\n",
  68 |     "    \n",
  69 |     "#     order_history_last_w = pd.read_csv('../data/output/feat/%s' % 'order_history_last_w')\n",
  70 |     "#     train = pd.merge(train, order_history_last_w, on='userid', how='left')\n",
  71 |     "#     test = pd.merge(test, order_history_last_w, on='userid', how='left')\n",
  72 |     "    \n",
  73 |     "    action_type = pd.read_csv('../data/output/feat/%s' % 'action_type')\n",
  74 |     "    train = pd.merge(train, action_type, on='userid', how='left')\n",
  75 |     "    test = pd.merge(test, action_type, on='userid', how='left')\n",
  76 |     "    \n",
  77 |     "    action_type_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_type_based_on_time')\n",
  78 |     "    train = pd.merge(train, action_type_based_on_time, on='userid', how='left')\n",
  79 |     "    test = pd.merge(test, action_type_based_on_time, on='userid', how='left')\n",
  80 |     "    \n",
  81 |     "    util.log('act_type_window=' + str(act_type_window))\n",
  82 |     "    window = act_type_window\n",
  83 |     "    action_type_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_based_on_time_last_window', window))\n",
  84 |     "    train = pd.merge(train, action_type_based_on_time_last_window, on='userid', how='left')\n",
  85 |     "    test = pd.merge(test, action_type_based_on_time_last_window, on='userid', how='left')\n",
  86 |     "    \n",
  87 |     "    util.log('act_type_num_window=' + str(act_type_num_window))\n",
  88 |     "    window = act_type_num_window\n",
  89 |     "    action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window))\n",
  90 |     "    train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left')\n",
  91 |     "    test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left')\n",
  92 |     "    \n",
  93 |     "    util.log('act_type_rate_window=' + str(act_type_rate_window))\n",
  94 |     "    window = act_type_rate_window\n",
  95 |     "    action_type_rate_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_rate_based_on_time_last_window', window))\n",
  96 |     "    train = pd.merge(train, action_type_rate_based_on_time_last_window, on='userid', how='left')\n",
  97 |     "    test = pd.merge(test, action_type_rate_based_on_time_last_window, on='userid', how='left')\n",
  98 |     "    \n",
  99 |     "    util.log('act_type_row_stat_window=' + str(act_type_row_stat_window))\n",
 100 |     "    window = act_type_row_stat_window\n",
 101 |     "    action_type_row_stat_based_on_time_last_window_feat = pd.read_csv('../data/output/feat/%s%d' % ('action_type_row_stat_based_on_time_last_window', window))\n",
 102 |     "    train = pd.merge(train, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left')\n",
 103 |     "    test = pd.merge(test, action_type_row_stat_based_on_time_last_window_feat, on='userid', how='left')\n",
 104 |     "    \n",
 105 |     "#     util.log('action_num_window=' + str(action_num_window))\n",
 106 |     "#     window = action_num_window\n",
 107 |     "#     action_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_num_based_on_time_last_window', window))\n",
 108 |     "#     train = pd.merge(train, action_num_based_on_time_last_window, on='userid', how='left')\n",
 109 |     "#     test = pd.merge(test, action_num_based_on_time_last_window, on='userid', how='left')\n",
 110 |     "\n",
 111 |     "#     util.log('action_type_num_window=' + str(action_type_num_window))\n",
 112 |     "#     window = action_type_num_window\n",
 113 |     "#     action_type_num_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_type_num_based_on_time_last_window', window))\n",
 114 |     "#     train = pd.merge(train, action_type_num_based_on_time_last_window, on='userid', how='left')\n",
 115 |     "#     test = pd.merge(test, action_type_num_based_on_time_last_window, on='userid', how='left')\n",
 116 |     "\n",
 117 |     "    action_time_based_on_time = pd.read_csv('../data/output/feat/%s' % 'action_time_based_on_time')\n",
 118 |     "    train = pd.merge(train, action_time_based_on_time, on='userid', how='left')\n",
 119 |     "    test = pd.merge(test, action_time_based_on_time, on='userid', how='left')\n",
 120 |     "    \n",
 121 |     "    util.log('act_time_window=' + str(act_time_window))\n",
 122 |     "    window = act_time_window\n",
 123 |     "    action_time_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_based_on_time_last_window', window))\n",
 124 |     "    train = pd.merge(train, action_time_based_on_time_last_window, on='userid', how='left')\n",
 125 |     "    test = pd.merge(test, action_time_based_on_time_last_window, on='userid', how='left')\n",
 126 |     "    \n",
 127 |     "#     util.log('act_time_row_stat_window=' + str(act_time_row_stat_window))\n",
 128 |     "#     window = act_time_row_stat_window\n",
 129 |     "#     action_time_row_stat_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_row_stat_based_on_time_last_window', window))\n",
 130 |     "#     train = pd.merge(train, action_time_row_stat_based_on_time_last_window, on='userid', how='left')\n",
 131 |     "#     test = pd.merge(test, action_time_row_stat_based_on_time_last_window, on='userid', how='left')\n",
 132 |     " \n",
 133 |     "#     util.log('action_time_diff2_window=' + str(action_time_diff2_window))\n",
 134 |     "#     window = action_time_diff2_window\n",
 135 |     "#     action_time_diff2_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff2_based_on_time_last_window', window))\n",
 136 |     "#     train = pd.merge(train, action_time_diff2_based_on_time_last_window, on='userid', how='left')\n",
 137 |     "#     test = pd.merge(test, action_time_diff2_based_on_time_last_window, on='userid', how='left')\n",
 138 |     "\n",
 139 |     "    util.log('act_time_1type_window=%d' % act_time_1type_window)\n",
 140 |     "    window = act_time_1type_window\n",
 141 |     "    for ttype in [1, 5, 6, 7, 8, 9]:\n",
 142 |     "        action_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_time_based_on_time_last_window', window, '_on_type', ttype))\n",
 143 |     "        train = pd.merge(train, action_time_based_on_time_last_window_on_type, on='userid', how='left')\n",
 144 |     "        test = pd.merge(test, action_time_based_on_time_last_window_on_type, on='userid', how='left')\n",
 145 |     "    \n",
 146 |     "#     util.log('action_time_2order_window=' + str(action_time_2order_window))\n",
 147 |     "#     window = action_time_2order_window\n",
 148 |     "#     action_time_2order_based_on_time_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_2order_based_on_time_last_window', window))\n",
 149 |     "#     train = pd.merge(train, action_time_2order_based_on_time_last_window, on='userid', how='left')\n",
 150 |     "#     test = pd.merge(test, action_time_2order_based_on_time_last_window, on='userid', how='left')\n",
 151 |     "\n",
 152 |     "#     util.log('act_real_time_1type_window=%d' % act_real_time_1type_window)\n",
 153 |     "#     window = act_real_time_1type_window\n",
 154 |     "#     for ttype in [1, 5, 6, 7, 8, 9]:\n",
 155 |     "#         action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype))\n",
 156 |     "#         train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n",
 157 |     "#         test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n",
 158 |     "\n",
 159 |     "#     action_order_time_diff = pd.read_csv('../data/output/feat/%s' % 'action_order_time_diff')\n",
 160 |     "#     train = pd.merge(train, action_order_time_diff, on='userid', how='left')\n",
 161 |     "#     test = pd.merge(test, action_order_time_diff, on='userid', how='left')\n",
 162 |     "\n",
 163 |     "#     order_last_order_ydm = pd.read_csv('../data/output/feat/%s' % 'order_last_order_ydm')\n",
 164 |     "#     train = pd.merge(train, order_last_order_ydm, on='userid', how='left')\n",
 165 |     "#     test = pd.merge(test, order_last_order_ydm, on='userid', how='left')\n",
 166 |     "\n",
 167 |     "    order_type1_ydm = pd.read_csv('../data/output/feat/%s' % 'order_type1_ydm')\n",
 168 |     "    train = pd.merge(train, order_type1_ydm, on='userid', how='left')\n",
 169 |     "    test = pd.merge(test, order_type1_ydm, on='userid', how='left')\n",
 170 |     "\n",
 171 |     "    util.log('act_ord_act_time_diff_window=' + str(act_ord_act_time_diff_window))\n",
 172 |     "    window = act_ord_act_time_diff_window\n",
 173 |     "    act_ord_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_act_time_diff_last_window', window))\n",
 174 |     "    train = pd.merge(train, act_ord_act_time_diff_last_window, on='userid', how='left')\n",
 175 |     "    test = pd.merge(test, act_ord_act_time_diff_last_window, on='userid', how='left')\n",
 176 |     "\n",
 177 |     "#     util.log('act_ord_type1_act_time_diff_window=' + str(act_ord_type1_act_time_diff_window))\n",
 178 |     "#     window = act_ord_type1_act_time_diff_window\n",
 179 |     "#     act_ord_type1_act_time_diff_last_window = pd.read_csv('../data/output/feat/%s%d' % ('act_ord_type1_act_time_diff_last_window', window))\n",
 180 |     "#     train = pd.merge(train, act_ord_type1_act_time_diff_last_window, on='userid', how='left')\n",
 181 |     "#     test = pd.merge(test, act_ord_type1_act_time_diff_last_window, on='userid', how='left')\n",
 182 |     "\n",
 183 |     "    util.log('action_sequence_time_diff_window=' + str(action_sequence_time_diff_window))\n",
 184 |     "    window = action_sequence_time_diff_window\n",
 185 |     "    action_sequence_time_diff_window = pd.read_csv('../data/output/feat/%s%d' % ('action_sequence_time_diff_window', window))\n",
 186 |     "    train = pd.merge(train, action_sequence_time_diff_window, on='userid', how='left')\n",
 187 |     "    test = pd.merge(test, action_sequence_time_diff_window, on='userid', how='left')\n",
 188 |     "\n",
 189 |     "#     action_sequence_time_stat_last123 = pd.read_csv('../data/output/feat/%s' % 'action_sequence_time_stat_last123')\n",
 190 |     "#     train = pd.merge(train, action_sequence_time_stat_last123, on='userid', how='left')\n",
 191 |     "#     test = pd.merge(test, action_sequence_time_stat_last123, on='userid', how='left')\n",
 192 |     "\n",
 193 |     "    util.log('action_time_diff_234_56789_window=' + str(action_time_diff_234_56789_window))\n",
 194 |     "    window = action_time_diff_234_56789_window\n",
 195 |     "    action_time_diff_234_56789_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_234_56789_last_window', window))\n",
 196 |     "    train = pd.merge(train, action_time_diff_234_56789_last_window, on='userid', how='left')\n",
 197 |     "    test = pd.merge(test, action_time_diff_234_56789_last_window, on='userid', how='left')\n",
 198 |     "    \n",
 199 |     "#     action_stat_last_every_type = pd.read_csv('../data/output/feat/%s' % 'action_stat_last_every_type')\n",
 200 |     "#     train = pd.merge(train, action_stat_last_every_type, on='userid', how='left')\n",
 201 |     "#     test = pd.merge(test, action_stat_last_every_type, on='userid', how='left')\n",
 202 |     "\n",
 203 |     "#     act_ord_before_type1_stat = pd.read_csv('../data/output/feat/%s' % 'act_ord_before_type1_stat')\n",
 204 |     "#     train = pd.merge(train, act_ord_before_type1_stat, on='userid', how='left')\n",
 205 |     "#     test = pd.merge(test, act_ord_before_type1_stat, on='userid', how='left')\n",
 206 |     "\n",
 207 |     "    action_time_diff_stat = pd.read_csv('../data/output/feat/%s' % 'action_time_diff_stat')\n",
 208 |     "    train = pd.merge(train, action_time_diff_stat, on='userid', how='left')\n",
 209 |     "    test = pd.merge(test, action_time_diff_stat, on='userid', how='left')\n",
 210 |     "\n",
 211 |     "    util.log('action_time_diff_stat_window=' + str(action_time_diff_stat_window))\n",
 212 |     "    window = action_time_diff_stat_window\n",
 213 |     "    action_time_diff_stat_last_window = pd.read_csv('../data/output/feat/%s%d' % ('action_time_diff_stat_last_window', window))\n",
 214 |     "    train = pd.merge(train, action_time_diff_stat_last_window, on='userid', how='left')\n",
 215 |     "    test = pd.merge(test, action_time_diff_stat_last_window, on='userid', how='left')\n",
 216 |     "    \n",
 217 |     "#     action_time_last_on_every_type = pd.read_csv('../data/output/feat/%s' % 'action_time_last_on_every_type')\n",
 218 |     "#     train = pd.merge(train, action_time_last_on_every_type, on='userid', how='left')\n",
 219 |     "#     test = pd.merge(test, action_time_last_on_every_type, on='userid', how='left')\n",
 220 |     "\n",
 221 |     "    # bjw comment 中出现 order 中没有出现的为 1\n",
 222 |     "    bjw_train = pd.read_csv('../data/output/feat/bjw/train_fea.csv')\n",
 223 |     "    bjw_test = pd.read_csv('../data/output/feat/bjw/test_fea.csv')\n",
 224 |     "    train = pd.merge(train, bjw_train, on='userid', how='left')\n",
 225 |     "    test = pd.merge(test, bjw_test, on='userid', how='left')\n",
 226 |     "    \n",
 227 |     "    # 别人的开源特征，基于自己理解实现了一部分\n",
 228 |     "    tryy = pd.read_csv('../data/output/feat/%s' % 'try')\n",
 229 |     "    train = pd.merge(train, tryy, on='userid', how='left')\n",
 230 |     "    test = pd.merge(test, tryy, on='userid', how='left')\n",
 231 |     "    \n",
 232 |     "    # bjw 的特征\n",
 233 |     "    bjw_train = pd.read_csv('../data/output/feat/bjw/all_features_train.csv').drop(['Unnamed: 0', 'orderType'], axis=1)\n",
 234 |     "    bjw_train.columns = ['userid' if i == 0 else i for i in range(len(bjw_train.columns))]\n",
 235 |     "    bjw_test = pd.read_csv('../data/output/feat/bjw/all_features_test.csv').drop(['Unnamed: 0'], axis=1)\n",
 236 |     "    bjw_test.columns = ['userid' if i == 0 else i for i in range(len(bjw_test.columns))]\n",
 237 |     "    train = pd.merge(train, bjw_train, on='userid', how='left')\n",
 238 |     "    test = pd.merge(test, bjw_test, on='userid', how='left')\n",
 239 |     "    \n",
 240 |     "#################################################################################################################\n",
 241 |     "    \n",
 242 |     "    # 用于交叉特征，使用之后会移除\n",
 243 |     "    window = 1\n",
 244 |     "    for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n",
 245 |     "        action_real_time_based_on_time_last_window_on_type = pd.read_csv('../data/output/feat/%s%d%s%d' % ('action_real_time_based_on_time_last_window', window, '_on_type', ttype))\n",
 246 |     "        train = pd.merge(train, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n",
 247 |     "        test = pd.merge(test, action_real_time_based_on_time_last_window_on_type, on='userid', how='left')\n",
 248 |     "\n",
 249 |     "    train, test = cross_feature(train, test)\n",
 250 |     "    \n",
 251 |     "    train, test = drop_duplicate_column(train, test)\n",
 252 |     "    \n",
 253 |     "    train_feature = train.drop(['orderType'], axis = 1)\n",
 254 |     "    train_label = train.orderType.values\n",
 255 |     "    test_feature = test\n",
 256 |     "    test_index = test.userid.values\n",
 257 |     "    \n",
 258 |     "    return train_feature, train_label, test_feature, test_index"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "code",
 263 |    "execution_count": 4,
 264 |    "metadata": {
 265 |     "collapsed": true
 266 |    },
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "def cross_feature(train, test):\n",
 270 |     "    util.log('Cross feature...')\n",
 271 |     "    \n",
 272 |     "    # 最近的 action 与最近的 order 的时间差\n",
 273 |     "    train['act_last_time-ord_last_time'] = train['act_last_time'] - train['ord_last_time']\n",
 274 |     "    train['act_last_time-ord_type0_time_max'] = train['act_last_time'] - train['ord_type0_time_max']\n",
 275 |     "    train['act_last_time-ord_type1_time_max'] = train['act_last_time'] - train['ord_type1_time_max']\n",
 276 |     "    test['act_last_time-ord_last_time'] = test['act_last_time'] - test['ord_last_time']\n",
 277 |     "    test['act_last_time-ord_type0_time_max'] = test['act_last_time'] - test['ord_type0_time_max']\n",
 278 |     "    test['act_last_time-ord_type1_time_max'] = test['act_last_time'] - test['ord_type1_time_max']\n",
 279 |     "    \n",
 280 |     "    # 最早的 action 与最早的 order 的时间差\n",
 281 |     "    train['act_first_time-ord_first_time'] = train['act_first_time'] - train['ord_first_time']\n",
 282 |     "    train['act_first_time-ord_type0_time_min'] = train['act_first_time'] - train['ord_type0_time_min']\n",
 283 |     "    train['act_first_time-ord_type1_time_min'] = train['act_first_time'] - train['ord_type1_time_min']\n",
 284 |     "    test['act_first_time-ord_first_time'] = test['act_first_time'] - test['ord_first_time']\n",
 285 |     "    test['act_first_time-ord_type0_time_min'] = test['act_first_time'] - test['ord_type0_time_min']\n",
 286 |     "    test['act_first_time-ord_type1_time_min'] = test['act_first_time'] - test['ord_type1_time_min']\n",
 287 |     "    \n",
 288 |     "    # 最近的 action 与最近的每一个 type 的 action 的时间差 + 最早的 action 与最早的每一个 type 的 action 的时间差\n",
 289 |     "    for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n",
 290 |     "        train['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_last_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 291 |     "        train['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['act_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 292 |     "        test['act_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_last_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 293 |     "        test['act_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['act_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 294 |     "        train = train.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1)\n",
 295 |     "        test = test.drop(['act_time(rank_1)(window_1)(type_%d)' % ttype], axis=1)\n",
 296 |     "\n",
 297 |     "    # 是否下过精品服务的单 * 最近的 action 的时间\n",
 298 |     "    tmp = train['ord_num(type_1)'].copy()\n",
 299 |     "    tmp[tmp > 1] = 1\n",
 300 |     "    tmp = pd.get_dummies(tmp.fillna(-1))\n",
 301 |     "    tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n",
 302 |     "    train = pd.concat([train, tmp.mul(train['act_last_time'], axis=0)], axis=1)\n",
 303 |     "    tmp = test['ord_num(type_1)'].copy()\n",
 304 |     "    tmp[tmp > 1] = 1\n",
 305 |     "    tmp = pd.get_dummies(tmp.fillna(-1))\n",
 306 |     "    tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n",
 307 |     "    test = pd.concat([test, tmp.mul(test['act_last_time'], axis=0)], axis=1)\n",
 308 |     "    \n",
 309 |     "    # 是否下过精品服务的单 * 每一个 type 的 action 的数量\n",
 310 |     "    tmp = train['ord_num(type_1)'].copy()\n",
 311 |     "    tmp[tmp > 1] = 1\n",
 312 |     "    tmp = pd.get_dummies(tmp.fillna(-1))\n",
 313 |     "    tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n",
 314 |     "    for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n",
 315 |     "        train = train.join(tmp.mul(train['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype)\n",
 316 |     "    tmp = test['ord_num(type_1)'].copy()\n",
 317 |     "    tmp[tmp > 1] = 1\n",
 318 |     "    tmp = pd.get_dummies(tmp.fillna(-1))\n",
 319 |     "    tmp.columns = ['has_ord_serv_nan', 'has_ord_serv_no', 'has_ord_serv_yes']\n",
 320 |     "    for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n",
 321 |     "        test = test.join(tmp.mul(test['act_num(type_%d)' % ttype], axis=0), rsuffix='*act_num(type_%d)' % ttype)\n",
 322 |     "        \n",
 323 |     "#     # 最近的 order 与最近的每一个 type 的 action 的时间差 + 最早的 order 与最早的每一个 type 的 action 的时间差 （all/0/1）\n",
 324 |     "#     for ttype in [1, 2, 3, 4, 5, 6, 7, 8, 9]:\n",
 325 |     "#         train['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_last_time'] -  train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 326 |     "#         train['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 327 |     "#         train['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_max'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 328 |     "#         train['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_first_time'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 329 |     "#         train['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type0_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 330 |     "#         train['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = train['ord_type1_time_min'] - train['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 331 |     "#         test['ord_last_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_last_time'] -  test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 332 |     "#         test['ord_type0_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 333 |     "#         test['ord_type1_time_max-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_max'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 334 |     "#         test['ord_first_time-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_first_time'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 335 |     "#         test['ord_type0_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type0_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 336 |     "#         test['ord_type1_time_min-act_time(rank_1)(window_1)(type_%d)' % ttype] = test['ord_type1_time_min'] - test['act_time(rank_1)(window_1)(type_%d)' % ttype]\n",
 337 |     "        \n",
 338 |     "    return train, test"
 339 |    ]
 340 |   },
 341 |   {
 342 |    "cell_type": "code",
 343 |    "execution_count": 5,
 344 |    "metadata": {
 345 |     "collapsed": true
 346 |    },
 347 |    "outputs": [],
 348 |    "source": [
 349 |     "def drop_duplicate_column(train, test):\n",
 350 |     "    util.log('Drop duplicate column...')\n",
 351 |     "    \n",
 352 |     "    train = train.drop(['act_type(rank_1)(window6)'], axis=1)  # window9\n",
 353 |     "    test = test.drop(['act_type(rank_1)(window6)'], axis=1)\n",
 354 |     "    \n",
 355 |     "    return train, test"
 356 |    ]
 357 |   },
 358 |   {
 359 |    "cell_type": "code",
 360 |    "execution_count": 6,
 361 |    "metadata": {
 362 |     "collapsed": true
 363 |    },
 364 |    "outputs": [],
 365 |    "source": [
 366 |     "def lgb_cv(train_feature, train_label, params, folds, rounds):\n",
 367 |     "    start = time.clock()\n",
 368 |     "    print train_feature.columns\n",
 369 |     "    dtrain = lgb.Dataset(train_feature, label=train_label)\n",
 370 |     "    num_round = rounds\n",
 371 |     "    print 'run cv: ' + 'round: ' + str(rounds)\n",
 372 |     "    res = lgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=20, early_stopping_rounds=100)\n",
 373 |     "    elapsed = (time.clock() - start)\n",
 374 |     "    print 'Time used:', elapsed, 's'\n",
 375 |     "    return len(res['auc-mean']), res['auc-mean'][len(res['auc-mean']) - 1]\n",
 376 |     "\n",
 377 |     "\n",
 378 |     "def lgb_predict(train_feature, train_label, test_feature, rounds, params):\n",
 379 |     "    dtrain = lgb.Dataset(train_feature, label=train_label)\n",
 380 |     "    valid_sets = [dtrain]\n",
 381 |     "    num_round = rounds\n",
 382 |     "    model = lgb.train(params, dtrain, num_round, valid_sets, verbose_eval=50)\n",
 383 |     "    predict = model.predict(test_feature)\n",
 384 |     "    return model, predict\n",
 385 |     "\n",
 386 |     "\n",
 387 |     "def store_result(test_index, pred, name):\n",
 388 |     "    result = pd.DataFrame({'userid': test_index, 'orderType': pred})\n",
 389 |     "    result.to_csv('../data/output/sub/' + name + '.csv', index=0, columns=['userid', 'orderType'])\n",
 390 |     "    return result"
 391 |    ]
 392 |   },
 393 |   {
 394 |    "cell_type": "code",
 395 |    "execution_count": 7,
 396 |    "metadata": {},
 397 |    "outputs": [
 398 |     {
 399 |      "name": "stdout",
 400 |      "output_type": "stream",
 401 |      "text": [
 402 |       "2018-02-10 11:57:35 Merge feature...\n",
 403 |       "2018-02-10 11:57:36 act_type_window=6\n",
 404 |       "2018-02-10 11:57:36 act_type_num_window=6\n",
 405 |       "2018-02-10 11:57:36 act_type_rate_window=3\n",
 406 |       "2018-02-10 11:57:36 act_type_row_stat_window=6\n",
 407 |       "2018-02-10 11:57:36 act_time_window=6\n",
 408 |       "2018-02-10 11:57:36 act_time_1type_window=6\n",
 409 |       "2018-02-10 11:57:37 act_ord_act_time_diff_window=6\n",
 410 |       "2018-02-10 11:57:37 action_sequence_time_diff_window=6\n",
 411 |       "2018-02-10 11:57:37 action_time_diff_234_56789_window=6\n",
 412 |       "2018-02-10 11:57:38 action_time_diff_stat_window=3\n",
 413 |       "2018-02-10 11:57:42 Cross feature...\n",
 414 |       "2018-02-10 11:57:47 Drop duplicate column...\n",
 415 |       "(40307, 599) (40307L,) (10076, 599)\n"
 416 |      ]
 417 |     }
 418 |    ],
 419 |    "source": [
 420 |     "train_feature, train_label, test_feature, test_index = merge_feature(6, 6, 3, 6, 6, 6, 6, 6, 6, 3)\n",
 421 |     "print train_feature.shape, train_label.shape, test_feature.shape"
 422 |    ]
 423 |   },
 424 |   {
 425 |    "cell_type": "code",
 426 |    "execution_count": 8,
 427 |    "metadata": {
 428 |     "collapsed": true
 429 |    },
 430 |    "outputs": [],
 431 |    "source": [
 432 |     "config = {\n",
 433 |     "    'rounds': 10000,\n",
 434 |     "    'folds': 5\n",
 435 |     "}\n",
 436 |     "\n",
 437 |     "params_lgb = {\n",
 438 |     "    'task': 'train',\n",
 439 |     "    'boosting_type': 'gbdt',\n",
 440 |     "    'objective': 'binary',\n",
 441 |     "    'metric': 'auc',\n",
 442 |     "    'min_sum_hessian_in_leaf': 0.1,\n",
 443 |     "    'learning_rate': 0.01,\n",
 444 |     "    'verbosity': 2,\n",
 445 |     "    'tree_learner': 'feature',\n",
 446 |     "    'num_leaves': 128,\n",
 447 |     "    'feature_fraction': 0.75,\n",
 448 |     "    'bagging_fraction': 0.9,\n",
 449 |     "    'bagging_freq': 1,\n",
 450 |     "    'num_threads': 16,\n",
 451 |     "    'seed': 7\n",
 452 |     "}"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": 10,
 458 |    "metadata": {
 459 |     "scrolled": true
 460 |    },
 461 |    "outputs": [
 462 |     {
 463 |      "name": "stdout",
 464 |      "output_type": "stream",
 465 |      "text": [
 466 |       "Index([                          u'userid',\n",
 467 |       "                                 u'gender',\n",
 468 |       "                               u'province',\n",
 469 |       "                                    u'age',\n",
 470 |       "                         u'com_rating_sum',\n",
 471 |       "                       u'com_rating_count',\n",
 472 |       "                     u'ord_type0_time_min',\n",
 473 |       "                     u'ord_type0_time_max',\n",
 474 |       "                     u'ord_type0_time_ptp',\n",
 475 |       "                    u'ord_type0_time_mean',\n",
 476 |       "       ...\n",
 477 |       "       u'has_ord_serv_yes*act_num(type_6)',\n",
 478 |       "       u'has_ord_serv_nan*act_num(type_7)',\n",
 479 |       "        u'has_ord_serv_no*act_num(type_7)',\n",
 480 |       "       u'has_ord_serv_yes*act_num(type_7)',\n",
 481 |       "       u'has_ord_serv_nan*act_num(type_8)',\n",
 482 |       "        u'has_ord_serv_no*act_num(type_8)',\n",
 483 |       "       u'has_ord_serv_yes*act_num(type_8)',\n",
 484 |       "       u'has_ord_serv_nan*act_num(type_9)',\n",
 485 |       "        u'has_ord_serv_no*act_num(type_9)',\n",
 486 |       "       u'has_ord_serv_yes*act_num(type_9)'],\n",
 487 |       "      dtype='object', length=599)\n",
 488 |       "run cv: round: 10000\n",
 489 |       "[20]\tcv_agg's auc: 0.92985 + 0.00347325\n",
 490 |       "[40]\tcv_agg's auc: 0.937074 + 0.00334338\n",
 491 |       "[60]\tcv_agg's auc: 0.939803 + 0.00342068\n",
 492 |       "[80]\tcv_agg's auc: 0.942163 + 0.00369461\n",
 493 |       "[100]\tcv_agg's auc: 0.944753 + 0.00353578\n",
 494 |       "[120]\tcv_agg's auc: 0.946343 + 0.00347801\n",
 495 |       "[140]\tcv_agg's auc: 0.948005 + 0.00343983\n",
 496 |       "[160]\tcv_agg's auc: 0.94951 + 0.00354529\n",
 497 |       "[180]\tcv_agg's auc: 0.950927 + 0.0035433\n",
 498 |       "[200]\tcv_agg's auc: 0.952201 + 0.00358382\n",
 499 |       "[220]\tcv_agg's auc: 0.953184 + 0.00362432\n",
 500 |       "[240]\tcv_agg's auc: 0.954438 + 0.00351397\n",
 501 |       "[260]\tcv_agg's auc: 0.955437 + 0.00352917\n",
 502 |       "[280]\tcv_agg's auc: 0.956519 + 0.00349788\n",
 503 |       "[300]\tcv_agg's auc: 0.957527 + 0.00343062\n",
 504 |       "[320]\tcv_agg's auc: 0.958456 + 0.00334904\n",
 505 |       "[340]\tcv_agg's auc: 0.959236 + 0.00324421\n",
 506 |       "[360]\tcv_agg's auc: 0.960058 + 0.00316942\n",
 507 |       "[380]\tcv_agg's auc: 0.96071 + 0.00313101\n",
 508 |       "[400]\tcv_agg's auc: 0.96131 + 0.00305216\n",
 509 |       "[420]\tcv_agg's auc: 0.961842 + 0.00303876\n",
 510 |       "[440]\tcv_agg's auc: 0.962355 + 0.00306587\n",
 511 |       "[460]\tcv_agg's auc: 0.962823 + 0.00307233\n",
 512 |       "[480]\tcv_agg's auc: 0.963272 + 0.00308154\n",
 513 |       "[500]\tcv_agg's auc: 0.963682 + 0.00306173\n",
 514 |       "[520]\tcv_agg's auc: 0.964027 + 0.00304902\n",
 515 |       "[540]\tcv_agg's auc: 0.964375 + 0.00305624\n",
 516 |       "[560]\tcv_agg's auc: 0.964683 + 0.00307713\n",
 517 |       "[580]\tcv_agg's auc: 0.964964 + 0.00307794\n",
 518 |       "[600]\tcv_agg's auc: 0.965264 + 0.00307881\n",
 519 |       "[620]\tcv_agg's auc: 0.965557 + 0.00307066\n",
 520 |       "[640]\tcv_agg's auc: 0.965813 + 0.00304611\n",
 521 |       "[660]\tcv_agg's auc: 0.966083 + 0.00304641\n",
 522 |       "[680]\tcv_agg's auc: 0.966309 + 0.00301581\n",
 523 |       "[700]\tcv_agg's auc: 0.966539 + 0.0030049\n",
 524 |       "[720]\tcv_agg's auc: 0.966745 + 0.00296058\n",
 525 |       "[740]\tcv_agg's auc: 0.96696 + 0.00292508\n",
 526 |       "[760]\tcv_agg's auc: 0.967143 + 0.00291436\n",
 527 |       "[780]\tcv_agg's auc: 0.967317 + 0.00288889\n",
 528 |       "[800]\tcv_agg's auc: 0.967494 + 0.00287349\n",
 529 |       "[820]\tcv_agg's auc: 0.967665 + 0.00286154\n",
 530 |       "[840]\tcv_agg's auc: 0.967805 + 0.00284129\n",
 531 |       "[860]\tcv_agg's auc: 0.967929 + 0.00283724\n",
 532 |       "[880]\tcv_agg's auc: 0.968058 + 0.0028292\n",
 533 |       "[900]\tcv_agg's auc: 0.96821 + 0.00280956\n",
 534 |       "[920]\tcv_agg's auc: 0.968321 + 0.00280509\n",
 535 |       "[940]\tcv_agg's auc: 0.96843 + 0.00279607\n",
 536 |       "[960]\tcv_agg's auc: 0.968546 + 0.002785\n",
 537 |       "[980]\tcv_agg's auc: 0.968645 + 0.00277864\n",
 538 |       "[1000]\tcv_agg's auc: 0.96872 + 0.00278066\n",
 539 |       "[1020]\tcv_agg's auc: 0.968792 + 0.00277067\n",
 540 |       "[1040]\tcv_agg's auc: 0.968876 + 0.00276013\n",
 541 |       "[1060]\tcv_agg's auc: 0.968933 + 0.00276024\n",
 542 |       "[1080]\tcv_agg's auc: 0.969008 + 0.00272818\n",
 543 |       "[1100]\tcv_agg's auc: 0.969076 + 0.00272411\n",
 544 |       "[1120]\tcv_agg's auc: 0.969146 + 0.00270865\n",
 545 |       "[1140]\tcv_agg's auc: 0.969206 + 0.00269515\n",
 546 |       "[1160]\tcv_agg's auc: 0.96926 + 0.002696\n",
 547 |       "[1180]\tcv_agg's auc: 0.969323 + 0.00268535\n",
 548 |       "[1200]\tcv_agg's auc: 0.969387 + 0.00267229\n",
 549 |       "[1220]\tcv_agg's auc: 0.969441 + 0.00267172\n",
 550 |       "[1240]\tcv_agg's auc: 0.969482 + 0.00267564\n",
 551 |       "[1260]\tcv_agg's auc: 0.969523 + 0.00267744\n",
 552 |       "[1280]\tcv_agg's auc: 0.969565 + 0.00265628\n",
 553 |       "[1300]\tcv_agg's auc: 0.969616 + 0.00265951\n",
 554 |       "[1320]\tcv_agg's auc: 0.969652 + 0.00264378\n",
 555 |       "[1340]\tcv_agg's auc: 0.969683 + 0.00265488\n",
 556 |       "[1360]\tcv_agg's auc: 0.969716 + 0.00265775\n",
 557 |       "[1380]\tcv_agg's auc: 0.969763 + 0.00265908\n",
 558 |       "[1400]\tcv_agg's auc: 0.969788 + 0.00266174\n",
 559 |       "[1420]\tcv_agg's auc: 0.969816 + 0.002664\n",
 560 |       "[1440]\tcv_agg's auc: 0.969844 + 0.00266299\n",
 561 |       "[1460]\tcv_agg's auc: 0.969869 + 0.00266542\n",
 562 |       "[1480]\tcv_agg's auc: 0.96991 + 0.00266125\n",
 563 |       "[1500]\tcv_agg's auc: 0.969927 + 0.0026605\n",
 564 |       "[1520]\tcv_agg's auc: 0.969946 + 0.00265666\n",
 565 |       "[1540]\tcv_agg's auc: 0.969976 + 0.00266038\n",
 566 |       "[1560]\tcv_agg's auc: 0.969992 + 0.00266306\n",
 567 |       "[1580]\tcv_agg's auc: 0.970015 + 0.00267018\n",
 568 |       "[1600]\tcv_agg's auc: 0.970022 + 0.00266937\n",
 569 |       "[1620]\tcv_agg's auc: 0.970038 + 0.00266377\n",
 570 |       "[1640]\tcv_agg's auc: 0.970062 + 0.00265507\n",
 571 |       "[1660]\tcv_agg's auc: 0.970068 + 0.00264258\n",
 572 |       "[1680]\tcv_agg's auc: 0.970086 + 0.00263613\n",
 573 |       "[1700]\tcv_agg's auc: 0.970088 + 0.00263336\n",
 574 |       "[1720]\tcv_agg's auc: 0.970101 + 0.00262677\n",
 575 |       "[1740]\tcv_agg's auc: 0.97011 + 0.00262187\n",
 576 |       "[1760]\tcv_agg's auc: 0.970129 + 0.00262488\n",
 577 |       "[1780]\tcv_agg's auc: 0.970143 + 0.00262872\n",
 578 |       "[1800]\tcv_agg's auc: 0.970163 + 0.00261796\n",
 579 |       "[1820]\tcv_agg's auc: 0.970163 + 0.00261577\n",
 580 |       "[1840]\tcv_agg's auc: 0.970164 + 0.00261959\n",
 581 |       "[1860]\tcv_agg's auc: 0.970173 + 0.00262113\n",
 582 |       "[1880]\tcv_agg's auc: 0.970184 + 0.00261985\n",
 583 |       "[1900]\tcv_agg's auc: 0.970195 + 0.00262071\n",
 584 |       "[1920]\tcv_agg's auc: 0.970207 + 0.0026204\n",
 585 |       "[1940]\tcv_agg's auc: 0.970221 + 0.00262299\n",
 586 |       "[1960]\tcv_agg's auc: 0.970224 + 0.00262606\n",
 587 |       "[1980]\tcv_agg's auc: 0.970226 + 0.00262837\n",
 588 |       "[2000]\tcv_agg's auc: 0.970227 + 0.00262799\n",
 589 |       "[2020]\tcv_agg's auc: 0.970243 + 0.00262799\n",
 590 |       "[2040]\tcv_agg's auc: 0.970252 + 0.00263339\n",
 591 |       "[2060]\tcv_agg's auc: 0.970263 + 0.00262685\n",
 592 |       "[2080]\tcv_agg's auc: 0.970272 + 0.00261628\n",
 593 |       "[2100]\tcv_agg's auc: 0.970289 + 0.00260981\n",
 594 |       "[2120]\tcv_agg's auc: 0.970292 + 0.00261098\n",
 595 |       "[2140]\tcv_agg's auc: 0.970291 + 0.00261651\n",
 596 |       "[2160]\tcv_agg's auc: 0.97029 + 0.00261132\n",
 597 |       "[2180]\tcv_agg's auc: 0.970303 + 0.00260396\n",
 598 |       "[2200]\tcv_agg's auc: 0.970303 + 0.00260545\n",
 599 |       "[2220]\tcv_agg's auc: 0.970302 + 0.00260331\n",
 600 |       "[2240]\tcv_agg's auc: 0.970304 + 0.00259132\n",
 601 |       "[2260]\tcv_agg's auc: 0.970313 + 0.00258719\n",
 602 |       "[2280]\tcv_agg's auc: 0.97032 + 0.00258453\n",
 603 |       "[2300]\tcv_agg's auc: 0.970321 + 0.00258521\n",
 604 |       "[2320]\tcv_agg's auc: 0.97032 + 0.00258072\n",
 605 |       "[2340]\tcv_agg's auc: 0.970327 + 0.00258175\n",
 606 |       "[2360]\tcv_agg's auc: 0.970327 + 0.00258231\n",
 607 |       "[2380]\tcv_agg's auc: 0.970324 + 0.00257717\n",
 608 |       "[2400]\tcv_agg's auc: 0.970319 + 0.00258236\n",
 609 |       "[2420]\tcv_agg's auc: 0.970319 + 0.00257521\n",
 610 |       "[2440]\tcv_agg's auc: 0.970319 + 0.00256878\n",
 611 |       "Time used: 607.245468312 s\n"
 612 |      ]
 613 |     }
 614 |    ],
 615 |    "source": [
 616 |     "iterations, best_score = lgb_cv(train_feature, train_label, params_lgb, config['folds'], config['rounds'])"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": 11,
 622 |    "metadata": {
 623 |     "scrolled": true
 624 |    },
 625 |    "outputs": [
 626 |     {
 627 |      "name": "stdout",
 628 |      "output_type": "stream",
 629 |      "text": [
 630 |       "[50]\ttraining's auc: 0.943533\n",
 631 |       "[100]\ttraining's auc: 0.950872\n",
 632 |       "[150]\ttraining's auc: 0.955854\n",
 633 |       "[200]\ttraining's auc: 0.959148\n",
 634 |       "[250]\ttraining's auc: 0.962417\n",
 635 |       "[300]\ttraining's auc: 0.965764\n",
 636 |       "[350]\ttraining's auc: 0.968347\n",
 637 |       "[400]\ttraining's auc: 0.970468\n",
 638 |       "[450]\ttraining's auc: 0.972457\n",
 639 |       "[500]\ttraining's auc: 0.974226\n",
 640 |       "[550]\ttraining's auc: 0.975788\n",
 641 |       "[600]\ttraining's auc: 0.977252\n",
 642 |       "[650]\ttraining's auc: 0.978582\n",
 643 |       "[700]\ttraining's auc: 0.97979\n",
 644 |       "[750]\ttraining's auc: 0.980914\n",
 645 |       "[800]\ttraining's auc: 0.981934\n",
 646 |       "[850]\ttraining's auc: 0.982887\n",
 647 |       "[900]\ttraining's auc: 0.983756\n",
 648 |       "[950]\ttraining's auc: 0.984568\n",
 649 |       "[1000]\ttraining's auc: 0.985332\n",
 650 |       "[1050]\ttraining's auc: 0.986045\n",
 651 |       "[1100]\ttraining's auc: 0.986688\n",
 652 |       "[1150]\ttraining's auc: 0.987315\n",
 653 |       "[1200]\ttraining's auc: 0.987904\n",
 654 |       "[1250]\ttraining's auc: 0.988446\n",
 655 |       "[1300]\ttraining's auc: 0.988976\n",
 656 |       "[1350]\ttraining's auc: 0.989463\n",
 657 |       "[1400]\ttraining's auc: 0.98994\n",
 658 |       "[1450]\ttraining's auc: 0.990374\n",
 659 |       "[1500]\ttraining's auc: 0.990802\n",
 660 |       "[1550]\ttraining's auc: 0.991203\n",
 661 |       "[1600]\ttraining's auc: 0.991564\n",
 662 |       "[1650]\ttraining's auc: 0.991903\n",
 663 |       "[1700]\ttraining's auc: 0.992221\n",
 664 |       "[1750]\ttraining's auc: 0.992537\n",
 665 |       "[1800]\ttraining's auc: 0.992855\n",
 666 |       "[1850]\ttraining's auc: 0.993147\n",
 667 |       "[1900]\ttraining's auc: 0.993422\n",
 668 |       "[1950]\ttraining's auc: 0.993685\n",
 669 |       "[2000]\ttraining's auc: 0.993928\n",
 670 |       "[2050]\ttraining's auc: 0.994164\n",
 671 |       "[2100]\ttraining's auc: 0.994403\n",
 672 |       "[2150]\ttraining's auc: 0.994626\n",
 673 |       "[2200]\ttraining's auc: 0.994846\n",
 674 |       "[2250]\ttraining's auc: 0.995046\n",
 675 |       "[2300]\ttraining's auc: 0.995233\n",
 676 |       "[50]\ttraining's auc: 0.944121\n",
 677 |       "[100]\ttraining's auc: 0.950113\n",
 678 |       "[150]\ttraining's auc: 0.954538\n",
 679 |       "[200]\ttraining's auc: 0.958038\n",
 680 |       "[250]\ttraining's auc: 0.961905\n",
 681 |       "[300]\ttraining's auc: 0.965661\n",
 682 |       "[350]\ttraining's auc: 0.96851\n",
 683 |       "[400]\ttraining's auc: 0.970647\n",
 684 |       "[450]\ttraining's auc: 0.97257\n",
 685 |       "[500]\ttraining's auc: 0.974361\n",
 686 |       "[550]\ttraining's auc: 0.975971\n",
 687 |       "[600]\ttraining's auc: 0.977432\n",
 688 |       "[650]\ttraining's auc: 0.978727\n",
 689 |       "[700]\ttraining's auc: 0.979893\n",
 690 |       "[750]\ttraining's auc: 0.980962\n",
 691 |       "[800]\ttraining's auc: 0.982016\n",
 692 |       "[850]\ttraining's auc: 0.982955\n",
 693 |       "[900]\ttraining's auc: 0.983835\n",
 694 |       "[950]\ttraining's auc: 0.984645\n",
 695 |       "[1000]\ttraining's auc: 0.985404\n",
 696 |       "[1050]\ttraining's auc: 0.986127\n",
 697 |       "[1100]\ttraining's auc: 0.986803\n",
 698 |       "[1150]\ttraining's auc: 0.987408\n",
 699 |       "[1200]\ttraining's auc: 0.987996\n",
 700 |       "[1250]\ttraining's auc: 0.988534\n",
 701 |       "[1300]\ttraining's auc: 0.989056\n",
 702 |       "[1350]\ttraining's auc: 0.98956\n",
 703 |       "[1400]\ttraining's auc: 0.990015\n",
 704 |       "[1450]\ttraining's auc: 0.990433\n",
 705 |       "[1500]\ttraining's auc: 0.990864\n",
 706 |       "[1550]\ttraining's auc: 0.991236\n",
 707 |       "[1600]\ttraining's auc: 0.991607\n",
 708 |       "[1650]\ttraining's auc: 0.991958\n",
 709 |       "[1700]\ttraining's auc: 0.992295\n",
 710 |       "[1750]\ttraining's auc: 0.992604\n",
 711 |       "[1800]\ttraining's auc: 0.992904\n",
 712 |       "[1850]\ttraining's auc: 0.993195\n",
 713 |       "[1900]\ttraining's auc: 0.993474\n",
 714 |       "[1950]\ttraining's auc: 0.993725\n",
 715 |       "[2000]\ttraining's auc: 0.993981\n",
 716 |       "[2050]\ttraining's auc: 0.994216\n",
 717 |       "[2100]\ttraining's auc: 0.994441\n",
 718 |       "[2150]\ttraining's auc: 0.994666\n",
 719 |       "[2200]\ttraining's auc: 0.994868\n",
 720 |       "[2250]\ttraining's auc: 0.99506\n",
 721 |       "[2300]\ttraining's auc: 0.995261\n",
 722 |       "[50]\ttraining's auc: 0.943344\n",
 723 |       "[100]\ttraining's auc: 0.950687\n",
 724 |       "[150]\ttraining's auc: 0.955448\n",
 725 |       "[200]\ttraining's auc: 0.958878\n",
 726 |       "[250]\ttraining's auc: 0.962563\n",
 727 |       "[300]\ttraining's auc: 0.96581\n",
 728 |       "[350]\ttraining's auc: 0.968427\n",
 729 |       "[400]\ttraining's auc: 0.970552\n",
 730 |       "[450]\ttraining's auc: 0.972443\n",
 731 |       "[500]\ttraining's auc: 0.974272\n",
 732 |       "[550]\ttraining's auc: 0.975921\n",
 733 |       "[600]\ttraining's auc: 0.977345\n",
 734 |       "[650]\ttraining's auc: 0.978615\n",
 735 |       "[700]\ttraining's auc: 0.979787\n",
 736 |       "[750]\ttraining's auc: 0.980919\n",
 737 |       "[800]\ttraining's auc: 0.981933\n",
 738 |       "[850]\ttraining's auc: 0.982875\n",
 739 |       "[900]\ttraining's auc: 0.983745\n",
 740 |       "[950]\ttraining's auc: 0.984559\n",
 741 |       "[1000]\ttraining's auc: 0.985313\n",
 742 |       "[1050]\ttraining's auc: 0.986013\n",
 743 |       "[1100]\ttraining's auc: 0.986682\n",
 744 |       "[1150]\ttraining's auc: 0.987308\n",
 745 |       "[1200]\ttraining's auc: 0.987906\n",
 746 |       "[1250]\ttraining's auc: 0.98846\n",
 747 |       "[1300]\ttraining's auc: 0.988989\n",
 748 |       "[1350]\ttraining's auc: 0.989478\n",
 749 |       "[1400]\ttraining's auc: 0.989943\n",
 750 |       "[1450]\ttraining's auc: 0.990387\n",
 751 |       "[1500]\ttraining's auc: 0.990801\n",
 752 |       "[1550]\ttraining's auc: 0.991205\n",
 753 |       "[1600]\ttraining's auc: 0.99156\n",
 754 |       "[1650]\ttraining's auc: 0.991911\n",
 755 |       "[1700]\ttraining's auc: 0.992244\n",
 756 |       "[1750]\ttraining's auc: 0.992559\n",
 757 |       "[1800]\ttraining's auc: 0.992854\n",
 758 |       "[1850]\ttraining's auc: 0.993143\n",
 759 |       "[1900]\ttraining's auc: 0.993427\n",
 760 |       "[1950]\ttraining's auc: 0.99369\n",
 761 |       "[2000]\ttraining's auc: 0.993945\n",
 762 |       "[2050]\ttraining's auc: 0.994184\n",
 763 |       "[2100]\ttraining's auc: 0.994413\n",
 764 |       "[2150]\ttraining's auc: 0.99463\n",
 765 |       "[2200]\ttraining's auc: 0.994838\n",
 766 |       "[2250]\ttraining's auc: 0.995043\n",
 767 |       "[2300]\ttraining's auc: 0.995236\n",
 768 |       "[50]\ttraining's auc: 0.942483\n",
 769 |       "[100]\ttraining's auc: 0.950969\n",
 770 |       "[150]\ttraining's auc: 0.955091\n",
 771 |       "[200]\ttraining's auc: 0.958299\n",
 772 |       "[250]\ttraining's auc: 0.962131\n",
 773 |       "[300]\ttraining's auc: 0.965567\n",
 774 |       "[350]\ttraining's auc: 0.968425\n",
 775 |       "[400]\ttraining's auc: 0.970545\n",
 776 |       "[450]\ttraining's auc: 0.972499\n",
 777 |       "[500]\ttraining's auc: 0.974226\n",
 778 |       "[550]\ttraining's auc: 0.975819\n",
 779 |       "[600]\ttraining's auc: 0.977303\n",
 780 |       "[650]\ttraining's auc: 0.978605\n",
 781 |       "[700]\ttraining's auc: 0.979826\n",
 782 |       "[750]\ttraining's auc: 0.980953\n",
 783 |       "[800]\ttraining's auc: 0.981973\n",
 784 |       "[850]\ttraining's auc: 0.982899\n",
 785 |       "[900]\ttraining's auc: 0.983789\n",
 786 |       "[950]\ttraining's auc: 0.984592\n",
 787 |       "[1000]\ttraining's auc: 0.985353\n",
 788 |       "[1050]\ttraining's auc: 0.98608\n",
 789 |       "[1100]\ttraining's auc: 0.986717\n",
 790 |       "[1150]\ttraining's auc: 0.987334\n",
 791 |       "[1200]\ttraining's auc: 0.987908\n",
 792 |       "[1250]\ttraining's auc: 0.988463\n",
 793 |       "[1300]\ttraining's auc: 0.98898\n",
 794 |       "[1350]\ttraining's auc: 0.989468\n",
 795 |       "[1400]\ttraining's auc: 0.98992\n",
 796 |       "[1450]\ttraining's auc: 0.990363\n",
 797 |       "[1500]\ttraining's auc: 0.990792\n",
 798 |       "[1550]\ttraining's auc: 0.99119\n",
 799 |       "[1600]\ttraining's auc: 0.991556\n",
 800 |       "[1650]\ttraining's auc: 0.991905\n",
 801 |       "[1700]\ttraining's auc: 0.992245\n",
 802 |       "[1750]\ttraining's auc: 0.992558\n",
 803 |       "[1800]\ttraining's auc: 0.99285\n",
 804 |       "[1850]\ttraining's auc: 0.99313\n",
 805 |       "[1900]\ttraining's auc: 0.993402\n",
 806 |       "[1950]\ttraining's auc: 0.993669\n",
 807 |       "[2000]\ttraining's auc: 0.993918\n",
 808 |       "[2050]\ttraining's auc: 0.994163\n",
 809 |       "[2100]\ttraining's auc: 0.994396\n",
 810 |       "[2150]\ttraining's auc: 0.994608\n",
 811 |       "[2200]\ttraining's auc: 0.994833\n",
 812 |       "[2250]\ttraining's auc: 0.995031\n",
 813 |       "[2300]\ttraining's auc: 0.995223\n"
 814 |      ]
 815 |     }
 816 |    ],
 817 |    "source": [
 818 |     "preds = 0\n",
 819 |     "for s in range(7, 11):\n",
 820 |     "    params_lgb['seed'] = s\n",
 821 |     "    model, pred = lgb_predict(train_feature, train_label, test_feature, iterations, params_lgb)\n",
 822 |     "    preds += pred\n",
 823 |     "preds /= 4"
 824 |    ]
 825 |   },
 826 |   {
 827 |    "cell_type": "code",
 828 |    "execution_count": 12,
 829 |    "metadata": {
 830 |     "collapsed": true
 831 |    },
 832 |    "outputs": [],
 833 |    "source": [
 834 |     "res = store_result(test_index, preds, '20180210-lgb-%f(r%d)' % (best_score, iterations))"
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "code",
 839 |    "execution_count": 13,
 840 |    "metadata": {},
 841 |    "outputs": [
 842 |     {
 843 |      "name": "stdout",
 844 |      "output_type": "stream",
 845 |      "text": [
 846 |       "ord_type1_time_min: 158788.50\n",
 847 |       "205: 119339.52\n",
 848 |       "192: 74425.39\n",
 849 |       "ord_type1_time_max: 71079.32\n",
 850 |       "last_day_rate(type_5): 35889.30\n",
 851 |       "act_time_diff(1-2)(window_6)(type_5): 33401.18\n",
 852 |       "act_last_type: 32289.74\n",
 853 |       "time_gap_last67: 31290.77\n",
 854 |       "time_gap_last56: 29933.51\n",
 855 |       "ord_type1_time_ptp: 29750.38\n",
 856 |       "ord_type1_time_mean: 23039.05\n",
 857 |       "27: 22502.02\n",
 858 |       "3: 21347.94\n",
 859 |       "13: 21197.36\n",
 860 |       "26: 20224.40\n",
 861 |       "25: 17148.59\n",
 862 |       "act_time_diff(1-2)(window_2)(type_5): 16612.83\n",
 863 |       "act_time_diff_56_max: 15666.05\n",
 864 |       "29: 15019.18\n",
 865 |       "ord_type1_time_median: 14009.13\n",
 866 |       "act_last_time-act_time(rank_1)(window_1)(type_4): 12531.98\n",
 867 |       "act_seq_time_diff(2-1)(window_6): 10592.32\n",
 868 |       "time_gap_last6: 10263.90\n",
 869 |       "30: 10063.68\n",
 870 |       "267: 10022.36\n",
 871 |       "time_gap_last1: 9354.54\n",
 872 |       "time_gap_last5: 8264.33\n",
 873 |       "act_time_diff_56789(2-3)(window_6): 7738.38\n",
 874 |       "act_time_diff(1-2)(window_6): 7738.33\n",
 875 |       "177: 7633.40\n",
 876 |       "264: 7547.36\n",
 877 |       "147: 7252.66\n",
 878 |       "has_ord_serv_nan: 6669.21\n",
 879 |       "act_time_diff(1-2)(window_6)(type_6): 6662.99\n",
 880 |       "47: 6109.28\n",
 881 |       "act_time_diff_56789(1-2)(window_6): 6105.44\n",
 882 |       "act_last_time: 6023.46\n",
 883 |       "261: 5941.93\n",
 884 |       "154: 5774.97\n",
 885 |       "16: 5362.74\n",
 886 |       "24: 5337.05\n",
 887 |       "140: 5322.88\n",
 888 |       "46: 5286.71\n",
 889 |       "260: 5243.96\n",
 890 |       "time_gap_last45: 5087.27\n",
 891 |       "204: 4967.24\n",
 892 |       "10: 4915.41\n",
 893 |       "act_time_diff(2-3)(window_6)(type_6): 4660.70\n",
 894 |       "act_ord_act_time_diff(1-2)(window_6): 4628.47\n",
 895 |       "56: 4623.05\n",
 896 |       "userid: 4575.01\n",
 897 |       "14: 4557.92\n",
 898 |       "280: 4342.64\n",
 899 |       "has_ord_serv_nan*act_num(type_8): 4241.34\n",
 900 |       "act_time_diff(3-4)(window_6)(type_6): 4156.29\n",
 901 |       "206: 4144.02\n",
 902 |       "act_first_time: 4091.94\n",
 903 |       "263: 3807.42\n",
 904 |       "act_last_time-act_time(rank_1)(window_1)(type_6): 3778.78\n",
 905 |       "148: 3764.90\n",
 906 |       "216: 3739.09\n",
 907 |       "act_num(type_6)(window_6): 3724.32\n",
 908 |       "ord_num(type_1): 3689.06\n",
 909 |       "act_time_diff(2-3)(window_6): 3635.89\n",
 910 |       "15: 3596.67\n",
 911 |       "191: 3585.44\n",
 912 |       "('country_27', 'sum'): 3500.75\n",
 913 |       "158: 3480.06\n",
 914 |       "139: 3439.16\n",
 915 |       "199: 3343.28\n",
 916 |       "159: 3308.34\n",
 917 |       "176: 3298.60\n",
 918 |       "act_last_time-ord_last_time: 3259.07\n",
 919 |       "act_time_diff_56_median: 3183.56\n",
 920 |       "act_type_diff(1-2)(window_6): 3141.00\n",
 921 |       "act_time_diff_min: 3139.59\n",
 922 |       "act_time_diff_56789(3-4)(window_6): 3014.31\n",
 923 |       "55: 2942.95\n",
 924 |       "act_seq_time_diff(3-2)(window_6): 2889.98\n",
 925 |       "com_rating_sum: 2869.91\n",
 926 |       "291: 2868.15\n",
 927 |       "286: 2793.73\n",
 928 |       "act_time_diff(1-2)(window_6)(type_1): 2780.36\n",
 929 |       "134: 2778.35\n",
 930 |       "149: 2768.18\n",
 931 |       "act_time_diff_56789(4-5)(window_6): 2752.34\n",
 932 |       "132: 2705.49\n",
 933 |       "act_time_diff(3-4)(window_6)(type_5): 2695.59\n",
 934 |       "143: 2682.05\n",
 935 |       "city_num(type_1): 2661.58\n",
 936 |       "45: 2644.15\n",
 937 |       "197: 2626.26\n",
 938 |       "17: 2577.90\n",
 939 |       "262: 2556.67\n",
 940 |       "has_ord_serv_nan*act_num(type_6): 2464.99\n",
 941 |       "185: 2454.82\n",
 942 |       "act_last_time-ord_type0_time_max: 2452.02\n",
 943 |       "act_time_diff(2-3)(window_6)(type_1): 2439.18\n",
 944 |       "28: 2394.69\n",
 945 |       "act_time_diff(4-5)(window_6)(type_6): 2393.97\n",
 946 |       "31: 2389.98\n",
 947 |       "160: 2369.23\n",
 948 |       "has_ord_serv_nan*act_num(type_1): 2346.15\n",
 949 |       "196: 2327.92\n",
 950 |       "157: 2327.35\n",
 951 |       "145: 2300.42\n",
 952 |       "53: 2261.73\n",
 953 |       "42: 2219.45\n",
 954 |       "265: 2200.27\n",
 955 |       "193: 2194.49\n",
 956 |       "34: 2139.11\n",
 957 |       "60: 2136.58\n",
 958 |       "act_last_time-act_time(rank_1)(window_1)(type_1): 2122.49\n",
 959 |       "last_time: 2109.67\n",
 960 |       "33: 2061.19\n",
 961 |       "198: 2059.85\n",
 962 |       "act_time_diff_median: 2052.37\n",
 963 |       "act_time_diff_234(3-4)(window_6): 2004.49\n",
 964 |       "act_ord_act_time_diff(2-3)(window_6): 1988.31\n",
 965 |       "act_num(type_6): 1968.70\n",
 966 |       "220: 1929.72\n",
 967 |       "144: 1911.66\n",
 968 |       "act_time_diff(4-5)(window_6)(type_5): 1892.98\n",
 969 |       "act_time_diff(5-6)(window_6)(type_6): 1880.65\n",
 970 |       "act_time_diff_56_std: 1858.45\n",
 971 |       "act_seq_time_diff(4-3)(window_6): 1811.92\n",
 972 |       "act_seq_time_diff(7-6)(window_6): 1810.81\n",
 973 |       "156: 1808.93\n",
 974 |       "270: 1800.24\n",
 975 |       "act_seq_time_diff(5-4)(window_6): 1796.10\n",
 976 |       "296: 1783.64\n",
 977 |       "has_ord_serv_nan*act_num(type_5): 1771.57\n",
 978 |       "23: 1761.05\n",
 979 |       "106: 1695.97\n",
 980 |       "act_time_diff(3-4)(window_6)(type_1): 1680.84\n",
 981 |       "142: 1676.34\n",
 982 |       "301: 1659.14\n",
 983 |       "act_time_diff(2-3)(window_6)(type_5): 1641.21\n",
 984 |       "act_last_time-act_time(rank_1)(window_1)(type_7): 1632.01\n",
 985 |       "act_seq_time_diff(6-5)(window_6): 1607.94\n",
 986 |       "act_time_diff_56_sum: 1583.83\n",
 987 |       "285: 1568.50\n",
 988 |       "11: 1567.28\n",
 989 |       "61: 1565.68\n",
 990 |       "province: 1540.42\n",
 991 |       "act_time_diff_median(window_3): 1538.59\n",
 992 |       "288: 1536.18\n",
 993 |       "act_first_time-act_time(rank_1)(window_1)(type_7): 1519.74\n",
 994 |       "283: 1501.60\n",
 995 |       "184: 1481.73\n",
 996 |       "in_comment_not_in_order: 1467.84\n",
 997 |       "276: 1446.86\n",
 998 |       "act_row_type_sum(window_6): 1436.19\n",
 999 |       "146: 1430.59\n",
1000 |       "266: 1411.94\n",
1001 |       "act_time_diff_56_mean: 1396.26\n",
1002 |       "137: 1390.03\n",
1003 |       "74: 1382.75\n",
1004 |       "289: 1375.64\n",
1005 |       "act_time_diff(5-6)(window_6)(type_5): 1361.74\n",
1006 |       "act_time_diff_56789(5-6)(window_6): 1351.64\n",
1007 |       "act_time_diff_min(window_3): 1347.88\n",
1008 |       "act_first_time-act_time(rank_1)(window_1)(type_1): 1343.85\n",
1009 |       "302: 1342.18\n",
1010 |       "161: 1335.74\n",
1011 |       "act_column_rate(type_6)(window_3): 1322.73\n",
1012 |       "act_last_time-act_time(rank_1)(window_1)(type_5): 1316.36\n",
1013 |       "284: 1311.53\n",
1014 |       "ord_type0_time_min: 1309.10\n",
1015 |       "act_time_diff(5-6)(window_6): 1301.12\n",
1016 |       "155: 1293.31\n",
1017 |       "act_type_diff(2-3)(window_6): 1279.50\n",
1018 |       "275: 1270.86\n",
1019 |       "act_num(type_4): 1265.26\n",
1020 |       "287: 1251.95\n",
1021 |       "act_time_diff_56_min: 1242.87\n",
1022 |       "act_time_diff(4-5)(window_6): 1239.71\n",
1023 |       "has_ord_serv_no: 1237.97\n",
1024 |       "298: 1237.77\n",
1025 |       "act_num(type_5)(window_6): 1207.33\n",
1026 |       "ord_type0_time_max: 1200.36\n",
1027 |       "act_time_diff_mean: 1196.56\n",
1028 |       "act_time_diff(3-4)(window_6): 1192.79\n",
1029 |       "has_ord_serv_no*act_num(type_6): 1146.12\n",
1030 |       "act_time_diff_mean(window_3): 1128.77\n",
1031 |       "act_num(type_1): 1122.70\n",
1032 |       "278: 1107.60\n",
1033 |       "act_row_type_ptp(window_6): 1092.74\n",
1034 |       "act_first_time-act_time(rank_1)(window_1)(type_6): 1091.17\n",
1035 |       "19: 1084.73\n",
1036 |       "act_first_time-act_time(rank_1)(window_1)(type_5): 1081.12\n",
1037 |       "152: 1075.11\n",
1038 |       "57: 1068.34\n",
1039 |       "299: 1067.17\n",
1040 |       "297: 1054.28\n",
1041 |       "7: 1045.67\n",
1042 |       "290: 1032.36\n",
1043 |       "292: 1027.20\n",
1044 |       "act_row_type_std(window_6): 997.76\n",
1045 |       "294: 992.39\n",
1046 |       "186: 986.84\n",
1047 |       "279: 970.18\n",
1048 |       "153: 960.88\n",
1049 |       "188: 957.32\n",
1050 |       "act_last_time-act_time(rank_1)(window_1)(type_8): 956.99\n",
1051 |       "271: 938.97\n",
1052 |       "com_rating_count: 926.12\n",
1053 |       "act_num(type_5): 911.37\n",
1054 |       "281: 894.21\n",
1055 |       "187: 887.71\n",
1056 |       "act_column_rate(type_4)(window_3): 887.37\n",
1057 |       "has_ord_serv_no*act_num(type_8): 868.85\n",
1058 |       "35: 861.87\n",
1059 |       "274: 855.11\n",
1060 |       "282: 847.87\n",
1061 |       "act_row_type_mean(window_6): 836.86\n",
1062 |       "71: 824.62\n",
1063 |       "ord_first_time: 821.98\n",
1064 |       "138: 820.06\n",
1065 |       "act_row_type_median(window_6): 815.67\n",
1066 |       "act_time_diff_std(window_3): 812.03\n",
1067 |       "act_time_diff(1-2)(window_6)(type_8): 801.78\n",
1068 |       "277: 793.23\n",
1069 |       "act_time_diff(4-5)(window_6)(type_1): 789.31\n",
1070 |       "300: 782.75\n",
1071 |       "has_ord_serv_no*act_num(type_1): 755.37\n",
1072 |       "295: 753.92\n",
1073 |       "70: 753.10\n",
1074 |       "act_first_time-act_time(rank_1)(window_1)(type_8): 749.44\n",
1075 |       "act_row_rate(type_6)(window_3): 749.17\n",
1076 |       "gender: 739.94\n",
1077 |       "293: 738.97\n",
1078 |       "act_time_diff(5-6)(window_6)(type_1): 737.55\n",
1079 |       "59: 717.40\n",
1080 |       "ord_type0_time_std: 716.11\n",
1081 |       "act_first_type: 705.41\n",
1082 |       "51: 675.96\n",
1083 |       "act_time_diff_234(2-3)(window_6): 671.28\n",
1084 |       "act_time_diff_std: 669.50\n",
1085 |       "190: 664.22\n",
1086 |       "act_time_diff_sum: 663.76\n",
1087 |       "act_time_diff_max(window_3): 661.46\n",
1088 |       "act_row_rate(type_5)(window_3): 660.11\n",
1089 |       "act_first_time-act_time(rank_1)(window_1)(type_4): 649.98\n",
1090 |       "58: 648.62\n",
1091 |       "act_type_num: 645.06\n",
1092 |       "act_time_diff_234(1-2)(window_6): 645.05\n",
1093 |       "272: 625.98\n",
1094 |       "act_first_time-ord_type0_time_min: 607.96\n",
1095 |       "189: 605.92\n",
1096 |       "211: 602.81\n",
1097 |       "act_time_diff_sum(window_3): 601.53\n",
1098 |       "ord_last_time: 600.09\n",
1099 |       "32: 596.78\n",
1100 |       "act_row_rate(type_4)(window_3): 596.26\n",
1101 |       "act_type_diff(3-4)(window_6): 595.63\n",
1102 |       "8: 593.52\n",
1103 |       "act_time_diff_234(4-5)(window_6): 586.77\n",
1104 |       "act_type(rank_2)(window6): 586.45\n",
1105 |       "act_first_time-ord_first_time: 578.54\n",
1106 |       "act_num(type_4)(window_6): 574.12\n",
1107 |       "act_time_last-first: 562.82\n",
1108 |       "273: 561.08\n",
1109 |       "act_column_rate(type_5)(window_3): 559.66\n",
1110 |       "act_last_time-act_time(rank_1)(window_1)(type_2): 558.87\n",
1111 |       "act_num(type_1)(window_6): 553.07\n",
1112 |       "act_ord_act_time_diff(5-6)(window_6): 548.27\n",
1113 |       "act_ord_act_time_diff(4-5)(window_6): 544.69\n",
1114 |       "act_type(rank_4)(window6): 541.88\n",
1115 |       "act_first_time-act_time(rank_1)(window_1)(type_2): 541.67\n",
1116 |       "act_time_diff_max: 522.10\n",
1117 |       "136: 499.26\n",
1118 |       "ord_last_id: 490.98\n",
1119 |       "age: 489.96\n",
1120 |       "21: 465.80\n",
1121 |       "200: 462.40\n",
1122 |       "214: 450.18\n",
1123 |       "151: 435.26\n",
1124 |       "ord_type0_time_median: 430.44\n",
1125 |       "208: 429.74\n",
1126 |       "act_type_diff(5-6)(window_6): 423.72\n",
1127 |       "act_row_type_max(window_6): 422.26\n",
1128 |       "162: 420.51\n",
1129 |       "act_num: 416.66\n",
1130 |       "43: 405.37\n",
1131 |       "52: 399.57\n",
1132 |       "('continent_3', 'sum'): 396.81\n",
1133 |       "167: 378.37\n",
1134 |       "36: 376.55\n",
1135 |       "114: 368.49\n",
1136 |       "act_type_diff(4-5)(window_6): 341.62\n",
1137 |       "2: 340.75\n",
1138 |       "67: 334.79\n",
1139 |       "ord_type0_time_mean: 328.52\n",
1140 |       "20: 327.96\n",
1141 |       "('country_34', 'sum'): 322.97\n",
1142 |       "has_ord_serv_no*act_num(type_4): 320.20\n",
1143 |       "54: 317.03\n",
1144 |       "209: 303.39\n",
1145 |       "act_row_rate(type_7)(window_3): 300.52\n",
1146 |       "135: 294.93\n",
1147 |       "act_num(type_2): 291.27\n",
1148 |       "act_row_type_min(window_6): 284.56\n",
1149 |       "1: 283.16\n",
1150 |       "48: 278.25\n",
1151 |       "268: 271.34\n",
1152 |       "141: 268.71\n",
1153 |       "act_ord_act_time_diff(3-4)(window_6): 264.33\n",
1154 |       "act_time_diff_234(5-6)(window_6): 258.60\n",
1155 |       "133: 251.06\n",
1156 |       "210: 250.90\n",
1157 |       "act_time_diff(5-6)(window_6)(type_9): 250.62\n",
1158 |       "150: 237.32\n",
1159 |       "act_num(type_2)(window_6): 236.80\n",
1160 |       "country_num(type_1): 224.02\n",
1161 |       "38: 223.78\n",
1162 |       "37: 222.92\n",
1163 |       "act_type(rank_3)(window6): 222.74\n",
1164 |       "4: 222.04\n",
1165 |       "('country_25', 'sum'): 220.33\n",
1166 |       "203: 217.87\n",
1167 |       "act_num(type_8)(window_6): 216.83\n",
1168 |       "act_first_time-act_time(rank_1)(window_1)(type_9): 215.64\n",
1169 |       "has_ord_serv_nan*act_num(type_2): 212.86\n",
1170 |       "215: 208.83\n",
1171 |       "act_type(rank_6)(window6): 208.78\n",
1172 |       "303: 206.01\n",
1173 |       "act_time_diff(1-2)(window_6)(type_7): 204.16\n",
1174 |       "22: 200.08\n",
1175 |       "226: 199.44\n",
1176 |       "93: 199.07\n",
1177 |       "ord_first_id: 196.04\n",
1178 |       "act_first_time-act_time(rank_1)(window_1)(type_3): 193.07\n",
1179 |       "has_ord_serv_no*act_num(type_9): 188.52\n",
1180 |       "act_num(type_3): 186.01\n",
1181 |       "has_ord_serv_nan*act_num(type_7): 183.59\n",
1182 |       "has_ord_serv_no*act_num(type_7): 177.61\n",
1183 |       "has_ord_serv_nan*act_num(type_3): 170.42\n",
1184 |       "9: 169.50\n",
1185 |       "has_ord_serv_nan*act_num(type_4): 166.98\n",
1186 |       "has_ord_serv_nan*act_num(type_9): 159.77\n",
1187 |       "ord_type0_time_ptp: 156.36\n",
1188 |       "('continent_0', 'sum'): 150.92\n",
1189 |       "act_time_diff(2-3)(window_6)(type_7): 150.12\n",
1190 |       "has_ord_serv_no*act_num(type_5): 149.97\n",
1191 |       "act_type(rank_5)(window6): 146.70\n",
1192 |       "act_time_diff(2-3)(window_6)(type_8): 138.56\n",
1193 |       "49: 135.74\n",
1194 |       "119: 133.37\n",
1195 |       "('country_3', 'sum'): 131.53\n",
1196 |       "180: 131.08\n",
1197 |       "5: 128.63\n",
1198 |       "12: 128.51\n",
1199 |       "68: 127.47\n",
1200 |       "224: 114.75\n",
1201 |       "6: 113.36\n",
1202 |       "act_time_diff(1-2)(window_6)(type_9): 110.68\n",
1203 |       "act_num(type_8): 110.32\n",
1204 |       "act_row_rate(type_1)(window_3): 108.95\n",
1205 |       "18: 107.46\n",
1206 |       "183: 105.46\n",
1207 |       "act_last_time-act_time(rank_1)(window_1)(type_3): 103.01\n",
1208 |       "act_column_rate(type_7)(window_3): 101.83\n",
1209 |       "202: 101.53\n",
1210 |       "act_num(type_7)(window_6): 99.50\n",
1211 |       "39: 99.31\n",
1212 |       "has_ord_serv_yes*act_num(type_1): 98.20\n",
1213 |       "182: 97.19\n",
1214 |       "44: 96.11\n",
1215 |       "40: 95.63\n",
1216 |       "('continent_4', 'sum'): 94.07\n",
1217 |       "257: 92.09\n",
1218 |       "201: 90.53\n",
1219 |       "243: 88.92\n",
1220 |       "ord_rate(type_1): 87.99\n",
1221 |       "50: 85.77\n",
1222 |       "('country_39', 'sum'): 85.33\n",
1223 |       "245: 84.97\n",
1224 |       "('continent_1', 'sum'): 81.00\n",
1225 |       "181: 78.63\n",
1226 |       "173: 75.30\n",
1227 |       "238: 68.02\n",
1228 |       "act_column_rate(type_1)(window_3): 67.25\n",
1229 |       "78: 65.97\n",
1230 |       "269: 64.41\n",
1231 |       "104: 59.83\n",
1232 |       "80: 59.32\n",
1233 |       "act_num(type_7): 57.99\n",
1234 |       "act_last_time-act_time(rank_1)(window_1)(type_9): 55.26\n",
1235 |       "('country_33', 'sum'): 50.74\n",
1236 |       "169: 49.31\n",
1237 |       "act_num(type_3)(window_6): 48.61\n",
1238 |       "219: 44.39\n",
1239 |       "continent_num(type_1): 43.54\n",
1240 |       "178: 39.29\n",
1241 |       "41: 37.74\n",
1242 |       "113: 37.51\n",
1243 |       "179: 36.72\n",
1244 |       "ord_num(type_0): 35.74\n",
1245 |       "253: 34.60\n",
1246 |       "108: 33.63\n",
1247 |       "ord_first_type1_month: 32.27\n",
1248 |       "has_ord_serv_yes*act_num(type_7): 31.59\n",
1249 |       "act_time_diff(3-4)(window_6)(type_8): 31.04\n",
1250 |       "172: 30.67\n",
1251 |       "has_ord_serv_no*act_num(type_2): 29.88\n",
1252 |       "('country_46', 'sum'): 26.48\n",
1253 |       "85: 26.33\n",
1254 |       "act_time_diff(4-5)(window_6)(type_9): 25.18\n",
1255 |       "234: 22.98\n",
1256 |       "act_time_diff(5-6)(window_6)(type_7): 22.55\n",
1257 |       "act_row_rate(type_8)(window_3): 21.55\n",
1258 |       "218: 20.47\n",
1259 |       "222: 19.76\n",
1260 |       "has_ord_serv_yes*act_num(type_6): 18.07\n",
1261 |       "241: 17.94\n",
1262 |       "act_num(type_9): 16.99\n",
1263 |       "170: 16.87\n",
1264 |       "212: 16.59\n",
1265 |       "217: 15.31\n",
1266 |       "has_ord_serv_yes*act_num(type_8): 14.42\n",
1267 |       "166: 12.65\n",
1268 |       "ord_last_type1_day: 12.60\n",
1269 |       "('country_1', 'sum'): 12.27\n",
1270 |       "has_ord_serv_no*act_num(type_3): 11.49\n",
1271 |       "131: 11.25\n",
1272 |       "country_num: 9.67\n",
1273 |       "171: 9.17\n",
1274 |       "ord_last_type1_month: 7.72\n",
1275 |       "164: 7.51\n",
1276 |       "act_row_rate(type_3)(window_3): 7.04\n",
1277 |       "213: 6.87\n",
1278 |       "207: 6.40\n",
1279 |       "city_num: 6.37\n",
1280 |       "act_time_diff(4-5)(window_6)(type_8): 6.03\n",
1281 |       "act_time_diff(3-4)(window_6)(type_9): 5.84\n",
1282 |       "ord_first_type1_day: 5.59\n",
1283 |       "act_column_rate(type_3)(window_3): 5.11\n",
1284 |       "225: 4.75\n",
1285 |       "('country_50', 'sum'): 4.26\n",
1286 |       "76: 3.79\n",
1287 |       "ord_num: 3.70\n",
1288 |       "act_time_diff(2-3)(window_6)(type_9): 3.44\n",
1289 |       "('country_42', 'sum'): 3.22\n",
1290 |       "continent_num: 3.03\n",
1291 |       "194: 2.68\n",
1292 |       "63: 2.68\n",
1293 |       "act_time_diff(4-5)(window_6)(type_7): 2.65\n",
1294 |       "has_ord_serv_yes*act_num(type_5): 2.03\n",
1295 |       "228: 1.51\n",
1296 |       "ord_type1_time_std: 0.00\n",
1297 |       "ord_rate(type_0): 0.00\n",
1298 |       "('country_0', 'sum'): 0.00\n",
1299 |       "('country_2', 'sum'): 0.00\n",
1300 |       "('country_4', 'sum'): 0.00\n",
1301 |       "('country_5', 'sum'): 0.00\n",
1302 |       "('country_6', 'sum'): 0.00\n",
1303 |       "('country_7', 'sum'): 0.00\n",
1304 |       "('country_8', 'sum'): 0.00\n",
1305 |       "('country_9', 'sum'): 0.00\n",
1306 |       "('country_10', 'sum'): 0.00\n",
1307 |       "('country_11', 'sum'): 0.00\n",
1308 |       "('country_12', 'sum'): 0.00\n",
1309 |       "('country_13', 'sum'): 0.00\n",
1310 |       "('country_14', 'sum'): 0.00\n",
1311 |       "('country_15', 'sum'): 0.00\n",
1312 |       "('country_16', 'sum'): 0.00\n",
1313 |       "('country_17', 'sum'): 0.00\n",
1314 |       "('country_18', 'sum'): 0.00\n",
1315 |       "('country_19', 'sum'): 0.00\n",
1316 |       "('country_20', 'sum'): 0.00\n",
1317 |       "('country_21', 'sum'): 0.00\n",
1318 |       "('country_22', 'sum'): 0.00\n",
1319 |       "('country_23', 'sum'): 0.00\n",
1320 |       "('country_24', 'sum'): 0.00\n",
1321 |       "('country_26', 'sum'): 0.00\n",
1322 |       "('country_28', 'sum'): 0.00\n",
1323 |       "('country_29', 'sum'): 0.00\n",
1324 |       "('country_30', 'sum'): 0.00\n",
1325 |       "('country_31', 'sum'): 0.00\n",
1326 |       "('country_32', 'sum'): 0.00\n",
1327 |       "('country_35', 'sum'): 0.00\n",
1328 |       "('country_36', 'sum'): 0.00\n",
1329 |       "('country_37', 'sum'): 0.00\n",
1330 |       "('country_38', 'sum'): 0.00\n",
1331 |       "('country_40', 'sum'): 0.00\n",
1332 |       "('country_41', 'sum'): 0.00\n",
1333 |       "('country_43', 'sum'): 0.00\n",
1334 |       "('country_44', 'sum'): 0.00\n",
1335 |       "('country_45', 'sum'): 0.00\n",
1336 |       "('country_47', 'sum'): 0.00\n",
1337 |       "('country_48', 'sum'): 0.00\n",
1338 |       "('country_49', 'sum'): 0.00\n",
1339 |       "('continent_2', 'sum'): 0.00\n",
1340 |       "('continent_5', 'sum'): 0.00\n",
1341 |       "ord_last_type: 0.00\n",
1342 |       "ord_first_type: 0.00\n",
1343 |       "act_num(type_9)(window_6): 0.00\n",
1344 |       "act_column_rate(type_2)(window_3): 0.00\n",
1345 |       "act_column_rate(type_8)(window_3): 0.00\n",
1346 |       "act_column_rate(type_9)(window_3): 0.00\n",
1347 |       "act_row_rate(type_2)(window_3): 0.00\n",
1348 |       "act_row_rate(type_9)(window_3): 0.00\n",
1349 |       "act_time_diff(3-4)(window_6)(type_7): 0.00\n",
1350 |       "act_time_diff(5-6)(window_6)(type_8): 0.00\n",
1351 |       "ord_last_type1_year: 0.00\n",
1352 |       "ord_first_type1_year: 0.00\n",
1353 |       "62: 0.00\n",
1354 |       "64: 0.00\n",
1355 |       "65: 0.00\n",
1356 |       "66: 0.00\n",
1357 |       "69: 0.00\n",
1358 |       "72: 0.00\n",
1359 |       "73: 0.00\n",
1360 |       "75: 0.00\n",
1361 |       "77: 0.00\n",
1362 |       "79: 0.00\n",
1363 |       "81: 0.00\n",
1364 |       "82: 0.00\n",
1365 |       "83: 0.00\n",
1366 |       "84: 0.00\n",
1367 |       "86: 0.00\n",
1368 |       "87: 0.00\n",
1369 |       "88: 0.00\n",
1370 |       "89: 0.00\n",
1371 |       "90: 0.00\n",
1372 |       "91: 0.00\n",
1373 |       "92: 0.00\n",
1374 |       "94: 0.00\n",
1375 |       "95: 0.00\n",
1376 |       "96: 0.00\n",
1377 |       "97: 0.00\n",
1378 |       "98: 0.00\n",
1379 |       "99: 0.00\n",
1380 |       "100: 0.00\n",
1381 |       "101: 0.00\n",
1382 |       "102: 0.00\n",
1383 |       "103: 0.00\n",
1384 |       "105: 0.00\n",
1385 |       "107: 0.00\n",
1386 |       "109: 0.00\n",
1387 |       "110: 0.00\n",
1388 |       "111: 0.00\n",
1389 |       "112: 0.00\n",
1390 |       "115: 0.00\n",
1391 |       "116: 0.00\n",
1392 |       "117: 0.00\n",
1393 |       "118: 0.00\n",
1394 |       "120: 0.00\n",
1395 |       "121: 0.00\n",
1396 |       "122: 0.00\n",
1397 |       "123: 0.00\n",
1398 |       "124: 0.00\n",
1399 |       "125: 0.00\n",
1400 |       "126: 0.00\n",
1401 |       "127: 0.00\n",
1402 |       "128: 0.00\n",
1403 |       "129: 0.00\n",
1404 |       "130: 0.00\n",
1405 |       "163: 0.00\n",
1406 |       "165: 0.00\n",
1407 |       "168: 0.00\n",
1408 |       "174: 0.00\n",
1409 |       "175: 0.00\n",
1410 |       "195: 0.00\n",
1411 |       "221: 0.00\n",
1412 |       "223: 0.00\n",
1413 |       "227: 0.00\n",
1414 |       "229: 0.00\n",
1415 |       "230: 0.00\n",
1416 |       "231: 0.00\n",
1417 |       "232: 0.00\n",
1418 |       "233: 0.00\n",
1419 |       "235: 0.00\n",
1420 |       "236: 0.00\n",
1421 |       "237: 0.00\n",
1422 |       "239: 0.00\n",
1423 |       "240: 0.00\n",
1424 |       "242: 0.00\n",
1425 |       "244: 0.00\n",
1426 |       "246: 0.00\n",
1427 |       "247: 0.00\n",
1428 |       "248: 0.00\n",
1429 |       "249: 0.00\n",
1430 |       "250: 0.00\n",
1431 |       "251: 0.00\n",
1432 |       "252: 0.00\n",
1433 |       "254: 0.00\n",
1434 |       "255: 0.00\n",
1435 |       "256: 0.00\n",
1436 |       "258: 0.00\n",
1437 |       "259: 0.00\n",
1438 |       "act_last_time-ord_type1_time_max: 0.00\n",
1439 |       "act_first_time-ord_type1_time_min: 0.00\n",
1440 |       "has_ord_serv_yes: 0.00\n",
1441 |       "has_ord_serv_yes*act_num(type_2): 0.00\n",
1442 |       "has_ord_serv_yes*act_num(type_3): 0.00\n",
1443 |       "has_ord_serv_yes*act_num(type_4): 0.00\n",
1444 |       "has_ord_serv_yes*act_num(type_9): 0.00\n"
1445 |      ]
1446 |     }
1447 |    ],
1448 |    "source": [
1449 |     "print(\"\\n\".join((\"%s: %.2f\" % x) for x in sorted(zip(train_feature.columns, model.feature_importance(\"gain\")), key=lambda x: x[1], reverse=True)))"
1450 |    ]
1451 |   },
1452 |   {
1453 |    "cell_type": "code",
1454 |    "execution_count": null,
1455 |    "metadata": {
1456 |     "collapsed": true
1457 |    },
1458 |    "outputs": [],
1459 |    "source": []
1460 |   },
1461 |   {
1462 |    "cell_type": "code",
1463 |    "execution_count": null,
1464 |    "metadata": {
1465 |     "collapsed": true
1466 |    },
1467 |    "outputs": [],
1468 |    "source": [
1469 |     "######################################### blending #########################################"
1470 |    ]
1471 |   },
1472 |   {
1473 |    "cell_type": "code",
1474 |    "execution_count": null,
1475 |    "metadata": {
1476 |     "collapsed": true
1477 |    },
1478 |    "outputs": [],
1479 |    "source": [
1480 |     "test1 = pd.read_csv('../data/output/sub/bjw/result_addUserid_0125_1.csv')\n",
1481 |     "test2 = pd.read_csv('../data/output/sub/20180203-lgb-0.966497(r1843).csv')\n",
1482 |     "test3 = pd.read_csv('../data/output/sub/shawn_lgb_local9641_online9646.csv')\n",
1483 |     "test4 = pd.read_csv('../data/output/sub/ym/lz96490.csv')\n",
1484 |     "testa = pd.merge(test1, test2, on='userid', how='left')\n",
1485 |     "testb = pd.merge(test3, test4, on='userid', how='left')\n",
1486 |     "test = pd.merge(testa, testb, on='userid', how='left')"
1487 |    ]
1488 |   },
1489 |   {
1490 |    "cell_type": "code",
1491 |    "execution_count": null,
1492 |    "metadata": {
1493 |     "collapsed": true
1494 |    },
1495 |    "outputs": [],
1496 |    "source": [
1497 |     "test['orderType'] = 0.5 * test['orderType_x_x'] + 0.3 * test['orderType_y_x'] + 0.1 * test['orderType_x_y'] + 0.1 * test['orderType_y_y']"
1498 |    ]
1499 |   },
1500 |   {
1501 |    "cell_type": "code",
1502 |    "execution_count": null,
1503 |    "metadata": {
1504 |     "collapsed": true,
1505 |     "scrolled": true
1506 |    },
1507 |    "outputs": [],
1508 |    "source": [
1509 |     "test[['userid','orderType']].to_csv('../data/output/sub/blend/20180203-0.5bjw+0.3+0.1+0.1ym.csv',index=False)"
1510 |    ]
1511 |   }
1512 |  ],
1513 |  "metadata": {
1514 |   "kernelspec": {
1515 |    "display_name": "Python [default]",
1516 |    "language": "python",
1517 |    "name": "python2"
1518 |   },
1519 |   "language_info": {
1520 |    "codemirror_mode": {
1521 |     "name": "ipython",
1522 |     "version": 2
1523 |    },
1524 |    "file_extension": ".py",
1525 |    "mimetype": "text/x-python",
1526 |    "name": "python",
1527 |    "nbconvert_exporter": "python",
1528 |    "pygments_lexer": "ipython2",
1529 |    "version": "2.7.13"
1530 |   }
1531 |  },
1532 |  "nbformat": 4,
1533 |  "nbformat_minor": 1
1534 | }
1535 | 


--------------------------------------------------------------------------------