├── .gitignore
├── Instacart 2nd Place Solution.pptx
├── LICENSE
├── README.md
├── appendix
    ├── 317_.py
    ├── 501_concat.py
    ├── 901_reorder_base.py
    ├── 902_reorder.py
    ├── 903_Faron_opt_bagging.py
    ├── README.md
    └── utils.py
├── input
    └── README.md
├── output
    └── sub
    │   └── final
    │       └── Faron-opt_bagging-v3.csv.gz
├── py_feature
    ├── 000_mk.py
    ├── 003_X_base_T.py
    ├── 004_label.py
    ├── 005_inarow.py
    ├── 006_days_since_last_order.py
    ├── 007_timezone.py
    ├── 008_product_feature.py
    ├── 009_None.py
    ├── 010_streak.py
    ├── 011_replacement.py
    ├── 012_aisle_dep_cumsum.py
    ├── 100_======user_feature======
    ├── 101_repeat_previous_ratio_T.py
    ├── 102_orderspan_average.py
    ├── 103_visit_time.py
    ├── 104_organic.py
    ├── 105_delta_time.py
    ├── 108_order_size.py
    ├── 109_have_you_bought.py
    ├── 110_None.py
    ├── 200_======item_feature======
    ├── 202_buy_time.py
    ├── 203_cycle.py
    ├── 205_co-occur.py
    ├── 207_mean_pos_cart.py
    ├── 208_one-shot.py
    ├── 209_together.py
    ├── 210_streak.py
    ├── 211_1to1.py
    ├── 212_withinN.py
    ├── 213_dow_diff.py
    ├── 214_first_order.py
    ├── 215_onb_diff.py
    ├── 300_======user x item======
    ├── 301_total_buy.py
    ├── 302-1_reorderd_all.py
    ├── 303_last_order_date.py
    ├── 304_buy_item_inarow.py
    ├── 305_last_order_num.py
    ├── 306_mean_pos_cart.py
    ├── 307_timezone_dow.py
    ├── 308_timezone_dow.py
    ├── 309_order_ratio_by-chance.py
    ├── 310_repeat_within_today.py
    ├── 312_cycle.py
    ├── 313_aisle_dep.py
    ├── 314_co-occur.py
    ├── 315_streak.py
    ├── 316_replacement.py
    ├── 400_===== daytime =====
    ├── 401_how_many_come.py
    ├── 500_===== concat =====
    ├── 501_concat.py
    ├── 502_concat.py
    ├── 900_===== run =====
    ├── 901_run_feature.py
    ├── 902_run_concat.py
    ├── run.py
    └── utils.py
├── py_model
    ├── 000_====== user x item prediction ======
    ├── 002_xgb_holdout_item_812_1.py
    ├── 002_xgb_holdout_item_813_1.py
    ├── 002_xgb_holdout_item_813_3.py
    ├── 100_====== None prediction ======
    ├── 102_xgb_holdout_None_813_3.py
    ├── 102_xgb_holdout_None_814_1.py
    ├── 102_xgb_holdout_None_814_2.py
    ├── 102_xgb_holdout_None_814_3.py
    ├── 200_===== threshold estimation =====
    ├── 201_Faron_opt_bagging_815_3.py
    ├── 999_run.py
    ├── LOG
    │   ├── 812_1_xgb_item.txt
    │   ├── 813_1_xgb_item.txt
    │   ├── 813_3_xgb_None.txt
    │   ├── 813_3_xgb_item.txt
    │   ├── 814_1_xgb_None.txt
    │   ├── 814_2_xgb_None.txt
    │   └── 814_3_xgb_None.txt
    ├── opt_fscore.py
    ├── pyx_get_best_items.pyx
    └── utils.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | input/
3 | output/
4 | data/
5 | py/.ipynb_checkpoints
6 | py/*.model
7 | py/*.p
8 | 


--------------------------------------------------------------------------------
/Instacart 2nd Place Solution.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/Instacart 2nd Place Solution.pptx


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 KazukiOnodera
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Instacart Market Basket Analysis 2nd place solution
 2 | 
 3 | I made two models for predicting reorder & None.
 4 | Following are the features I made.
 5 | 
 6 | ## Features
 7 | ### User feature
 8 | * How often the user reordered items
 9 | * Time between orders
10 | * Time of day the user visits
11 | * Whether the user ordered organic, gluten-free, or Asian items in the past
12 | * Features based on order sizes
13 | * How many of the user’s orders contained no previously purchased items
14 | 
15 | ### Item feature
16 | * How often the item is purchased
17 | * Position in the cart
18 | * How many users buy it as "one shot" item
19 | * Stats on the number of items that co-occur with this item
20 | * Stats on the order streak
21 | * Probability of being reordered within N orders
22 | * Distribution of the day of week it is ordered
23 | * Probability it is reordered after the first order
24 | * Statistics around the time between orders
25 | 
26 | ### User x Item feature
27 | * Number of orders in which the user purchases the item
28 | * Days since the user last purchased the item
29 | * Streak (number of orders in a row the user has purchased the item)
30 | * Position in the cart
31 | * Whether the user already ordered the item today
32 | * Co-occurrence statistics
33 | * Replacement items
34 | 
35 | ### datetime feature
36 | * Counts by day of week
37 | * Counts by hour
38 | 
39 | More detail, please refer to codes.
40 | 
41 | ## F1 maximization
42 | Regarding F1 maximization, I hadn't read that paper until Faron had published the kernel. But I got high score because of my F1 maximization.
43 | Let me explain it.
44 | For maximizing F1, I generate y_true according to predicted prob. And check F1 from higher prob.
45 | For example, lets say we have ordered item and prob, like {A: 0.3, B:0.5, C:0.4}. Then generate y_true in many times. In my case, generated 9999 times.
46 | So now we have many of y_true, like [ [A,B],[B],[B,C],[C],[B],[None].....].
47 | As I mentioned above, next thing we do is to check F1 from [B], [B,C], [B,C,A]. Then we can estimate F1 peak out, and stop calculation, and go next order.
48 | You may know, in this method, we don't need to check all pattern, like [A],[A,B],[A,B,C],[B]...
49 | I guess some might have figured out this method from my comment of "tips to go farther".
50 | However, this method is time consuming as well as depends on seed. So finally I used Faron's kernel. 
51 | Fortunatelly or not, I got almost same result using Faron's kernel.
52 | Please refer to py_model/pyx_get_best_items.pyx
53 | 
54 | ## How to run
55 | * cd py_feature
56 | * python 901_run_feature.py
57 | * python 902_run_concat.py
58 | * cd ../py_model
59 | * python 999_run.py
60 | 
61 | ## Requirements
62 | Around 300 GB RAM needed(sorry).
63 | But I confirmed we can get 0.4073 on private LB with only around 60 GB RAM.
64 | Also if you don't have enough memory and want to get high score, try continuous training using
65 | xgb_model of xgb.train.
66 | 
67 | Python packages:
68 | - numpy==1.12.1
69 | - pandas==0.19.2
70 | - scipy==0.19.0
71 | - tqdm==4.11.2
72 | - xgboost==0.6
73 | 


--------------------------------------------------------------------------------
/appendix/317_.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Jun 18 12:55:38 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | https://twitter.com/jeremystan/status/911357665481080832
  9 | 
 10 | 6/ most novel feature: 
 11 |     binary user by product purchase sequence -> 
 12 |     decimal -> XGBoost learns non-trivial sequence patterns
 13 | 
 14 | """
 15 | 
 16 | import pandas as pd
 17 | import numpy as np
 18 | from tqdm import tqdm
 19 | from decimal import Decimal
 20 | import utils
 21 | #utils.start(__file__)
 22 | 
 23 | 
 24 | #==============================================================================
 25 | # load
 26 | #==============================================================================
 27 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev']
 28 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number'])
 29 | 
 30 | 
 31 | #==============================================================================
 32 | # def
 33 | #==============================================================================
 34 | def conv_bi2dec(seq, onb_max, reverse=True, deci=10):
 35 |     """
 36 |     ex.
 37 |     seq     = [1,3,4]
 38 |     onb_max = 6
 39 |             101100 -> 44
 40 |             001101 -> 13
 41 |     """
 42 |     
 43 |     bi = [0]*onb_max
 44 |     for i in seq:
 45 |         bi[i-1] = 1
 46 |     
 47 |     if reverse:
 48 |         bi = ''.join(map(str, bi))[::-1]
 49 |     else:
 50 |         bi = ''.join(map(str, bi))
 51 |     
 52 |     if deci==10:
 53 |         return int(bi, 2)
 54 |     elif deci==2:
 55 |         return int(bi)
 56 |     elif deci==.2:
 57 |         return float(bi[0] + '.' + bi[1:])
 58 |     else:
 59 |         raise
 60 | 
 61 | def make(T):
 62 |     """
 63 |     T = 0
 64 |     folder = 'trainT-0'
 65 |     """
 66 |     if T==-1:
 67 |         folder = 'test'
 68 |     else:
 69 |         folder = 'trainT-'+str(T)
 70 |         
 71 |     log_ = log[log.order_number_rev>T]
 72 |     log_['onb_max'] = log_.groupby('user_id').order_number.transform(np.max)
 73 |     
 74 |     r1_d10 = []
 75 |     r1_d2 = []
 76 |     r1_df2 = []
 77 |     r0_d10 = []
 78 |     r0_d2 = []
 79 |     r0_df2 = []
 80 |     
 81 |     seq = []
 82 |     uid_bk = pid_bk = onb_max_bk = None
 83 |     for uid,pid,onb,onb_max in tqdm(log_[['user_id', 'product_id', 'order_number', 'onb_max']].values):
 84 |         
 85 |         if uid_bk is None:
 86 |             pass
 87 |         
 88 |         elif uid==uid_bk and pid==pid_bk:
 89 |             pass
 90 |         
 91 |         elif uid!=uid_bk or pid!=pid_bk:
 92 |             r1_d10.append(conv_bi2dec(seq, onb_max_bk, True,  10))
 93 |             r1_d2.append(conv_bi2dec(seq, onb_max_bk,  True,  2))
 94 |             r1_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2))
 95 |             r0_d10.append(conv_bi2dec(seq, onb_max_bk, True,  10))
 96 |             r0_d2.append(conv_bi2dec(seq, onb_max_bk,  True,  2))
 97 |             r0_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2))
 98 |             seq = []
 99 |             
100 |         seq.append(onb)
101 |         uid_bk = uid
102 |         pid_bk = pid
103 |         onb_max_bk = onb_max
104 |     
105 |     r1_d10.append(conv_bi2dec(seq, onb_max_bk, True,  10))
106 |     r1_d2.append(conv_bi2dec(seq, onb_max_bk,  True,  2))
107 |     r1_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2))
108 |     r0_d10.append(conv_bi2dec(seq, onb_max_bk, True,  10))
109 |     r0_d2.append(conv_bi2dec(seq, onb_max_bk,  True,  2))
110 |     r0_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2))
111 |     
112 |     df = log_[['user_id', 'product_id']].drop_duplicates(keep='first').reset_index(drop=True)
113 |     df['seq2dec_r1_d10'] = r1_d10
114 |     df['seq2dec_r1_d2']  = r1_d2
115 |     df['seq2dec_r1_df2'] = r1_df2
116 |     df['seq2dec_r0_d10'] = r0_d10
117 |     df['seq2dec_r0_d2']  = r0_d2
118 |     df['seq2dec_r0_df2'] = r0_df2
119 |     
120 |     df.to_pickle('../feature/{}/f317_user-product.p'.format(folder))
121 |     
122 |     
123 | #==============================================================================
124 | # main
125 | #==============================================================================
126 | make(0)
127 | #make(1)
128 | #make(2)
129 | 
130 | make(-1)
131 | 
132 | 
133 | 
134 | #==============================================================================
135 | utils.end(__file__)
136 | 
137 | 


--------------------------------------------------------------------------------
/appendix/501_concat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Jul 10 04:11:27 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | nohup python -u 501_concat.py &
10 | 
11 | """
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import multiprocessing as mp
16 | import gc
17 | import utils
18 | #utils.start(__file__)
19 | 
20 | #==============================================================================
21 | # def
22 | #==============================================================================
23 | def concat_pred_item(T, dryrun=False):
24 |     if T==-1:
25 |         name = 'test'
26 |     else:
27 |         name = 'trainT-'+str(T)
28 |     
29 |     df = utils.load_pred_item(name)
30 |     
31 |     df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)), 
32 |                   on=['user_id', 'product_id'],how='left')
33 |     
34 |     gc.collect()
35 |     
36 |     #==============================================================================
37 |     print('output')
38 |     #==============================================================================
39 |     if dryrun == True:
40 |         return df
41 |     else:
42 |         utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True)
43 | 
44 | def multi(name):
45 |     concat_pred_item(name)
46 | 
47 | #==============================================================================
48 | 
49 | # multi
50 | mp_pool = mp.Pool(2)
51 | mp_pool.map(multi, [0, -1])
52 | 
53 | 
54 | 
55 | utils.end(__file__)
56 | 
57 | 


--------------------------------------------------------------------------------
/appendix/901_reorder_base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 901_reorder_base.py > LOG/_xgb_reorder.txt &
  9 | 
 10 | 
 11 | """
 12 | 
 13 | import warnings
 14 | warnings.filterwarnings("ignore")
 15 | import pandas as pd
 16 | import numpy as np
 17 | import gc
 18 | import xgboost as xgb
 19 | import utils
 20 | 
 21 | utils.start(__file__)
 22 | 
 23 | 
 24 | 
 25 | # setting
 26 | OUTF = '../output/sub/apdx/bench.p'
 27 | LOOP = 3
 28 | ESR = 40
 29 | 
 30 | #seed = np.random.randint(99999)
 31 | seed = 71
 32 | 
 33 | np.random.seed(seed)
 34 | 
 35 | valid_size = 0.05
 36 | 
 37 | 
 38 | # XGB param
 39 | nround = 10000
 40 | #nround = 10
 41 | 
 42 | param = {'max_depth':10, 
 43 |          'eta':0.02,
 44 |          'colsample_bytree':0.4,
 45 |          'subsample':0.75,
 46 |          'silent':1,
 47 |          'nthread':27,
 48 |          'eval_metric':'logloss',
 49 |          'objective':'binary:logistic',
 50 |          'tree_method':'hist'
 51 |          }
 52 | 
 53 | print("""#==== print param ======""")
 54 | print('OUTF:', OUTF)
 55 | print('seed:', seed)
 56 | 
 57 | utils.mkdir_p('../output/model/apdx')
 58 | utils.mkdir_p('../output/imp/apdx')
 59 | utils.mkdir_p('../output/sub/apdx')
 60 | 
 61 | #==============================================================================
 62 | # prepare
 63 | #==============================================================================
 64 | train = utils.load_pred_item('trainT-0')
 65 | 
 66 | y_train = train['y']
 67 | X_train = train.drop('y', axis=1)
 68 | del train
 69 | gc.collect()
 70 | 
 71 | # drop id
 72 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 73 | col.remove('user_id')
 74 | print('drop1',col)
 75 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 76 | 
 77 | # drop obj
 78 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 79 | print('drop2',col)
 80 | X_train.drop(col, axis=1, inplace=True)
 81 | 
 82 | X_train.fillna(-1, inplace=1)
 83 | 
 84 | #==============================================================================
 85 | # SPLIT!
 86 | print('split by user')
 87 | #==============================================================================
 88 | train_user = X_train[['user_id']].drop_duplicates()
 89 | 
 90 | def split_build_valid():
 91 |     
 92 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 93 |                                               p=[1-valid_size, valid_size])
 94 |     valid_n = train_user['is_valid'].sum()
 95 |     build_n = (train_user.shape[0] - valid_n)
 96 |     
 97 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 98 |     valid_user = train_user[train_user['is_valid']==1].user_id
 99 |     is_valid = X_train.user_id.isin(valid_user)
100 |     
101 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
102 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
103 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
104 |     
105 |     print('FINAL SHAPE')
106 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
107 |                                                       (dvalid.num_row(), dvalid.num_col())))
108 | 
109 |     return dbuild, dvalid, watchlist
110 | 
111 | #==============================================================================
112 | print('hold out')
113 | #==============================================================================
114 | 
115 | # hold out
116 | models = []
117 | for i in range(LOOP):
118 |     print('LOOP',i)
119 |     dbuild, dvalid, watchlist = split_build_valid()
120 |     
121 |     if i==0:
122 |         col_train = dbuild.feature_names
123 |         
124 |     model = xgb.train(param, dbuild, nround, watchlist,
125 |                       early_stopping_rounds=ESR, verbose_eval=5)
126 |     models.append(model)
127 | #    model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i))
128 |     # VALID
129 |     valid_yhat = model.predict(dvalid)
130 |     print('Valid Mean:', np.mean(valid_yhat))
131 |     del dbuild, dvalid, watchlist
132 |     gc.collect()
133 | 
134 | del train_user, X_train, y_train
135 | gc.collect()
136 | 
137 | #==============================================================================
138 | print('test')
139 | #==============================================================================
140 | test = utils.load_pred_item('test').fillna(-1)
141 | 
142 | sub_test = test[['order_id', 'product_id']]
143 | 
144 | dtest  = xgb.DMatrix(test[col_train])
145 | sub_test['yhat'] = 0
146 | for model in models:
147 |     sub_test['yhat'] += model.predict(dtest)
148 | sub_test['yhat'] /= LOOP
149 | print('Test Mean:', sub_test['yhat'].mean())
150 | 
151 | sub_test.to_pickle(OUTF)
152 | 
153 | 
154 | #==============================================================================
155 | utils.end(__file__)
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/appendix/902_reorder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 902_reorder.py > LOG/_xgb_item.txt &
  9 | 
 10 | 
 11 | """
 12 | 
 13 | import warnings
 14 | warnings.filterwarnings("ignore")
 15 | import pandas as pd
 16 | import numpy as np
 17 | import gc
 18 | import xgboost as xgb
 19 | import utils
 20 | 
 21 | utils.start(__file__)
 22 | 
 23 | 
 24 | 
 25 | # setting
 26 | OUTF = '../output/sub/apdx/seq2dec.p'
 27 | LOOP = 2
 28 | ESR = 40
 29 | 
 30 | #seed = np.random.randint(99999)
 31 | seed = 71
 32 | 
 33 | np.random.seed(seed)
 34 | 
 35 | valid_size = 0.05
 36 | 
 37 | 
 38 | # XGB param
 39 | nround = 10000
 40 | #nround = 10
 41 | 
 42 | param = {'max_depth':10, 
 43 |          'eta':0.02,
 44 |          'colsample_bytree':0.4,
 45 |          'subsample':0.75,
 46 |          'silent':1,
 47 |          'nthread':27,
 48 |          'eval_metric':'logloss',
 49 |          'objective':'binary:logistic',
 50 |          'tree_method':'hist'
 51 |          }
 52 | 
 53 | print("""#==== print param ======""")
 54 | print('OUTF:', OUTF)
 55 | print('seed:', seed)
 56 | 
 57 | #==============================================================================
 58 | # prepare
 59 | #==============================================================================
 60 | train = utils.read_pickles('../feature/{}/all_apdx'.format('trainT-0'))
 61 | 
 62 | # f317 obj into int
 63 | #col =  [c for c in train.columns if 'seq2' in c and not '_df' in c]
 64 | #train[col] = train[col].astype(np.float32)
 65 | 
 66 | y_train = train['y']
 67 | X_train = train.drop('y', axis=1)
 68 | del train
 69 | gc.collect()
 70 | 
 71 | # drop id
 72 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 73 | col.remove('user_id')
 74 | print('drop1',col)
 75 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 76 | 
 77 | # drop obj
 78 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()+['seq2dec_r0_df2']
 79 | print('drop2',col)
 80 | X_train.drop(col, axis=1, inplace=True)
 81 | 
 82 | X_train.fillna(-1, inplace=1)
 83 | 
 84 | #==============================================================================
 85 | # SPLIT!
 86 | print('split by user')
 87 | #==============================================================================
 88 | train_user = X_train[['user_id']].drop_duplicates()
 89 | 
 90 | def split_build_valid():
 91 |     
 92 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 93 |                                               p=[1-valid_size, valid_size])
 94 |     valid_n = train_user['is_valid'].sum()
 95 |     build_n = (train_user.shape[0] - valid_n)
 96 |     
 97 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 98 |     valid_user = train_user[train_user['is_valid']==1].user_id
 99 |     is_valid = X_train.user_id.isin(valid_user)
100 |     
101 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
102 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
103 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
104 |     
105 |     print('FINAL SHAPE')
106 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
107 |                                                       (dvalid.num_row(), dvalid.num_col())))
108 | 
109 |     return dbuild, dvalid, watchlist
110 | 
111 | #==============================================================================
112 | print('hold out')
113 | #==============================================================================
114 | 
115 | # hold out
116 | models = []
117 | for i in range(LOOP):
118 |     print('LOOP',i)
119 |     dbuild, dvalid, watchlist = split_build_valid()
120 |     
121 |     if i==0:
122 |         col_train = dbuild.feature_names
123 |         
124 |     model = xgb.train(param, dbuild, nround, watchlist,
125 |                       early_stopping_rounds=ESR, verbose_eval=5)
126 |     models.append(model)
127 | #    model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i))
128 |     # VALID
129 |     valid_yhat = model.predict(dvalid)
130 |     print('Valid Mean:', np.mean(valid_yhat))
131 |     del dbuild, dvalid, watchlist
132 |     gc.collect()
133 | 
134 | del train_user, X_train, y_train
135 | gc.collect()
136 | 
137 | #==============================================================================
138 | print('test')
139 | #==============================================================================
140 | test = utils.read_pickles('../feature/{}/all_apdx'.format('test')).fillna(-1)
141 | 
142 | # f317 obj into int
143 | #col =  [c for c in test.columns if 'seq2' in c and not '_df' in c]
144 | #test[col] = test[col].astype(np.float32)
145 | 
146 | 
147 | sub_test = test[['order_id', 'product_id']]
148 | 
149 | dtest  = xgb.DMatrix(test[col_train])
150 | sub_test['yhat'] = 0
151 | for model in models:
152 |     sub_test['yhat'] += model.predict(dtest)
153 | sub_test['yhat'] /= LOOP
154 | print('Test Mean:', sub_test['yhat'].mean())
155 | 
156 | sub_test.to_pickle(OUTF)
157 | 
158 | 
159 | #==============================================================================
160 | utils.end(__file__)
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/appendix/903_Faron_opt_bagging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Jul 29 18:59:46 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | nohup python -u 201_Faron_opt_bagging_815_3.py > LOG/_Faron-opt.txt &
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import sys
14 | sys.path.append('../py_model')
15 | from opt_fscore import get_best_prediction
16 | import multiprocessing as mp
17 | import time
18 | import utils
19 | utils.start(__file__)
20 | 
21 | 
22 | # setting
23 | DATE_None = ['813_3', '814_1', '814_2', '814_3']
24 | 
25 | total_proc = 60
26 | 
27 | 
28 | utils.mkdir_p('../output/sub/apdx')
29 | #==============================================================================
30 | # def
31 | #==============================================================================
32 | def multi(i):
33 |     if i%1000==0:
34 |         print('{:.3f} min'.format((time.time()-st_time)/60))
35 |     items = sub.loc[i,'product_id']
36 |     preds = sub.loc[i,'yhat']
37 |     pNone = sub.loc[i,'yhat_None']
38 |     ret = get_best_prediction(items, preds, pNone)
39 |     return ret
40 | 
41 | def mk_sub(DATE_item):
42 |     print("""#==== print param ======""")
43 |     print('OUTF:', OUTF)
44 |     print('DATE_item:', DATE_item)
45 |     print('DATE_None:', DATE_None)
46 |     print('total_proc:', total_proc)
47 |     
48 |     global sub, st_time
49 |     
50 |     sub_item = pd.read_pickle('../output/sub/{}/sub_test.p'.format(DATE_item))
51 |     sub = sub_item.groupby('order_id').product_id.apply(list).to_frame()
52 |     sub['yhat'] = sub_item.groupby('order_id').yhat.apply(list)
53 |     
54 |     # weighted
55 |     for i,(w,d) in enumerate(zip([0.1, 0.1, 0.4, 0.4], DATE_None)):
56 |         tmp = pd.read_pickle('../output/sub/{}/sub_test_None.p'.format(d)).rename(columns={'yhat':'yhat_None'})
57 |         tmp.yhat_None *= w
58 |         if i==0:
59 |             sub_None = tmp
60 |         else:
61 |             sub_None = pd.concat([sub_None, tmp])
62 |     
63 |     sub_None = sub_None.groupby('order_id').yhat_None.sum().reset_index()
64 |     
65 |     sub = pd.merge(sub.reset_index(), sub_None, on='order_id', how='left')
66 |     
67 |     # optimize start!!!
68 |     st_time = time.time()
69 |     pool = mp.Pool(total_proc)
70 |     callback = pool.map(multi, range(sub.shape[0]))
71 |     
72 |     sub['products'] = callback
73 |     
74 |     print('writing...')
75 |     sub[['order_id', 'products']].to_csv(OUTF, index=0, compression='gzip')
76 | #==============================================================================
77 | OUTF = "../output/sub/apdx/bench.csv.gz"
78 | mk_sub('apdx_base')
79 | 
80 | OUTF = "../output/sub/apdx/seq2dec.csv.gz"
81 | mk_sub('apdx')
82 | 
83 | 
84 | 
85 | 
86 | 
87 | #==============================================================================
88 | utils.end(__file__)
89 | 
90 | 


--------------------------------------------------------------------------------
/appendix/README.md:
--------------------------------------------------------------------------------
 1 | # Appendix of Instacart Market Basket Analysis
 2 | 
 3 | After the competition, I wannted to try some ideas.
 4 | 
 5 | ## How to run
 6 | * pendding
 7 | 
 8 | ## Requirements
 9 | 
10 | Python packages:
11 | - numpy==1.12.1
12 | - pandas==0.19.2
13 | - scipy==0.19.0
14 | - tqdm==4.11.2
15 | - xgboost==0.6
16 | 


--------------------------------------------------------------------------------
/input/README.md:
--------------------------------------------------------------------------------
1 | You need to put below files in this directory
2 | - aisles.csv
3 | - departments.csv
4 | - order_products__prior.csv.gz
5 | - order_products__train.csv.gz
6 | - orders.csv.gz
7 | - products.csv
8 | - sample_submission.csv


--------------------------------------------------------------------------------
/output/sub/final/Faron-opt_bagging-v3.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/output/sub/final/Faron-opt_bagging-v3.csv.gz


--------------------------------------------------------------------------------
/py_feature/000_mk.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 17 08:55:13 2017
  4 | 
  5 | @author: konodera
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import gc
 11 | import os
 12 | import utils
 13 | utils.start(__file__)
 14 | 
 15 | os.system('rm -rf ../input/mk')
 16 | os.system('mkdir ../input/mk')
 17 | 
 18 | os.system('rm -rf ../feature')
 19 | os.system('mkdir ../feature')
 20 | 
 21 | #==============================================================================
 22 | # test user
 23 | #==============================================================================
 24 | orders = pd.read_csv('../input/orders.csv.gz')
 25 | 
 26 | test_user = orders.loc[orders.eval_set=='test'].reset_index(drop=1)
 27 | test_user[['order_id', 'user_id']].to_pickle('../input/mk/test_user.p')
 28 | 
 29 | 
 30 | #==============================================================================
 31 | # goods
 32 | #==============================================================================
 33 | products = pd.read_csv('../input/products.csv')
 34 | products.product_name = products.product_name.str.replace(' ', '-')
 35 | 
 36 | aisles = pd.read_csv('../input/aisles.csv', engine='c')
 37 | departments = pd.read_csv('../input/departments.csv', engine='c')
 38 | 
 39 | goods = pd.merge(left=pd.merge(left=products, right=departments, how='left'), right=aisles, how='left')
 40 | 
 41 | 
 42 | goods.to_pickle('../input/mk/goods.p')
 43 | gc.collect()
 44 | #==============================================================================
 45 | # log
 46 | #==============================================================================
 47 | log = pd.concat([pd.read_csv('../input/order_products__prior.csv.gz'), 
 48 |                  pd.read_csv('../input/order_products__train.csv.gz')], 
 49 |                 ignore_index=1)
 50 | 
 51 | log.sort_values(['order_id', 'add_to_cart_order'], inplace=True)
 52 | log.reset_index(drop=1, inplace=True)
 53 | log = pd.merge(log, goods, on='product_id', how='left')
 54 | log = pd.merge(log, orders, on='order_id', how='left')
 55 | log['order_number_rev'] = log.groupby('user_id').order_number.transform(np.max) - log.order_number
 56 | 
 57 | utils.to_pickles(log, '../input/mk/log', 20)
 58 | 
 59 | gc.collect()
 60 | #==============================================================================
 61 | # order_tbl
 62 | #==============================================================================
 63 | order_product = log.groupby('order_id').product_name.apply(list).reset_index()
 64 | order_tbl = pd.merge(orders, order_product, on='order_id', how='left')
 65 | 
 66 | order_tbl.sort_values(['user_id', 'order_number'],inplace=True)
 67 | order_tbl.reset_index(drop=1, inplace=True)
 68 | order_tbl = pd.merge(order_tbl, log[['order_id','order_number_rev']].drop_duplicates(), on='order_id', how='left')
 69 | order_tbl.order_number_rev = order_tbl.order_number_rev.fillna(-1).astype(int)
 70 | #order_tbl['order_number_rev'] = order_tbl.groupby('user_id').order_number.transform(np.max) - order_tbl.order_number
 71 | order_tbl['days_since_first_order'] = order_tbl.groupby('user_id').days_since_prior_order.cumsum()
 72 | 
 73 | def set_diff(items1, items2):
 74 |     if  isinstance(items1, float) or isinstance(items2, float):
 75 |         return items1
 76 |     return [i1 for i1 in items1 if i1 not in items2]
 77 | 
 78 | def same_products(items1, items2):
 79 |     if  isinstance(items1, float) or isinstance(items2, float):
 80 |         return []
 81 |     return [i1 for i1 in items1 if i1 in items2]
 82 | 
 83 | order_tbl['t-1_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(1)
 84 | order_tbl['set_diff_products'] = order_tbl.apply(lambda x: set_diff(x['product_name'], x['t-1_product_name']), axis=1)
 85 | order_tbl['same_products'] = order_tbl.apply(lambda x: same_products(x['product_name'], x['t-1_product_name']), axis=1)
 86 | 
 87 | order_tbl.to_pickle('../input/mk/order_tbl.p')
 88 | gc.collect()
 89 | #==============================================================================
 90 | # order_aisle-department
 91 | #==============================================================================
 92 | order_aisle      = pd.crosstab(log['order_id'], 
 93 |                                log['aisle_id']).add_prefix('aisle_').reset_index()
 94 | 
 95 | order_department = pd.crosstab(log['order_id'], 
 96 |                                log['department_id']).add_prefix('department_').reset_index()
 97 | 
 98 | order_aisle = pd.merge(order_aisle, order_department, on='order_id', how='left')
 99 | 
100 | order_aisle.to_pickle('../input/mk/order_aisle-department.p')
101 | 
102 | del order_aisle, order_department
103 | gc.collect()
104 | 
105 | #==============================================================================
106 | # order_reorderd
107 | #==============================================================================
108 | log_ = log.loc[log.reordered==1]
109 | order_reorderd = log_.groupby('order_id').product_id.apply(list).reset_index()
110 | 
111 | order_reorderd.to_pickle('../input/mk/order_reorderd.p')
112 | gc.collect()
113 | 
114 | #==============================================================================
115 | # user_order
116 | #==============================================================================
117 | from itertools import chain
118 | 
119 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')
120 | order_tbl = order_tbl.loc[order_tbl.eval_set!='test']
121 | 
122 | goods = pd.read_pickle('../input/mk/goods.p')
123 | 
124 | goods_di = {}
125 | for k,v in zip(goods.product_name, goods.product_id):
126 |     goods_di[k] = v
127 | 
128 | 
129 | def sum_list(x):
130 |     return list(chain.from_iterable(x))
131 | 
132 | def to_unique(lists):
133 |     li = sum_list(lists)
134 |     return list(set(li))
135 | 
136 | def to_ids(names):
137 |     ids = [goods_di[n] for n in names]
138 |     return ids
139 | 
140 | user_hist = order_tbl.groupby('user_id').product_name.apply(to_unique).reset_index()
141 | user_hist['product_id'] = user_hist.product_name.map(to_ids)
142 | 
143 | user_hist.to_pickle('../input/mk/user_order.p')
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | utils.end(__file__)
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/py_feature/003_X_base_T.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat May 27 17:05:46 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | #==============================================================================
16 | # load
17 | #==============================================================================
18 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')
19 | order_tbl = order_tbl[['order_id', 'user_id', 'order_number', 'order_number_rev']]
20 | order_tbl.sort_values(['user_id', 'order_number', 'order_id'], inplace=True)
21 | 
22 | test_order = pd.read_pickle('../input/mk/test_user.p')
23 | 
24 | #==============================================================================
25 | # def
26 | #==============================================================================
27 | def main(T):
28 |     for i in range(1, 1+T):
29 |         order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i)
30 |     
31 |     order_tbl.dropna(inplace=True)
32 |     
33 |     col = [c for c in order_tbl.columns if 'order_id' in c]
34 |     for c in col:
35 |         order_tbl[c] = order_tbl[c].map(int)
36 |     
37 |     order_tbl.reset_index(drop=1, inplace=True)
38 |     
39 |     order_tbl['is_train'] = 1-order_tbl.order_id.isin(test_order.order_id)*1
40 |     
41 |     order_tbl[col+['user_id','is_train']].to_pickle('../feature/X_base_t{}.p'.format(T))
42 | 
43 | 
44 | main(3)
45 | main(5)
46 | 
47 | #==============================================================================
48 | utils.end(__file__)
49 | 
50 | """
51 | 206209 rows
52 | """


--------------------------------------------------------------------------------
/py_feature/004_label.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri May 26 17:07:09 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev']
16 | log = utils.read_pickles('../input/mk/log', col).rename(columns={'reordered':'y'})
17 | 
18 | test_order = pd.read_pickle('../input/mk/test_user.p')
19 | 
20 | #==============================================================================
21 | # train
22 | #==============================================================================
23 | def make(T):
24 |     label_t1 = log[log.order_number_rev>T]
25 |     label_t1.drop_duplicates(['user_id','product_id'], keep='last', inplace=True)
26 |     label_t1.sort_values(['user_id','product_id'], inplace=True)
27 |     
28 |     label_t0_y1 = log.loc[log.order_number_rev==T].loc[log.y==1]
29 |     label_t0_y1.sort_values(['user_id','product_id'], inplace=True)
30 |     
31 |     label_t1['key'] = label_t1.user_id.map(str) + ' ' + label_t1.product_id.map(str)
32 |     label_t0_y1['key'] = label_t0_y1.user_id.map(str) + ' ' + label_t0_y1.product_id.map(str)
33 |     label_t0_y0 = label_t1[~label_t1.key.isin(label_t0_y1.key)]
34 |     
35 |     label_t0_y0.drop('order_id', axis=1 ,inplace=True)
36 |     label_t0_y0 = pd.merge(label_t0_y0, log.loc[log.order_number_rev==T, ['user_id','order_id']].drop_duplicates(), 
37 |                            on='user_id', how='left')
38 |     label_t0_y0.y = 0
39 |     
40 |     label_train = pd.concat([label_t0_y1, label_t0_y0], ignore_index=1)
41 |     label_train.sort_values(['user_id','product_id'], inplace=True)
42 |     label_train.reset_index(drop=1, inplace=True)
43 |     
44 |     col = ['order_id', 'product_id', 'y']
45 |     
46 |     print(label_train[col].isnull().sum())
47 |     utils.mkdir_p('../feature/trainT-{}'.format(T))
48 |     label_train[col].to_pickle('../feature/trainT-{}/label_reordered.p'.format(T))
49 | 
50 | make(0) # basically train is T=0, for validation, train;T=1 valid;T=0
51 | make(1)
52 | make(2)
53 | 
54 | #==============================================================================
55 | # test
56 | #==============================================================================
57 | log_test = log.drop_duplicates(['user_id','product_id'])[['user_id','product_id']]
58 | log_test = log_test[log_test.user_id.isin(test_order.user_id)]
59 | 
60 | log_test.sort_values(['user_id','product_id'],inplace=True)
61 | log_test.reset_index(drop=1, inplace=True)
62 | 
63 | test_order = pd.merge(test_order, log_test, on='user_id', how='left')
64 | 
65 | print(test_order[['order_id', 'product_id']].isnull().sum())
66 | utils.mkdir_p('../feature/test')
67 | test_order[['order_id', 'product_id']].to_pickle('../feature/test/label_reordered.p')
68 | 
69 | 
70 | 
71 | 
72 | #==============================================================================
73 | utils.end(__file__)
74 | 
75 | 


--------------------------------------------------------------------------------
/py_feature/005_inarow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed May 31 00:29:00 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | """
10 | import pandas as pd
11 | import numpy as np
12 | from tqdm import tqdm
13 | import utils
14 | utils.start(__file__)
15 | 
16 | 
17 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'reordered']
18 | log = utils.read_pickles('../input/mk/log', col)
19 | log.sort_values(['user_id', 'product_id', 'order_number'], inplace=True)
20 | 
21 | 
22 | 
23 | uid_bk = pid_bk = onum_bk = None
24 | ret = []
25 | miniters = int(log.shape[0]/50)
26 | col = ['user_id', 'product_id', 'order_number']
27 | for uid,pid,onum in tqdm(log[col].values,miniters=miniters):
28 |     if uid_bk is None:
29 |         cnt = 1
30 |         ret.append(cnt)
31 |     elif uid == uid_bk and pid == pid_bk:
32 |         if onum - onum_bk == 1:
33 |             cnt+=1
34 |             ret.append(cnt)
35 |         else:
36 |             cnt = 1
37 |             ret.append(cnt)
38 |         pass
39 |     elif uid == uid_bk and pid != pid_bk: # item change
40 |         cnt = 1
41 |         ret.append(cnt)
42 |     elif uid != uid_bk: # user change
43 |         cnt = 1
44 |         ret.append(cnt)
45 |     else:
46 |         raise Exception('?')
47 | 
48 |     uid_bk = uid
49 |     pid_bk = pid
50 |     onum_bk = onum
51 | log['buy_item_inarow'] = ret
52 | 
53 | log.reset_index(drop=1, inplace=True)
54 | 
55 | log.to_pickle('../input/mk/log_inarow.p')
56 | 
57 | 
58 | utils.end(__file__)
59 | 
60 | 


--------------------------------------------------------------------------------
/py_feature/006_days_since_last_order.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Jun  3 07:41:26 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | そのユーザがそのアイテム注文したのは何日前か？
  9 | *リークじゃない
 10 | 
 11 | """
 12 | 
 13 | import pandas as pd
 14 | import numpy as np
 15 | from tqdm import tqdm
 16 | import gc
 17 | import multiprocessing as mp
 18 | import utils
 19 | utils.start(__file__)
 20 | 
 21 | 
 22 | kfold = 10
 23 | 
 24 | X_base = pd.read_pickle('../feature/X_base_t3.p')
 25 | 
 26 | label_train = pd.read_pickle('../feature/trainT-0/label_reordered.p')
 27 | label_test  = pd.read_pickle('../feature/test/label_reordered.p')
 28 | 
 29 | train = pd.merge(X_base[X_base.is_train==1], label_train, on='order_id', how='inner')
 30 | test = pd.merge(X_base[X_base.is_train==0], label_test, on='order_id', how='inner')
 31 | 
 32 | #==============================================================================
 33 | # mk train * test log
 34 | #==============================================================================
 35 | col = ['order_id', 'user_id', 'product_id']
 36 | train_log = utils.read_pickles('../input/mk/log', col)
 37 | 
 38 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')\
 39 |             [['order_id', 'user_id', 'order_number', 'days_since_first_order']]
 40 | 
 41 | # merge user_id -> ['order_id', 'user_id', 'product_id']
 42 | train_log = pd.merge(train_log[['order_id', 'product_id']], 
 43 |                      order_tbl[['order_id','user_id']], 
 44 |                      on='order_id', how='left')[['order_id', 'user_id', 'product_id']]
 45 | test_log  = pd.merge(test[['order_id', 'product_id']], 
 46 |                      order_tbl[['order_id','user_id']], 
 47 |                      on='order_id', how='left')[['order_id', 'user_id', 'product_id']]
 48 | 
 49 | log = pd.concat([train_log, test_log])
 50 | del X_base, train_log, test_log; gc.collect()
 51 | log.sort_values(['user_id', 'product_id'], inplace=True)
 52 | 
 53 | user_item = log.drop_duplicates(['user_id', 'product_id'])[['user_id', 'product_id']]
 54 | order_user = order_tbl[['order_id', 'user_id',]]
 55 | 
 56 | log = pd.merge(order_user, user_item, on='user_id', how='left')
 57 | del order_user, user_item; gc.collect()
 58 | 
 59 | users = log[['user_id']].drop_duplicates().reset_index(drop=1)
 60 | users['kfold'] = users.index%kfold
 61 | 
 62 | 
 63 | usecols = [ 'order_id', 'product_id']
 64 | buy_tbl = utils.read_pickles('../input/mk/log', usecols)
 65 | buy_tbl['key'] = buy_tbl.order_id.map(str) + ' ' + buy_tbl.product_id.map(str)
 66 | 
 67 | utils.mkdir_p('../input/mk/days_since_last_order')
 68 | 
 69 | 
 70 | #==============================================================================
 71 | # days_since_last_order_this_item
 72 | #==============================================================================
 73 | def multi(i):
 74 |     target_users = users[users.kfold==i].user_id
 75 |     
 76 |     tbl = pd.merge(log[log.user_id.isin(target_users)], 
 77 |                     order_tbl[['order_id','order_number', 'days_since_first_order']], 
 78 |                     on='order_id', how='left')
 79 |     
 80 |     tbl.sort_values(['user_id', 'product_id', 'order_number'], inplace=True)
 81 |     
 82 |     
 83 |     tbl['key'] = tbl.order_id.map(str) + ' ' + tbl.product_id.map(str)
 84 |     tbl['buy'] = tbl.key.isin(buy_tbl.key)*1
 85 |     
 86 |     tbl.days_since_first_order = tbl.days_since_first_order.fillna(0)
 87 |     
 88 |     tbl.sort_values(['user_id', 'product_id', 'order_number'], inplace=True)
 89 |     
 90 |     tbl.reset_index(drop=1, inplace=True)
 91 |     
 92 |     uid_bk = pid_bk = day_bk = last_date = None
 93 |     first_buy = False
 94 |     ret = []
 95 |     miniters = int(tbl.shape[0]/50)
 96 |     for uid,pid,day,buy in tqdm(tbl[['user_id', 'product_id','days_since_first_order','buy']].values, 
 97 |                                 miniters=miniters):
 98 |         if uid_bk is None:
 99 |             if buy==1 and first_buy is False:
100 |                 ret.append(None)
101 |                 last_date = day
102 |                 first_buy = True
103 |             elif buy==1:
104 |                 ret.append(day-last_date)
105 |                 last_date = day
106 |             elif buy==0 and first_buy is True:
107 |                 ret.append(day-last_date)
108 |             else:
109 |                 ret.append(None)
110 |                 
111 |         elif uid == uid_bk and pid == pid_bk:
112 |             if buy==1 and first_buy is False:
113 |                 ret.append(None)
114 |                 last_date = day
115 |                 first_buy = True
116 |             elif buy==1:
117 |                 ret.append(day-last_date)
118 |                 last_date = day
119 |             elif buy==0 and first_buy is True:
120 |                 ret.append(day-last_date)
121 |             else:
122 |                 ret.append(None)
123 |                 
124 |         elif uid == uid_bk and pid != pid_bk: # item change
125 |             last_date = None
126 |             first_buy = False
127 |             if buy==1 and first_buy is False:
128 |                 ret.append(None)
129 |                 last_date = day
130 |                 first_buy = True
131 |             elif buy==1:
132 |                 ret.append(day-last_date)
133 |                 last_date = day
134 |             elif buy==0 and first_buy is True:
135 |                 ret.append(day-last_date)
136 |             else:
137 |                 ret.append(None)
138 |                 
139 |         elif uid != uid_bk: # user change
140 |             last_date = None
141 |             first_buy = False
142 |             if buy==1 and first_buy is False:
143 |                 ret.append(None)
144 |                 last_date = day
145 |                 first_buy = True
146 |             elif buy==1:
147 |                 ret.append(day-last_date)
148 |                 last_date = day
149 |             elif buy==0 and first_buy is True:
150 |                 ret.append(day-last_date)
151 |             else:
152 |                 ret.append(None)
153 |         uid_bk = uid
154 |         pid_bk = pid
155 |         day_bk = day
156 |     tbl['days_since_last_order_this_item'] = ret
157 |     
158 |     col = ['order_id', 'product_id','days_since_last_order_this_item']
159 |     tbl[col].to_pickle('../input/mk/days_since_last_order/{}.p'.format(i))
160 |     
161 | #==============================================================================
162 | 
163 | 
164 | 
165 | mp_pool = mp.Pool(kfold)
166 | mp_pool.map(multi, range(kfold))
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | utils.end(__file__)
179 | 
180 | 


--------------------------------------------------------------------------------
/py_feature/007_timezone.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Jun 19 12:46:49 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | Time Zone
 9 | 
10 | """
11 | import pandas as pd
12 | import numpy as np
13 | from tqdm import tqdm
14 | import utils
15 | utils.start(__file__)
16 | 
17 | 
18 | 
19 | orders = pd.read_csv('../input/orders.csv.gz', usecols=['order_hour_of_day'])
20 | 
21 | orders.sort_values('order_hour_of_day', inplace=True)
22 | orders.drop_duplicates(inplace=True)
23 | orders.reset_index(drop=True, inplace=True)
24 | 
25 | def timezone(s):
26 |     if s < 6:
27 |         return 'midnight'
28 |     elif s < 12:
29 |         return 'morning'
30 |     elif s < 18:
31 |         return 'noon'
32 |     else:
33 |         return 'night'
34 | 
35 | 
36 | orders['timezone'] = orders.order_hour_of_day.map(timezone)
37 | 
38 | orders.to_pickle('../input/mk/timezone.p')
39 | 
40 | 
41 | 
42 | 
43 | utils.end(__file__)
44 | 
45 | 


--------------------------------------------------------------------------------
/py_feature/008_product_feature.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Jun 20 17:41:54 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | 
16 | item = pd.read_csv('../input/products.csv')
17 | 
18 | 
19 | item['item_is_Organic'] = item.product_name.map(lambda x: 'organic' in x.lower())*1
20 | item['item_is_Gluten-Free'] = item.product_name.map(lambda x: 'gluten' in x.lower() and 'free' in x.lower())*1
21 | item['item_is_Asian'] = item.product_name.map(lambda x: 'asian' in x.lower())*1
22 | 
23 | 
24 | col = ['product_id', 'item_is_Organic', 'item_is_Gluten-Free', 'item_is_Asian']
25 | item[col].to_pickle('../input/mk/products_feature.p')
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | utils.end(__file__)
33 | 
34 | 


--------------------------------------------------------------------------------
/py_feature/009_None.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jun 25 07:55:48 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | None
 9 | 
10 | Leak!
11 | 
12 | """
13 | 
14 | import pandas as pd
15 | import numpy as np
16 | from tqdm import tqdm
17 | import utils
18 | utils.start(__file__)
19 | 
20 | #==============================================================================
21 | # load
22 | #==============================================================================
23 | col = ['order_id', 'user_id','order_number','product_name', 'eval_set']
24 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col]
25 | order_tbl.sort_values(['user_id', 'order_number'], inplace=True)
26 | order_tbl = order_tbl[order_tbl.eval_set!='test']
27 | 
28 | #==============================================================================
29 | # main
30 | #==============================================================================
31 | 
32 | uid_bk = None
33 | product_name_all = [] # 2d list
34 | pname_unq = []        # 1d list
35 | pname_unq_len = []     # 1d list
36 | for uid,pnames in tqdm(order_tbl[['user_id', 'product_name']].values):
37 |     if uid_bk is None:
38 |         pname_unq += pnames
39 |     elif uid == uid_bk:
40 |         pname_unq += pnames
41 |     elif uid != uid_bk:
42 |         pname_unq = pnames[:]
43 |         
44 |     uid_bk = uid
45 |     pname_unq = list(set(pname_unq))
46 |     pname_unq_len.append(len(pname_unq))
47 |     product_name_all.append(pname_unq)
48 | 
49 | order_tbl['product_name_all'] = product_name_all
50 | order_tbl['product_unq_len'] = pname_unq_len
51 | order_tbl['new_item_cnt'] = order_tbl.groupby('user_id').product_unq_len.diff()
52 | order_tbl['product_len'] = order_tbl['product_name'].map(len)
53 | order_tbl['is_None'] = (order_tbl.new_item_cnt == order_tbl.product_len)*1
54 | 
55 | col = ['order_id', 'product_unq_len', 'is_None']
56 | order_tbl[col].to_pickle('../input/mk/order_None.p')
57 | 
58 | 
59 | 
60 | utils.end(__file__)
61 | 
62 | 


--------------------------------------------------------------------------------
/py_feature/010_streak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jul 28 23:31:30 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | LEAK
 9 | 
10 | """
11 | import pandas as pd
12 | import numpy as np
13 | from tqdm import tqdm
14 | import multiprocessing as mp
15 | total_proc = 60
16 | import utils
17 | utils.start(__file__)
18 | 
19 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number']
20 | log = utils.read_pickles('../input/mk/log', usecols)
21 | 
22 | 
23 | def multi(uid):
24 |     tmp = log[log.user_id==uid]
25 |     ct = pd.crosstab(tmp.order_number, tmp.product_id).reset_index().set_index('order_number')
26 |     li = []
27 |     for pid in ct.columns:
28 |         streak = 0
29 |         sw_odr = False
30 |         for onb,odr in enumerate(ct[pid].values):
31 |             onb+=1
32 |             if sw_odr == False and odr == 1:
33 |                 sw_odr = True
34 |                 streak = 1
35 |                 li.append([uid, pid, onb, streak])
36 |                 continue
37 |             if sw_odr == True:
38 |                 if odr == 1 and streak>0:
39 |                     streak += 1
40 |                     li.append([uid, pid, onb, streak])
41 |                 elif odr == 1 and streak<=0:
42 |                     streak = 1
43 |                     li.append([uid, pid, onb, streak])
44 |                 elif odr == 0 and streak>0:
45 |                     streak = 0
46 |                     li.append([uid, pid, onb, streak])
47 |                 elif odr == 0 and streak<=0:
48 |                     streak -= 1
49 |                     li.append([uid, pid, onb, streak])
50 |     return pd.DataFrame(li, columns=['user_id', 'product_id', 'order_number', 'streak'])
51 | 
52 | 
53 | user_id = log.user_id.unique()
54 | mp_pool = mp.Pool(total_proc)
55 | callback = mp_pool.map(multi, user_id)
56 | 
57 | df = pd.concat(callback, ignore_index=True)
58 | 
59 | order = log[['order_id', 'user_id', 'order_number']].drop_duplicates().reset_index(drop=True)
60 | df = pd.merge(df, order, on=['user_id', 'order_number'], how='left')
61 | 
62 | df[['order_id', 'product_id', 'streak']].to_pickle('../input/mk/streak_order-product.p')
63 | 
64 | 
65 | utils.end(__file__)
66 | 
67 | 


--------------------------------------------------------------------------------
/py_feature/011_replacement.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Aug  9 09:26:45 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 011_replacement.py &
  9 | 
 10 | 
 11 | t-3 -> t-2 -> t-1
 12 |  a      a      a
 13 |  b      d      c
 14 |  c      e      d
 15 |                f
 16 | 
 17 | pids_3notin2: b,c
 18 | pids_2notin3: d,e
 19 | pids_1notin2: c,f
 20 | pids_skip:    c
 21 | 
 22 | c -> e -> c
 23 | 
 24 | ratio: freq(c -> d -> c)/freq(c -> d)
 25 | 
 26 | merge t-1: c->d
 27 | 
 28 | """
 29 | 
 30 | import pandas as pd
 31 | import gc
 32 | import numpy as np
 33 | from tqdm import tqdm
 34 | from collections import defaultdict
 35 | from itertools import product
 36 | import utils
 37 | utils.start(__file__)
 38 | 
 39 | #==============================================================================
 40 | # load
 41 | #==============================================================================
 42 | 
 43 | usecols = ['user_id', 'order_number', 'product_id', 'product_name', 'order_id', 'order_number_rev']
 44 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3])
 45 | 
 46 | order_tbl = log[['order_id', 'user_id', 'order_number', 'order_number_rev']].drop_duplicates().reset_index(drop=True)
 47 | for i in range(1, 4):
 48 |     order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i)
 49 | order_tbl.dropna(inplace=True)
 50 | 
 51 | #order_pids = log.head(999999).groupby('order_id').product_id.apply(set).reset_index()
 52 | order_pids = log.groupby('order_id').product_id.apply(set).reset_index()
 53 | 
 54 | order_tbl = pd.merge(order_tbl, 
 55 |                      order_pids.add_prefix('t-1_'), 
 56 |                      on='t-1_order_id', how='inner')
 57 | order_tbl = pd.merge(order_tbl, 
 58 |                      order_pids.add_prefix('t-2_'), 
 59 |                      on='t-2_order_id', how='inner')
 60 | order_tbl = pd.merge(order_tbl, 
 61 |                      order_pids.add_prefix('t-3_'), 
 62 |                      on='t-3_order_id', how='inner')
 63 | 
 64 | #==============================================================================
 65 | # def
 66 | #==============================================================================
 67 | 
 68 | def make(T):
 69 |     """
 70 |     T = 0
 71 |     folder = 'trainT-0'
 72 |     """
 73 |     if T==-1:
 74 |         folder = 'test'
 75 |     else:
 76 |         folder = 'trainT-'+str(T)
 77 |     
 78 |     order_tbl_ = order_tbl[order_tbl.order_number_rev>T]
 79 |     
 80 |     pid_cnt    = defaultdict(int)
 81 |     pid_chance = defaultdict(int)
 82 |     
 83 | #    for pids_bk3, pids_bk2, pids_bk1 in tqdm(order_tbl_[['t-3_product_id', 't-2_product_id', 't-1_product_id']].values):
 84 | #    for uid, onb, pid in tqdm(log_[['user_id', 'order_number', 'product_name']].head(1999999).values):
 85 | #    for uid, onb, pid in tqdm(log_[['user_id', 'order_number', 'product_id']].values, miniters=99999):
 86 |     for pids_bk3, pids_bk2, pids_bk1 in tqdm(order_tbl_[['t-3_product_id', 't-2_product_id', 't-1_product_id']].values, miniters=99999):
 87 |         
 88 |         pids_3notin2 = pids_bk3 - pids_bk2
 89 |         pids_2notin3 = pids_bk2 - pids_bk3
 90 |         pids_hub  = pids_bk2 - pids_bk3 - pids_bk1
 91 |         pids_skip = (pids_bk3 & pids_bk1) - pids_bk2
 92 |         
 93 |         li = []
 94 |         for i1, i2 in list(product(pids_3notin2, pids_2notin3)):
 95 |             key = str(i1)+' -> '+str(i2)
 96 |             li.append(key)
 97 |             pid_chance[key] +=1
 98 |             
 99 |         li = []
100 |         for i1, i2 in list(product(pids_skip, pids_hub)):
101 |             key = str(i1)+' -> '+str(i2)
102 |             li.append(key)
103 |             pid_cnt[key] +=1
104 |         
105 |         
106 |     
107 |     pid_chance = pd.DataFrame.from_dict(pid_chance, orient='index').reset_index()
108 |     pid_chance.columns = ['pids', 'chance']
109 |     
110 |     pid_cnt = pd.DataFrame.from_dict(pid_cnt, orient='index').reset_index()
111 |     pid_cnt.columns = ['pids', 'back']
112 |     
113 |     df = pd.merge(pid_chance, pid_cnt, on='pids', how='left').fillna(0)
114 |     
115 |     df['ratio'] = df.back/df.chance
116 |     df.sort_values('ratio', ascending=False, inplace=True)
117 |     
118 |     df.reset_index(drop=True, inplace=True)
119 |     df['pid1'] = df.pids.map(lambda x: x.split(' -> ')[0]).astype(int)
120 |     df['pid2'] = df.pids.map(lambda x: x.split(' -> ')[1]).astype(int)
121 |     df[['pid1', 'pid2', 'back', 'chance', 'ratio']].to_pickle('../input/mk/replacement.p')
122 |     
123 | #==============================================================================
124 | # main
125 | #==============================================================================
126 | 
127 | make(2)
128 | 
129 | #==============================================================================
130 | utils.end(__file__)
131 | 
132 | 


--------------------------------------------------------------------------------
/py_feature/012_aisle_dep_cumsum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Aug 13 05:30:59 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | from collections import defaultdict
13 | import utils
14 | utils.start(__file__)
15 | 
16 | #==============================================================================
17 | # load
18 | #==============================================================================
19 | col = ['user_id', 'order_number', 'order_id']
20 | log = utils.read_pickles('../input/mk/log', col).drop_duplicates().sort_values(col)
21 | 
22 | ai_dep = pd.read_pickle('../input/mk/order_aisle-department.p')
23 | 
24 | log = pd.merge(log, ai_dep, on='order_id', how='left')
25 | 
26 | #==============================================================================
27 | # calc
28 | #==============================================================================
29 | col = [c for c in log.columns if 'aisle_' in c or 'dep' in c]
30 | di = defaultdict(int)
31 | uid_bk = None
32 | 
33 | li1 = []
34 | for args in tqdm(log[['user_id']+col].values):
35 |     uid = args[0]
36 |     
37 |     if uid_bk is None:
38 |         pass
39 |     elif uid == uid_bk:
40 |         pass
41 |     elif uid != uid_bk:
42 |         di = defaultdict(int)
43 |     li2 = []
44 |     for i,c in enumerate(col):
45 |         di[c] += args[i+1]
46 |         li2.append(di[c])
47 |     li1.append(li2)
48 |     
49 |     uid_bk = uid
50 | #==============================================================================
51 | df = pd.DataFrame(li1, columns=col).add_suffix('_cumsum')
52 | df['order_id'] = log['order_id']
53 | 
54 | df.to_pickle('../input/mk/order_aisle-department_cumsum.p')
55 | 
56 | 
57 | #==============================================================================
58 | utils.end(__file__)
59 | 
60 | 


--------------------------------------------------------------------------------
/py_feature/100_======user_feature======:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/100_======user_feature======


--------------------------------------------------------------------------------
/py_feature/101_repeat_previous_ratio_T.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu May 25 22:45:03 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | リークしてるのでshiftして使うこと！
  9 | """
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | from tqdm import tqdm
 14 | import gc
 15 | import utils
 16 | utils.start(__file__)
 17 | 
 18 | # setting T
 19 | T = 3
 20 | 
 21 | 
 22 | #==============================================================================
 23 | # load base
 24 | #==============================================================================
 25 | X_base = pd.read_pickle('../feature/X_base_t{}.p'.format(T))
 26 | all_order = pd.concat([X_base[c] for c in X_base.columns if 't-' in c]).unique()
 27 | 
 28 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')
 29 | 
 30 | col = ['order_id', 'order_number', 'order_dow', 'order_hour_of_day', 
 31 |        'days_since_prior_order', 'days_since_first_order']
 32 | X = pd.merge(X_base, order_tbl[col], on='order_id', how='left')
 33 | 
 34 | col_feature = []
 35 | 
 36 | #==============================================================================
 37 | # repeat_previous_ratio
 38 | #==============================================================================
 39 | order_tbl['t-2_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(2)
 40 | order_tbl['t-3_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(3)
 41 | order_tbl['t-4_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(4)
 42 | order_tbl['t-5_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(5)
 43 | 
 44 | order_tbl = order_tbl[order_tbl.order_id.isin(all_order)]
 45 | 
 46 | # fill list
 47 | col = ['product_name'] + [c for c in order_tbl.columns if 't-' in c]
 48 | 
 49 | def fill_list(s):
 50 |     if isinstance(s, float):
 51 |         return []
 52 |     return s
 53 | 
 54 | for c in col:
 55 |     order_tbl[c] = order_tbl[c].map(fill_list)
 56 | 
 57 | def ratio(list1, list2):
 58 |     """
 59 |     list1: previous
 60 |     list2: current
 61 |     
 62 |     return: intersection(previous & current) / current
 63 |     """
 64 |     if len(list1)==0 or len(list2)==0:
 65 |         return 
 66 |     ret = sum([1 for i in list2 if i in list1]) / len(list2)
 67 |     
 68 |     return ret
 69 | 
 70 | # w means window size
 71 | order_tbl['repeat_previous_ratio-w1'] = order_tbl.apply(\
 72 |          lambda x: ratio(x['t-1_product_name'], x['product_name']), axis=1)
 73 | 
 74 | order_tbl['repeat_previous_ratio-w2'] = order_tbl.apply(\
 75 |          lambda x: ratio(x['t-1_product_name']+x['t-2_product_name'], 
 76 |                          x['product_name']), axis=1)
 77 | 
 78 | order_tbl['repeat_previous_ratio-w3'] = order_tbl.apply(\
 79 |          lambda x: ratio(x['t-1_product_name']+x['t-2_product_name']+x['t-3_product_name'], 
 80 |                          x['product_name']), axis=1)
 81 | 
 82 | order_tbl['repeat_previous_ratio-w4'] = order_tbl.apply(\
 83 |          lambda x: ratio(x['t-1_product_name']+x['t-2_product_name']+x['t-3_product_name']+\
 84 |                          x['t-4_product_name'], x['product_name']), axis=1)
 85 | 
 86 | order_tbl['repeat_previous_ratio-w5'] = order_tbl.apply(\
 87 |          lambda x: ratio(x['t-1_product_name']+x['t-2_product_name']+x['t-3_product_name']+\
 88 |                          x['t-4_product_name']+x['t-5_product_name'], x['product_name']), axis=1)
 89 | 
 90 | col_feature += ['repeat_previous_ratio-w1','repeat_previous_ratio-w2',
 91 |                 'repeat_previous_ratio-w3','repeat_previous_ratio-w4',
 92 |                 'repeat_previous_ratio-w5']
 93 | 
 94 | #==============================================================================
 95 | # reordered_ratio
 96 | #==============================================================================
 97 | log = utils.read_pickles('../input/mk/log')
 98 | reordered_ratio = log.groupby(['order_id']).reordered.mean().reset_index()
 99 | reordered_ratio.columns = ['order_id', 'reordered_ratio']
100 | order_tbl = pd.merge(order_tbl, reordered_ratio, on='order_id', how='left')
101 | 
102 | log['unreordered'] = 1-log.reordered
103 | unreordered_ratio = log.groupby(['order_id']).unreordered.mean().reset_index()
104 | unreordered_ratio.columns = ['order_id', 'unreordered_ratio']
105 | 
106 | order_tbl = pd.merge(order_tbl, unreordered_ratio, on='order_id', how='left')
107 | 
108 | 
109 | del reordered_ratio, unreordered_ratio; gc.collect()
110 | 
111 | col_feature += ['reordered_ratio']
112 | 
113 | #==============================================================================
114 | # total_unique_item
115 | #==============================================================================
116 | 
117 | order_unique_item = log.groupby('order_id').unreordered.sum().reset_index()
118 | order_unique_item.columns = ['order_id', 'unreordered_sum']
119 | 
120 | order_tbl = pd.merge(order_tbl, order_unique_item, on='order_id', how='left')
121 | 
122 | order_tbl['total_unique_item'] = order_tbl.groupby('user_id').unreordered_sum.cumsum()
123 | order_tbl['total_unique_item_ratio'] = order_tbl['total_unique_item']/order_tbl['order_number']
124 | 
125 | del order_unique_item; gc.collect()
126 | 
127 | col_feature += ['unreordered_sum','total_unique_item', 'total_unique_item_ratio']
128 | 
129 | #==============================================================================
130 | # ordered item
131 | #==============================================================================
132 | 
133 | ordered_item = log.groupby('order_id').size().reset_index()
134 | ordered_item.columns = ['order_id', 'ordered_item']
135 | 
136 | order_tbl = pd.merge(order_tbl, ordered_item, on='order_id', how='left')
137 | 
138 | order_tbl['total_ordered_item'] = order_tbl.groupby('user_id').ordered_item.cumsum()
139 | order_tbl['total_ordered_item_ratio'] = order_tbl['total_ordered_item']/order_tbl['order_number']
140 | 
141 | del ordered_item; gc.collect()
142 | 
143 | col_feature += ['ordered_item','total_ordered_item', 'total_ordered_item_ratio']
144 | 
145 | 
146 | 
147 | #==============================================================================
148 | # merge & split
149 | #==============================================================================
150 | col = ['order_id', 'order_dow', 'order_hour_of_day', 
151 |        'days_since_prior_order', 'days_since_first_order']
152 | for i in range(1, 1+T):
153 |     X = pd.merge(X, order_tbl[col+col_feature].add_prefix('t-{}_'.format(i)), 
154 |                  on='t-{}_order_id'.format(i), how='left')
155 | 
156 | 
157 | train = X[X.is_train==1].drop(['user_id','is_train'], axis=1).reset_index(drop=1)
158 | test  = X[X.is_train==0].drop(['user_id','is_train'], axis=1).reset_index(drop=1)
159 | 
160 | #==============================================================================
161 | # write
162 | #==============================================================================
163 | col = [c for c in train.columns if not ('t-' in c and '_id' in c)]
164 | train[col].to_pickle('../feature/trainT-0/f101_order.p')
165 | test[col].to_pickle('../feature/test/f101_order.p')
166 | 
167 | 
168 | #==============================================================================
169 | utils.end(__file__)
170 | 
171 | 


--------------------------------------------------------------------------------
/py_feature/102_orderspan_average.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Jun 13 15:58:46 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | order span
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | 
19 | X_base = pd.read_pickle('../feature/X_base_t3.p')
20 | col = ['order_id', 'user_id', 'days_since_prior_order', 'eval_set', 'order_number_rev']
21 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col]
22 | 
23 | #==============================================================================
24 | # train
25 | #==============================================================================
26 | def make(T):
27 |     order_tbl_tr = order_tbl[order_tbl.order_number_rev>T]
28 |     
29 |     user = order_tbl_tr.groupby('user_id')['days_since_prior_order'].mean().reset_index()
30 |     user.columns = ['user_id', 'days_order_mean']
31 |     
32 |     user.to_pickle('../feature/trainT-{}/f102_user.p'.format(T))
33 | 
34 | make(0)
35 | make(1)
36 | make(2)
37 | 
38 | 
39 | 
40 | #==============================================================================
41 | # test
42 | #==============================================================================
43 | order_tbl_te = order_tbl[order_tbl.eval_set != 'test']
44 | 
45 | user = order_tbl_te.groupby('user_id')['days_since_prior_order'].mean().reset_index()
46 | user.columns = ['user_id', 'days_order_mean']
47 | 
48 | user.to_pickle('../feature/test/f102_user.p')
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | #==============================================================================
61 | utils.end(__file__)
62 | 
63 | 


--------------------------------------------------------------------------------
/py_feature/103_visit_time.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Jun 17 08:45:57 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | visit time ratio
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | #==============================================================================
19 | # load
20 | #==============================================================================
21 | X_base = pd.read_pickle('../feature/X_base_t3.p')
22 | 
23 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev']
24 | log = utils.read_pickles('../input/mk/log', col)
25 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 
26 |                on='order_hour_of_day', how='left')
27 | log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone
28 | 
29 | #==============================================================================
30 | # train
31 | #==============================================================================
32 | def make(T):
33 |     log_tr = log[log.order_number_rev>T]
34 |     
35 |     # dow
36 |     dow  = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_')
37 |     dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_')
38 |     
39 |     # timezone
40 |     timezone  = pd.crosstab(log_tr.user_id, log_tr.timezone).add_prefix('user_timezone_freq_')
41 |     timezone_ = pd.crosstab(log_tr.user_id, log_tr.timezone, normalize='index').add_prefix('user_timezone_norm_')
42 |     
43 |     # dow * timezone
44 |     dow_tz  = pd.crosstab(log_tr.user_id, log_tr.dow_tz).add_prefix('user_dow-tz_freq_')
45 |     dow_tz_ = pd.crosstab(log_tr.user_id, log_tr.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_')
46 |     
47 |     tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1)
48 |     
49 |     tab.reset_index().to_pickle('../feature/trainT-{}/f103_user.p'.format(T))
50 | 
51 | make(0)
52 | make(1)
53 | make(2)
54 | 
55 | #==============================================================================
56 | # test
57 | #==============================================================================
58 | 
59 | # dow
60 | dow  = pd.crosstab(log.user_id, log.order_dow).add_prefix('user_dow_freq_')
61 | dow_ = pd.crosstab(log.user_id, log.order_dow, normalize='index').add_prefix('user_dow_norm_')
62 | 
63 | # timezone
64 | timezone  = pd.crosstab(log.user_id, log.timezone).add_prefix('user_timezone_freq_')
65 | timezone_ = pd.crosstab(log.user_id, log.timezone, normalize='index').add_prefix('user_timezone_norm_')
66 | 
67 | # dow * timezone
68 | dow_tz  = pd.crosstab(log.user_id, log.dow_tz).add_prefix('user_dow-tz_freq_')
69 | dow_tz_ = pd.crosstab(log.user_id, log.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_')
70 | 
71 | tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1)
72 | 
73 | tab.reset_index().to_pickle('../feature/test/f103_user.p')
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | utils.end(__file__)
83 | 
84 | 


--------------------------------------------------------------------------------
/py_feature/104_organic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jun 21 07:51:30 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | 
16 | #==============================================================================
17 | # load
18 | #==============================================================================
19 | X_base = pd.read_pickle('../feature/X_base_t3.p')
20 | 
21 | 
22 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev']
23 | log = utils.read_pickles('../input/mk/log', col)
24 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 
25 |                on='order_hour_of_day', how='left')
26 | log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone
27 | 
28 | log = pd.merge(log, pd.read_pickle('../input/mk/products_feature.p'), 
29 |                on='product_id', how='left')
30 | 
31 | #==============================================================================
32 | # train
33 | #==============================================================================
34 | def make(T):
35 |     log_tr = log[log.order_number_rev>T]
36 |     
37 |     user = log_tr.groupby(['user_id']).size().to_frame()
38 |     user.columns = ['total']
39 |     user['organic_cnt'] = log_tr.groupby(['user_id'])['item_is_Organic'].sum()
40 |     user['glutenfree_cnt'] = log_tr.groupby(['user_id'])['item_is_Gluten-Free'].sum()
41 |     user['Asian_cnt'] = log_tr.groupby(['user_id'])['item_is_Asian'].sum()
42 |     
43 |     user['organic_ratio'] = user['organic_cnt'] / user.total
44 |     user['glutenfree_ratio'] = user['glutenfree_cnt'] / user.total
45 |     user['Asian_ratio'] = user['Asian_cnt'] / user.total
46 |     
47 |     user.drop('total', axis=1, inplace=True)
48 |     user.reset_index().to_pickle('../feature/trainT-{}/f104_user.p'.format(T))
49 | 
50 | make(0)
51 | make(1)
52 | make(2)
53 | 
54 | #==============================================================================
55 | # test
56 | #==============================================================================
57 | 
58 | user = log.groupby(['user_id']).size().to_frame()
59 | user.columns = ['total']
60 | user['organic_cnt'] = log.groupby(['user_id'])['item_is_Organic'].sum()
61 | user['glutenfree_cnt'] = log.groupby(['user_id'])['item_is_Gluten-Free'].sum()
62 | user['Asian_cnt'] = log.groupby(['user_id'])['item_is_Asian'].sum()
63 | 
64 | user['organic_ratio'] = user['organic_cnt'] / user.total
65 | user['glutenfree_ratio'] = user['glutenfree_cnt'] / user.total
66 | user['Asian_ratio'] = user['Asian_cnt'] / user.total
67 | 
68 | user.drop('total', axis=1, inplace=True)
69 | user.reset_index().to_pickle('../feature/test/f104_user.p')
70 | 
71 | 
72 | #==============================================================================
73 | utils.end(__file__)
74 | 
75 | 


--------------------------------------------------------------------------------
/py_feature/105_delta_time.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jun 25 09:41:08 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | delta order time
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | #==============================================================================
19 | # load
20 | #==============================================================================
21 | 
22 | col = ['order_id', 'user_id','order_number', 'order_dow', 'order_hour_of_day', 
23 |        'days_since_prior_order', 'eval_set']
24 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col]
25 | order_tbl.sort_values(['user_id', 'order_number'], inplace=True)
26 | #order_tbl = order_tbl[order_tbl.eval_set!='test']
27 | 
28 | 
29 | #==============================================================================
30 | # main
31 | #==============================================================================
32 | order_tbl['t-1_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(1)
33 | order_tbl['t-2_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(2)
34 | order_tbl['t-3_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(3)
35 | 
36 | col = ['order_id', 'order_dow', 'order_hour_of_day']
37 | order_tbl = pd.merge(order_tbl, order_tbl[col].add_prefix('t-1_'), on='t-1_order_id', how='left')
38 | order_tbl = pd.merge(order_tbl, order_tbl[col].add_prefix('t-2_'), on='t-2_order_id', how='left')
39 | order_tbl = pd.merge(order_tbl, order_tbl[col].add_prefix('t-3_'), on='t-3_order_id', how='left')
40 | 
41 | order_tbl['delta_hour_t-1'] = order_tbl['order_hour_of_day'] - order_tbl['t-1_order_hour_of_day']
42 | order_tbl['delta_hour_t-2'] = order_tbl['order_hour_of_day'] - order_tbl['t-2_order_hour_of_day']
43 | order_tbl['delta_hour_t-3'] = order_tbl['order_hour_of_day'] - order_tbl['t-3_order_hour_of_day']
44 | 
45 | 
46 | col = ['order_id', 'delta_hour_t-1', 'delta_hour_t-2',
47 |        'delta_hour_t-3']
48 | order_tbl[col].to_pickle('../feature/trainT-0/f105_order.p')
49 | order_tbl[col].to_pickle('../feature/test/f105_order.p')
50 | 
51 | 
52 | 
53 | utils.end(__file__)
54 | 
55 | 


--------------------------------------------------------------------------------
/py_feature/108_order_size.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 16 21:04:09 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | """
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | from tqdm import tqdm
14 | import utils
15 | utils.start(__file__)
16 | 
17 | #==============================================================================
18 | # load
19 | #==============================================================================
20 | 
21 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev']
22 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id')
23 | 
24 | #==============================================================================
25 | # def
26 | #==============================================================================
27 | def make(T):
28 |     """
29 |     T = 0
30 |     folder = 'trainT-0'
31 |     """
32 |     
33 |     if T==-1:
34 |         folder = 'test'
35 |     else:
36 |         folder = 'trainT-'+str(T)
37 |         
38 |     log_ = log[log.order_number_rev>T]
39 |     
40 |     order_tbl = log_.groupby('order_id').size().to_frame()
41 |     order_tbl.columns = ['order_size']
42 |     order_tbl.reset_index(inplace=True)
43 |     
44 |     order_tbl = pd.merge(order_tbl, log_[['order_id', 'user_id']].drop_duplicates())
45 |     
46 |     user_osz = order_tbl.groupby(['user_id']).order_size.min().to_frame()
47 |     user_osz.columns = ['user_order_size-min']
48 |     user_osz['user_order_size-max'] = order_tbl.groupby(['user_id']).order_size.max()
49 |     user_osz['user_order_size-median'] = order_tbl.groupby(['user_id']).order_size.median()
50 |     user_osz['user_order_size-mean'] = order_tbl.groupby(['user_id']).order_size.mean()
51 |     user_osz['user_order_size-std'] = order_tbl.groupby(['user_id']).order_size.std()
52 |     user_osz.reset_index(inplace=True)
53 |     
54 |     user_osz.to_pickle('../feature/{}/f108_user.p'.format(folder))
55 | 
56 | #==============================================================================
57 | # main
58 | #==============================================================================
59 | make(0)
60 | make(1)
61 | make(2)
62 | 
63 | make(-1)
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | utils.end(__file__)
81 | 
82 | 


--------------------------------------------------------------------------------
/py_feature/109_have_you_bought.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jul 21 00:54:02 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | pid      freq
  9 | -------------
 10 | 24852    57186
 11 | 13176    47063
 12 | 21137    39871
 13 | 21903    38095
 14 | 47209    30047
 15 | 47626    28741
 16 | 47766    28478
 17 | 26209    26199
 18 | 16797    25621
 19 | 24964    21090
 20 | 22935    20824
 21 | 27966    20193
 22 | 39275    20134
 23 | 45007    19652
 24 | 49683    17508
 25 | 4605     16176
 26 | 27845    16134
 27 | 40706    16054
 28 | 5876     15765
 29 | 4920     15150
 30 | 28204    14802
 31 | 42265    14766
 32 | 30391    14089
 33 | 31717    13949
 34 | 8277     13900
 35 | 8518     13770
 36 | 27104    13719
 37 | 17794    13642
 38 | 46979    13491
 39 | 45066    13289
 40 | 
 41 | """
 42 | 
 43 | import pandas as pd
 44 | import numpy as np
 45 | from tqdm import tqdm
 46 | import utils
 47 | utils.start(__file__)
 48 | 
 49 | #==============================================================================
 50 | # load
 51 | #==============================================================================
 52 | 
 53 | col = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev']
 54 | log = utils.read_pickles('../input/mk/log', col)
 55 | 
 56 | 
 57 | #==============================================================================
 58 | # def
 59 | #==============================================================================
 60 | def make(T):
 61 |     """
 62 |     T = 0
 63 |     folder = 'trainT-0'
 64 |     """
 65 |     
 66 |     if T==-1:
 67 |         folder = 'test'
 68 |     else:
 69 |         folder = 'trainT-'+str(T)
 70 |         
 71 |     log_ = log[log.order_number_rev>T]
 72 |     
 73 |     user = log_.drop_duplicates('user_id')[['user_id']].reset_index(drop=True)
 74 |     
 75 |     # have you bought -> hyb
 76 |     tag_user = log_[log_.product_id==24852].user_id
 77 |     user['hyb_Banana'] = 0
 78 |     user.loc[user.user_id.isin(tag_user), 'hyb_Banana'] = 1
 79 |     
 80 |     tag_user = log_[log_.product_id==13176].user_id
 81 |     user['hyb_BoO-Bananas'] = 0
 82 |     user.loc[user.user_id.isin(tag_user), 'hyb_BoO-Bananas'] = 1
 83 |     
 84 |     tag_user = log_[log_.product_id==21137].user_id
 85 |     user['hyb_Organic-Strawberries'] = 0
 86 |     user.loc[user.user_id.isin(tag_user), 'hyb_Organic-Strawberries'] = 1
 87 |     
 88 |     tag_user = log_[log_.product_id==21903].user_id
 89 |     user['hyb_Organic-Baby-Spinach'] = 0
 90 |     user.loc[user.user_id.isin(tag_user), 'hyb_Organic-Baby-Spinach'] = 1
 91 |     
 92 |     tag_user = log_[log_.product_id==47209].user_id
 93 |     user['hyb_Organic-Hass-Avocado'] = 0
 94 |     user.loc[user.user_id.isin(tag_user), 'hyb_Organic-Hass-Avocado'] = 1
 95 |     
 96 |     user.to_pickle('../feature/{}/f109_user.p'.format(folder))
 97 | 
 98 | #==============================================================================
 99 | # main
100 | #==============================================================================
101 | make(0)
102 | make(1)
103 | make(2)
104 | 
105 | make(-1)
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | utils.end(__file__)
121 | 
122 | 


--------------------------------------------------------------------------------
/py_feature/110_None.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Jul 31 23:59:01 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | """
 9 | 
10 | import pandas as pd
11 | import numpy as np
12 | from tqdm import tqdm
13 | import multiprocessing as mp
14 | import utils
15 | utils.start(__file__)
16 | 
17 | LOOP = 20
18 | #==============================================================================
19 | # load
20 | #==============================================================================
21 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[['order_id', 'user_id', 'order_number']].sort_values(['user_id', 'order_number', 'order_id'])
22 | for i in range(1, LOOP):
23 |     order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i)
24 | 
25 | col = [c for c in order_tbl.columns if 'order_id' in c]
26 | order_tbl = order_tbl[col]
27 | 
28 | order_None = pd.read_pickle('../input/mk/order_None.p')
29 | 
30 | 
31 | #==============================================================================
32 | # main
33 | #==============================================================================        
34 | df = order_tbl.copy()
35 | 
36 | for i in tqdm(range(1, LOOP)):
37 |     df = pd.merge(df, order_None.add_prefix('t-{}_'.format(i)), 
38 |                 on='t-{}_order_id'.format(i), how='left')
39 |     
40 | col = [c for c in df.columns if c.endswith('_order_id')]
41 | df.drop(col, axis=1, inplace=True)
42 | 
43 | df.fillna(-1, inplace=True)
44 | 
45 | df.to_pickle('../feature/trainT-0/f110_order.p')
46 | df.to_pickle('../feature/test/f110_order.p')
47 | 
48 | 
49 | #==============================================================================
50 | 
51 | utils.end(__file__)
52 | 
53 | 


--------------------------------------------------------------------------------
/py_feature/200_======item_feature======:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/200_======item_feature======


--------------------------------------------------------------------------------
/py_feature/202_buy_time.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon May 29 19:58:46 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | アイテムが買われる時間帯
  9 | 
 10 | """
 11 | 
 12 | import pandas as pd
 13 | import numpy as np
 14 | import gc
 15 | from tqdm import tqdm
 16 | import utils
 17 | utils.start(__file__)
 18 | 
 19 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev']
 20 | log = utils.read_pickles('../input/mk/log', col)
 21 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), on='order_hour_of_day', how='left')
 22 | log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone
 23 | 
 24 | 
 25 | # TODO: rolling mean
 26 | def make(log, folder):
 27 |     #==============================================================================
 28 |     # hour
 29 |     #==============================================================================
 30 |     gc.collect()
 31 |     tbl = log.groupby(['product_id', 'order_hour_of_day']).size().reset_index()
 32 |     tbl.columns = ['product_id', 'order_hour_of_day', 'item_hour_cnt']
 33 |     
 34 |     tbl['item_hour_ratio'] = tbl.item_hour_cnt / tbl.groupby('product_id').transform(np.sum).item_hour_cnt
 35 |     
 36 |     tbl.to_pickle('../feature/{}/f202_product_hour.p'.format(folder))
 37 |     
 38 |     # unique
 39 |     tbl = log.drop_duplicates(['user_id', 'product_id', 'order_hour_of_day']).groupby(['product_id', 'order_hour_of_day']).size().reset_index()
 40 |     tbl.columns = ['product_id', 'order_hour_of_day', 'item_hour_cnt_unq']
 41 |     
 42 |     tbl['item_hour_ratio_unq'] = tbl.item_hour_cnt_unq / tbl.groupby('product_id').transform(np.sum).item_hour_cnt_unq
 43 |     
 44 |     tbl.to_pickle('../feature/{}/f202_uniq_product_hour.p'.format(folder))
 45 |     
 46 |     
 47 |     #==============================================================================
 48 |     # dow
 49 |     #==============================================================================
 50 |     gc.collect()
 51 |     tbl = log.groupby(['product_id', 'order_dow']).size().reset_index()
 52 |     tbl.columns = ['product_id', 'order_dow', 'item_dow_cnt']
 53 |     
 54 |     tbl['item_dow_ratio'] = tbl.item_dow_cnt / tbl.groupby('product_id').transform(np.sum).item_dow_cnt
 55 |     
 56 |     tbl.to_pickle('../feature/{}/f202_product_dow.p'.format(folder))
 57 |     
 58 |     # unique
 59 |     tbl = log.drop_duplicates(['user_id', 'product_id', 'order_dow']).groupby(['product_id', 'order_dow']).size().reset_index()
 60 |     tbl.columns = ['product_id', 'order_dow', 'item_dow_cnt_unq']
 61 |     
 62 |     tbl['item_dow_ratio_unq'] = tbl.item_dow_cnt_unq / tbl.groupby('product_id').transform(np.sum).item_dow_cnt_unq
 63 |     
 64 |     tbl.to_pickle('../feature/{}/f202_uniq_product_dow.p'.format(folder))
 65 |     
 66 |     
 67 |     #==============================================================================
 68 |     # timezone
 69 |     #==============================================================================
 70 |     gc.collect()
 71 |     tbl = log.groupby(['product_id', 'timezone']).size().reset_index()
 72 |     tbl.columns = ['product_id', 'timezone', 'item_timezone_cnt']
 73 |     
 74 |     tbl['item_timezone_ratio'] = (tbl.item_timezone_cnt / tbl.groupby('product_id').transform(np.sum).item_timezone_cnt).map(float)
 75 |     
 76 |     tbl.to_pickle('../feature/{}/f202_product_timezone.p'.format(folder))
 77 |     
 78 |     # unique
 79 |     tbl = log.drop_duplicates(['user_id', 'product_id', 'timezone']).groupby(['product_id', 'timezone']).size().reset_index()
 80 |     tbl.columns = ['product_id', 'timezone', 'item_timezone_cnt_uniq']
 81 |     
 82 |     tbl['item_timezone_ratio_uniq'] = (tbl.item_timezone_cnt_uniq / tbl.groupby('product_id').transform(np.sum).item_timezone_cnt_uniq).map(float)
 83 |     
 84 |     tbl.to_pickle('../feature/{}/f202_uniq_product_timezone.p'.format(folder))
 85 |     
 86 |     #==============================================================================
 87 |     # timezone * dow
 88 |     #==============================================================================
 89 |     gc.collect()
 90 |     
 91 |     tbl = log.groupby(['product_id', 'order_dow', 'timezone']).size().reset_index()
 92 |     tbl.columns = ['product_id', 'order_dow', 'timezone', 'item_dow-tz_cnt']
 93 |     
 94 |     tbl['item_dow-tz_ratio'] = (tbl['item_dow-tz_cnt'] / tbl.groupby('product_id').transform(np.sum)['item_dow-tz_cnt']).map(float)
 95 |     
 96 |     tbl.to_pickle('../feature/{}/f202_product_dow-timezone.p'.format(folder))
 97 |     
 98 |     # unique
 99 |     tbl = log.drop_duplicates(['user_id', 'product_id', 'order_dow', 'timezone']).groupby(['product_id', 'order_dow', 'timezone']).size().reset_index()
100 |     tbl.columns = ['product_id', 'order_dow', 'timezone', 'item_dow-tz_cnt_uniq']
101 |     
102 |     tbl['item_dow-tz_ratio_uniq'] = (tbl['item_dow-tz_cnt_uniq'] / tbl.groupby('product_id').transform(np.sum)['item_dow-tz_cnt_uniq']).map(float)
103 |     
104 |     tbl.to_pickle('../feature/{}/f202_uniq_product_dow-timezone.p'.format(folder))
105 |     
106 |     
107 |     #==============================================================================
108 |     # flat
109 |     #==============================================================================
110 |     gc.collect()
111 |     tbl = pd.crosstab(log.product_id, log.dow_tz, normalize='index').add_prefix('item_flat_dow-tz_')
112 |     
113 |     tbl.reset_index().to_pickle('../feature/{}/f202_flat_product.p'.format(folder))
114 | #==============================================================================
115 | # main
116 | #==============================================================================
117 | make(log[log.order_number_rev>0], 'trainT-0')
118 | make(log[log.order_number_rev>1], 'trainT-1')
119 | make(log[log.order_number_rev>2], 'trainT-2')
120 | 
121 | make(log, 'test')
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | #==============================================================================
135 | utils.end(__file__)
136 | 
137 | 


--------------------------------------------------------------------------------
/py_feature/203_cycle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Jun  3 06:46:06 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | Item buy cycle
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | #import gc
16 | import utils
17 | utils.start(__file__)
18 | 
19 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev']
20 | log = pd.merge(utils.read_pickles('../input/mk/log', usecols),
21 |                utils.read_pickles('../input/mk/days_since_last_order'),
22 |                on=['order_id','product_id'], how='left')
23 | 
24 | def make(log, folder):
25 | 
26 |     tbl = log.groupby('product_id').days_since_last_order_this_item.mean().to_frame()
27 |     tbl.columns = ['item_order_days_mean']
28 |     tbl['item_order_days_min'] = log.groupby('product_id').days_since_last_order_this_item.min()
29 |     tbl['item_order_days_max'] = log.groupby('product_id').days_since_last_order_this_item.max()
30 |     tbl['item_order_days_median'] = log.groupby('product_id').days_since_last_order_this_item.median()
31 |     
32 |     tbl['item_order_freq'] = log.groupby('product_id').size()
33 |     
34 |     tbl['item_reorderd_freq'] = log.groupby('product_id').reordered.sum()
35 |     tbl['item_reorder_ratio'] = (tbl.item_reorderd_freq / tbl.item_order_freq).astype(np.float32)
36 |     
37 |     tbl['item_unique_user'] = log.drop_duplicates(['user_id', 'product_id']).groupby('product_id').size()
38 |     tbl['item_order_per-user'] = tbl['item_order_freq'] / tbl['item_unique_user']
39 |     
40 |     tbl.reset_index(inplace=1)
41 |     
42 |     
43 |     tbl.to_pickle('../feature/{}/f203_product.p'.format(folder))
44 | #==============================================================================
45 | # main
46 | #==============================================================================
47 | make(log[log.order_number_rev>0], 'trainT-0')
48 | make(log[log.order_number_rev>1], 'trainT-1')
49 | make(log[log.order_number_rev>2], 'trainT-2')
50 | 
51 | make(log, 'test')
52 | 
53 | 
54 | 
55 | #==============================================================================
56 | utils.end(__file__)
57 | 
58 | 


--------------------------------------------------------------------------------
/py_feature/205_co-occur.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jun  7 22:00:22 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 205_co-occur.py &
  9 | 
 10 | 
 11 | === order_numberまたぎ共起 ===
 12 | t-1に何を買うと(自分以外)、t-0にreorderする？
 13 | exp:
 14 |     t-1にbananaを買った人の30%がt-0にstrawberryを買う
 15 |     
 16 | takes 3 hour
 17 | """
 18 | 
 19 | import pandas as pd
 20 | import numpy as np
 21 | #from tqdm import tqdm
 22 | from collections import Counter
 23 | from itertools import product
 24 | from operator import itemgetter
 25 | import gc
 26 | import multiprocessing as mp
 27 | import utils
 28 | utils.start(__file__)
 29 | 
 30 | #==============================================================================
 31 | # load
 32 | #==============================================================================
 33 | col = ['order_id', 'user_id', 'product_name', 't-1_product_name', 'order_number', 'order_number_rev']
 34 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col]
 35 | order_tbl.sort_values(['user_id','order_number'], inplace=1)
 36 | order_tbl['t-1_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(1)
 37 | order_tbl.reset_index(drop=True, inplace=True)
 38 | 
 39 | prods = pd.read_pickle('../input/mk/goods.p')[['product_id','product_name']]
 40 | 
 41 | log = utils.read_pickles('../input/mk/log', ['order_id', 'product_id', 'order_number_rev'])
 42 | order_item_array = log.groupby('order_id').product_id.apply(np.array).reset_index()
 43 | del log; gc.collect()
 44 | #==============================================================================
 45 | # def
 46 | #==============================================================================
 47 | 
 48 | def make(T):
 49 |     """
 50 |     T = 0
 51 |     folder = 'trainT-0'
 52 |     """
 53 |     if T==-1:
 54 |         folder = 'test'
 55 |     else:
 56 |         folder = 'trainT-'+str(T)
 57 |     print("start T:{} folder:{}".format(T, folder))
 58 |     order_tbl_ = order_tbl[order_tbl.order_number_rev>T].dropna() # drop first order
 59 |     
 60 |     item2item = []
 61 |     item_bunbo = Counter()
 62 |     for item_prior, item_now in order_tbl_[['t-1_product_name', 'product_name']].values:
 63 |         item2item  += [i1+' -> '+i2 for i1, i2 in list(product(item_prior, item_now))]
 64 |         item_bunbo += Counter(item_prior)
 65 |     item2item = Counter(item2item)
 66 | 
 67 |     df = pd.DataFrame.from_dict(item2item, orient='index').reset_index()
 68 |     df.columns = ['item', 'cnt']
 69 |     del item2item; gc.collect()
 70 | 
 71 |     df_ = pd.DataFrame.from_dict(item_bunbo, orient='index').reset_index()
 72 |     df_.columns = ['before', 'total_cnt']
 73 |     del item_bunbo; gc.collect()
 74 |     
 75 |     df.sort_values('cnt', ascending=False, inplace=True)
 76 |     
 77 |     df['before'] = df.item.map(lambda x: x.split(' -> ')[0])
 78 |     df['after'] = df.item.map(lambda x: x.split(' -> ')[1])
 79 |     df = df[df.before!=df.after]
 80 |     
 81 |     df = pd.merge(df, df_, on='before', how='left')
 82 |     
 83 |     df['before_to_after_ratio'] = df.cnt / df.total_cnt
 84 |     df = df[['before', 'after', 'before_to_after_ratio']]
 85 |     gc.collect()
 86 | 
 87 |     df = pd.merge(df, prods.rename(columns={'product_name':'before', 'product_id':'before_id'}), 
 88 |                    on='before', how='left')
 89 |     df = pd.merge(df, prods.rename(columns={'product_name':'after', 'product_id':'after_id'}), 
 90 |                    on='after', how='left')
 91 |     
 92 |     df = df[['before_id', 'after_id', 'before_to_after_ratio']]
 93 |     gc.collect()
 94 |     """
 95 |     df.head()
 96 |           before_id  after_id  before_to_after_ratio
 97 |     0      47209     13176               0.288618
 98 |     1      13176     47209               0.175736
 99 |     2      13176     21137               0.148974
100 |     3      21137     13176               0.188769
101 |     """
102 |     #==============================================================================
103 |     print('Merge', T)
104 |     #==============================================================================
105 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
106 |     label = pd.merge(label, order_tbl[['order_id', 't-1_order_id']], 
107 |                      on='order_id', how='left')
108 |     print('今まで買ったitem and t-1に買ったitem')
109 |     order_b4after = pd.merge(label, order_item_array.add_prefix('t-1_'), 
110 |                              on='t-1_order_id', how='left')
111 |     gc.collect()
112 |     
113 |     col = ['order_id', 't-1_product_id', 'product_id']
114 |     order_b4after = order_b4after[col]
115 |     gc.collect()
116 |     """
117 |     order_b4after.head()
118 |     Out[9]:
119 |        order_id                                     t-1_product_id  product_id
120 |     0   1187899  [46149, 39657, 38928, 25133, 10258, 35951, 130...         196
121 |     1   1187899  [46149, 39657, 38928, 25133, 10258, 35951, 130...       10258
122 |     2   1187899  [46149, 39657, 38928, 25133, 10258, 35951, 130...       10326
123 |     3   1187899  [46149, 39657, 38928, 25133, 10258, 35951, 130...       12427
124 |     4   1187899  [46149, 39657, 38928, 25133, 10258, 35951, 130...       13032
125 |     """
126 |     #==============================================================================
127 |     print('search max ratio',T)
128 |     #==============================================================================
129 |     df['key'] = df.before_id.map(str) + 'to' + df.after_id.map(str)
130 |     
131 |     ratio_tbl = {}
132 |     for k,v in df[['key','before_to_after_ratio']].values:
133 |         ratio_tbl[k] = v
134 |     
135 |     del df; gc.collect()
136 |     
137 |     def get_ratio(key):
138 |         try:
139 |             return ratio_tbl[key]
140 |         except:
141 |             return -1
142 |     
143 |     def search_max_ratio(before_items, item):
144 |         """
145 |         before_items = order_tr.loc[0,'t-1_product_id']
146 |         item    = order_tr.loc[0,'product_id']    
147 |         """
148 |         comb = list(product(before_items, [item]))
149 |         comb = [str(x) + 'to' + str(y) for x,y in sorted(comb, key=itemgetter(1))]
150 |         return np.max([get_ratio(k) for k in comb])
151 | 
152 | 
153 |     print('== before_to_after_ratio ==', T)
154 |     ret = []
155 |     for before_items, item in order_b4after[['t-1_product_id', 'product_id']].values:
156 |         ret.append(search_max_ratio(before_items, item))
157 |     order_b4after['before_to_after_ratio'] = ret
158 |     
159 |     col = ['order_id', 'product_id', 'before_to_after_ratio']
160 |     order_b4after[col].to_pickle('../feature/{}/f205_order_product.p'.format(folder))
161 |     
162 | #==============================================================================
163 | # main
164 | #==============================================================================
165 | 
166 | mp_pool = mp.Pool(3)
167 | mp_pool.map(make, [-1, 0, 1, 2, #3,# 4, 5
168 |                    ])
169 | 
170 | 
171 | 
172 | 
173 | #==============================================================================
174 | utils.end(__file__)
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/py_feature/207_mean_pos_cart.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jun 16 07:11:23 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | 
16 | #==============================================================================
17 | # load
18 | #==============================================================================
19 | 
20 | col = ['order_id', 'product_id', 'add_to_cart_order', 'order_number_rev']
21 | log = utils.read_pickles('../input/mk/log', col)
22 | 
23 | #==============================================================================
24 | # def
25 | #==============================================================================
26 | def make(T):
27 |     """
28 |     T = 0
29 |     folder = 'trainT-0'
30 |     """
31 |     if T==-1:
32 |         folder = 'test'
33 |     else:
34 |         folder = 'trainT-'+str(T)
35 |     
36 |     log_ = log[log.order_number_rev>T]
37 |     
38 |     gr = log_.groupby('product_id')
39 |     
40 |     items = gr.add_to_cart_order.mean().to_frame()
41 |     items.columns = ['item_mean_pos_cart']
42 |     items['item_sum_pos_cart'] = gr.add_to_cart_order.sum()
43 |     items['item_min_pos_cart'] = gr.add_to_cart_order.min()
44 |     items['item_median_pos_cart'] = gr.add_to_cart_order.median()
45 |     items['item_max_pos_cart'] = gr.add_to_cart_order.max()
46 |     items['item_std_pos_cart'] = gr.add_to_cart_order.std()
47 |     items.reset_index(inplace=True)
48 |     
49 |     items.to_pickle('../feature/{}/f207_product.p'.format(folder))
50 | 
51 | #==============================================================================
52 | # main
53 | #==============================================================================
54 | make(0)
55 | make(1)
56 | make(2)
57 | 
58 | make(-1)
59 | 
60 | #==============================================================================
61 | utils.end(__file__)
62 | 
63 | 


--------------------------------------------------------------------------------
/py_feature/208_one-shot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Jul  4 03:58:09 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 一回しか買わないユーザーの数
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | #==============================================================================
19 | # load
20 | #==============================================================================
21 | 
22 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev']
23 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id')
24 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 
25 |                on='order_hour_of_day', how='left')
26 | 
27 | #==============================================================================
28 | # def
29 | #==============================================================================
30 | def make(T):
31 |     """
32 |     T = 0
33 |     folder = 'trainT-0'
34 |     """
35 |     if T==-1:
36 |         folder = 'test'
37 |     else:
38 |         folder = 'trainT-'+str(T)
39 |         
40 |     log_ = log[log.order_number_rev>T]
41 |     
42 |     item = log_.groupby(['product_id', 'user_id']).size().reset_index()
43 |     item.columns = ['product_id', 'user_id', 'cnt']
44 |     
45 |     item_one = item[item.cnt==1].groupby('product_id').size().reset_index()
46 |     item_one.columns = ['product_id', 'item_only_one_user_cnt']
47 |     
48 |     item_size = item.groupby('product_id').size().reset_index()
49 |     item_size.columns = ['product_id', 'item_unique_user']
50 |     
51 |     item = pd.merge(item_one, item_size, on='product_id', how='left')
52 |     item['item_only_one_user_cnt_ratio'] = item['item_only_one_user_cnt']/item['item_unique_user']
53 |     
54 |     col = ['product_id', 'item_only_one_user_cnt', 'item_only_one_user_cnt_ratio']
55 |     item[col].to_pickle('../feature/{}/f208_product.p'.format(folder))
56 | 
57 | #==============================================================================
58 | # main
59 | #==============================================================================
60 | make(0)
61 | make(1)
62 | make(2)
63 | 
64 | make(-1)
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | utils.end(__file__)
79 | 
80 | 


--------------------------------------------------------------------------------
/py_feature/209_together.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jul  5 22:36:10 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | #==============================================================================
16 | # load
17 | #==============================================================================
18 | 
19 | col = ['order_id', 'user_id', 'product_id', 'order_number_rev']
20 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id')
21 | 
22 | #==============================================================================
23 | # def
24 | #==============================================================================
25 | def make(T):
26 |     """
27 |     T = 0
28 |     folder = 'trainT-0'
29 |     """
30 |     if T==-1:
31 |         folder = 'test'
32 |     else:
33 |         folder = 'trainT-'+str(T)
34 |         
35 |     log_ = log[log.order_number_rev>T]
36 |     
37 |     order_size = log_.groupby('order_id').size().reset_index()
38 |     order_size.columns = ['order_id', 'total']
39 |     
40 |     log_ = pd.merge(log_, order_size, on='order_id', how='left')
41 |     
42 |     item = log_.groupby('product_id').total.mean().to_frame()
43 |     item.columns = ['item_together_mean']
44 |     
45 |     item['item_together_min'] = log_.groupby('product_id').total.min()
46 |     item['item_together_max'] = log_.groupby('product_id').total.max()
47 |     item['item_together_std'] = log_.groupby('product_id').total.std()
48 |     
49 |     item.reset_index().to_pickle('../feature/{}/f209_product.p'.format(folder))
50 | 
51 | #==============================================================================
52 | # main
53 | #==============================================================================
54 | make(0)
55 | make(1)
56 | make(2)
57 | 
58 | make(-1)
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | utils.end(__file__)
74 | 
75 | 


--------------------------------------------------------------------------------
/py_feature/210_streak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jul  5 22:36:10 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | #==============================================================================
16 | # load
17 | #==============================================================================
18 | 
19 | col = ['order_id', 'user_id', 'product_id', 'order_number_rev']
20 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id')
21 | 
22 | streak = pd.read_pickle('../input/mk/streak_order-product.p')
23 | #==============================================================================
24 | # def
25 | #==============================================================================
26 | def make(T):
27 |     """
28 |     T = 0
29 |     folder = 'trainT-0'
30 |     """
31 |     if T==-1:
32 |         folder = 'test'
33 |     else:
34 |         folder = 'trainT-'+str(T)
35 |         
36 |     log_ = pd.merge(log[log.order_number_rev>T], streak,
37 |                     on=['order_id', 'product_id'], how='left')
38 |     
39 |     gr = log_.groupby('product_id')
40 |     item = gr.streak.mean().to_frame()
41 |     item.columns = ['item_streak_mean']
42 |     
43 |     item['item_streak_min'] = gr.streak.min()
44 |     item['item_streak_max'] = gr.streak.max()
45 |     item['item_streak_std'] = gr.streak.std()
46 |     
47 |     item.reset_index().to_pickle('../feature/{}/f210_product.p'.format(folder))
48 | 
49 | #==============================================================================
50 | # main
51 | #==============================================================================
52 | make(0)
53 | make(1)
54 | make(2)
55 | 
56 | make(-1)
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | utils.end(__file__)
72 | 
73 | 


--------------------------------------------------------------------------------
/py_feature/212_withinN.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jul  5 22:36:10 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | 
  9 | nohup python -u 212_withinN.py &
 10 | 
 11 | 
 12 | """
 13 | 
 14 | import pandas as pd
 15 | import gc
 16 | import numpy as np
 17 | from collections import defaultdict
 18 | import multiprocessing as mp
 19 | total_proc = 3
 20 | import utils
 21 | utils.start(__file__)
 22 | 
 23 | #==============================================================================
 24 | # load
 25 | #==============================================================================
 26 | 
 27 | usecols = ['product_id', 'user_id', 'order_number', 'order_id', 'order_number_rev']
 28 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3])
 29 | 
 30 | #==============================================================================
 31 | # def
 32 | #==============================================================================
 33 | 
 34 | def make(T):
 35 |     """
 36 |     T = 0
 37 |     folder = 'trainT-0'
 38 |     """
 39 |     if T==-1:
 40 |         folder = 'test'
 41 |     else:
 42 |         folder = 'trainT-'+str(T)
 43 |     
 44 |     log_ = log[log.order_number_rev>T]
 45 |     log_['user_max_onb'] = log_.groupby('user_id').order_number.transform(np.max)
 46 |     
 47 |     item_N2_cnt    = defaultdict(int)
 48 |     item_N2_chance = defaultdict(int)
 49 |     item_N3_cnt    = defaultdict(int)
 50 |     item_N3_chance = defaultdict(int)
 51 |     item_N4_cnt    = defaultdict(int)
 52 |     item_N4_chance = defaultdict(int)
 53 |     item_N5_cnt    = defaultdict(int)
 54 |     item_N5_chance = defaultdict(int)
 55 |     pid_bk = uid_bk = onb_bk = None
 56 | #    for pid, uid, onb, max_onb in tqdm(log_[['product_id', 'user_id', 'order_number','user_max_onb']].values):
 57 |     for pid, uid, onb, max_onb in log_[['product_id', 'user_id', 'order_number','user_max_onb']].values:
 58 |         
 59 |         if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=2 and (max_onb-onb) >=2:
 60 |             item_N2_cnt[pid] +=1
 61 |         if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=2:
 62 |             item_N2_chance[pid] +=1
 63 |         
 64 |         if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=3 and (max_onb-onb) >=3:
 65 |             item_N3_cnt[pid] +=1
 66 |         if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=3:
 67 |             item_N3_chance[pid] +=1
 68 |         
 69 |         if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=4 and (max_onb-onb) >=4:
 70 |             item_N4_cnt[pid] +=1
 71 |         if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=4:
 72 |             item_N4_chance[pid] +=1
 73 |         
 74 |         if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=5 and (max_onb-onb) >=5:
 75 |             item_N5_cnt[pid] +=1
 76 |         if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=5:
 77 |             item_N5_chance[pid] +=1
 78 |         
 79 |         pid_bk = pid
 80 |         uid_bk = uid
 81 |         onb_bk = onb
 82 |     
 83 |     item_N2_cnt = pd.DataFrame.from_dict(item_N2_cnt, orient='index').reset_index()
 84 |     item_N2_cnt.columns = ['product_id', 'item_N2_cnt']
 85 |     item_N2_chance = pd.DataFrame.from_dict(item_N2_chance, orient='index').reset_index()
 86 |     item_N2_chance.columns = ['product_id', 'item_N2_chance']
 87 |     
 88 |     item_N3_cnt = pd.DataFrame.from_dict(item_N3_cnt, orient='index').reset_index()
 89 |     item_N3_cnt.columns = ['product_id', 'item_N3_cnt']
 90 |     item_N3_chance = pd.DataFrame.from_dict(item_N3_chance, orient='index').reset_index()
 91 |     item_N3_chance.columns = ['product_id', 'item_N3_chance']
 92 |     
 93 |     item_N4_cnt = pd.DataFrame.from_dict(item_N4_cnt, orient='index').reset_index()
 94 |     item_N4_cnt.columns = ['product_id', 'item_N4_cnt']
 95 |     item_N4_chance = pd.DataFrame.from_dict(item_N4_chance, orient='index').reset_index()
 96 |     item_N4_chance.columns = ['product_id', 'item_N4_chance']
 97 |     
 98 |     item_N5_cnt = pd.DataFrame.from_dict(item_N5_cnt, orient='index').reset_index()
 99 |     item_N5_cnt.columns = ['product_id', 'item_N5_cnt']
100 |     item_N5_chance = pd.DataFrame.from_dict(item_N5_chance, orient='index').reset_index()
101 |     item_N5_chance.columns = ['product_id', 'item_N5_chance']
102 |     
103 |     df2 = pd.merge(item_N2_cnt, item_N2_chance, on='product_id', how='outer')
104 |     df3 = pd.merge(item_N3_cnt, item_N3_chance, on='product_id', how='outer')
105 |     df4 = pd.merge(item_N4_cnt, item_N4_chance, on='product_id', how='outer')
106 |     df5 = pd.merge(item_N5_cnt, item_N5_chance, on='product_id', how='outer')
107 |     
108 |     df = pd.merge(pd.merge(df2, df3, on='product_id', how='outer'),
109 |                   pd.merge(df4, df5, on='product_id', how='outer'), 
110 |                   on='product_id', how='outer').fillna(0)
111 |     
112 |     df['item_N2_ratio'] = df['item_N2_cnt']/df['item_N2_chance']
113 |     df['item_N3_ratio'] = df['item_N3_cnt']/df['item_N3_chance']
114 |     df['item_N4_ratio'] = df['item_N4_cnt']/df['item_N4_chance']
115 |     df['item_N5_ratio'] = df['item_N5_cnt']/df['item_N5_chance']
116 |     
117 |     df.fillna(0, inplace=True)
118 |     df.reset_index(drop=True, inplace=True)
119 |     df.to_pickle('../feature/{}/f212_product.p'.format(folder))
120 |     
121 | #==============================================================================
122 | # main
123 | #==============================================================================
124 | 
125 | mp_pool = mp.Pool(total_proc)
126 | mp_pool.map(make, range(-1,3))
127 | 
128 | #==============================================================================
129 | utils.end(__file__)
130 | 
131 | 


--------------------------------------------------------------------------------
/py_feature/213_dow_diff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Aug  7 13:58:58 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | import utils
12 | utils.start(__file__)
13 | 
14 | #==============================================================================
15 | # load
16 | #==============================================================================
17 | 
18 | usecols = ['product_id', 'order_dow', 'order_number_rev']
19 | log = utils.read_pickles('../input/mk/log', usecols)
20 | 
21 | #==============================================================================
22 | # def
23 | #==============================================================================
24 | 
25 | def make(T):
26 |     """
27 |     T = 0
28 |     folder = 'trainT-0'
29 |     """
30 |     if T==-1:
31 |         folder = 'test'
32 |     else:
33 |         folder = 'trainT-'+str(T)
34 |     
35 |     log_ = log[log.order_number_rev>T]
36 |     
37 |     all_item_dist = log_.order_dow.value_counts(normalize=True).reset_index()
38 |     all_item_dist.columns = ['order_dow', 'dow_dist_ratio']
39 |     
40 |     tbl = log_.groupby(['product_id', 'order_dow']).size().reset_index()
41 |     tbl.columns = ['product_id', 'order_dow', 'item_dow_cnt']
42 |     tbl['item_dow_ratio'] = tbl.item_dow_cnt / tbl.groupby('product_id').transform(np.sum).item_dow_cnt
43 |     
44 |     tbl = pd.merge(tbl, all_item_dist, on='order_dow', how='left')
45 |     
46 |     tbl['item_dow_ratio_diff'] = tbl.item_dow_ratio - tbl.dow_dist_ratio
47 |     
48 |     tbl[['product_id','order_dow', 'item_dow_ratio_diff']].to_pickle('../feature/{}/f213_product-dow.p'.format(folder))
49 | 
50 | #==============================================================================
51 | # main
52 | #==============================================================================
53 | make(0)
54 | make(1)
55 | make(2)
56 | 
57 | make(-1)
58 | 
59 | #==============================================================================
60 | utils.end(__file__)
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | utils.end(__file__)
77 | 
78 | 


--------------------------------------------------------------------------------
/py_feature/214_first_order.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Aug  7 15:52:10 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | if t-1 == first buy, what's the ratio of reorderd?
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from collections import defaultdict
15 | from tqdm import tqdm
16 | import utils
17 | utils.start(__file__)
18 | 
19 | #==============================================================================
20 | # load
21 | #==============================================================================
22 | 
23 | usecols = ['user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev']
24 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3])
25 | 
26 | #==============================================================================
27 | # def
28 | #==============================================================================
29 | 
30 | def make(T):
31 |     """
32 |     T = 0
33 |     folder = 'trainT-0'
34 |     """
35 |     if T==-1:
36 |         folder = 'test'
37 |     else:
38 |         folder = 'trainT-'+str(T)
39 |     
40 |     log_ = log[log.order_number_rev>T]
41 |     log_['user_max_onb'] = log_.groupby('user_id').order_number.transform(np.max)
42 |     log_ = log_.groupby(['user_id', 'product_id']).head(2)
43 |     
44 |     item_cnt    = defaultdict(int)
45 |     item_chance = defaultdict(int)
46 |     pid_bk = uid_bk = onb_bk = None
47 |     
48 |     for uid, pid, onb, max_onb in log_[['user_id', 'product_id', 'order_number', 'user_max_onb']].values:
49 |         
50 |         if uid==uid_bk and pid==pid_bk and (onb-onb_bk==1):
51 |             item_cnt[pid] +=1
52 |         if onb!=max_onb:
53 |             item_chance[pid] +=1
54 |     
55 |         pid_bk = pid
56 |         uid_bk = uid
57 |         onb_bk = onb
58 |     
59 |     item_cnt = pd.DataFrame.from_dict(item_cnt, orient='index').reset_index()
60 |     item_cnt.columns = ['product_id', 'item_first_cnt']
61 |     item_chance = pd.DataFrame.from_dict(item_chance, orient='index').reset_index()
62 |     item_chance.columns = ['product_id', 'item_first_chance']
63 |     
64 |     df = pd.merge(item_cnt, item_chance, on='product_id', how='outer').fillna(0)
65 |     df['item_first_ratio'] = df.item_first_cnt/df.item_first_chance
66 |     
67 |     df.to_pickle('../feature/{}/f214_product.p'.format(folder))
68 | 
69 | 
70 | #==============================================================================
71 | # main
72 | #==============================================================================
73 | 
74 | make(0)
75 | make(1)
76 | make(2)
77 | 
78 | make(-1)
79 | 
80 | #==============================================================================
81 | 
82 | utils.end(__file__)
83 | 
84 | 


--------------------------------------------------------------------------------
/py_feature/215_onb_diff.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jul  5 22:36:10 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | 
  9 | """
 10 | 
 11 | import pandas as pd
 12 | import gc
 13 | import numpy as np
 14 | from collections import defaultdict
 15 | from scipy.stats import skew
 16 | import utils
 17 | utils.start(__file__)
 18 | 
 19 | #==============================================================================
 20 | # load
 21 | #==============================================================================
 22 | 
 23 | col = ['product_id', 'user_id', 'order_number', 'order_number_rev']
 24 | log = utils.read_pickles('../input/mk/log', col).sort_values(col[:3])
 25 | 
 26 | """
 27 | 1 1 1
 28 | 1 1 2
 29 | 1 1 4
 30 | 1 2 3
 31 | 1 2 4
 32 | 2 2 5
 33 | """
 34 | #==============================================================================
 35 | # def
 36 | #==============================================================================
 37 | def make(T):
 38 |     """
 39 |     T = 0
 40 |     folder = 'trainT-0'
 41 |     """
 42 |     if T==-1:
 43 |         folder = 'test'
 44 |     else:
 45 |         folder = 'trainT-'+str(T)
 46 |     
 47 |     log_ = log[log.order_number_rev>T]
 48 |     log_['user_max_onb'] = log_.groupby('user_id').order_number.transform(np.max)
 49 |     
 50 |     item_min = defaultdict(int)
 51 |     item_mean = defaultdict(int)
 52 |     item_median = defaultdict(int)
 53 |     item_max = defaultdict(int)
 54 |     item_std = defaultdict(int)
 55 |     item_skew = defaultdict(int)
 56 |     
 57 |     pid_bk = uid_bk = onb_bk = None
 58 |     diff = []
 59 |     
 60 |     for pid, uid, onb, max_onb in log_[['product_id', 'user_id', 'order_number', 'user_max_onb']].values:
 61 |         
 62 |         if pid==pid_bk and uid==uid_bk:
 63 |             diff.append(onb-onb_bk)
 64 |             """
 65 |             pattern would be like:
 66 |                      onb     ->  diff 
 67 |             1111    1,2,3,4  ->  [1,1,1]
 68 |             11101   1,2,3,5  ->  [1,1,2]
 69 |             111     1,2,3    ->  [1,1]
 70 |             1101    1,2,4    ->  [1,2]
 71 |             1011    1,3,4    ->  [2,1]
 72 |             """
 73 |         
 74 |         elif pid==pid_bk and uid!=uid_bk:
 75 |             pass
 76 |         elif pid!=pid_bk:
 77 |             if len(diff)>0:
 78 |                 item_min[pid]    = np.min(diff)
 79 |                 item_mean[pid]   = np.mean(diff)
 80 |                 item_median[pid] = np.median(diff)
 81 |                 item_max[pid]    = np.max(diff)
 82 |                 item_std[pid]    = np.std(diff)
 83 |                 item_skew[pid]   = skew(diff)
 84 |             diff = []
 85 |         
 86 |         pid_bk = pid
 87 |         uid_bk = uid
 88 |         onb_bk = onb
 89 |     
 90 |     item_min = pd.DataFrame.from_dict(item_min, orient='index').reset_index()
 91 |     item_min.columns = ['product_id', 'item_onb_diff_min']
 92 |     item_mean = pd.DataFrame.from_dict(item_mean, orient='index').reset_index()
 93 |     item_mean.columns = ['product_id', 'item_onb_diff_mean']
 94 |     item_median = pd.DataFrame.from_dict(item_median, orient='index').reset_index()
 95 |     item_median.columns = ['product_id', 'item_onb_diff_median']
 96 |     item_max = pd.DataFrame.from_dict(item_max, orient='index').reset_index()
 97 |     item_max.columns = ['product_id', 'item_onb_diff_max']
 98 |     item_std = pd.DataFrame.from_dict(item_std, orient='index').reset_index()
 99 |     item_std.columns = ['product_id', 'item_onb_diff_std']
100 |     item_skew = pd.DataFrame.from_dict(item_skew, orient='index').reset_index()
101 |     item_skew.columns = ['product_id', 'item_onb_diff_skew']
102 |     
103 |     df1 = pd.merge(item_min,   item_mean, on='product_id', how='outer')
104 |     df2 = pd.merge(item_median, item_max, on='product_id', how='outer')
105 |     df3 = pd.merge(item_std, item_skew, on='product_id', how='outer')
106 |     
107 |     df = pd.merge(pd.merge(df1, df2, on='product_id', how='outer'), 
108 |                   df3, on='product_id', how='outer')
109 |     
110 |     df.fillna(-99, inplace=True)
111 |     df.to_pickle('../feature/{}/f215_product.p'.format(folder))
112 |     
113 | 
114 | #==============================================================================
115 | # main
116 | #==============================================================================
117 | make(0)
118 | make(1)
119 | make(2)
120 | 
121 | make(-1)
122 | 
123 | 
124 | #==============================================================================
125 | utils.end(__file__)
126 | 
127 | 


--------------------------------------------------------------------------------
/py_feature/300_======user x item======:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/300_======user x item======


--------------------------------------------------------------------------------
/py_feature/301_total_buy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri May 26 22:12:32 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | そのユーザーがそのアイテムをいくつ買ったか
 9 | *リークしてない
10 | 
11 | """
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from tqdm import tqdm
16 | from collections import defaultdict
17 | import utils
18 | utils.start(__file__)
19 | 
20 | 
21 | col = ['order_id', 'user_id', 'product_id', 'order_number_rev']
22 | log = utils.read_pickles('../input/mk/log', col)
23 | 
24 | orders = pd.read_csv('../input/orders.csv.gz',usecols=['order_id','user_id','order_number'])
25 | 
26 | #==============================================================================
27 | # def
28 | #==============================================================================
29 | def make(T):
30 |     """
31 |     T = 0
32 |     folder = 'trainT-0'
33 |     """
34 |     if T==-1:
35 |         folder = 'test'
36 |     else:
37 |         folder = 'trainT-'+str(T)
38 |         
39 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
40 |     df = pd.merge(label, orders, on='order_id', how='left')
41 |     
42 |     total_buy = log[log.order_number_rev>T].groupby(['user_id', 'product_id']).size().reset_index()
43 |     total_buy.columns = ['user_id', 'product_id','total_buy']
44 |     
45 |     df = pd.merge(df, total_buy, on=['user_id', 'product_id'], how='left')
46 |     df['total_buy_ratio'] = df.total_buy / (df.order_number-1)
47 | 
48 |     col = ['order_id', 'product_id','total_buy', 'total_buy_ratio']
49 |     df[col].to_pickle('../feature/{}/f301_order-product.p'.format(folder))
50 |     
51 |     # near5
52 |     df = pd.merge(label, orders, on='order_id', how='left')
53 |     total_buy = log[log.order_number_rev>T][log.order_number_rev<=(T+5)].groupby(['user_id', 'product_id']).size().reset_index()
54 |     total_buy.columns = ['user_id', 'product_id','total_buy_n5']
55 |     
56 |     df = pd.merge(df, total_buy, on=['user_id', 'product_id'], how='left').fillna(0)
57 |     df['total_buy_ratio_n5'] = df['total_buy_n5'] / df.order_number.map(lambda x: min(5, x))
58 | 
59 |     col = ['order_id', 'product_id','total_buy_n5', 'total_buy_ratio_n5']
60 |     df[col].to_pickle('../feature/{}/f301_order-product_n5.p'.format(folder))
61 |     
62 | 
63 | #==============================================================================
64 | # main
65 | #==============================================================================
66 | make(0)
67 | make(1)
68 | make(2)
69 | 
70 | make(-1)
71 | 
72 | #==============================================================================
73 | utils.end(__file__)
74 | 
75 | 


--------------------------------------------------------------------------------
/py_feature/302-1_reorderd_all.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jun 14 09:42:55 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | """
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | from tqdm import tqdm
14 | import multiprocessing as mp
15 | import utils
16 | utils.start(__file__)
17 | 
18 | LOOP = 20
19 | #==============================================================================
20 | # load
21 | #==============================================================================
22 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[['order_id', 'user_id', 'order_number']].sort_values(['user_id', 'order_number', 'order_id'])
23 | for i in range(1, LOOP):
24 |     order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i)
25 | 
26 | col = [c for c in order_tbl.columns if 'order_id' in c]
27 | order_tbl = order_tbl[col]
28 | 
29 | col = ['order_id', 'user_id', 'order_number', 'product_id', 'reordered']
30 | log = utils.read_pickles('../input/mk/log', col)
31 | log.sort_values(['user_id', 'order_number', 'product_id'], inplace=True)
32 | 
33 | 
34 | #==============================================================================
35 | # def
36 | #==============================================================================
37 | def multi(T):
38 |     """
39 |     T = 0
40 |     folder = 'trainT-0'
41 |     """
42 |     if T==-1:
43 |         folder = 'test'
44 |     else:
45 |         folder = 'trainT-'+str(T)
46 |         
47 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
48 |     df = pd.merge(label, order_tbl, on='order_id', how='left') 
49 |     
50 |     for i in tqdm(range(1, LOOP)):
51 |         oid = 't-{}_order_id'.format(i)
52 |         v = 't-{}_reordered'.format(i)
53 |         log_ = log.rename(columns={'order_id':oid, 
54 |                                    'reordered':v})[[oid, 'product_id', v]]
55 |         df = pd.merge(df, log_, on=[oid, 'product_id'], how='left')
56 |     
57 |     col = ['order_id', 'product_id'] + [c for c in df.columns if '_reordered' in c]
58 |     
59 |     df[col].fillna(-1).to_pickle('../feature/{}/f302_order-product_all.p'.format(folder))
60 | #==============================================================================
61 | # main
62 | #==============================================================================
63 | mp_pool = mp.Pool(7)
64 | mp_pool.map(multi, [0, 1, 2, #3, 4, 5, 
65 |                     -1])
66 | 
67 | 
68 | 
69 | #==============================================================================
70 | 
71 | utils.end(__file__)
72 | 
73 | 


--------------------------------------------------------------------------------
/py_feature/303_last_order_date.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun May 28 18:06:05 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | そのユーザがそのアイテム注文したのは何日前か？
 9 | *リークじゃない
10 | 
11 | """
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from tqdm import tqdm
16 | import gc
17 | import utils
18 | utils.start(__file__)
19 | 
20 | 
21 | #==============================================================================
22 | # mk train * test log
23 | #==============================================================================
24 | tbl = utils.read_pickles('../input/mk/days_since_last_order')
25 | 
26 | #==============================================================================
27 | # def
28 | #==============================================================================
29 | def make(T):
30 |     """
31 |     T = 0
32 |     folder = 'trainT-0'
33 |     """
34 |     if T==-1:
35 |         folder = 'test'
36 |     else:
37 |         folder = 'trainT-'+str(T)
38 |         
39 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
40 |     
41 |     df = pd.merge(label[['order_id', 'product_id']], 
42 |                  tbl[['order_id', 'product_id','days_since_last_order_this_item']], 
43 |                  on=['order_id', 'product_id'], how='left')
44 |     
45 |     df.to_pickle('../feature/{}/f303_order-product.p'.format(folder))
46 | #==============================================================================
47 | # main
48 | #==============================================================================
49 | make(0)
50 | make(1)
51 | make(2)
52 | 
53 | make(-1)
54 | 
55 | 
56 | 
57 | #==============================================================================
58 | utils.end(__file__)
59 | 
60 | 


--------------------------------------------------------------------------------
/py_feature/304_buy_item_inarow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed May 31 02:10:45 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 現時点の連続購入記録
 9 | *リーク
10 | 
11 | """
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from tqdm import tqdm
16 | import utils
17 | utils.start(__file__)
18 | 
19 | 
20 | log = pd.read_pickle('../input/mk/log_inarow.p')
21 | X_base = pd.read_pickle('../feature/X_base_t3.p')
22 | 
23 | #==============================================================================
24 | # def
25 | #==============================================================================
26 | def make(T):
27 |     """
28 |     T = 0
29 |     folder = 'trainT-0'
30 |     """
31 |     if T==-1:
32 |         folder = 'test'
33 |     else:
34 |         folder = 'trainT-'+str(T)
35 |         
36 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
37 |     label = pd.merge(label, X_base, on='order_id', how='left') # TODO: change to inner
38 |     
39 |     # ======== T-1~3 ========
40 |     for t in range(1,4):
41 |         col = ['order_id', 'product_id', 'buy_item_inarow']
42 |         df = pd.merge(label, log[col].rename(columns={'order_id':'t-{}_order_id'.format(t)}),
43 |                       on=['t-{}_order_id'.format(t),'product_id'], how='left')
44 |         
45 |         col = ['order_id', 'order_number']
46 |         df = pd.merge(df, log[col].rename(columns={'order_id':'t-{}_order_id'.format(t)}).drop_duplicates(),
47 |                       on=['t-{}_order_id'.format(t)], how='left')
48 |         
49 |         df['buy_item_inarow_ratio'] = df['buy_item_inarow']/df['order_number']
50 |         df = df.rename(columns={'buy_item_inarow':'t-{}_buy_item_inarow'.format(t),
51 |                                 'buy_item_inarow_ratio':'t-{}_buy_item_inarow_ratio'.format(t)})
52 |         print(df.isnull().sum())
53 |         df.fillna(0, inplace=1)
54 |         df.reset_index(drop=1, inplace=1)
55 |         
56 |         col = ['order_id', 'product_id', 't-{}_buy_item_inarow'.format(t),'t-{}_buy_item_inarow_ratio'.format(t)]
57 |         df[col].to_pickle('../feature/{}/f304-{}_order-product.p'.format(folder, t))
58 |     
59 | #==============================================================================
60 | # main
61 | #==============================================================================
62 | make(0)
63 | make(1)
64 | make(2)
65 | 
66 | make(-1)
67 | 
68 | #==============================================================================
69 | utils.end(__file__)
70 | 
71 | 


--------------------------------------------------------------------------------
/py_feature/305_last_order_num.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jun 14 00:00:43 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | oder_num - last_order_num
10 | """
11 | 
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from tqdm import tqdm
16 | import utils
17 | utils.start(__file__)
18 | 
19 | 
20 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev']
21 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'order_number'])
22 | 
23 | orders = pd.read_csv('../input/orders.csv.gz', usecols=['order_id', 'order_number'])
24 | 
25 | X_base = pd.read_pickle('../feature/X_base_t3.p')
26 | X_base = pd.merge(X_base, orders, on='order_id', how='left')
27 | 
28 | 
29 | #==============================================================================
30 | # def
31 | #==============================================================================
32 | def make(T):
33 |     """
34 |     T = 0
35 |     folder = 'trainT-0'
36 |     """
37 |     if T==-1:
38 |         folder = 'test'
39 |     else:
40 |         folder = 'trainT-'+str(T)
41 |         
42 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
43 |     label = pd.merge(label, X_base, on='order_id', how='left')
44 |     
45 |     log_ = log[log.order_number_rev>T]
46 |     log_.drop_duplicates(['user_id', 'product_id'], keep='last', inplace=True)
47 |     log_.drop(['order_id','order_number_rev'], axis=1, inplace=1)
48 |     log_.columns = ['user_id', 'product_id', 'last_order_number']
49 |     
50 |     df = pd.merge(label, log_, on=['user_id', 'product_id'], how='left')
51 |     df['order_number_diff'] = df.order_number - df.last_order_number
52 | 
53 |     col = ['order_id', 'product_id', 'last_order_number', 'order_number_diff']
54 |     df[col].to_pickle('../feature/{}/f305_order-product.p'.format(folder))
55 | 
56 | #==============================================================================
57 | # main
58 | #==============================================================================
59 | make(0)
60 | make(1)
61 | make(2)
62 | 
63 | make(-1)
64 | 
65 | 
66 | 
67 | 
68 | utils.end(__file__)
69 | 
70 | 


--------------------------------------------------------------------------------
/py_feature/306_mean_pos_cart.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jun 16 06:42:38 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 平均pos_cart
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | 
19 | col = ['order_id', 'user_id', 'product_id', 'add_to_cart_order', 'order_number_rev']
20 | log = utils.read_pickles('../input/mk/log', col)
21 | 
22 | #==============================================================================
23 | # def
24 | #==============================================================================
25 | def make(T):
26 |     """
27 |     T = 0
28 |     folder = 'trainT-0'
29 |     """
30 |     if T==-1:
31 |         folder = 'test'
32 |     else:
33 |         folder = 'trainT-'+str(T)
34 |         
35 |     log_ = log[log.order_number_rev>T]
36 |         
37 |     gr = log_.groupby(['user_id', 'product_id'])
38 |     
39 |     user = gr.add_to_cart_order.mean().to_frame()
40 |     user.columns = ['useritem_mean_pos_cart']
41 |     user['useritem_sum_pos_cart'] = gr.add_to_cart_order.sum()
42 |     user['useritem_min_pos_cart'] = gr.add_to_cart_order.min()
43 |     user['useritem_median_pos_cart'] = gr.add_to_cart_order.median()
44 |     user['useritem_max_pos_cart'] = gr.add_to_cart_order.max()
45 |     user['useritem_std_pos_cart'] = gr.add_to_cart_order.std()
46 |     user.reset_index(inplace=True)
47 |     
48 |     user.to_pickle('../feature/{}/f306_user-product.p'.format(folder))
49 |     
50 |     # === near5 ===
51 |     log_ = log[log.order_number_rev>T][log.order_number_rev<=(T+5)]
52 |         
53 |     gr = log_.groupby(['user_id', 'product_id'])
54 |     
55 |     user = gr.add_to_cart_order.mean().to_frame()
56 |     user.columns = ['useritem_mean_pos_cart_n5']
57 |     user['useritem_sum_pos_cart_n5'] = gr.add_to_cart_order.sum()
58 |     user['useritem_min_pos_cart_n5'] = gr.add_to_cart_order.min()
59 |     user['useritem_median_pos_cart_n5'] = gr.add_to_cart_order.median()
60 |     user['useritem_max_pos_cart_n5'] = gr.add_to_cart_order.max()
61 |     user['useritem_std_pos_cart_n5'] = gr.add_to_cart_order.std()
62 |     user.reset_index(inplace=True)
63 |     
64 |     user.to_pickle('../feature/{}/f306_user-product_n5.p'.format(folder))
65 |     
66 |     
67 | #==============================================================================
68 | # main
69 | #==============================================================================
70 | make(0)
71 | make(1)
72 | make(2)
73 | 
74 | make(-1)
75 | 
76 | 
77 | 
78 | 
79 | #==============================================================================
80 | utils.end(__file__)
81 | 
82 | 


--------------------------------------------------------------------------------
/py_feature/307_timezone_dow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jun 16 15:50:03 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | そのユーザーがそのアイテムを買う時間帯の割合
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | from collections import defaultdict
16 | import utils
17 | utils.start(__file__)
18 | 
19 | #==============================================================================
20 | # load
21 | #==============================================================================
22 | 
23 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev']
24 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id')
25 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 
26 |                on='order_hour_of_day', how='left')
27 | 
28 | #==============================================================================
29 | # def
30 | #==============================================================================
31 | def make(T):
32 |     """
33 |     T = 0
34 |     folder = 'trainT-0'
35 |     """
36 |     if T==-1:
37 |         folder = 'test'
38 |     else:
39 |         folder = 'trainT-'+str(T)
40 |         
41 |     log_ = log[log.order_number_rev>T]
42 | 
43 |     cnt = log_.groupby(['user_id', 'product_id', 'timezone']).size()
44 |     cnt.name = 'useritem_buy_timezone_cnt'
45 |     cnt = cnt.reset_index()
46 |     
47 |     sum_ = log_.groupby(['user_id', 'product_id']).size()
48 |     sum_.name = 'total'
49 |     sum_ = sum_.reset_index()
50 |     
51 |     df = pd.merge(cnt, sum_, on=['user_id', 'product_id'], how='left')
52 |     
53 |     df['useritem_buy_timezone_ratio'] = df.useritem_buy_timezone_cnt / df.total
54 |     
55 |     col = ['user_id', 'product_id', 'timezone', 
56 |            'useritem_buy_timezone_cnt', 'useritem_buy_timezone_ratio']
57 |     
58 |     df[col].to_pickle('../feature/{}/f307_user-product-timezone.p'.format(folder))
59 |     
60 |     #==============================================================================
61 |     
62 |     
63 |     cnt = log_.groupby(['user_id', 'product_id', 'order_dow']).size()
64 |     cnt.name = 'useritem_buy_dow_cnt'
65 |     cnt = cnt.reset_index()
66 |     
67 |     sum_ = log_.groupby(['user_id', 'product_id']).size()
68 |     sum_.name = 'total'
69 |     sum_ = sum_.reset_index()
70 |     
71 |     df = pd.merge(cnt, sum_, on=['user_id', 'product_id'], how='left')
72 |     
73 |     df['useritem_buy_dow_ratio'] = df.useritem_buy_dow_cnt / df.total
74 |     
75 |     col = ['user_id', 'product_id', 'order_dow', 
76 |            'useritem_buy_dow_cnt', 'useritem_buy_dow_ratio']
77 |     
78 |     df[col].to_pickle('../feature/{}/f307_user-product-dow.p'.format(folder))
79 | 
80 | #==============================================================================
81 | # main
82 | #==============================================================================
83 | make(0)
84 | make(1)
85 | make(2)
86 | 
87 | make(-1)
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | #==============================================================================
95 | utils.end(__file__)
96 | 
97 | 


--------------------------------------------------------------------------------
/py_feature/308_timezone_dow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Jun 17 23:28:15 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | そのユーザーがその時間にそのアイテムを買う率
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | #==============================================================================
19 | # load
20 | #==============================================================================
21 | 
22 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev']
23 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id')
24 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 
25 |                on='order_hour_of_day', how='left')
26 | 
27 | #==============================================================================
28 | # def
29 | #==============================================================================
30 | def make(T):
31 |     """
32 |     T = 0
33 |     folder = 'trainT-0'
34 |     """
35 |     if T==-1:
36 |         folder = 'test'
37 |     else:
38 |         folder = 'trainT-'+str(T)
39 |         
40 |     log_ = log[log.order_number_rev>T]
41 |     
42 |     # timezone
43 |     cnt = log_.groupby(['user_id', 'product_id', 'timezone']).size()
44 |     cnt.name = 'useritem_buy_timezone_cnt'
45 |     cnt = cnt.reset_index()
46 |     
47 |     chance = log_.drop_duplicates('order_id').groupby(['user_id', 'timezone']).size()
48 |     chance.name = 'total'
49 |     chance = chance.reset_index()
50 |     
51 |     df = pd.merge(cnt, chance, on=['user_id', 'timezone'], how='left')
52 |     df['useritem_buy_timezone_ratio2'] = df.useritem_buy_timezone_cnt / df.total
53 |     
54 |     col = ['user_id', 'product_id', 'timezone', 'useritem_buy_timezone_ratio2']
55 |     
56 |     df[col].to_pickle('../feature/{}/f308_user-product-timezone.p'.format(folder))
57 |     
58 |     # dow
59 |     cnt = log_.groupby(['user_id', 'product_id', 'order_dow']).size()
60 |     cnt.name = 'useritem_buy_dow_cnt'
61 |     cnt = cnt.reset_index()
62 |     
63 |     chance = log_.drop_duplicates('order_id').groupby(['user_id', 'order_dow']).size()
64 |     chance.name = 'total'
65 |     chance = chance.reset_index()
66 |     
67 |     df = pd.merge(cnt, chance, on=['user_id', 'order_dow'], how='left')
68 |     df['useritem_buy_dow_ratio2'] = df.useritem_buy_dow_cnt / df.total
69 |     
70 |     col = ['user_id', 'product_id', 'order_dow', 'useritem_buy_dow_ratio2']
71 |     
72 |     df[col].to_pickle('../feature/{}/f308_user-product-dow.p'.format(folder))
73 | 
74 | #==============================================================================
75 | # main
76 | #==============================================================================
77 | make(0)
78 | make(1)
79 | make(2)
80 | 
81 | make(-1)
82 | 
83 | 
84 | 
85 | 
86 | 
87 | #==============================================================================
88 | utils.end(__file__)
89 | 
90 | 


--------------------------------------------------------------------------------
/py_feature/309_order_ratio_by-chance.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Jun 18 12:55:38 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | item order ratio divide by chance
  9 | 
 10 | 
 11 | ex1:
 12 | onb_buy = [5,8,9]
 13 | onb_visit = [1,2,5,8,9]
 14 | return: 3/3
 15 | 
 16 | ex2:
 17 | onb_buy = [5,9]
 18 | onb_visit = [1,2,5,8,9]
 19 | return: 2/3
 20 | 
 21 | """
 22 | 
 23 | import pandas as pd
 24 | import numpy as np
 25 | from tqdm import tqdm
 26 | import utils
 27 | utils.start(__file__)
 28 | 
 29 | 
 30 | #==============================================================================
 31 | # load
 32 | #==============================================================================
 33 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev']
 34 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number'])
 35 | 
 36 | 
 37 | #==============================================================================
 38 | # def
 39 | #==============================================================================
 40 | def make(T):
 41 |     """
 42 |     T = 0
 43 |     folder = 'trainT-0'
 44 |     """
 45 |     if T==-1:
 46 |         folder = 'test'
 47 |     else:
 48 |         folder = 'trainT-'+str(T)
 49 |         
 50 |     log_ = log[log.order_number_rev>T]
 51 | 
 52 |     cnt = log_.groupby(['user_id', 'product_id']).size()
 53 |     cnt.name = 'cnt'
 54 |     cnt = cnt.reset_index()
 55 |     
 56 |     # chance
 57 |     user_onb_max = log_.groupby('user_id').order_number.max().reset_index()
 58 |     user_onb_max.columns = ['user_id', 'onb_max']
 59 |     
 60 |     user_item_min = log_.groupby(['user_id', 'product_id']).order_number.min().reset_index()
 61 |     user_item_min.columns = ['user_id', 'product_id', 'onb_min']
 62 |     
 63 |     chance = pd.merge(user_item_min, user_onb_max, on='user_id', how='left')
 64 |     chance['chance'] = chance.onb_max - chance.onb_min +1
 65 |     
 66 |     df = pd.merge(cnt, chance, on=['user_id', 'product_id'], how='left')
 67 |     
 68 |     df['order_ratio_bychance'] = df.cnt / df.chance
 69 |     
 70 |     col = ['user_id', 'product_id', 'chance', 'order_ratio_bychance']
 71 |     df[col].to_pickle('../feature/{}/f309_user-product.p'.format(folder))
 72 |     
 73 |     # === near5 ===
 74 |     log_ = log[log.order_number_rev>T][log.order_number_rev<=(T+5)]
 75 | 
 76 |     cnt = log_.groupby(['user_id', 'product_id']).size()
 77 |     cnt.name = 'cnt'
 78 |     cnt = cnt.reset_index()
 79 |     
 80 |     # chance
 81 |     user_onb_max = log_.groupby('user_id').order_number.max().reset_index()
 82 |     user_onb_max.columns = ['user_id', 'onb_max']
 83 |     
 84 |     user_item_min = log_.groupby(['user_id', 'product_id']).order_number.min().reset_index()
 85 |     user_item_min.columns = ['user_id', 'product_id', 'onb_min']
 86 |     
 87 |     chance = pd.merge(user_item_min, user_onb_max, on='user_id', how='left')
 88 |     chance['chance_n5'] = chance.onb_max - chance.onb_min +1
 89 |     
 90 |     df = pd.merge(cnt, chance, on=['user_id', 'product_id'], how='left')
 91 |     
 92 |     df['order_ratio_bychance_n5'] = df.cnt / df.chance_n5
 93 |     
 94 |     col = ['user_id', 'product_id', 'chance_n5', 'order_ratio_bychance_n5']
 95 |     df[col].to_pickle('../feature/{}/f309_user-product_n5.p'.format(folder))
 96 |     
 97 |     
 98 | #==============================================================================
 99 | # main
100 | #==============================================================================
101 | make(0)
102 | make(1)
103 | make(2)
104 | 
105 | make(-1)
106 | 
107 | 
108 | 
109 | #==============================================================================
110 | utils.end(__file__)
111 | 
112 | 


--------------------------------------------------------------------------------
/py_feature/310_repeat_within_today.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jun 18 15:58:38 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 同じアイテムを同日に買ったことがあるか
 9 | 
10 | 
11 | """
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from tqdm import tqdm
16 | from collections import defaultdict
17 | import utils
18 | utils.start(__file__)
19 | 
20 | #==============================================================================
21 | # load
22 | #==============================================================================
23 | col = ['order_id', 'user_id', 'product_id', 'order_number','days_since_prior_order', 'order_number_rev']
24 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number'])
25 | log.user_id    = log.user_id.map(str)
26 | log.product_id = log.product_id.map(str)
27 | 
28 | 
29 | #==============================================================================
30 | # def
31 | #==============================================================================
32 | def make(T):
33 |     """
34 |     T = 0
35 |     folder = 'trainT-0'
36 |     """
37 |     if T==-1:
38 |         folder = 'test'
39 |     else:
40 |         folder = 'trainT-'+str(T)
41 |         
42 |     log_ = log[log.order_number_rev>T]
43 |     
44 |     uid_pid = {}
45 |     uid_bk = pid_bk = onb_bk = None
46 |     col = ['user_id', 'product_id', 'order_number', 'days_since_prior_order']
47 |     
48 |     for uid,pid,onb,days in log_[col].values:
49 |     #    uid = str(uid)
50 |     #    pid = str(pid)
51 |         if uid_bk is None:
52 |             pass
53 |         elif uid+'@'+pid in uid_pid:
54 |             continue
55 |         elif days == 0 and uid == uid_bk and pid == pid_bk and onb-onb_bk==1:
56 |             uid_pid[uid+'@'+pid] = 1
57 |             
58 |         uid_bk = uid
59 |         pid_bk = pid
60 |         onb_bk = onb
61 |     
62 |     df = pd.DataFrame().from_dict(uid_pid, orient='index').reset_index()
63 |     df.columns = ['uidpid', 'buy_within_sameday']
64 |     df['user_id'] = df.uidpid.map(lambda x:x.split('@')[0])
65 |     df['product_id'] = df.uidpid.map(lambda x:x.split('@')[1])
66 |     
67 |     df = df[['user_id', 'product_id', 'buy_within_sameday']]
68 |     for c in df.columns:
69 |         df[c] = df[c].map(int)
70 |     df.sort_values(df.columns.tolist(), inplace=True)
71 |     df.reset_index(drop=1, inplace=1)
72 |     
73 |     df.to_pickle('../feature/{}/f310_user-product.p'.format(folder))
74 | 
75 | #==============================================================================
76 | # main
77 | #==============================================================================
78 | make(0)
79 | make(1)
80 | make(2)
81 | 
82 | make(-1)
83 | 
84 | #==============================================================================
85 | utils.end(__file__)
86 | 
87 | 


--------------------------------------------------------------------------------
/py_feature/312_cycle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Jun 26 10:35:09 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | userのそのitemのcycle
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | 
19 | #==============================================================================
20 | # load
21 | #==============================================================================
22 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev']
23 | log = pd.merge(utils.read_pickles('../input/mk/log', usecols),
24 |                utils.read_pickles('../input/mk/days_since_last_order'),
25 |                on=['order_id','product_id'], how='left')
26 | 
27 | 
28 | #==============================================================================
29 | # def
30 | #==============================================================================
31 | def make(T):
32 |     """
33 |     T = 0
34 |     folder = 'trainT-0'
35 |     """
36 |     if T==-1:
37 |         folder = 'test'
38 |     else:
39 |         folder = 'trainT-'+str(T)
40 |         
41 |     log_ = log[log.order_number_rev>T]
42 |     
43 |     key = ['user_id', 'product_id']
44 |     tbl = log_.groupby(key).days_since_last_order_this_item.mean().to_frame()
45 |     tbl.columns = ['useritem_order_days_mean']
46 |     tbl['useritem_order_days_min'] = log_.groupby(key).days_since_last_order_this_item.min()
47 |     tbl['useritem_order_days_max'] = log_.groupby(key).days_since_last_order_this_item.max()
48 |     tbl['useritem_order_days_median'] = log_.groupby(key).days_since_last_order_this_item.median()
49 |     
50 |     tbl.reset_index().to_pickle('../feature/{}/f312_user_product.p'.format(folder))
51 | 
52 |     # === near5 ===
53 |     log_ = log[log.order_number_rev>T][log.order_number_rev<=(T+5)]
54 |     
55 |     key = ['user_id', 'product_id']
56 |     tbl = log_.groupby(key).days_since_last_order_this_item.mean().to_frame()
57 |     tbl.columns = ['useritem_order_days_mean_n5']
58 |     tbl['useritem_order_days_min_n5'] = log_.groupby(key).days_since_last_order_this_item.min()
59 |     tbl['useritem_order_days_max_n5'] = log_.groupby(key).days_since_last_order_this_item.max()
60 |     tbl['useritem_order_days_median_n5'] = log_.groupby(key).days_since_last_order_this_item.median()
61 |     
62 |     tbl.reset_index().to_pickle('../feature/{}/f312_user_product_n5.p'.format(folder))
63 | 
64 | #==============================================================================
65 | # main
66 | #==============================================================================
67 | make(0)
68 | make(1)
69 | make(2)
70 | 
71 | make(-1)
72 | 
73 | 
74 | 
75 | #==============================================================================
76 | utils.end(__file__)
77 | 
78 | 


--------------------------------------------------------------------------------
/py_feature/313_aisle_dep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jul 12 00:48:08 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | aisle & department
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | from tqdm import tqdm
15 | import utils
16 | utils.start(__file__)
17 | 
18 | 
19 | #==============================================================================
20 | # load
21 | #==============================================================================
22 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev']
23 | log = utils.read_pickles('../input/mk/log', usecols)
24 | 
25 | goods = pd.read_pickle('../input/mk/goods.p')[['product_id', 'aisle_id', 'department_id']]
26 | 
27 | log = pd.merge(log, goods, on='product_id', how='left')
28 | 
29 | #==============================================================================
30 | # def
31 | #==============================================================================
32 | def make(T):
33 |     """
34 |     T = 0
35 |     folder = 'trainT-0'
36 |     """
37 |     if T==-1:
38 |         folder = 'test'
39 |     else:
40 |         folder = 'trainT-'+str(T)
41 |         
42 |     log_ = log[log.order_number_rev>T]
43 |     
44 |     user = log_.groupby(['user_id']).size().to_frame()
45 |     user.columns = ['total']
46 |     user.reset_index(inplace=True)
47 |     
48 |     user_aisle = log_.groupby(['user_id', 'aisle_id']).size().to_frame()
49 |     user_aisle.columns = ['user_aisle_cnt']
50 |     user_aisle.reset_index(inplace=True)
51 |     user_aisle = pd.merge(user_aisle, user, on='user_id', how='left')
52 |     user_aisle['user_aisle_ratio'] = user_aisle.user_aisle_cnt / user_aisle.total
53 |     user_aisle.drop('total', axis=1, inplace=True)
54 |     user_aisle.to_pickle('../feature/{}/f313_user_aisle.p'.format(folder))
55 |     
56 |     user_dep = log_.groupby(['user_id', 'department_id']).size().to_frame()
57 |     user_dep.columns = ['user_dep_cnt']
58 |     user_dep.reset_index(inplace=True)
59 |     user_dep = pd.merge(user, user_dep, on='user_id', how='left')
60 |     user_dep['user_dep_ratio'] = user_dep.user_dep_cnt / user_dep.total
61 |     user_dep.drop('total', axis=1, inplace=True)
62 |     user_dep.to_pickle('../feature/{}/f313_user_dep.p'.format(folder))
63 | 
64 | #==============================================================================
65 | # main
66 | #==============================================================================
67 | make(0)
68 | make(1)
69 | make(2)
70 | 
71 | make(-1)
72 | 
73 | 
74 | 
75 | #==============================================================================
76 | utils.end(__file__)
77 | 
78 | 


--------------------------------------------------------------------------------
/py_feature/314_co-occur.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 16 16:02:01 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | 
16 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev']
17 | log = utils.read_pickles('../input/mk/log', col)
18 | 
19 | #==============================================================================
20 | # def
21 | #==============================================================================
22 | def make(T):
23 |     """
24 |     T = 0
25 |     folder = 'trainT-0'
26 |     """
27 |     if T==-1:
28 |         folder = 'test'
29 |     else:
30 |         folder = 'trainT-'+str(T)
31 |     
32 |     log_ = log[log.order_number_rev>T]
33 |     
34 |     order_tbl = log_.groupby('order_id').size().to_frame()
35 |     order_tbl.columns = ['order_size']
36 |     order_tbl.reset_index(inplace=True)
37 |     
38 |     order_tbl = pd.merge(order_tbl, log_[['order_id', 'user_id', 'product_id']])
39 |     
40 |     col = ['user_id', 'product_id']
41 |     tbl = log_.sort_values(col).drop_duplicates(col)[col]
42 |     tbl = tbl.set_index(col)
43 |     
44 |     gr = order_tbl.groupby(['user_id', 'product_id'])
45 |     
46 |     tbl['useritem_cooccur-min'] = gr.order_size.min()
47 |     tbl['useritem_cooccur-max'] = gr.order_size.max()
48 |     tbl['useritem_cooccur-mean'] = gr.order_size.mean()
49 |     tbl['useritem_cooccur-median'] = gr.order_size.median()
50 |     tbl['useritem_cooccur-std'] = gr.order_size.std()
51 |     tbl.reset_index(inplace=True)
52 |     
53 |     user_osz = order_tbl.groupby(['user_id']).order_size.min().to_frame()
54 |     user_osz.columns = ['user_order_size-min']
55 |     user_osz['user_order_size-max'] = order_tbl.groupby(['user_id']).order_size.max()
56 |     user_osz.reset_index(inplace=True)
57 |     
58 |     tbl = pd.merge(tbl, user_osz, on='user_id', how='left')
59 |     
60 |     tbl['useritem_cooccur-min-min'] = tbl['user_order_size-min']  - tbl['useritem_cooccur-min']
61 |     tbl['useritem_cooccur-max-min'] = tbl['useritem_cooccur-max'] - tbl['useritem_cooccur-min']
62 |     tbl['useritem_cooccur-max-max'] = tbl['user_order_size-max'] - tbl['useritem_cooccur-max']
63 |     tbl.drop(['user_order_size-min', 'user_order_size-max'], axis=1, inplace=True)
64 |     
65 |     tbl.to_pickle('../feature/{}/f314_user-product.p'.format(folder))
66 | 
67 | #==============================================================================
68 | # main
69 | #==============================================================================
70 | make(0)
71 | make(1)
72 | make(2)
73 | 
74 | make(-1)
75 | 
76 | 
77 | utils.end(__file__)
78 | 
79 | 


--------------------------------------------------------------------------------
/py_feature/315_streak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed May 31 02:10:45 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 現時点の連続購入記録
 9 | *リーク
10 | 
11 | """
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from tqdm import tqdm
16 | import multiprocessing as mp
17 | import utils
18 | utils.start(__file__)
19 | 
20 | 
21 | streak = pd.read_pickle('../input/mk/streak_order-product.p')
22 | X_base = pd.read_pickle('../feature/X_base_t3.p')
23 | 
24 | #==============================================================================
25 | # def
26 | #==============================================================================
27 | def multi(T):
28 |     """
29 |     T = 0
30 |     folder = 'trainT-0'
31 |     """
32 |     if T==-1:
33 |         folder = 'test'
34 |     else:
35 |         folder = 'trainT-'+str(T)
36 |         
37 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
38 |     label = pd.merge(label, X_base, on='order_id', how='inner')
39 |     
40 |     # ======== T-1~3 ========
41 |     for t in range(1,4):
42 |         
43 |         df = pd.merge(label, streak.rename(columns={'order_id':'t-{}_order_id'.format(t),
44 |                                                     'streak':'t-{}_streak'.format(t)}),
45 |                       on=['t-{}_order_id'.format(t),'product_id'], how='left')
46 |         
47 |         print(df.isnull().sum())
48 |         df.fillna(-99, inplace=1)
49 |         df.reset_index(drop=1, inplace=1)
50 |         
51 |         col = ['order_id', 'product_id', 't-{}_streak'.format(t)]
52 |         df[col].to_pickle('../feature/{}/f315-{}_order-product.p'.format(folder, t))
53 |     
54 | #==============================================================================
55 | # main
56 | #==============================================================================
57 | mp_pool = mp.Pool(3)
58 | callback = mp_pool.map(multi, list(range(-1,3)))
59 | 
60 | #==============================================================================
61 | utils.end(__file__)
62 | 
63 | 


--------------------------------------------------------------------------------
/py_feature/316_replacement.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jul  5 22:36:10 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | 
  9 | nohup python -u 316_replacement.py &
 10 | 
 11 | 
 12 | """
 13 | 
 14 | import pandas as pd
 15 | import gc
 16 | import numpy as np
 17 | from tqdm import tqdm
 18 | from collections import defaultdict
 19 | from itertools import product
 20 | import utils
 21 | utils.start(__file__)
 22 | 
 23 | #==============================================================================
 24 | # load
 25 | #==============================================================================
 26 | 
 27 | usecols = ['user_id', 'order_number', 'product_id', 'product_name', 'order_id', 'order_number_rev']
 28 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3])
 29 | order_pids = log.groupby('order_id').product_id.apply(set).reset_index()
 30 | 
 31 | #item = pd.read_pickle('../input/mk/replacement2.p').head(999)
 32 | item = pd.read_pickle('../input/mk/replacement.p')
 33 | item = item[item.back>9]
 34 | 
 35 | # parse
 36 | item_di = defaultdict(int)
 37 | for pid1,pid2,ratio in item[['pid1', 'pid2', 'ratio']].values:
 38 |     item_di['{} {}'.format(int(pid1),int(pid2))] = ratio
 39 | #==============================================================================
 40 | # def
 41 | #==============================================================================
 42 | def make(T):
 43 |     """
 44 |     T = 0
 45 |     folder = 'trainT-0'
 46 |     """
 47 |     if T==-1:
 48 |         folder = 'test'
 49 |     else:
 50 |         folder = 'trainT-'+str(T)
 51 |     
 52 |     X_base = pd.read_pickle('../feature/X_base_t3.p')
 53 |     label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
 54 |     
 55 |     # 'inner' for removing t-n_order_id == NaN
 56 |     if 'train' in folder:
 57 |         df = pd.merge(X_base[X_base.is_train==1], label, on='order_id', how='inner')
 58 |     elif folder == 'test':
 59 |         df = pd.merge(X_base[X_base.is_train==0], label, on='order_id', how='inner')
 60 |     
 61 |     df = pd.merge(df, 
 62 |                   order_pids.add_prefix('t-1_'), 
 63 |                   on='t-1_order_id', how='left')
 64 |     df = pd.merge(df, 
 65 |                   order_pids.add_prefix('t-2_'), 
 66 |                   on='t-2_order_id', how='left')
 67 |     
 68 |     ratio_min  = []
 69 |     ratio_mean = []
 70 |     ratio_max  = []
 71 |     ratio_sum  = []
 72 |     ratio_len  = []
 73 |     for t_2,t_1,pid in tqdm(df[['t-2_product_id', 't-1_product_id', 'product_id']].values, miniters=99999):
 74 |         rep = t_1 - t_2
 75 |         if pid not in t_1 and pid in t_2 and len(rep)>0:
 76 |             ratios = [item_di['{} {}'.format(i1,i2)] for i1,i2 in  list(product([pid], rep))]
 77 |             ratio_min.append(np.min(ratios))
 78 |             ratio_mean.append(np.mean(ratios))
 79 |             ratio_max.append(np.max(ratios))
 80 |             ratio_sum.append(np.sum(ratios))
 81 |             ratio_len.append(len(ratios))
 82 |         else:
 83 |             ratio_min.append(-1)
 84 |             ratio_mean.append(-1)
 85 |             ratio_max.append(-1)
 86 |             ratio_sum.append(-1)
 87 |             ratio_len.append(-1)
 88 |     
 89 |     df['comeback_ratio_min']  = ratio_min
 90 |     df['comeback_ratio_mean'] = ratio_mean
 91 |     df['comeback_ratio_max']  = ratio_max
 92 |     df['comeback_ratio_sum']  = ratio_sum
 93 |     df['comeback_ratio_len']  = ratio_len
 94 |     
 95 |     col = ['order_id', 'product_id', 'comeback_ratio_min', 'comeback_ratio_mean',
 96 |            'comeback_ratio_max', 'comeback_ratio_sum', 'comeback_ratio_len']
 97 |     df[col].to_pickle('../feature/{}/f316_order_product.p'.format(folder))
 98 |     del df
 99 |     gc.collect()
100 |     
101 | #==============================================================================
102 | # main
103 | #==============================================================================
104 | make(0)
105 | make(1)
106 | make(2)
107 | 
108 | make(-1)
109 | 
110 | #==============================================================================
111 | utils.end(__file__)
112 | 
113 | 


--------------------------------------------------------------------------------
/py_feature/400_===== daytime =====:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/400_===== daytime =====


--------------------------------------------------------------------------------
/py_feature/401_how_many_come.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jun 18 01:09:41 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | from tqdm import tqdm
12 | import utils
13 | utils.start(__file__)
14 | 
15 | 
16 | #==============================================================================
17 | # load
18 | #==============================================================================
19 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'order_number_rev']
20 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number'])
21 | 
22 | 
23 | #==============================================================================
24 | # def
25 | #==============================================================================
26 | def make(T):
27 |     """
28 |     T = 0
29 |     folder = 'trainT-0'
30 |     """
31 |     if T==-1:
32 |         folder = 'test'
33 |     else:
34 |         folder = 'trainT-'+str(T)
35 |         
36 |     log_ = log[log.order_number_rev>T]
37 |     
38 |     # dow
39 |     dow = log_.drop_duplicates('order_id').groupby('order_dow').size()
40 |     dow.name = 'dow_order_cnt'
41 |     dow = dow.to_frame()
42 |     
43 |     dow['dow_item_cnt'] = log_.groupby('order_dow').size()
44 |     
45 |     dow /= dow.sum()
46 |     
47 |     dow['dow_rank_diff'] = dow.dow_order_cnt.rank() - dow.dow_item_cnt.rank()
48 |     
49 |     dow.reset_index().to_pickle('../feature/{}/f401_dow.p'.format(folder))
50 |     
51 |     
52 |     # hour
53 |     hour = log_.drop_duplicates('order_id').groupby('order_hour_of_day').size()
54 |     hour.name = 'hour_order_cnt'
55 |     hour = hour.to_frame()
56 |     
57 |     hour['hour_item_cnt'] = log_.groupby('order_hour_of_day').size()
58 |     
59 |     hour /= hour.sum()
60 |     
61 |     hour['hour_rank_diff'] = hour.hour_order_cnt.rank() - hour.hour_item_cnt.rank()
62 |     
63 |     hour.reset_index().to_pickle('../feature/{}/f401_hour.p'.format(folder))
64 | 
65 | #==============================================================================
66 | # main
67 | #==============================================================================
68 | make(0)
69 | make(1)
70 | make(2)
71 | 
72 | make(-1)
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | #==============================================================================
81 | utils.end(__file__)
82 | 
83 | 


--------------------------------------------------------------------------------
/py_feature/500_===== concat =====:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/500_===== concat =====


--------------------------------------------------------------------------------
/py_feature/900_===== run =====:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/900_===== run =====


--------------------------------------------------------------------------------
/py_feature/901_run_feature.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu May 18 23:13:37 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | nohup python -u 901_run_feature.py > log_run_feature.txt &
10 | 
11 | 
12 | from glob import glob
13 | files = [f for f in sorted(glob('*.py')) if f[0].isdigit()]
14 | for f in files:
15 |     print("os.system('python -u {}')".format(f))
16 | 
17 | 
18 | """
19 | 
20 | import os
21 | import utils
22 | utils.start(__file__)
23 | 
24 | utils.mkdir_p('../input/mk')
25 | utils.mkdir_p('../output')
26 | utils.mkdir_p('../output/model')
27 | utils.mkdir_p('../output/sub')
28 | utils.mkdir_p('../output/imp')
29 | utils.mkdir_p('../feature')
30 | utils.mkdir_p('../feature/trainT-0')
31 | utils.mkdir_p('../feature/trainT-1')
32 | utils.mkdir_p('../feature/trainT-2')
33 | utils.mkdir_p('../feature/test')
34 | 
35 | 
36 | os.system('python -u 000_mk.py')
37 | os.system('python -u 003_X_base_T.py')
38 | os.system('python -u 004_label.py')
39 | os.system('python -u 005_inarow.py')
40 | os.system('python -u 006_days_since_last_order.py')
41 | os.system('python -u 007_timezone.py')
42 | os.system('python -u 008_product_feature.py')
43 | os.system('python -u 009_None.py')
44 | os.system('python -u 010_streak.py')
45 | os.system('python -u 011_replacement.py')
46 | os.system('python -u 012_aisle_dep_cumsum.py')
47 | 
48 | os.system('nohup python -u 101_repeat_previous_ratio_T.py &')
49 | os.system('python -u 102_orderspan_average.py')
50 | os.system('nohup python -u 103_visit_time.py &')
51 | os.system('python -u 104_organic.py')
52 | os.system('python -u 105_delta_time.py')
53 | os.system('python -u 108_order_size.py')
54 | os.system('python -u 109_have_you_bought.py')
55 | os.system('python -u 110_None.py')
56 | 
57 | os.system('nohup python -u 202_buy_time.py &')
58 | os.system('python -u 203_cycle.py')
59 | os.system('nohup python -u 205_co-occur.py &')
60 | os.system('python -u 207_mean_pos_cart.py')
61 | os.system('python -u 208_one-shot.py')
62 | os.system('python -u 209_together.py')
63 | os.system('nohup python -u 210_streak.py &')
64 | os.system('nohup python -u 211_1to1.py &')
65 | os.system('nohup python -u 212_withinN.py &')
66 | os.system('nohup python -u 213_dow_diff.py &')
67 | os.system('nohup python -u 214_first_order.py &')
68 | os.system('nohup python -u 215_onb_diff.py &')
69 | 
70 | os.system('python -u 301_total_buy.py')
71 | os.system('nohup python -u 302-1_reorderd_all.py &')
72 | os.system('nohup python -u 303_last_order_date.py &')
73 | os.system('nohup python -u 304_buy_item_inarow.py &')
74 | os.system('nohup python -u 305_last_order_num.py &')
75 | os.system('nohup python -u 306_mean_pos_cart.py &')
76 | os.system('nohup python -u 307_timezone_dow.py &')
77 | os.system('nohup python -u 308_timezone_dow.py &')
78 | os.system('nohup python -u 309_order_ratio_by-chance.py &')
79 | os.system('python -u 310_repeat_within_today.py')
80 | os.system('python -u 312_cycle.py')
81 | os.system('python -u 313_aisle_dep.py')
82 | os.system('python -u 314_co-occur.py')
83 | os.system('nohup python -u 315_streak.py &')
84 | os.system('nohup python -u 316_replacement.py &')
85 | 
86 | os.system('python -u 401_how_many_come.py')
87 | 
88 | 
89 | 
90 | #==============================================================================
91 | utils.end(__file__)
92 | 


--------------------------------------------------------------------------------
/py_feature/902_run_concat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu May 18 23:13:37 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | 
 9 | 
10 | nohup python -u 902_run_concat.py > log_run_concat.txt &
11 | 
12 | 
13 | """
14 | 
15 | import os
16 | import utils
17 | utils.start(__file__)
18 | 
19 | os.system('python -u 501_concat.py')
20 | os.system('python -u 502_concat.py')
21 | 
22 | 
23 | 
24 | utils.end(__file__)


--------------------------------------------------------------------------------
/py_feature/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Mar 17 19:37:49 2018
 5 | 
 6 | @author: Kazuki
 7 | """
 8 | 
 9 | import os
10 | from time import sleep
11 | import sys
12 | argv = sys.argv
13 | 
14 | file = argv[1]
15 | if len(argv)>2:
16 |     sec = 60 * int(argv[2])
17 |     print(f'wait {sec} sec')
18 | else:
19 |     sec = 0
20 | 
21 | sleep(sec)
22 | os.system(f'nohup python -u {file} > LOG/log_{file}.txt &')
23 | 
24 | 


--------------------------------------------------------------------------------
/py_feature/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 17 01:21:53 2017
  4 | 
  5 | @author: konodera
  6 | """
  7 | 
  8 | import warnings
  9 | warnings.filterwarnings("ignore")
 10 | import pandas as pd
 11 | import numpy as np
 12 | from glob import glob
 13 | import os
 14 | from tqdm import tqdm
 15 | from sklearn.model_selection import KFold
 16 | #import pickle
 17 | from time import time
 18 | from datetime import datetime
 19 | import gc
 20 | #from itertools import chain
 21 | 
 22 | 
 23 | # =============================================================================
 24 | # def
 25 | # =============================================================================
 26 | def start(fname):
 27 |     global st_time
 28 |     st_time = time()
 29 |     print("""
 30 | #==============================================================================
 31 | # START!!! {}    PID: {}    time: {}
 32 | #==============================================================================
 33 | """.format( fname, os.getpid(), datetime.today() ))
 34 |     
 35 | #    send_line(f'START {fname}  time: {elapsed_minute():.2f}min')
 36 |     
 37 |     return
 38 | 
 39 | def end(fname):
 40 |     
 41 |     print("""
 42 | #==============================================================================
 43 | # SUCCESS !!! {}
 44 | #==============================================================================
 45 | """.format(fname))
 46 |     print('time: {:.2f}min'.format( elapsed_minute() ))
 47 |     
 48 | #    send_line(f'FINISH {fname}  time: {elapsed_minute():.2f}min')
 49 |     
 50 |     return
 51 | 
 52 | def elapsed_minute():
 53 |     return (time() - st_time)/60
 54 | 
 55 | def mkdir_p(path):
 56 |     try:
 57 |         os.stat(path)
 58 |     except:
 59 |         os.mkdir(path)
 60 |     
 61 | def to_pickles(df, path, split_size=3, inplace=True):
 62 |     """
 63 |     path = '../output/mydf'
 64 |     
 65 |     wirte '../output/mydf/0.p'
 66 |           '../output/mydf/1.p'
 67 |           '../output/mydf/2.p'
 68 |     
 69 |     """
 70 |     if inplace==True:
 71 |         df.reset_index(drop=True, inplace=True)
 72 |     else:
 73 |         df = df.reset_index(drop=True)
 74 |     gc.collect()
 75 |     mkdir_p(path)
 76 |     
 77 |     kf = KFold(n_splits=split_size)
 78 |     for i, (train_index, val_index) in enumerate(tqdm(kf.split(df))):
 79 |         df.iloc[val_index].to_pickle(f'{path}/{i:03d}.p')
 80 |     return
 81 | 
 82 | def read_pickles(path, col=None):
 83 |     if col is None:
 84 |         df = pd.concat([pd.read_pickle(f) for f in tqdm(sorted(glob(path+'/*')))])
 85 |     else:
 86 |         df = pd.concat([pd.read_pickle(f)[col] for f in tqdm(sorted(glob(path+'/*')))])
 87 |     return df
 88 | 
 89 | def reduce_memory(df, ix_start=0):
 90 |     df.fillna(-1, inplace=True)
 91 |     df_ = df.sample(9999, random_state=71)
 92 |     ## int
 93 |     col_int8 = []
 94 |     col_int16 = []
 95 |     col_int32 = []
 96 |     for c in tqdm(df.columns[ix_start:], miniters=20):
 97 |         if df[c].dtype=='O':
 98 |             continue
 99 |         if (df_[c] == df_[c].astype(np.int8)).all():
100 |             col_int8.append(c)
101 |         elif (df_[c] == df_[c].astype(np.int16)).all():
102 |             col_int16.append(c)
103 |         elif (df_[c] == df_[c].astype(np.int32)).all():
104 |             col_int32.append(c)
105 |     
106 |     df[col_int8]  = df[col_int8].astype(np.int8)
107 |     df[col_int16] = df[col_int16].astype(np.int16)
108 |     df[col_int32] = df[col_int32].astype(np.int32)
109 |     
110 |     ## float
111 |     col = [c for c in df.dtypes[df.dtypes==np.float64].index if '_id' not in c]
112 |     df[col] = df[col].astype(np.float32)
113 | 
114 |     gc.collect()
115 | 
116 | #==============================================================================
117 | # main
118 | #==============================================================================
119 | if __name__ == "__main__":
120 |     
121 |     files = sorted(glob('../input/*'))
122 |     data = {}
123 |     for f in files:
124 |         if os.path.isfile(f):
125 |             data[f.split('/')[-1]] = pd.read_csv(f)
126 |     
127 |     print("""
128 |     #==============================================================================
129 |     # SUCCESS !!! {}
130 |     #==============================================================================
131 |     """.format(__file__))
132 | 
133 | 


--------------------------------------------------------------------------------
/py_model/000_====== user x item prediction ======:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_model/000_====== user x item prediction ======


--------------------------------------------------------------------------------
/py_model/002_xgb_holdout_item_812_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 002_xgb_holdout_item_812_1.py > LOG/_xgb_item.txt &
  9 | 
 10 | 
 11 | """
 12 | 
 13 | import warnings
 14 | warnings.filterwarnings("ignore")
 15 | import pandas as pd
 16 | import numpy as np
 17 | import gc
 18 | import xgboost as xgb
 19 | import utils
 20 | 
 21 | utils.start(__file__)
 22 | 
 23 | 
 24 | 
 25 | # setting
 26 | DATE = '812_1'
 27 | LOOP = 2
 28 | ESR = 40
 29 | 
 30 | #seed = np.random.randint(99999)
 31 | seed = 71
 32 | 
 33 | np.random.seed(seed)
 34 | 
 35 | valid_size = 0.05
 36 | 
 37 | 
 38 | # XGB param
 39 | nround = 10000
 40 | #nround = 10
 41 | 
 42 | param = {'max_depth':10, 
 43 |          'eta':0.02,
 44 |          'colsample_bytree':0.4,
 45 |          'subsample':0.75,
 46 |          'silent':1,
 47 |          'nthread':27,
 48 |          'eval_metric':'logloss',
 49 |          'objective':'binary:logistic',
 50 |          'tree_method':'hist'
 51 |          }
 52 | 
 53 | print("""#==== print param ======""")
 54 | print('DATE:', DATE)
 55 | print('seed:', seed)
 56 | 
 57 | #==============================================================================
 58 | # prepare
 59 | #==============================================================================
 60 | train = pd.concat([utils.load_pred_item('trainT-0'),
 61 |                    utils.load_pred_item('trainT-1'),
 62 |                    utils.load_pred_item('trainT-2')
 63 |                    ], ignore_index=True)
 64 | 
 65 | y_train = train['y']
 66 | X_train = train.drop('y', axis=1)
 67 | del train
 68 | gc.collect()
 69 | 
 70 | # drop id
 71 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 72 | col.remove('user_id')
 73 | print('drop1',col)
 74 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 75 | 
 76 | # drop obj
 77 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 78 | print('drop2',col)
 79 | X_train.drop(col, axis=1, inplace=True)
 80 | 
 81 | X_train.fillna(-1, inplace=1)
 82 | 
 83 | #==============================================================================
 84 | # SPLIT!
 85 | print('split by user')
 86 | #==============================================================================
 87 | train_user = X_train[['user_id']].drop_duplicates()
 88 | 
 89 | def split_build_valid():
 90 |     
 91 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 92 |                                               p=[1-valid_size, valid_size])
 93 |     valid_n = train_user['is_valid'].sum()
 94 |     build_n = (train_user.shape[0] - valid_n)
 95 |     
 96 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 97 |     valid_user = train_user[train_user['is_valid']==1].user_id
 98 |     is_valid = X_train.user_id.isin(valid_user)
 99 |     
100 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
101 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
102 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
103 |     
104 |     print('FINAL SHAPE')
105 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
106 |                                                       (dvalid.num_row(), dvalid.num_col())))
107 | 
108 |     return dbuild, dvalid, watchlist
109 | 
110 | #==============================================================================
111 | print('hold out')
112 | #==============================================================================
113 | utils.mkdir_p('../output/model/{}/'.format(DATE))
114 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
115 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
116 | 
117 | # hold out
118 | models = []
119 | for i in range(LOOP):
120 |     print('LOOP',i)
121 |     dbuild, dvalid, watchlist = split_build_valid()
122 |     
123 |     if i==0:
124 |         col_train = dbuild.feature_names
125 |         
126 |     model = xgb.train(param, dbuild, nround, watchlist,
127 |                       early_stopping_rounds=ESR, verbose_eval=5)
128 |     models.append(model)
129 |     model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i))
130 |     # VALID
131 |     valid_yhat = model.predict(dvalid)
132 |     print('Valid Mean:', np.mean(valid_yhat))
133 |     del dbuild, dvalid, watchlist
134 |     gc.collect()
135 | 
136 | del train_user, X_train, y_train
137 | gc.collect()
138 | 
139 | #==============================================================================
140 | print('test')
141 | #==============================================================================
142 | test = utils.load_pred_item('test').fillna(-1)
143 | 
144 | sub_test = test[['order_id', 'product_id']]
145 | 
146 | dtest  = xgb.DMatrix(test[col_train])
147 | sub_test['yhat'] = 0
148 | for model in models:
149 |     sub_test['yhat'] += model.predict(dtest)
150 | sub_test['yhat'] /= LOOP
151 | print('Test Mean:', sub_test['yhat'].mean())
152 | 
153 | sub_test.to_pickle('../output/sub/{}/sub_test.p'.format(DATE))
154 | 
155 | 
156 | #==============================================================================
157 | utils.end(__file__)
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/py_model/002_xgb_holdout_item_813_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 002_xgb_holdout_item_813_1.py > LOG/_xgb_item.txt &
  9 | 
 10 | 
 11 | """
 12 | 
 13 | import warnings
 14 | warnings.filterwarnings("ignore")
 15 | import pandas as pd
 16 | import numpy as np
 17 | import gc
 18 | import xgboost as xgb
 19 | import utils
 20 | 
 21 | utils.start(__file__)
 22 | 
 23 | 
 24 | 
 25 | # setting
 26 | DATE = '813_1'
 27 | LOOP = 2
 28 | ESR = 60
 29 | 
 30 | #seed = np.random.randint(99999)
 31 | seed = 72
 32 | 
 33 | np.random.seed(seed)
 34 | 
 35 | valid_size = 0.05
 36 | 
 37 | 
 38 | # XGB param
 39 | nround = 10000
 40 | #nround = 10
 41 | 
 42 | param = {'max_depth':10, 
 43 |          'eta':0.02,
 44 |          'colsample_bytree':0.4,
 45 |          'subsample':0.75,
 46 |          'silent':1,
 47 |          'nthread':27,
 48 |          'eval_metric':'logloss',
 49 |          'objective':'binary:logistic',
 50 |          'tree_method':'hist'
 51 |          }
 52 | 
 53 | print("""#==== print param ======""")
 54 | print('DATE:', DATE)
 55 | print('seed:', seed)
 56 | 
 57 | #==============================================================================
 58 | # prepare
 59 | #==============================================================================
 60 | train = pd.concat([utils.load_pred_item('trainT-0'),
 61 |                    utils.load_pred_item('trainT-1'),
 62 |                    utils.load_pred_item('trainT-2')
 63 |                    ], ignore_index=True)
 64 | 
 65 | y_train = train['y']
 66 | X_train = train.drop('y', axis=1)
 67 | del train
 68 | gc.collect()
 69 | 
 70 | # drop id
 71 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 72 | col.remove('user_id')
 73 | print('drop1',col)
 74 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 75 | 
 76 | # drop obj
 77 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 78 | print('drop2',col)
 79 | X_train.drop(col, axis=1, inplace=True)
 80 | 
 81 | X_train.fillna(-1, inplace=1)
 82 | 
 83 | #==============================================================================
 84 | # SPLIT!
 85 | print('split by user')
 86 | #==============================================================================
 87 | train_user = X_train[['user_id']].drop_duplicates()
 88 | 
 89 | def split_build_valid():
 90 |     
 91 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 92 |                                               p=[1-valid_size, valid_size])
 93 |     valid_n = train_user['is_valid'].sum()
 94 |     build_n = (train_user.shape[0] - valid_n)
 95 |     
 96 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 97 |     valid_user = train_user[train_user['is_valid']==1].user_id
 98 |     is_valid = X_train.user_id.isin(valid_user)
 99 |     
100 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
101 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
102 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
103 |     
104 |     print('FINAL SHAPE')
105 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
106 |                                                       (dvalid.num_row(), dvalid.num_col())))
107 | 
108 |     return dbuild, dvalid, watchlist
109 | 
110 | #==============================================================================
111 | print('hold out')
112 | #==============================================================================
113 | utils.mkdir_p('../output/model/{}/'.format(DATE))
114 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
115 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
116 | 
117 | # hold out
118 | models = []
119 | for i in range(LOOP):
120 |     print('LOOP',i)
121 |     dbuild, dvalid, watchlist = split_build_valid()
122 |     
123 |     if i==0:
124 |         col_train = dbuild.feature_names
125 |         
126 |     model = xgb.train(param, dbuild, nround, watchlist,
127 |                       early_stopping_rounds=ESR, verbose_eval=5)
128 |     models.append(model)
129 |     model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i))
130 |     # VALID
131 |     valid_yhat = model.predict(dvalid)
132 |     print('Valid Mean:', np.mean(valid_yhat))
133 |     del dbuild, dvalid, watchlist
134 |     gc.collect()
135 | 
136 | del train_user, X_train, y_train
137 | gc.collect()
138 | 
139 | #==============================================================================
140 | print('test')
141 | #==============================================================================
142 | test = utils.load_pred_item('test').fillna(-1)
143 | 
144 | sub_test = test[['order_id', 'product_id']]
145 | 
146 | dtest  = xgb.DMatrix(test[col_train])
147 | sub_test['yhat'] = 0
148 | for model in models:
149 |     sub_test['yhat'] += model.predict(dtest)
150 | sub_test['yhat'] /= LOOP
151 | print('Test Mean:', sub_test['yhat'].mean())
152 | 
153 | sub_test.to_pickle('../output/sub/{}/sub_test.p'.format(DATE))
154 | 
155 | 
156 | #==============================================================================
157 | utils.end(__file__)
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/py_model/002_xgb_holdout_item_813_3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 002_xgb_holdout_item_813_3.py > LOG/_xgb_item.txt &
  9 | 
 10 | 
 11 | """
 12 | 
 13 | import warnings
 14 | warnings.filterwarnings("ignore")
 15 | import pandas as pd
 16 | import numpy as np
 17 | import gc
 18 | import xgboost as xgb
 19 | import utils
 20 | 
 21 | utils.start(__file__)
 22 | 
 23 | 
 24 | 
 25 | # setting
 26 | DATE = '813_3'
 27 | LOOP = 2
 28 | ESR = 60
 29 | 
 30 | #seed = np.random.randint(99999)
 31 | seed = 73
 32 | 
 33 | np.random.seed(seed)
 34 | 
 35 | valid_size = 0.05
 36 | 
 37 | 
 38 | # XGB param
 39 | nround = 10000
 40 | #nround = 10
 41 | 
 42 | param = {'max_depth':10, 
 43 |          'eta':0.02,
 44 |          'colsample_bytree':0.4,
 45 |          'subsample':0.75,
 46 |          'silent':1,
 47 |          'nthread':27,
 48 |          'eval_metric':'logloss',
 49 |          'objective':'binary:logistic',
 50 |          'tree_method':'hist'
 51 |          }
 52 | 
 53 | print("""#==== print param ======""")
 54 | print('DATE:', DATE)
 55 | print('seed:', seed)
 56 | 
 57 | #==============================================================================
 58 | # prepare
 59 | #==============================================================================
 60 | train = pd.concat([utils.load_pred_item('trainT-0'),
 61 |                    utils.load_pred_item('trainT-1'),
 62 |                    utils.load_pred_item('trainT-2')
 63 |                    ], ignore_index=True)
 64 | 
 65 | y_train = train['y']
 66 | X_train = train.drop('y', axis=1)
 67 | del train
 68 | gc.collect()
 69 | 
 70 | # drop id
 71 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 72 | col.remove('user_id')
 73 | print('drop1',col)
 74 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 75 | 
 76 | # drop obj
 77 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 78 | print('drop2',col)
 79 | X_train.drop(col, axis=1, inplace=True)
 80 | 
 81 | X_train.fillna(-1, inplace=1)
 82 | 
 83 | #==============================================================================
 84 | # SPLIT!
 85 | print('split by user')
 86 | #==============================================================================
 87 | train_user = X_train[['user_id']].drop_duplicates()
 88 | 
 89 | def split_build_valid():
 90 |     
 91 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 92 |                                               p=[1-valid_size, valid_size])
 93 |     valid_n = train_user['is_valid'].sum()
 94 |     build_n = (train_user.shape[0] - valid_n)
 95 |     
 96 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 97 |     valid_user = train_user[train_user['is_valid']==1].user_id
 98 |     is_valid = X_train.user_id.isin(valid_user)
 99 |     
100 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
101 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
102 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
103 |     
104 |     print('FINAL SHAPE')
105 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
106 |                                                       (dvalid.num_row(), dvalid.num_col())))
107 | 
108 |     return dbuild, dvalid, watchlist
109 | 
110 | #==============================================================================
111 | print('hold out')
112 | #==============================================================================
113 | utils.mkdir_p('../output/model/{}/'.format(DATE))
114 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
115 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
116 | 
117 | # hold out
118 | models = []
119 | for i in range(LOOP):
120 |     print('LOOP',i)
121 |     dbuild, dvalid, watchlist = split_build_valid()
122 |     
123 |     if i==0:
124 |         col_train = dbuild.feature_names
125 |         
126 |     model = xgb.train(param, dbuild, nround, watchlist,
127 |                       early_stopping_rounds=ESR, verbose_eval=5)
128 |     models.append(model)
129 |     model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i))
130 |     # VALID
131 |     valid_yhat = model.predict(dvalid)
132 |     print('Valid Mean:', np.mean(valid_yhat))
133 |     del dbuild, dvalid, watchlist
134 |     gc.collect()
135 | 
136 | del train_user, X_train, y_train
137 | gc.collect()
138 | 
139 | 
140 | 
141 | #==============================================================================
142 | print('test')
143 | #==============================================================================
144 | test = utils.load_pred_item('test').fillna(-1)
145 | 
146 | sub_test = test[['order_id', 'product_id']]
147 | 
148 | dtest  = xgb.DMatrix(test[col_train])
149 | sub_test['yhat'] = 0
150 | for model in models:
151 |     sub_test['yhat'] += model.predict(dtest)
152 | sub_test['yhat'] /= LOOP
153 | print('Test Mean:', sub_test['yhat'].mean())
154 | 
155 | sub_test.to_pickle('../output/sub/{}/sub_test.p'.format(DATE))
156 | 
157 | #==============================================================================
158 | utils.end(__file__)
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/py_model/100_====== None prediction ======:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_model/100_====== None prediction ======


--------------------------------------------------------------------------------
/py_model/102_xgb_holdout_None_813_3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 102_xgb_holdout_None_813_3.py > LOG/_xgb_None.txt &
  9 | 
 10 | """
 11 | 
 12 | import warnings
 13 | warnings.filterwarnings("ignore")
 14 | import pandas as pd
 15 | import numpy as np
 16 | import gc
 17 | import xgboost as xgb
 18 | import utils
 19 | utils.start(__file__)
 20 | 
 21 | 
 22 | 
 23 | # setting
 24 | DATE = '813_3'
 25 | LOOP = 5
 26 | ESR = 40
 27 | 
 28 | #seed = np.random.randint(99999)
 29 | seed = 71
 30 | 
 31 | np.random.seed(seed)
 32 | 
 33 | valid_size = 0.05
 34 | 
 35 | 
 36 | # XGB param
 37 | nround = 10000
 38 | #nround = 10
 39 | 
 40 | param = {'max_depth':10,
 41 |          'eta':0.01,
 42 |          'colsample_bytree':0.5,
 43 |          'subsample':0.75,
 44 |          'silent':1, 
 45 |          'nthread':27,
 46 |          'eval_metric':'logloss',
 47 |          'objective':'binary:logistic',
 48 |          'tree_method':'hist'
 49 |          }
 50 | 
 51 | print("""#==== print param ======""")
 52 | print('DATE:', DATE)
 53 | print('seed:', seed)
 54 | 
 55 | #==============================================================================
 56 | # prepare
 57 | #==============================================================================
 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3),
 59 |                    utils.load_pred_None('trainT-1', 3),
 60 |                    utils.load_pred_None('trainT-2', 3)
 61 |                    ], ignore_index=True)
 62 | 
 63 | sub_train = train[['order_id', 'y']]
 64 | y_train = train['y']
 65 | X_train = train.drop('y', axis=1)
 66 | del train; gc.collect()
 67 | 
 68 | # drop id
 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 70 | col.remove('user_id')
 71 | print('drop1',col)
 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 73 | 
 74 | # drop obj
 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 76 | print('drop2',col)
 77 | X_train.drop(col, axis=1, inplace=True)
 78 | 
 79 | X_train.fillna(-1, inplace=1)
 80 | 
 81 | #==============================================================================
 82 | # SPLIT!
 83 | print('split by user')
 84 | #==============================================================================
 85 | train_user = X_train[['user_id']].drop_duplicates()
 86 | #utils.to_pickles(X_train, 'X_train', 10)
 87 | #del X_train; gc.collect()
 88 | 
 89 | 
 90 | def split_build_valid():
 91 |     
 92 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 93 |                                               p=[1-valid_size, valid_size])
 94 |     valid_n = train_user['is_valid'].sum()
 95 |     build_n = (train_user.shape[0] - valid_n)
 96 |     
 97 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 98 |     valid_user = train_user[train_user['is_valid']==1].user_id
 99 |     is_valid = X_train.user_id.isin(valid_user)
100 |     
101 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
102 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
103 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
104 |     
105 |     label = dbuild.get_label()
106 |     scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
107 |     
108 |     print('scale_pos_weight', scale_pos_weight)
109 |     print('FINAL SHAPE')
110 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
111 |                                                       (dvalid.num_row(), dvalid.num_col())))
112 | 
113 |     return dbuild, dvalid, watchlist, scale_pos_weight
114 | 
115 | dbuild, dvalid, watchlist, weight = split_build_valid()
116 | 
117 | col_train = dbuild.feature_names
118 | #==============================================================================
119 | print('hold out')
120 | #==============================================================================
121 | utils.mkdir_p('../output/model/{}/'.format(DATE))
122 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
123 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
124 | 
125 | # hold out
126 | models = []
127 | for i in range(LOOP):
128 |     print('LOOP',i)
129 | #    param['scale_pos_weight'] = weight
130 |     model = xgb.train(param, dbuild, nround, watchlist,
131 |                       early_stopping_rounds=ESR, verbose_eval=5)
132 |     models.append(model)
133 |     model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i))
134 |     
135 |     # VALID
136 |     yhat = model.predict(dvalid)
137 |     print('Valid Mean:', np.mean(yhat))
138 |     
139 |     if i != (LOOP-1):
140 |         del dbuild, dvalid, watchlist
141 |         gc.collect()
142 |         dbuild, dvalid, watchlist, weight = split_build_valid()
143 | 
144 | 
145 | del train_user, sub_train, X_train, y_train
146 | del dbuild, dvalid
147 | gc.collect()
148 | 
149 | 
150 | #==============================================================================
151 | print('test')
152 | #==============================================================================
153 | test = utils.load_pred_None('test', 3).fillna(-1)
154 | sub_test = test[['order_id']]
155 | 
156 | dtest  = xgb.DMatrix(test[col_train])
157 | sub_test['yhat'] = 0
158 | for model in models:
159 |     sub_test['yhat'] += model.predict(dtest)
160 | sub_test['yhat'] /= LOOP
161 | print('Test Mean:', sub_test['yhat'].mean())
162 | 
163 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE))
164 | 
165 | 
166 | #==============================================================================
167 | utils.end(__file__)
168 | 
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/py_model/102_xgb_holdout_None_814_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 102_xgb_holdout_None_814_1.py > LOG/_xgb_None.txt &
  9 | 
 10 | """
 11 | 
 12 | import warnings
 13 | warnings.filterwarnings("ignore")
 14 | import pandas as pd
 15 | import numpy as np
 16 | import gc
 17 | import xgboost as xgb
 18 | import utils
 19 | utils.start(__file__)
 20 | 
 21 | 
 22 | 
 23 | # setting
 24 | DATE = '814_1'
 25 | LOOP = 6
 26 | ESR = 50
 27 | 
 28 | #seed = np.random.randint(99999)
 29 | seed = 72
 30 | 
 31 | np.random.seed(seed)
 32 | 
 33 | valid_size = 0.05
 34 | 
 35 | 
 36 | # XGB param
 37 | nround = 10000
 38 | #nround = 10
 39 | 
 40 | param = {'max_depth':10,
 41 |          'eta':0.01,
 42 |          'colsample_bytree':0.5,
 43 |          'subsample':0.75,
 44 |          'silent':1, 
 45 |          'nthread':28,
 46 |          'eval_metric':'logloss',
 47 |          'objective':'binary:logistic',
 48 |          'tree_method':'hist'
 49 |          }
 50 | 
 51 | print("""#==== print param ======""")
 52 | print('DATE:', DATE)
 53 | print('seed:', seed)
 54 | 
 55 | #==============================================================================
 56 | # prepare
 57 | #==============================================================================
 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3),
 59 |                    utils.load_pred_None('trainT-1', 3),
 60 |                    utils.load_pred_None('trainT-2', 3)
 61 |                    ], ignore_index=True)
 62 | 
 63 | sub_train = train[['order_id', 'y']]
 64 | y_train = train['y']
 65 | X_train = train.drop('y', axis=1)
 66 | del train; gc.collect()
 67 | 
 68 | # drop id
 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 70 | col.remove('user_id')
 71 | print('drop1',col)
 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 73 | 
 74 | # drop obj
 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 76 | print('drop2',col)
 77 | X_train.drop(col, axis=1, inplace=True)
 78 | 
 79 | X_train.fillna(-1, inplace=1)
 80 | 
 81 | #==============================================================================
 82 | # SPLIT!
 83 | print('split by user')
 84 | #==============================================================================
 85 | train_user = X_train[['user_id']].drop_duplicates()
 86 | #utils.to_pickles(X_train, 'X_train', 10)
 87 | #del X_train; gc.collect()
 88 | 
 89 | 
 90 | def split_build_valid():
 91 |     
 92 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 93 |                                               p=[1-valid_size, valid_size])
 94 |     valid_n = train_user['is_valid'].sum()
 95 |     build_n = (train_user.shape[0] - valid_n)
 96 |     
 97 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 98 |     valid_user = train_user[train_user['is_valid']==1].user_id
 99 |     is_valid = X_train.user_id.isin(valid_user)
100 |     
101 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
102 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
103 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
104 |     
105 |     label = dbuild.get_label()
106 |     scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
107 |     
108 |     print('scale_pos_weight', scale_pos_weight)
109 |     print('FINAL SHAPE')
110 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
111 |                                                       (dvalid.num_row(), dvalid.num_col())))
112 | 
113 |     return dbuild, dvalid, watchlist, scale_pos_weight
114 | 
115 | dbuild, dvalid, watchlist, weight = split_build_valid()
116 | 
117 | col_train = dbuild.feature_names
118 | #==============================================================================
119 | print('hold out')
120 | #==============================================================================
121 | utils.mkdir_p('../output/model/{}/'.format(DATE))
122 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
123 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
124 | 
125 | # hold out
126 | models = []
127 | for i in range(LOOP):
128 |     print('LOOP',i)
129 | #    param['scale_pos_weight'] = weight
130 |     model = xgb.train(param, dbuild, nround, watchlist,
131 |                       early_stopping_rounds=ESR, verbose_eval=5)
132 |     models.append(model)
133 |     model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i))
134 |     
135 |     # VALID
136 |     yhat = model.predict(dvalid)
137 |     print('Valid Mean:', np.mean(yhat))
138 |     
139 |     if i != (LOOP-1):
140 |         del dbuild, dvalid, watchlist
141 |         gc.collect()
142 |         dbuild, dvalid, watchlist, weight = split_build_valid()
143 | 
144 | 
145 | del train_user, sub_train, X_train, y_train
146 | del dbuild, dvalid
147 | gc.collect()
148 | 
149 | 
150 | #==============================================================================
151 | print('test')
152 | #==============================================================================
153 | test = utils.load_pred_None('test', 3).fillna(-1)
154 | sub_test = test[['order_id']]
155 | 
156 | dtest  = xgb.DMatrix(test[col_train])
157 | sub_test['yhat'] = 0
158 | for model in models:
159 |     sub_test['yhat'] += model.predict(dtest)
160 | sub_test['yhat'] /= LOOP
161 | print('Test Mean:', sub_test['yhat'].mean())
162 | 
163 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE))
164 | 
165 | 
166 | #==============================================================================
167 | utils.end(__file__)
168 | 
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/py_model/102_xgb_holdout_None_814_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 102_xgb_holdout_None_814_2.py > LOG/_xgb_None.txt &
  9 | 
 10 | """
 11 | 
 12 | import warnings
 13 | warnings.filterwarnings("ignore")
 14 | import pandas as pd
 15 | import numpy as np
 16 | import gc
 17 | import xgboost as xgb
 18 | import utils
 19 | utils.start(__file__)
 20 | 
 21 | 
 22 | 
 23 | # setting
 24 | DATE = '814_2'
 25 | LOOP = 3
 26 | ESR = 60
 27 | 
 28 | #seed = np.random.randint(99999)
 29 | seed = 73
 30 | 
 31 | np.random.seed(seed)
 32 | 
 33 | valid_size = 0.05
 34 | 
 35 | 
 36 | # XGB param
 37 | nround = 20000
 38 | #nround = 10
 39 | 
 40 | param = {'max_depth':10,
 41 |          'eta':0.002,
 42 |          'colsample_bytree':0.5,
 43 |          'subsample':0.75,
 44 |          'silent':1, 
 45 |          'nthread':28,
 46 |          'eval_metric':'logloss',
 47 |          'objective':'binary:logistic',
 48 |          'tree_method':'hist'
 49 |          }
 50 | 
 51 | print("""#==== print param ======""")
 52 | print('DATE:', DATE)
 53 | print('seed:', seed)
 54 | 
 55 | #==============================================================================
 56 | # prepare
 57 | #==============================================================================
 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3),
 59 |                    utils.load_pred_None('trainT-1', 3),
 60 |                    utils.load_pred_None('trainT-2', 3)
 61 |                    ], ignore_index=True)
 62 | 
 63 | sub_train = train[['order_id', 'y']]
 64 | y_train = train['y']
 65 | X_train = train.drop('y', axis=1)
 66 | del train; gc.collect()
 67 | 
 68 | # drop id
 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 70 | col.remove('user_id')
 71 | print('drop1',col)
 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 73 | 
 74 | # drop obj
 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 76 | print('drop2',col)
 77 | X_train.drop(col, axis=1, inplace=True)
 78 | 
 79 | X_train.fillna(-1, inplace=1)
 80 | 
 81 | #==============================================================================
 82 | # SPLIT!
 83 | print('split by user')
 84 | #==============================================================================
 85 | train_user = X_train[['user_id']].drop_duplicates()
 86 | #utils.to_pickles(X_train, 'X_train', 10)
 87 | #del X_train; gc.collect()
 88 | 
 89 | 
 90 | def split_build_valid():
 91 |     
 92 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 93 |                                               p=[1-valid_size, valid_size])
 94 |     valid_n = train_user['is_valid'].sum()
 95 |     build_n = (train_user.shape[0] - valid_n)
 96 |     
 97 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 98 |     valid_user = train_user[train_user['is_valid']==1].user_id
 99 |     is_valid = X_train.user_id.isin(valid_user)
100 |     
101 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
102 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
103 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
104 |     
105 |     label = dbuild.get_label()
106 |     scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
107 |     
108 |     print('scale_pos_weight', scale_pos_weight)
109 |     print('FINAL SHAPE')
110 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
111 |                                                       (dvalid.num_row(), dvalid.num_col())))
112 | 
113 |     return dbuild, dvalid, watchlist, scale_pos_weight
114 | 
115 | dbuild, dvalid, watchlist, weight = split_build_valid()
116 | 
117 | col_train = dbuild.feature_names
118 | #==============================================================================
119 | print('hold out')
120 | #==============================================================================
121 | utils.mkdir_p('../output/model/{}/'.format(DATE))
122 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
123 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
124 | 
125 | # hold out
126 | models = []
127 | for i in range(LOOP):
128 |     print('LOOP',i)
129 | #    param['scale_pos_weight'] = weight
130 |     model = xgb.train(param, dbuild, nround, watchlist,
131 |                       early_stopping_rounds=ESR, verbose_eval=5)
132 |     models.append(model)
133 |     model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i))
134 |     
135 |     # VALID
136 |     yhat = model.predict(dvalid)
137 |     print('Valid Mean:', np.mean(yhat))
138 |     
139 |     if i != (LOOP-1):
140 |         del dbuild, dvalid, watchlist
141 |         gc.collect()
142 |         dbuild, dvalid, watchlist, weight = split_build_valid()
143 | 
144 | 
145 | del train_user, sub_train, X_train, y_train
146 | del dbuild, dvalid
147 | gc.collect()
148 | 
149 | #==============================================================================
150 | print('test')
151 | #==============================================================================
152 | test = utils.load_pred_None('test', 3).fillna(-1)
153 | sub_test = test[['order_id']]
154 | 
155 | dtest  = xgb.DMatrix(test[col_train])
156 | sub_test['yhat'] = 0
157 | for model in models:
158 |     sub_test['yhat'] += model.predict(dtest)
159 | sub_test['yhat'] /= LOOP
160 | print('Test Mean:', sub_test['yhat'].mean())
161 | 
162 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE))
163 | 
164 | #==============================================================================
165 | utils.end(__file__)
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/py_model/102_xgb_holdout_None_814_3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue May 30 23:28:19 2017
  5 | 
  6 | @author: konodera
  7 | 
  8 | nohup python -u 102_xgb_holdout_None_814_3.py > LOG/_xgb_None.txt &
  9 | 
 10 | """
 11 | 
 12 | import warnings
 13 | warnings.filterwarnings("ignore")
 14 | import pandas as pd
 15 | import numpy as np
 16 | import gc
 17 | import xgboost as xgb
 18 | import utils
 19 | utils.start(__file__)
 20 | 
 21 | 
 22 | 
 23 | # setting
 24 | DATE = '814_3'
 25 | LOOP = 3
 26 | ESR = 60
 27 | 
 28 | #seed = np.random.randint(99999)
 29 | seed = 74
 30 | 
 31 | np.random.seed(seed)
 32 | 
 33 | valid_size = 0.05
 34 | 
 35 | 
 36 | # XGB param
 37 | nround = 20000
 38 | #nround = 10
 39 | 
 40 | param = {'max_depth':10,
 41 |          'eta':0.002,
 42 |          'colsample_bytree':0.5,
 43 |          'subsample':0.75,
 44 |          'silent':1, 
 45 |          'nthread':28,
 46 |          'eval_metric':'logloss',
 47 |          'objective':'binary:logistic',
 48 |          'tree_method':'hist'
 49 |          }
 50 | 
 51 | print("""#==== print param ======""")
 52 | print('DATE:', DATE)
 53 | print('seed:', seed)
 54 | 
 55 | #==============================================================================
 56 | # prepare
 57 | #==============================================================================
 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3),
 59 |                    utils.load_pred_None('trainT-1', 3),
 60 |                    utils.load_pred_None('trainT-2', 3)
 61 |                    ], ignore_index=True)
 62 | 
 63 | sub_train = train[['order_id', 'y']]
 64 | y_train = train['y']
 65 | X_train = train.drop('y', axis=1)
 66 | del train; gc.collect()
 67 | 
 68 | # drop id
 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train']
 70 | col.remove('user_id')
 71 | print('drop1',col)
 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id
 73 | 
 74 | # drop obj
 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()
 76 | print('drop2',col)
 77 | X_train.drop(col, axis=1, inplace=True)
 78 | 
 79 | X_train.fillna(-1, inplace=1)
 80 | 
 81 | #==============================================================================
 82 | # SPLIT!
 83 | print('split by user')
 84 | #==============================================================================
 85 | train_user = X_train[['user_id']].drop_duplicates()
 86 | #utils.to_pickles(X_train, 'X_train', 10)
 87 | #del X_train; gc.collect()
 88 | 
 89 | 
 90 | def split_build_valid():
 91 |     
 92 |     train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
 93 |                                               p=[1-valid_size, valid_size])
 94 |     valid_n = train_user['is_valid'].sum()
 95 |     build_n = (train_user.shape[0] - valid_n)
 96 |     
 97 |     print('build user:{}, valid user:{}'.format(build_n, valid_n))
 98 |     valid_user = train_user[train_user['is_valid']==1].user_id
 99 |     is_valid = X_train.user_id.isin(valid_user)
100 |     
101 |     dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
102 |     dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
103 |     watchlist = [(dbuild, 'build'),(dvalid, 'valid')]
104 |     
105 |     label = dbuild.get_label()
106 |     scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)
107 |     
108 |     print('scale_pos_weight', scale_pos_weight)
109 |     print('FINAL SHAPE')
110 |     print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
111 |                                                       (dvalid.num_row(), dvalid.num_col())))
112 | 
113 |     return dbuild, dvalid, watchlist, scale_pos_weight
114 | 
115 | dbuild, dvalid, watchlist, weight = split_build_valid()
116 | 
117 | col_train = dbuild.feature_names
118 | #==============================================================================
119 | print('hold out')
120 | #==============================================================================
121 | utils.mkdir_p('../output/model/{}/'.format(DATE))
122 | utils.mkdir_p('../output/imp/{}/'.format(DATE))
123 | utils.mkdir_p('../output/sub/{}/'.format(DATE))
124 | 
125 | # hold out
126 | models = []
127 | for i in range(LOOP):
128 |     print('LOOP',i)
129 | #    param['scale_pos_weight'] = weight
130 |     model = xgb.train(param, dbuild, nround, watchlist,
131 |                       early_stopping_rounds=ESR, verbose_eval=5)
132 |     models.append(model)
133 |     model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i))
134 |     
135 |     # VALID
136 |     yhat = model.predict(dvalid)
137 |     print('Valid Mean:', np.mean(yhat))
138 |     
139 |     if i != (LOOP-1):
140 |         del dbuild, dvalid, watchlist
141 |         gc.collect()
142 |         dbuild, dvalid, watchlist, weight = split_build_valid()
143 | 
144 | 
145 | del train_user, sub_train, X_train, y_train
146 | del dbuild, dvalid
147 | gc.collect()
148 | 
149 | #==============================================================================
150 | print('test')
151 | #==============================================================================
152 | test = utils.load_pred_None('test', 3).fillna(-1)
153 | sub_test = test[['order_id']]
154 | 
155 | dtest  = xgb.DMatrix(test[col_train])
156 | sub_test['yhat'] = 0
157 | for model in models:
158 |     sub_test['yhat'] += model.predict(dtest)
159 | sub_test['yhat'] /= LOOP
160 | print('Test Mean:', sub_test['yhat'].mean())
161 | 
162 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE))
163 | 
164 | #==============================================================================
165 | utils.end(__file__)
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/py_model/200_===== threshold estimation =====:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_model/200_===== threshold estimation =====


--------------------------------------------------------------------------------
/py_model/201_Faron_opt_bagging_815_3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Jul 29 18:59:46 2017
 5 | 
 6 | @author: konodera
 7 | 
 8 | nohup python -u 201_Faron_opt_bagging_815_3.py > LOG/_Faron-opt.txt &
 9 | 
10 | """
11 | 
12 | import pandas as pd
13 | from opt_fscore import get_best_prediction
14 | import multiprocessing as mp
15 | import time
16 | import utils
17 | utils.start(__file__)
18 | 
19 | 
20 | # setting
21 | DATE_item = ['812_1', '813_1', '813_3']
22 | 
23 | DATE_None = ['813_3', '814_1', '814_2', '814_3']
24 | 
25 | total_proc = 60
26 | 
27 | OUTF = "../output/sub/final/Faron-opt_bagging-v3.csv.gz"
28 | 
29 | print("""#==== print param ======""")
30 | print('OUTF:', OUTF)
31 | print('DATE_item:', DATE_item)
32 | print('DATE_None:', DATE_None)
33 | print('total_proc:', total_proc)
34 | 
35 | utils.mkdir_p('../output/sub/final')
36 | #==============================================================================
37 | # load
38 | #==============================================================================
39 | sub_item = pd.concat([pd.read_pickle('../output/sub/{}/sub_test.p'.format(d)) for d in DATE_item])
40 | sub_item = sub_item.groupby(['order_id','product_id']).yhat.mean().reset_index()
41 | sub = sub_item.groupby('order_id').product_id.apply(list).to_frame()
42 | sub['yhat'] = sub_item.groupby('order_id').yhat.apply(list)
43 | 
44 | # weighted
45 | for i,(w,d) in enumerate(zip([0.1, 0.1, 0.4, 0.4], DATE_None)):
46 |     tmp = pd.read_pickle('../output/sub/{}/sub_test_None.p'.format(d)).rename(columns={'yhat':'yhat_None'})
47 |     tmp.yhat_None *= w
48 |     if i==0:
49 |         sub_None = tmp
50 |     else:
51 |         sub_None = pd.concat([sub_None, tmp])
52 | 
53 | sub_None = sub_None.groupby('order_id').yhat_None.sum().reset_index()
54 | 
55 | sub = pd.merge(sub.reset_index(), sub_None, on='order_id', how='left')
56 | 
57 | #==============================================================================
58 | # optimize
59 | #==============================================================================
60 | def multi(i):
61 |     if i%1000==0:
62 |         print('{:.3f} min'.format((time.time()-st_time)/60))
63 |     items = sub.loc[i,'product_id']
64 |     preds = sub.loc[i,'yhat']
65 |     pNone = sub.loc[i,'yhat_None']
66 |     ret = get_best_prediction(items, preds, pNone)
67 |     return ret
68 | 
69 | # start!!!
70 | st_time = time.time()
71 | pool = mp.Pool(total_proc)
72 | callback = pool.map(multi, range(sub.shape[0]))
73 | 
74 | sub['products'] = callback
75 | 
76 | print('writing...')
77 | sub[['order_id', 'products']].to_csv(OUTF, index=0, compression='gzip')
78 | 
79 | #==============================================================================
80 | utils.end(__file__)
81 | 
82 | 


--------------------------------------------------------------------------------
/py_model/999_run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Aug 21 13:13:57 2017
 5 | 
 6 | @author: konodera
 7 | """
 8 | 
 9 | import os
10 | import utils
11 | utils.start(__file__)
12 | 
13 | 
14 | os.system('python -u 002_xgb_holdout_item_812_1.py')
15 | os.system('python -u 002_xgb_holdout_item_813_1.py')
16 | os.system('python -u 002_xgb_holdout_item_813_3.py')
17 | 
18 | os.system('python -u 102_xgb_holdout_None_813_3.py')
19 | os.system('python -u 102_xgb_holdout_None_814_1.py')
20 | os.system('python -u 102_xgb_holdout_None_814_2.py')
21 | os.system('python -u 102_xgb_holdout_None_814_3.py')
22 | 
23 | os.system('python -u 201_Faron_opt_bagging_815_3.py')
24 | 
25 | utils.end(__file__)
26 | 
27 | 


--------------------------------------------------------------------------------
/py_model/opt_fscore.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Faron
  4 | """
  5 | import numpy as np
  6 | from operator import itemgetter
  7 | 
  8 | '''
  9 | This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in
 10 | "Ye, N., Chai, K., Lee, W., and Chieu, H.  Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."
 11 | 
 12 | It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])]
 13 | with [[None]] being the indicator for predicting label "None"
 14 | given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n
 15 | under label independence assumption by means of dynamic programming in O(n²).
 16 | '''
 17 | 
 18 | 
 19 | class F1Optimizer():
 20 |     def __init__(self):
 21 |         pass
 22 | 
 23 |     @staticmethod
 24 |     def get_expectations(P, pNone=None):
 25 |         expectations = []
 26 |         P = np.sort(P)[::-1]
 27 | 
 28 |         n = np.array(P).shape[0]
 29 |         DP_C = np.zeros((n + 2, n + 1))
 30 |         if pNone is None:
 31 |             pNone = (1.0 - P).prod()
 32 | 
 33 |         DP_C[0][0] = 1.0
 34 |         for j in range(1, n):
 35 |             DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]
 36 | 
 37 |         for i in range(1, n + 1):
 38 |             DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
 39 |             for j in range(i + 1, n + 1):
 40 |                 DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]
 41 | 
 42 |         DP_S = np.zeros((2 * n + 1,))
 43 |         DP_SNone = np.zeros((2 * n + 1,))
 44 |         for i in range(1, 2 * n + 1):
 45 |             DP_S[i] = 1. / (1. * i)
 46 |             DP_SNone[i] = 1. / (1. * i + 1)
 47 |         for k in range(n + 1)[::-1]:
 48 |             f1 = 0
 49 |             f1None = 0
 50 |             for k1 in range(n + 1):
 51 |                 f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
 52 |                 f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
 53 |             for i in range(1, 2 * k - 1):
 54 |                 DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
 55 |                 DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
 56 |             expectations.append([f1None + 2 * pNone / (2 + k), f1])
 57 | 
 58 |         return np.array(expectations[::-1]).T
 59 | 
 60 |     @staticmethod
 61 |     def maximize_expectation(P, pNone=None):
 62 |         expectations = F1Optimizer.get_expectations(P, pNone)
 63 | 
 64 |         ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
 65 |         max_f1 = expectations[ix_max]
 66 | 
 67 |         predNone = True if ix_max[0] == 0 else False
 68 |         best_k = ix_max[1]
 69 | 
 70 |         return best_k, predNone, max_f1
 71 | 
 72 |     @staticmethod
 73 |     def _F1(tp, fp, fn):
 74 |         return 2 * tp / (2 * tp + fp + fn)
 75 | 
 76 |     @staticmethod
 77 |     def _Fbeta(tp, fp, fn, beta=1.0):
 78 |         beta_squared = beta ** 2
 79 |         return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)
 80 | 
 81 | 
 82 | def get_best_prediction(items, preds, pNone=None):
 83 | #    print("Maximize F1-Expectation")
 84 | #    print("=" * 23)
 85 |     items_preds = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True)
 86 |     P = [p for i,p in items_preds]
 87 |     L = [i for i,p in items_preds]
 88 |     
 89 |     opt = F1Optimizer.maximize_expectation(P, pNone)
 90 |     best_prediction = ['None'] if opt[1] else []
 91 |     best_prediction += (L[:opt[0]])
 92 | #    f1_max = opt[2]
 93 |     
 94 | #    print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))
 95 |     return ' '.join(list(map(str,best_prediction)))
 96 | 
 97 | if __name__ == '__main__':
 98 |     get_best_prediction(['a', 'b'], [0.9, 0.3], 0.5)
 99 |     
100 |     
101 |     
102 |     


--------------------------------------------------------------------------------
/py_model/pyx_get_best_items.pyx:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Fri Jun 30 15:09:33 2017
  3 | 
  4 | @author: konodera
  5 | """
  6 | from operator import itemgetter
  7 | import numpy as np
  8 | 
  9 | LOOP = 9999
 10 | np.random.seed(71)
 11 | 
 12 | cdef int __tp__(y_true, y_pred):
 13 |     return len(y_true & y_pred)
 14 | 
 15 | cdef int __tpfp__(y_pred):
 16 |     return len(y_pred)
 17 | 
 18 | cdef int __tpfn__(y_true):
 19 |     return len(y_true)
 20 | 
 21 | cdef double multilabel_fscore(y_true, y_pred):
 22 |     cdef double precision, recall
 23 |     cdef double tp, tpfp, tpfn
 24 |     
 25 |     tp = __tp__(y_true, y_pred)
 26 |     tpfp = __tpfp__(y_pred)
 27 |     tpfn = __tpfn__(y_true)
 28 |     
 29 |     precision = tp/tpfp
 30 |     recall = tp/tpfn
 31 |     
 32 |     if precision + recall == 0:
 33 |         return 0
 34 |     return (2 * precision * recall) / (precision + recall)
 35 | 
 36 | cdef get_y_true(items):
 37 |     """
 38 |     items: dict
 39 |     {A:0.9, B:0.3}
 40 |     """
 41 |     cdef list y_true = []
 42 |     for k in items.keys():
 43 |         if items[k]>np.random.uniform():
 44 |             y_true.append(k)
 45 |     if len(y_true)==0 or 'None' in y_true:
 46 |         y_true = ['None']
 47 |     return y_true
 48 | 
 49 | def get_best_items(items, preds):
 50 |     """
 51 |     items: list
 52 |     [1, 2, 3...]
 53 |     
 54 |     preds: list
 55 |     [0.3, 0.9, 0.2...]
 56 |     
 57 |     items = [1, 2, 3, 4, 5, 6, 7]
 58 |     preds = [0.2, 0.19, 0.18, 0.17, 0.16, 0.15, 0.14]
 59 |     
 60 |     """
 61 |     items_true = dict(zip(items, preds))
 62 |     cdef list items_pred = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True)
 63 |     items_pred = [k for k,v in items_pred]
 64 |     cdef list y_trues = [set(get_y_true(items_true)) for i in range(LOOP)]
 65 |     cdef list best_items
 66 |     
 67 |     cdef double best_score = 0
 68 |     for i in range(1,len(items_pred)+1):
 69 |         score = np.mean([multilabel_fscore(y_trues[j], set(items_pred[:i])) for j in range(LOOP)])
 70 |         if best_score < score:
 71 |             best_score = score
 72 |         elif best_score > score:
 73 |             best_items = items_pred[:i-1]
 74 |             break
 75 |         if i==len(items_pred):
 76 |             # last
 77 |             best_items = items_pred[:]
 78 |             break
 79 |     
 80 |     if 'None' in best_items:
 81 |         return ' '.join(map(str, best_items))
 82 |     
 83 |     # search None
 84 |     best_items = best_items[::-1] # low is head
 85 |     for i in range(len(best_items)+1):
 86 |         score = np.mean([multilabel_fscore(y_trues[j], set(best_items[i:]+['None'])) for j in range(LOOP)])
 87 |         if best_score < score:
 88 |             best_score = score
 89 |         elif best_score > score and i==0:
 90 |             break
 91 |         elif best_score > score:
 92 |             best_items = best_items[i-1:]+['None']
 93 |             break
 94 |         elif i==len(best_items):
 95 |             # last
 96 |             best_items = ['None']
 97 |             break
 98 |     
 99 |     return ' '.join(map(str, best_items))
100 | 
101 | def get_best_items2(items, preds):
102 |     """
103 |     items: list
104 |     [1, 2, 3...]
105 |     
106 |     preds: list
107 |     [0.3, 0.9, 0.2...]
108 |     
109 |     ex:
110 |     items = [1, 2, 3, 4, 5, 6, 7]
111 |     preds = [0.2, 0.19, 0.18, 0.17, 0.16, 0.15, 0.14]
112 |     
113 |     """
114 |     items_true = dict(zip(items, preds))
115 |     cdef list items_pred = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True)
116 |     items_pred = [k for k,v in items_pred]
117 |     cdef list y_trues = [set(get_y_true(items_true)) for i in range(LOOP)]
118 |     cdef list best_items
119 |     
120 |     cdef double best_score = 0
121 |     for i in range(1,len(items_pred)+1):
122 |         score = np.mean([multilabel_fscore(y_trues[j], set(items_pred[:i])) for j in range(LOOP)])
123 |         if best_score < score:
124 |             best_score = score
125 |         elif best_score > score:
126 |             best_items = items_pred[:i-1]
127 |             break
128 |         if i==len(items_pred):
129 |             # last
130 |             best_items = items_pred[:]
131 |             break
132 |     
133 |     return ' '.join(map(str, best_items))
134 | 
135 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.12.1
2 | pandas==0.19.2
3 | scipy==0.19.0
4 | tqdm==4.11.2
5 | xgboost==0.6
6 | 


--------------------------------------------------------------------------------