├── .gitignore ├── Instacart 2nd Place Solution.pptx ├── LICENSE ├── README.md ├── appendix ├── 317_.py ├── 501_concat.py ├── 901_reorder_base.py ├── 902_reorder.py ├── 903_Faron_opt_bagging.py ├── README.md └── utils.py ├── input └── README.md ├── output └── sub │ └── final │ └── Faron-opt_bagging-v3.csv.gz ├── py_feature ├── 000_mk.py ├── 003_X_base_T.py ├── 004_label.py ├── 005_inarow.py ├── 006_days_since_last_order.py ├── 007_timezone.py ├── 008_product_feature.py ├── 009_None.py ├── 010_streak.py ├── 011_replacement.py ├── 012_aisle_dep_cumsum.py ├── 100_======user_feature====== ├── 101_repeat_previous_ratio_T.py ├── 102_orderspan_average.py ├── 103_visit_time.py ├── 104_organic.py ├── 105_delta_time.py ├── 108_order_size.py ├── 109_have_you_bought.py ├── 110_None.py ├── 200_======item_feature====== ├── 202_buy_time.py ├── 203_cycle.py ├── 205_co-occur.py ├── 207_mean_pos_cart.py ├── 208_one-shot.py ├── 209_together.py ├── 210_streak.py ├── 211_1to1.py ├── 212_withinN.py ├── 213_dow_diff.py ├── 214_first_order.py ├── 215_onb_diff.py ├── 300_======user x item====== ├── 301_total_buy.py ├── 302-1_reorderd_all.py ├── 303_last_order_date.py ├── 304_buy_item_inarow.py ├── 305_last_order_num.py ├── 306_mean_pos_cart.py ├── 307_timezone_dow.py ├── 308_timezone_dow.py ├── 309_order_ratio_by-chance.py ├── 310_repeat_within_today.py ├── 312_cycle.py ├── 313_aisle_dep.py ├── 314_co-occur.py ├── 315_streak.py ├── 316_replacement.py ├── 400_===== daytime ===== ├── 401_how_many_come.py ├── 500_===== concat ===== ├── 501_concat.py ├── 502_concat.py ├── 900_===== run ===== ├── 901_run_feature.py ├── 902_run_concat.py ├── run.py └── utils.py ├── py_model ├── 000_====== user x item prediction ====== ├── 002_xgb_holdout_item_812_1.py ├── 002_xgb_holdout_item_813_1.py ├── 002_xgb_holdout_item_813_3.py ├── 100_====== None prediction ====== ├── 102_xgb_holdout_None_813_3.py ├── 102_xgb_holdout_None_814_1.py ├── 102_xgb_holdout_None_814_2.py ├── 102_xgb_holdout_None_814_3.py ├── 200_===== threshold estimation ===== ├── 201_Faron_opt_bagging_815_3.py ├── 999_run.py ├── LOG │ ├── 812_1_xgb_item.txt │ ├── 813_1_xgb_item.txt │ ├── 813_3_xgb_None.txt │ ├── 813_3_xgb_item.txt │ ├── 814_1_xgb_None.txt │ ├── 814_2_xgb_None.txt │ └── 814_3_xgb_None.txt ├── opt_fscore.py ├── pyx_get_best_items.pyx └── utils.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | input/ 3 | output/ 4 | data/ 5 | py/.ipynb_checkpoints 6 | py/*.model 7 | py/*.p 8 | -------------------------------------------------------------------------------- /Instacart 2nd Place Solution.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/Instacart 2nd Place Solution.pptx -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 KazukiOnodera 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Instacart Market Basket Analysis 2nd place solution 2 | 3 | I made two models for predicting reorder & None. 4 | Following are the features I made. 5 | 6 | ## Features 7 | ### User feature 8 | * How often the user reordered items 9 | * Time between orders 10 | * Time of day the user visits 11 | * Whether the user ordered organic, gluten-free, or Asian items in the past 12 | * Features based on order sizes 13 | * How many of the user’s orders contained no previously purchased items 14 | 15 | ### Item feature 16 | * How often the item is purchased 17 | * Position in the cart 18 | * How many users buy it as "one shot" item 19 | * Stats on the number of items that co-occur with this item 20 | * Stats on the order streak 21 | * Probability of being reordered within N orders 22 | * Distribution of the day of week it is ordered 23 | * Probability it is reordered after the first order 24 | * Statistics around the time between orders 25 | 26 | ### User x Item feature 27 | * Number of orders in which the user purchases the item 28 | * Days since the user last purchased the item 29 | * Streak (number of orders in a row the user has purchased the item) 30 | * Position in the cart 31 | * Whether the user already ordered the item today 32 | * Co-occurrence statistics 33 | * Replacement items 34 | 35 | ### datetime feature 36 | * Counts by day of week 37 | * Counts by hour 38 | 39 | More detail, please refer to codes. 40 | 41 | ## F1 maximization 42 | Regarding F1 maximization, I hadn't read that paper until Faron had published the kernel. But I got high score because of my F1 maximization. 43 | Let me explain it. 44 | For maximizing F1, I generate y_true according to predicted prob. And check F1 from higher prob. 45 | For example, lets say we have ordered item and prob, like {A: 0.3, B:0.5, C:0.4}. Then generate y_true in many times. In my case, generated 9999 times. 46 | So now we have many of y_true, like [ [A,B],[B],[B,C],[C],[B],[None].....]. 47 | As I mentioned above, next thing we do is to check F1 from [B], [B,C], [B,C,A]. Then we can estimate F1 peak out, and stop calculation, and go next order. 48 | You may know, in this method, we don't need to check all pattern, like [A],[A,B],[A,B,C],[B]... 49 | I guess some might have figured out this method from my comment of "tips to go farther". 50 | However, this method is time consuming as well as depends on seed. So finally I used Faron's kernel. 51 | Fortunatelly or not, I got almost same result using Faron's kernel. 52 | Please refer to py_model/pyx_get_best_items.pyx 53 | 54 | ## How to run 55 | * cd py_feature 56 | * python 901_run_feature.py 57 | * python 902_run_concat.py 58 | * cd ../py_model 59 | * python 999_run.py 60 | 61 | ## Requirements 62 | Around 300 GB RAM needed(sorry). 63 | But I confirmed we can get 0.4073 on private LB with only around 60 GB RAM. 64 | Also if you don't have enough memory and want to get high score, try continuous training using 65 | xgb_model of xgb.train. 66 | 67 | Python packages: 68 | - numpy==1.12.1 69 | - pandas==0.19.2 70 | - scipy==0.19.0 71 | - tqdm==4.11.2 72 | - xgboost==0.6 73 | -------------------------------------------------------------------------------- /appendix/317_.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jun 18 12:55:38 2017 5 | 6 | @author: konodera 7 | 8 | https://twitter.com/jeremystan/status/911357665481080832 9 | 10 | 6/ most novel feature: 11 | binary user by product purchase sequence -> 12 | decimal -> XGBoost learns non-trivial sequence patterns 13 | 14 | """ 15 | 16 | import pandas as pd 17 | import numpy as np 18 | from tqdm import tqdm 19 | from decimal import Decimal 20 | import utils 21 | #utils.start(__file__) 22 | 23 | 24 | #============================================================================== 25 | # load 26 | #============================================================================== 27 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] 28 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number']) 29 | 30 | 31 | #============================================================================== 32 | # def 33 | #============================================================================== 34 | def conv_bi2dec(seq, onb_max, reverse=True, deci=10): 35 | """ 36 | ex. 37 | seq = [1,3,4] 38 | onb_max = 6 39 | 101100 -> 44 40 | 001101 -> 13 41 | """ 42 | 43 | bi = [0]*onb_max 44 | for i in seq: 45 | bi[i-1] = 1 46 | 47 | if reverse: 48 | bi = ''.join(map(str, bi))[::-1] 49 | else: 50 | bi = ''.join(map(str, bi)) 51 | 52 | if deci==10: 53 | return int(bi, 2) 54 | elif deci==2: 55 | return int(bi) 56 | elif deci==.2: 57 | return float(bi[0] + '.' + bi[1:]) 58 | else: 59 | raise 60 | 61 | def make(T): 62 | """ 63 | T = 0 64 | folder = 'trainT-0' 65 | """ 66 | if T==-1: 67 | folder = 'test' 68 | else: 69 | folder = 'trainT-'+str(T) 70 | 71 | log_ = log[log.order_number_rev>T] 72 | log_['onb_max'] = log_.groupby('user_id').order_number.transform(np.max) 73 | 74 | r1_d10 = [] 75 | r1_d2 = [] 76 | r1_df2 = [] 77 | r0_d10 = [] 78 | r0_d2 = [] 79 | r0_df2 = [] 80 | 81 | seq = [] 82 | uid_bk = pid_bk = onb_max_bk = None 83 | for uid,pid,onb,onb_max in tqdm(log_[['user_id', 'product_id', 'order_number', 'onb_max']].values): 84 | 85 | if uid_bk is None: 86 | pass 87 | 88 | elif uid==uid_bk and pid==pid_bk: 89 | pass 90 | 91 | elif uid!=uid_bk or pid!=pid_bk: 92 | r1_d10.append(conv_bi2dec(seq, onb_max_bk, True, 10)) 93 | r1_d2.append(conv_bi2dec(seq, onb_max_bk, True, 2)) 94 | r1_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2)) 95 | r0_d10.append(conv_bi2dec(seq, onb_max_bk, True, 10)) 96 | r0_d2.append(conv_bi2dec(seq, onb_max_bk, True, 2)) 97 | r0_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2)) 98 | seq = [] 99 | 100 | seq.append(onb) 101 | uid_bk = uid 102 | pid_bk = pid 103 | onb_max_bk = onb_max 104 | 105 | r1_d10.append(conv_bi2dec(seq, onb_max_bk, True, 10)) 106 | r1_d2.append(conv_bi2dec(seq, onb_max_bk, True, 2)) 107 | r1_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2)) 108 | r0_d10.append(conv_bi2dec(seq, onb_max_bk, True, 10)) 109 | r0_d2.append(conv_bi2dec(seq, onb_max_bk, True, 2)) 110 | r0_df2.append(conv_bi2dec(seq, onb_max_bk, False, .2)) 111 | 112 | df = log_[['user_id', 'product_id']].drop_duplicates(keep='first').reset_index(drop=True) 113 | df['seq2dec_r1_d10'] = r1_d10 114 | df['seq2dec_r1_d2'] = r1_d2 115 | df['seq2dec_r1_df2'] = r1_df2 116 | df['seq2dec_r0_d10'] = r0_d10 117 | df['seq2dec_r0_d2'] = r0_d2 118 | df['seq2dec_r0_df2'] = r0_df2 119 | 120 | df.to_pickle('../feature/{}/f317_user-product.p'.format(folder)) 121 | 122 | 123 | #============================================================================== 124 | # main 125 | #============================================================================== 126 | make(0) 127 | #make(1) 128 | #make(2) 129 | 130 | make(-1) 131 | 132 | 133 | 134 | #============================================================================== 135 | utils.end(__file__) 136 | 137 | -------------------------------------------------------------------------------- /appendix/501_concat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jul 10 04:11:27 2017 5 | 6 | @author: konodera 7 | 8 | 9 | nohup python -u 501_concat.py & 10 | 11 | """ 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import multiprocessing as mp 16 | import gc 17 | import utils 18 | #utils.start(__file__) 19 | 20 | #============================================================================== 21 | # def 22 | #============================================================================== 23 | def concat_pred_item(T, dryrun=False): 24 | if T==-1: 25 | name = 'test' 26 | else: 27 | name = 'trainT-'+str(T) 28 | 29 | df = utils.load_pred_item(name) 30 | 31 | df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)), 32 | on=['user_id', 'product_id'],how='left') 33 | 34 | gc.collect() 35 | 36 | #============================================================================== 37 | print('output') 38 | #============================================================================== 39 | if dryrun == True: 40 | return df 41 | else: 42 | utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True) 43 | 44 | def multi(name): 45 | concat_pred_item(name) 46 | 47 | #============================================================================== 48 | 49 | # multi 50 | mp_pool = mp.Pool(2) 51 | mp_pool.map(multi, [0, -1]) 52 | 53 | 54 | 55 | utils.end(__file__) 56 | 57 | -------------------------------------------------------------------------------- /appendix/901_reorder_base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 901_reorder_base.py > LOG/_xgb_reorder.txt & 9 | 10 | 11 | """ 12 | 13 | import warnings 14 | warnings.filterwarnings("ignore") 15 | import pandas as pd 16 | import numpy as np 17 | import gc 18 | import xgboost as xgb 19 | import utils 20 | 21 | utils.start(__file__) 22 | 23 | 24 | 25 | # setting 26 | OUTF = '../output/sub/apdx/bench.p' 27 | LOOP = 3 28 | ESR = 40 29 | 30 | #seed = np.random.randint(99999) 31 | seed = 71 32 | 33 | np.random.seed(seed) 34 | 35 | valid_size = 0.05 36 | 37 | 38 | # XGB param 39 | nround = 10000 40 | #nround = 10 41 | 42 | param = {'max_depth':10, 43 | 'eta':0.02, 44 | 'colsample_bytree':0.4, 45 | 'subsample':0.75, 46 | 'silent':1, 47 | 'nthread':27, 48 | 'eval_metric':'logloss', 49 | 'objective':'binary:logistic', 50 | 'tree_method':'hist' 51 | } 52 | 53 | print("""#==== print param ======""") 54 | print('OUTF:', OUTF) 55 | print('seed:', seed) 56 | 57 | utils.mkdir_p('../output/model/apdx') 58 | utils.mkdir_p('../output/imp/apdx') 59 | utils.mkdir_p('../output/sub/apdx') 60 | 61 | #============================================================================== 62 | # prepare 63 | #============================================================================== 64 | train = utils.load_pred_item('trainT-0') 65 | 66 | y_train = train['y'] 67 | X_train = train.drop('y', axis=1) 68 | del train 69 | gc.collect() 70 | 71 | # drop id 72 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 73 | col.remove('user_id') 74 | print('drop1',col) 75 | X_train.drop(col, axis=1, inplace=True) # keep user_id 76 | 77 | # drop obj 78 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 79 | print('drop2',col) 80 | X_train.drop(col, axis=1, inplace=True) 81 | 82 | X_train.fillna(-1, inplace=1) 83 | 84 | #============================================================================== 85 | # SPLIT! 86 | print('split by user') 87 | #============================================================================== 88 | train_user = X_train[['user_id']].drop_duplicates() 89 | 90 | def split_build_valid(): 91 | 92 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 93 | p=[1-valid_size, valid_size]) 94 | valid_n = train_user['is_valid'].sum() 95 | build_n = (train_user.shape[0] - valid_n) 96 | 97 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 98 | valid_user = train_user[train_user['is_valid']==1].user_id 99 | is_valid = X_train.user_id.isin(valid_user) 100 | 101 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 102 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 103 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 104 | 105 | print('FINAL SHAPE') 106 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 107 | (dvalid.num_row(), dvalid.num_col()))) 108 | 109 | return dbuild, dvalid, watchlist 110 | 111 | #============================================================================== 112 | print('hold out') 113 | #============================================================================== 114 | 115 | # hold out 116 | models = [] 117 | for i in range(LOOP): 118 | print('LOOP',i) 119 | dbuild, dvalid, watchlist = split_build_valid() 120 | 121 | if i==0: 122 | col_train = dbuild.feature_names 123 | 124 | model = xgb.train(param, dbuild, nround, watchlist, 125 | early_stopping_rounds=ESR, verbose_eval=5) 126 | models.append(model) 127 | # model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i)) 128 | # VALID 129 | valid_yhat = model.predict(dvalid) 130 | print('Valid Mean:', np.mean(valid_yhat)) 131 | del dbuild, dvalid, watchlist 132 | gc.collect() 133 | 134 | del train_user, X_train, y_train 135 | gc.collect() 136 | 137 | #============================================================================== 138 | print('test') 139 | #============================================================================== 140 | test = utils.load_pred_item('test').fillna(-1) 141 | 142 | sub_test = test[['order_id', 'product_id']] 143 | 144 | dtest = xgb.DMatrix(test[col_train]) 145 | sub_test['yhat'] = 0 146 | for model in models: 147 | sub_test['yhat'] += model.predict(dtest) 148 | sub_test['yhat'] /= LOOP 149 | print('Test Mean:', sub_test['yhat'].mean()) 150 | 151 | sub_test.to_pickle(OUTF) 152 | 153 | 154 | #============================================================================== 155 | utils.end(__file__) 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /appendix/902_reorder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 902_reorder.py > LOG/_xgb_item.txt & 9 | 10 | 11 | """ 12 | 13 | import warnings 14 | warnings.filterwarnings("ignore") 15 | import pandas as pd 16 | import numpy as np 17 | import gc 18 | import xgboost as xgb 19 | import utils 20 | 21 | utils.start(__file__) 22 | 23 | 24 | 25 | # setting 26 | OUTF = '../output/sub/apdx/seq2dec.p' 27 | LOOP = 2 28 | ESR = 40 29 | 30 | #seed = np.random.randint(99999) 31 | seed = 71 32 | 33 | np.random.seed(seed) 34 | 35 | valid_size = 0.05 36 | 37 | 38 | # XGB param 39 | nround = 10000 40 | #nround = 10 41 | 42 | param = {'max_depth':10, 43 | 'eta':0.02, 44 | 'colsample_bytree':0.4, 45 | 'subsample':0.75, 46 | 'silent':1, 47 | 'nthread':27, 48 | 'eval_metric':'logloss', 49 | 'objective':'binary:logistic', 50 | 'tree_method':'hist' 51 | } 52 | 53 | print("""#==== print param ======""") 54 | print('OUTF:', OUTF) 55 | print('seed:', seed) 56 | 57 | #============================================================================== 58 | # prepare 59 | #============================================================================== 60 | train = utils.read_pickles('../feature/{}/all_apdx'.format('trainT-0')) 61 | 62 | # f317 obj into int 63 | #col = [c for c in train.columns if 'seq2' in c and not '_df' in c] 64 | #train[col] = train[col].astype(np.float32) 65 | 66 | y_train = train['y'] 67 | X_train = train.drop('y', axis=1) 68 | del train 69 | gc.collect() 70 | 71 | # drop id 72 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 73 | col.remove('user_id') 74 | print('drop1',col) 75 | X_train.drop(col, axis=1, inplace=True) # keep user_id 76 | 77 | # drop obj 78 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist()+['seq2dec_r0_df2'] 79 | print('drop2',col) 80 | X_train.drop(col, axis=1, inplace=True) 81 | 82 | X_train.fillna(-1, inplace=1) 83 | 84 | #============================================================================== 85 | # SPLIT! 86 | print('split by user') 87 | #============================================================================== 88 | train_user = X_train[['user_id']].drop_duplicates() 89 | 90 | def split_build_valid(): 91 | 92 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 93 | p=[1-valid_size, valid_size]) 94 | valid_n = train_user['is_valid'].sum() 95 | build_n = (train_user.shape[0] - valid_n) 96 | 97 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 98 | valid_user = train_user[train_user['is_valid']==1].user_id 99 | is_valid = X_train.user_id.isin(valid_user) 100 | 101 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 102 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 103 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 104 | 105 | print('FINAL SHAPE') 106 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 107 | (dvalid.num_row(), dvalid.num_col()))) 108 | 109 | return dbuild, dvalid, watchlist 110 | 111 | #============================================================================== 112 | print('hold out') 113 | #============================================================================== 114 | 115 | # hold out 116 | models = [] 117 | for i in range(LOOP): 118 | print('LOOP',i) 119 | dbuild, dvalid, watchlist = split_build_valid() 120 | 121 | if i==0: 122 | col_train = dbuild.feature_names 123 | 124 | model = xgb.train(param, dbuild, nround, watchlist, 125 | early_stopping_rounds=ESR, verbose_eval=5) 126 | models.append(model) 127 | # model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i)) 128 | # VALID 129 | valid_yhat = model.predict(dvalid) 130 | print('Valid Mean:', np.mean(valid_yhat)) 131 | del dbuild, dvalid, watchlist 132 | gc.collect() 133 | 134 | del train_user, X_train, y_train 135 | gc.collect() 136 | 137 | #============================================================================== 138 | print('test') 139 | #============================================================================== 140 | test = utils.read_pickles('../feature/{}/all_apdx'.format('test')).fillna(-1) 141 | 142 | # f317 obj into int 143 | #col = [c for c in test.columns if 'seq2' in c and not '_df' in c] 144 | #test[col] = test[col].astype(np.float32) 145 | 146 | 147 | sub_test = test[['order_id', 'product_id']] 148 | 149 | dtest = xgb.DMatrix(test[col_train]) 150 | sub_test['yhat'] = 0 151 | for model in models: 152 | sub_test['yhat'] += model.predict(dtest) 153 | sub_test['yhat'] /= LOOP 154 | print('Test Mean:', sub_test['yhat'].mean()) 155 | 156 | sub_test.to_pickle(OUTF) 157 | 158 | 159 | #============================================================================== 160 | utils.end(__file__) 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /appendix/903_Faron_opt_bagging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jul 29 18:59:46 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 201_Faron_opt_bagging_815_3.py > LOG/_Faron-opt.txt & 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import sys 14 | sys.path.append('../py_model') 15 | from opt_fscore import get_best_prediction 16 | import multiprocessing as mp 17 | import time 18 | import utils 19 | utils.start(__file__) 20 | 21 | 22 | # setting 23 | DATE_None = ['813_3', '814_1', '814_2', '814_3'] 24 | 25 | total_proc = 60 26 | 27 | 28 | utils.mkdir_p('../output/sub/apdx') 29 | #============================================================================== 30 | # def 31 | #============================================================================== 32 | def multi(i): 33 | if i%1000==0: 34 | print('{:.3f} min'.format((time.time()-st_time)/60)) 35 | items = sub.loc[i,'product_id'] 36 | preds = sub.loc[i,'yhat'] 37 | pNone = sub.loc[i,'yhat_None'] 38 | ret = get_best_prediction(items, preds, pNone) 39 | return ret 40 | 41 | def mk_sub(DATE_item): 42 | print("""#==== print param ======""") 43 | print('OUTF:', OUTF) 44 | print('DATE_item:', DATE_item) 45 | print('DATE_None:', DATE_None) 46 | print('total_proc:', total_proc) 47 | 48 | global sub, st_time 49 | 50 | sub_item = pd.read_pickle('../output/sub/{}/sub_test.p'.format(DATE_item)) 51 | sub = sub_item.groupby('order_id').product_id.apply(list).to_frame() 52 | sub['yhat'] = sub_item.groupby('order_id').yhat.apply(list) 53 | 54 | # weighted 55 | for i,(w,d) in enumerate(zip([0.1, 0.1, 0.4, 0.4], DATE_None)): 56 | tmp = pd.read_pickle('../output/sub/{}/sub_test_None.p'.format(d)).rename(columns={'yhat':'yhat_None'}) 57 | tmp.yhat_None *= w 58 | if i==0: 59 | sub_None = tmp 60 | else: 61 | sub_None = pd.concat([sub_None, tmp]) 62 | 63 | sub_None = sub_None.groupby('order_id').yhat_None.sum().reset_index() 64 | 65 | sub = pd.merge(sub.reset_index(), sub_None, on='order_id', how='left') 66 | 67 | # optimize start!!! 68 | st_time = time.time() 69 | pool = mp.Pool(total_proc) 70 | callback = pool.map(multi, range(sub.shape[0])) 71 | 72 | sub['products'] = callback 73 | 74 | print('writing...') 75 | sub[['order_id', 'products']].to_csv(OUTF, index=0, compression='gzip') 76 | #============================================================================== 77 | OUTF = "../output/sub/apdx/bench.csv.gz" 78 | mk_sub('apdx_base') 79 | 80 | OUTF = "../output/sub/apdx/seq2dec.csv.gz" 81 | mk_sub('apdx') 82 | 83 | 84 | 85 | 86 | 87 | #============================================================================== 88 | utils.end(__file__) 89 | 90 | -------------------------------------------------------------------------------- /appendix/README.md: -------------------------------------------------------------------------------- 1 | # Appendix of Instacart Market Basket Analysis 2 | 3 | After the competition, I wannted to try some ideas. 4 | 5 | ## How to run 6 | * pendding 7 | 8 | ## Requirements 9 | 10 | Python packages: 11 | - numpy==1.12.1 12 | - pandas==0.19.2 13 | - scipy==0.19.0 14 | - tqdm==4.11.2 15 | - xgboost==0.6 16 | -------------------------------------------------------------------------------- /input/README.md: -------------------------------------------------------------------------------- 1 | You need to put below files in this directory 2 | - aisles.csv 3 | - departments.csv 4 | - order_products__prior.csv.gz 5 | - order_products__train.csv.gz 6 | - orders.csv.gz 7 | - products.csv 8 | - sample_submission.csv -------------------------------------------------------------------------------- /output/sub/final/Faron-opt_bagging-v3.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/output/sub/final/Faron-opt_bagging-v3.csv.gz -------------------------------------------------------------------------------- /py_feature/000_mk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 17 08:55:13 2017 4 | 5 | @author: konodera 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import gc 11 | import os 12 | import utils 13 | utils.start(__file__) 14 | 15 | os.system('rm -rf ../input/mk') 16 | os.system('mkdir ../input/mk') 17 | 18 | os.system('rm -rf ../feature') 19 | os.system('mkdir ../feature') 20 | 21 | #============================================================================== 22 | # test user 23 | #============================================================================== 24 | orders = pd.read_csv('../input/orders.csv.gz') 25 | 26 | test_user = orders.loc[orders.eval_set=='test'].reset_index(drop=1) 27 | test_user[['order_id', 'user_id']].to_pickle('../input/mk/test_user.p') 28 | 29 | 30 | #============================================================================== 31 | # goods 32 | #============================================================================== 33 | products = pd.read_csv('../input/products.csv') 34 | products.product_name = products.product_name.str.replace(' ', '-') 35 | 36 | aisles = pd.read_csv('../input/aisles.csv', engine='c') 37 | departments = pd.read_csv('../input/departments.csv', engine='c') 38 | 39 | goods = pd.merge(left=pd.merge(left=products, right=departments, how='left'), right=aisles, how='left') 40 | 41 | 42 | goods.to_pickle('../input/mk/goods.p') 43 | gc.collect() 44 | #============================================================================== 45 | # log 46 | #============================================================================== 47 | log = pd.concat([pd.read_csv('../input/order_products__prior.csv.gz'), 48 | pd.read_csv('../input/order_products__train.csv.gz')], 49 | ignore_index=1) 50 | 51 | log.sort_values(['order_id', 'add_to_cart_order'], inplace=True) 52 | log.reset_index(drop=1, inplace=True) 53 | log = pd.merge(log, goods, on='product_id', how='left') 54 | log = pd.merge(log, orders, on='order_id', how='left') 55 | log['order_number_rev'] = log.groupby('user_id').order_number.transform(np.max) - log.order_number 56 | 57 | utils.to_pickles(log, '../input/mk/log', 20) 58 | 59 | gc.collect() 60 | #============================================================================== 61 | # order_tbl 62 | #============================================================================== 63 | order_product = log.groupby('order_id').product_name.apply(list).reset_index() 64 | order_tbl = pd.merge(orders, order_product, on='order_id', how='left') 65 | 66 | order_tbl.sort_values(['user_id', 'order_number'],inplace=True) 67 | order_tbl.reset_index(drop=1, inplace=True) 68 | order_tbl = pd.merge(order_tbl, log[['order_id','order_number_rev']].drop_duplicates(), on='order_id', how='left') 69 | order_tbl.order_number_rev = order_tbl.order_number_rev.fillna(-1).astype(int) 70 | #order_tbl['order_number_rev'] = order_tbl.groupby('user_id').order_number.transform(np.max) - order_tbl.order_number 71 | order_tbl['days_since_first_order'] = order_tbl.groupby('user_id').days_since_prior_order.cumsum() 72 | 73 | def set_diff(items1, items2): 74 | if isinstance(items1, float) or isinstance(items2, float): 75 | return items1 76 | return [i1 for i1 in items1 if i1 not in items2] 77 | 78 | def same_products(items1, items2): 79 | if isinstance(items1, float) or isinstance(items2, float): 80 | return [] 81 | return [i1 for i1 in items1 if i1 in items2] 82 | 83 | order_tbl['t-1_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(1) 84 | order_tbl['set_diff_products'] = order_tbl.apply(lambda x: set_diff(x['product_name'], x['t-1_product_name']), axis=1) 85 | order_tbl['same_products'] = order_tbl.apply(lambda x: same_products(x['product_name'], x['t-1_product_name']), axis=1) 86 | 87 | order_tbl.to_pickle('../input/mk/order_tbl.p') 88 | gc.collect() 89 | #============================================================================== 90 | # order_aisle-department 91 | #============================================================================== 92 | order_aisle = pd.crosstab(log['order_id'], 93 | log['aisle_id']).add_prefix('aisle_').reset_index() 94 | 95 | order_department = pd.crosstab(log['order_id'], 96 | log['department_id']).add_prefix('department_').reset_index() 97 | 98 | order_aisle = pd.merge(order_aisle, order_department, on='order_id', how='left') 99 | 100 | order_aisle.to_pickle('../input/mk/order_aisle-department.p') 101 | 102 | del order_aisle, order_department 103 | gc.collect() 104 | 105 | #============================================================================== 106 | # order_reorderd 107 | #============================================================================== 108 | log_ = log.loc[log.reordered==1] 109 | order_reorderd = log_.groupby('order_id').product_id.apply(list).reset_index() 110 | 111 | order_reorderd.to_pickle('../input/mk/order_reorderd.p') 112 | gc.collect() 113 | 114 | #============================================================================== 115 | # user_order 116 | #============================================================================== 117 | from itertools import chain 118 | 119 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p') 120 | order_tbl = order_tbl.loc[order_tbl.eval_set!='test'] 121 | 122 | goods = pd.read_pickle('../input/mk/goods.p') 123 | 124 | goods_di = {} 125 | for k,v in zip(goods.product_name, goods.product_id): 126 | goods_di[k] = v 127 | 128 | 129 | def sum_list(x): 130 | return list(chain.from_iterable(x)) 131 | 132 | def to_unique(lists): 133 | li = sum_list(lists) 134 | return list(set(li)) 135 | 136 | def to_ids(names): 137 | ids = [goods_di[n] for n in names] 138 | return ids 139 | 140 | user_hist = order_tbl.groupby('user_id').product_name.apply(to_unique).reset_index() 141 | user_hist['product_id'] = user_hist.product_name.map(to_ids) 142 | 143 | user_hist.to_pickle('../input/mk/user_order.p') 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | utils.end(__file__) 156 | 157 | 158 | -------------------------------------------------------------------------------- /py_feature/003_X_base_T.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat May 27 17:05:46 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | #============================================================================== 16 | # load 17 | #============================================================================== 18 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p') 19 | order_tbl = order_tbl[['order_id', 'user_id', 'order_number', 'order_number_rev']] 20 | order_tbl.sort_values(['user_id', 'order_number', 'order_id'], inplace=True) 21 | 22 | test_order = pd.read_pickle('../input/mk/test_user.p') 23 | 24 | #============================================================================== 25 | # def 26 | #============================================================================== 27 | def main(T): 28 | for i in range(1, 1+T): 29 | order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i) 30 | 31 | order_tbl.dropna(inplace=True) 32 | 33 | col = [c for c in order_tbl.columns if 'order_id' in c] 34 | for c in col: 35 | order_tbl[c] = order_tbl[c].map(int) 36 | 37 | order_tbl.reset_index(drop=1, inplace=True) 38 | 39 | order_tbl['is_train'] = 1-order_tbl.order_id.isin(test_order.order_id)*1 40 | 41 | order_tbl[col+['user_id','is_train']].to_pickle('../feature/X_base_t{}.p'.format(T)) 42 | 43 | 44 | main(3) 45 | main(5) 46 | 47 | #============================================================================== 48 | utils.end(__file__) 49 | 50 | """ 51 | 206209 rows 52 | """ -------------------------------------------------------------------------------- /py_feature/004_label.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri May 26 17:07:09 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] 16 | log = utils.read_pickles('../input/mk/log', col).rename(columns={'reordered':'y'}) 17 | 18 | test_order = pd.read_pickle('../input/mk/test_user.p') 19 | 20 | #============================================================================== 21 | # train 22 | #============================================================================== 23 | def make(T): 24 | label_t1 = log[log.order_number_rev>T] 25 | label_t1.drop_duplicates(['user_id','product_id'], keep='last', inplace=True) 26 | label_t1.sort_values(['user_id','product_id'], inplace=True) 27 | 28 | label_t0_y1 = log.loc[log.order_number_rev==T].loc[log.y==1] 29 | label_t0_y1.sort_values(['user_id','product_id'], inplace=True) 30 | 31 | label_t1['key'] = label_t1.user_id.map(str) + ' ' + label_t1.product_id.map(str) 32 | label_t0_y1['key'] = label_t0_y1.user_id.map(str) + ' ' + label_t0_y1.product_id.map(str) 33 | label_t0_y0 = label_t1[~label_t1.key.isin(label_t0_y1.key)] 34 | 35 | label_t0_y0.drop('order_id', axis=1 ,inplace=True) 36 | label_t0_y0 = pd.merge(label_t0_y0, log.loc[log.order_number_rev==T, ['user_id','order_id']].drop_duplicates(), 37 | on='user_id', how='left') 38 | label_t0_y0.y = 0 39 | 40 | label_train = pd.concat([label_t0_y1, label_t0_y0], ignore_index=1) 41 | label_train.sort_values(['user_id','product_id'], inplace=True) 42 | label_train.reset_index(drop=1, inplace=True) 43 | 44 | col = ['order_id', 'product_id', 'y'] 45 | 46 | print(label_train[col].isnull().sum()) 47 | utils.mkdir_p('../feature/trainT-{}'.format(T)) 48 | label_train[col].to_pickle('../feature/trainT-{}/label_reordered.p'.format(T)) 49 | 50 | make(0) # basically train is T=0, for validation, train;T=1 valid;T=0 51 | make(1) 52 | make(2) 53 | 54 | #============================================================================== 55 | # test 56 | #============================================================================== 57 | log_test = log.drop_duplicates(['user_id','product_id'])[['user_id','product_id']] 58 | log_test = log_test[log_test.user_id.isin(test_order.user_id)] 59 | 60 | log_test.sort_values(['user_id','product_id'],inplace=True) 61 | log_test.reset_index(drop=1, inplace=True) 62 | 63 | test_order = pd.merge(test_order, log_test, on='user_id', how='left') 64 | 65 | print(test_order[['order_id', 'product_id']].isnull().sum()) 66 | utils.mkdir_p('../feature/test') 67 | test_order[['order_id', 'product_id']].to_pickle('../feature/test/label_reordered.p') 68 | 69 | 70 | 71 | 72 | #============================================================================== 73 | utils.end(__file__) 74 | 75 | -------------------------------------------------------------------------------- /py_feature/005_inarow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed May 31 00:29:00 2017 5 | 6 | @author: konodera 7 | 8 | 9 | """ 10 | import pandas as pd 11 | import numpy as np 12 | from tqdm import tqdm 13 | import utils 14 | utils.start(__file__) 15 | 16 | 17 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'reordered'] 18 | log = utils.read_pickles('../input/mk/log', col) 19 | log.sort_values(['user_id', 'product_id', 'order_number'], inplace=True) 20 | 21 | 22 | 23 | uid_bk = pid_bk = onum_bk = None 24 | ret = [] 25 | miniters = int(log.shape[0]/50) 26 | col = ['user_id', 'product_id', 'order_number'] 27 | for uid,pid,onum in tqdm(log[col].values,miniters=miniters): 28 | if uid_bk is None: 29 | cnt = 1 30 | ret.append(cnt) 31 | elif uid == uid_bk and pid == pid_bk: 32 | if onum - onum_bk == 1: 33 | cnt+=1 34 | ret.append(cnt) 35 | else: 36 | cnt = 1 37 | ret.append(cnt) 38 | pass 39 | elif uid == uid_bk and pid != pid_bk: # item change 40 | cnt = 1 41 | ret.append(cnt) 42 | elif uid != uid_bk: # user change 43 | cnt = 1 44 | ret.append(cnt) 45 | else: 46 | raise Exception('?') 47 | 48 | uid_bk = uid 49 | pid_bk = pid 50 | onum_bk = onum 51 | log['buy_item_inarow'] = ret 52 | 53 | log.reset_index(drop=1, inplace=True) 54 | 55 | log.to_pickle('../input/mk/log_inarow.p') 56 | 57 | 58 | utils.end(__file__) 59 | 60 | -------------------------------------------------------------------------------- /py_feature/006_days_since_last_order.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jun 3 07:41:26 2017 5 | 6 | @author: konodera 7 | 8 | そのユーザがそのアイテム注文したのは何日前か? 9 | *リークじゃない 10 | 11 | """ 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | import gc 17 | import multiprocessing as mp 18 | import utils 19 | utils.start(__file__) 20 | 21 | 22 | kfold = 10 23 | 24 | X_base = pd.read_pickle('../feature/X_base_t3.p') 25 | 26 | label_train = pd.read_pickle('../feature/trainT-0/label_reordered.p') 27 | label_test = pd.read_pickle('../feature/test/label_reordered.p') 28 | 29 | train = pd.merge(X_base[X_base.is_train==1], label_train, on='order_id', how='inner') 30 | test = pd.merge(X_base[X_base.is_train==0], label_test, on='order_id', how='inner') 31 | 32 | #============================================================================== 33 | # mk train * test log 34 | #============================================================================== 35 | col = ['order_id', 'user_id', 'product_id'] 36 | train_log = utils.read_pickles('../input/mk/log', col) 37 | 38 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')\ 39 | [['order_id', 'user_id', 'order_number', 'days_since_first_order']] 40 | 41 | # merge user_id -> ['order_id', 'user_id', 'product_id'] 42 | train_log = pd.merge(train_log[['order_id', 'product_id']], 43 | order_tbl[['order_id','user_id']], 44 | on='order_id', how='left')[['order_id', 'user_id', 'product_id']] 45 | test_log = pd.merge(test[['order_id', 'product_id']], 46 | order_tbl[['order_id','user_id']], 47 | on='order_id', how='left')[['order_id', 'user_id', 'product_id']] 48 | 49 | log = pd.concat([train_log, test_log]) 50 | del X_base, train_log, test_log; gc.collect() 51 | log.sort_values(['user_id', 'product_id'], inplace=True) 52 | 53 | user_item = log.drop_duplicates(['user_id', 'product_id'])[['user_id', 'product_id']] 54 | order_user = order_tbl[['order_id', 'user_id',]] 55 | 56 | log = pd.merge(order_user, user_item, on='user_id', how='left') 57 | del order_user, user_item; gc.collect() 58 | 59 | users = log[['user_id']].drop_duplicates().reset_index(drop=1) 60 | users['kfold'] = users.index%kfold 61 | 62 | 63 | usecols = [ 'order_id', 'product_id'] 64 | buy_tbl = utils.read_pickles('../input/mk/log', usecols) 65 | buy_tbl['key'] = buy_tbl.order_id.map(str) + ' ' + buy_tbl.product_id.map(str) 66 | 67 | utils.mkdir_p('../input/mk/days_since_last_order') 68 | 69 | 70 | #============================================================================== 71 | # days_since_last_order_this_item 72 | #============================================================================== 73 | def multi(i): 74 | target_users = users[users.kfold==i].user_id 75 | 76 | tbl = pd.merge(log[log.user_id.isin(target_users)], 77 | order_tbl[['order_id','order_number', 'days_since_first_order']], 78 | on='order_id', how='left') 79 | 80 | tbl.sort_values(['user_id', 'product_id', 'order_number'], inplace=True) 81 | 82 | 83 | tbl['key'] = tbl.order_id.map(str) + ' ' + tbl.product_id.map(str) 84 | tbl['buy'] = tbl.key.isin(buy_tbl.key)*1 85 | 86 | tbl.days_since_first_order = tbl.days_since_first_order.fillna(0) 87 | 88 | tbl.sort_values(['user_id', 'product_id', 'order_number'], inplace=True) 89 | 90 | tbl.reset_index(drop=1, inplace=True) 91 | 92 | uid_bk = pid_bk = day_bk = last_date = None 93 | first_buy = False 94 | ret = [] 95 | miniters = int(tbl.shape[0]/50) 96 | for uid,pid,day,buy in tqdm(tbl[['user_id', 'product_id','days_since_first_order','buy']].values, 97 | miniters=miniters): 98 | if uid_bk is None: 99 | if buy==1 and first_buy is False: 100 | ret.append(None) 101 | last_date = day 102 | first_buy = True 103 | elif buy==1: 104 | ret.append(day-last_date) 105 | last_date = day 106 | elif buy==0 and first_buy is True: 107 | ret.append(day-last_date) 108 | else: 109 | ret.append(None) 110 | 111 | elif uid == uid_bk and pid == pid_bk: 112 | if buy==1 and first_buy is False: 113 | ret.append(None) 114 | last_date = day 115 | first_buy = True 116 | elif buy==1: 117 | ret.append(day-last_date) 118 | last_date = day 119 | elif buy==0 and first_buy is True: 120 | ret.append(day-last_date) 121 | else: 122 | ret.append(None) 123 | 124 | elif uid == uid_bk and pid != pid_bk: # item change 125 | last_date = None 126 | first_buy = False 127 | if buy==1 and first_buy is False: 128 | ret.append(None) 129 | last_date = day 130 | first_buy = True 131 | elif buy==1: 132 | ret.append(day-last_date) 133 | last_date = day 134 | elif buy==0 and first_buy is True: 135 | ret.append(day-last_date) 136 | else: 137 | ret.append(None) 138 | 139 | elif uid != uid_bk: # user change 140 | last_date = None 141 | first_buy = False 142 | if buy==1 and first_buy is False: 143 | ret.append(None) 144 | last_date = day 145 | first_buy = True 146 | elif buy==1: 147 | ret.append(day-last_date) 148 | last_date = day 149 | elif buy==0 and first_buy is True: 150 | ret.append(day-last_date) 151 | else: 152 | ret.append(None) 153 | uid_bk = uid 154 | pid_bk = pid 155 | day_bk = day 156 | tbl['days_since_last_order_this_item'] = ret 157 | 158 | col = ['order_id', 'product_id','days_since_last_order_this_item'] 159 | tbl[col].to_pickle('../input/mk/days_since_last_order/{}.p'.format(i)) 160 | 161 | #============================================================================== 162 | 163 | 164 | 165 | mp_pool = mp.Pool(kfold) 166 | mp_pool.map(multi, range(kfold)) 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | utils.end(__file__) 179 | 180 | -------------------------------------------------------------------------------- /py_feature/007_timezone.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jun 19 12:46:49 2017 5 | 6 | @author: konodera 7 | 8 | Time Zone 9 | 10 | """ 11 | import pandas as pd 12 | import numpy as np 13 | from tqdm import tqdm 14 | import utils 15 | utils.start(__file__) 16 | 17 | 18 | 19 | orders = pd.read_csv('../input/orders.csv.gz', usecols=['order_hour_of_day']) 20 | 21 | orders.sort_values('order_hour_of_day', inplace=True) 22 | orders.drop_duplicates(inplace=True) 23 | orders.reset_index(drop=True, inplace=True) 24 | 25 | def timezone(s): 26 | if s < 6: 27 | return 'midnight' 28 | elif s < 12: 29 | return 'morning' 30 | elif s < 18: 31 | return 'noon' 32 | else: 33 | return 'night' 34 | 35 | 36 | orders['timezone'] = orders.order_hour_of_day.map(timezone) 37 | 38 | orders.to_pickle('../input/mk/timezone.p') 39 | 40 | 41 | 42 | 43 | utils.end(__file__) 44 | 45 | -------------------------------------------------------------------------------- /py_feature/008_product_feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 20 17:41:54 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | 16 | item = pd.read_csv('../input/products.csv') 17 | 18 | 19 | item['item_is_Organic'] = item.product_name.map(lambda x: 'organic' in x.lower())*1 20 | item['item_is_Gluten-Free'] = item.product_name.map(lambda x: 'gluten' in x.lower() and 'free' in x.lower())*1 21 | item['item_is_Asian'] = item.product_name.map(lambda x: 'asian' in x.lower())*1 22 | 23 | 24 | col = ['product_id', 'item_is_Organic', 'item_is_Gluten-Free', 'item_is_Asian'] 25 | item[col].to_pickle('../input/mk/products_feature.p') 26 | 27 | 28 | 29 | 30 | 31 | 32 | utils.end(__file__) 33 | 34 | -------------------------------------------------------------------------------- /py_feature/009_None.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jun 25 07:55:48 2017 5 | 6 | @author: konodera 7 | 8 | None 9 | 10 | Leak! 11 | 12 | """ 13 | 14 | import pandas as pd 15 | import numpy as np 16 | from tqdm import tqdm 17 | import utils 18 | utils.start(__file__) 19 | 20 | #============================================================================== 21 | # load 22 | #============================================================================== 23 | col = ['order_id', 'user_id','order_number','product_name', 'eval_set'] 24 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col] 25 | order_tbl.sort_values(['user_id', 'order_number'], inplace=True) 26 | order_tbl = order_tbl[order_tbl.eval_set!='test'] 27 | 28 | #============================================================================== 29 | # main 30 | #============================================================================== 31 | 32 | uid_bk = None 33 | product_name_all = [] # 2d list 34 | pname_unq = [] # 1d list 35 | pname_unq_len = [] # 1d list 36 | for uid,pnames in tqdm(order_tbl[['user_id', 'product_name']].values): 37 | if uid_bk is None: 38 | pname_unq += pnames 39 | elif uid == uid_bk: 40 | pname_unq += pnames 41 | elif uid != uid_bk: 42 | pname_unq = pnames[:] 43 | 44 | uid_bk = uid 45 | pname_unq = list(set(pname_unq)) 46 | pname_unq_len.append(len(pname_unq)) 47 | product_name_all.append(pname_unq) 48 | 49 | order_tbl['product_name_all'] = product_name_all 50 | order_tbl['product_unq_len'] = pname_unq_len 51 | order_tbl['new_item_cnt'] = order_tbl.groupby('user_id').product_unq_len.diff() 52 | order_tbl['product_len'] = order_tbl['product_name'].map(len) 53 | order_tbl['is_None'] = (order_tbl.new_item_cnt == order_tbl.product_len)*1 54 | 55 | col = ['order_id', 'product_unq_len', 'is_None'] 56 | order_tbl[col].to_pickle('../input/mk/order_None.p') 57 | 58 | 59 | 60 | utils.end(__file__) 61 | 62 | -------------------------------------------------------------------------------- /py_feature/010_streak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jul 28 23:31:30 2017 5 | 6 | @author: konodera 7 | 8 | LEAK 9 | 10 | """ 11 | import pandas as pd 12 | import numpy as np 13 | from tqdm import tqdm 14 | import multiprocessing as mp 15 | total_proc = 60 16 | import utils 17 | utils.start(__file__) 18 | 19 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number'] 20 | log = utils.read_pickles('../input/mk/log', usecols) 21 | 22 | 23 | def multi(uid): 24 | tmp = log[log.user_id==uid] 25 | ct = pd.crosstab(tmp.order_number, tmp.product_id).reset_index().set_index('order_number') 26 | li = [] 27 | for pid in ct.columns: 28 | streak = 0 29 | sw_odr = False 30 | for onb,odr in enumerate(ct[pid].values): 31 | onb+=1 32 | if sw_odr == False and odr == 1: 33 | sw_odr = True 34 | streak = 1 35 | li.append([uid, pid, onb, streak]) 36 | continue 37 | if sw_odr == True: 38 | if odr == 1 and streak>0: 39 | streak += 1 40 | li.append([uid, pid, onb, streak]) 41 | elif odr == 1 and streak<=0: 42 | streak = 1 43 | li.append([uid, pid, onb, streak]) 44 | elif odr == 0 and streak>0: 45 | streak = 0 46 | li.append([uid, pid, onb, streak]) 47 | elif odr == 0 and streak<=0: 48 | streak -= 1 49 | li.append([uid, pid, onb, streak]) 50 | return pd.DataFrame(li, columns=['user_id', 'product_id', 'order_number', 'streak']) 51 | 52 | 53 | user_id = log.user_id.unique() 54 | mp_pool = mp.Pool(total_proc) 55 | callback = mp_pool.map(multi, user_id) 56 | 57 | df = pd.concat(callback, ignore_index=True) 58 | 59 | order = log[['order_id', 'user_id', 'order_number']].drop_duplicates().reset_index(drop=True) 60 | df = pd.merge(df, order, on=['user_id', 'order_number'], how='left') 61 | 62 | df[['order_id', 'product_id', 'streak']].to_pickle('../input/mk/streak_order-product.p') 63 | 64 | 65 | utils.end(__file__) 66 | 67 | -------------------------------------------------------------------------------- /py_feature/011_replacement.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Aug 9 09:26:45 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 011_replacement.py & 9 | 10 | 11 | t-3 -> t-2 -> t-1 12 | a a a 13 | b d c 14 | c e d 15 | f 16 | 17 | pids_3notin2: b,c 18 | pids_2notin3: d,e 19 | pids_1notin2: c,f 20 | pids_skip: c 21 | 22 | c -> e -> c 23 | 24 | ratio: freq(c -> d -> c)/freq(c -> d) 25 | 26 | merge t-1: c->d 27 | 28 | """ 29 | 30 | import pandas as pd 31 | import gc 32 | import numpy as np 33 | from tqdm import tqdm 34 | from collections import defaultdict 35 | from itertools import product 36 | import utils 37 | utils.start(__file__) 38 | 39 | #============================================================================== 40 | # load 41 | #============================================================================== 42 | 43 | usecols = ['user_id', 'order_number', 'product_id', 'product_name', 'order_id', 'order_number_rev'] 44 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3]) 45 | 46 | order_tbl = log[['order_id', 'user_id', 'order_number', 'order_number_rev']].drop_duplicates().reset_index(drop=True) 47 | for i in range(1, 4): 48 | order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i) 49 | order_tbl.dropna(inplace=True) 50 | 51 | #order_pids = log.head(999999).groupby('order_id').product_id.apply(set).reset_index() 52 | order_pids = log.groupby('order_id').product_id.apply(set).reset_index() 53 | 54 | order_tbl = pd.merge(order_tbl, 55 | order_pids.add_prefix('t-1_'), 56 | on='t-1_order_id', how='inner') 57 | order_tbl = pd.merge(order_tbl, 58 | order_pids.add_prefix('t-2_'), 59 | on='t-2_order_id', how='inner') 60 | order_tbl = pd.merge(order_tbl, 61 | order_pids.add_prefix('t-3_'), 62 | on='t-3_order_id', how='inner') 63 | 64 | #============================================================================== 65 | # def 66 | #============================================================================== 67 | 68 | def make(T): 69 | """ 70 | T = 0 71 | folder = 'trainT-0' 72 | """ 73 | if T==-1: 74 | folder = 'test' 75 | else: 76 | folder = 'trainT-'+str(T) 77 | 78 | order_tbl_ = order_tbl[order_tbl.order_number_rev>T] 79 | 80 | pid_cnt = defaultdict(int) 81 | pid_chance = defaultdict(int) 82 | 83 | # for pids_bk3, pids_bk2, pids_bk1 in tqdm(order_tbl_[['t-3_product_id', 't-2_product_id', 't-1_product_id']].values): 84 | # for uid, onb, pid in tqdm(log_[['user_id', 'order_number', 'product_name']].head(1999999).values): 85 | # for uid, onb, pid in tqdm(log_[['user_id', 'order_number', 'product_id']].values, miniters=99999): 86 | for pids_bk3, pids_bk2, pids_bk1 in tqdm(order_tbl_[['t-3_product_id', 't-2_product_id', 't-1_product_id']].values, miniters=99999): 87 | 88 | pids_3notin2 = pids_bk3 - pids_bk2 89 | pids_2notin3 = pids_bk2 - pids_bk3 90 | pids_hub = pids_bk2 - pids_bk3 - pids_bk1 91 | pids_skip = (pids_bk3 & pids_bk1) - pids_bk2 92 | 93 | li = [] 94 | for i1, i2 in list(product(pids_3notin2, pids_2notin3)): 95 | key = str(i1)+' -> '+str(i2) 96 | li.append(key) 97 | pid_chance[key] +=1 98 | 99 | li = [] 100 | for i1, i2 in list(product(pids_skip, pids_hub)): 101 | key = str(i1)+' -> '+str(i2) 102 | li.append(key) 103 | pid_cnt[key] +=1 104 | 105 | 106 | 107 | pid_chance = pd.DataFrame.from_dict(pid_chance, orient='index').reset_index() 108 | pid_chance.columns = ['pids', 'chance'] 109 | 110 | pid_cnt = pd.DataFrame.from_dict(pid_cnt, orient='index').reset_index() 111 | pid_cnt.columns = ['pids', 'back'] 112 | 113 | df = pd.merge(pid_chance, pid_cnt, on='pids', how='left').fillna(0) 114 | 115 | df['ratio'] = df.back/df.chance 116 | df.sort_values('ratio', ascending=False, inplace=True) 117 | 118 | df.reset_index(drop=True, inplace=True) 119 | df['pid1'] = df.pids.map(lambda x: x.split(' -> ')[0]).astype(int) 120 | df['pid2'] = df.pids.map(lambda x: x.split(' -> ')[1]).astype(int) 121 | df[['pid1', 'pid2', 'back', 'chance', 'ratio']].to_pickle('../input/mk/replacement.p') 122 | 123 | #============================================================================== 124 | # main 125 | #============================================================================== 126 | 127 | make(2) 128 | 129 | #============================================================================== 130 | utils.end(__file__) 131 | 132 | -------------------------------------------------------------------------------- /py_feature/012_aisle_dep_cumsum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Aug 13 05:30:59 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | from collections import defaultdict 13 | import utils 14 | utils.start(__file__) 15 | 16 | #============================================================================== 17 | # load 18 | #============================================================================== 19 | col = ['user_id', 'order_number', 'order_id'] 20 | log = utils.read_pickles('../input/mk/log', col).drop_duplicates().sort_values(col) 21 | 22 | ai_dep = pd.read_pickle('../input/mk/order_aisle-department.p') 23 | 24 | log = pd.merge(log, ai_dep, on='order_id', how='left') 25 | 26 | #============================================================================== 27 | # calc 28 | #============================================================================== 29 | col = [c for c in log.columns if 'aisle_' in c or 'dep' in c] 30 | di = defaultdict(int) 31 | uid_bk = None 32 | 33 | li1 = [] 34 | for args in tqdm(log[['user_id']+col].values): 35 | uid = args[0] 36 | 37 | if uid_bk is None: 38 | pass 39 | elif uid == uid_bk: 40 | pass 41 | elif uid != uid_bk: 42 | di = defaultdict(int) 43 | li2 = [] 44 | for i,c in enumerate(col): 45 | di[c] += args[i+1] 46 | li2.append(di[c]) 47 | li1.append(li2) 48 | 49 | uid_bk = uid 50 | #============================================================================== 51 | df = pd.DataFrame(li1, columns=col).add_suffix('_cumsum') 52 | df['order_id'] = log['order_id'] 53 | 54 | df.to_pickle('../input/mk/order_aisle-department_cumsum.p') 55 | 56 | 57 | #============================================================================== 58 | utils.end(__file__) 59 | 60 | -------------------------------------------------------------------------------- /py_feature/100_======user_feature======: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/100_======user_feature====== -------------------------------------------------------------------------------- /py_feature/101_repeat_previous_ratio_T.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu May 25 22:45:03 2017 5 | 6 | @author: konodera 7 | 8 | リークしてるのでshiftして使うこと! 9 | """ 10 | 11 | import pandas as pd 12 | import numpy as np 13 | from tqdm import tqdm 14 | import gc 15 | import utils 16 | utils.start(__file__) 17 | 18 | # setting T 19 | T = 3 20 | 21 | 22 | #============================================================================== 23 | # load base 24 | #============================================================================== 25 | X_base = pd.read_pickle('../feature/X_base_t{}.p'.format(T)) 26 | all_order = pd.concat([X_base[c] for c in X_base.columns if 't-' in c]).unique() 27 | 28 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p') 29 | 30 | col = ['order_id', 'order_number', 'order_dow', 'order_hour_of_day', 31 | 'days_since_prior_order', 'days_since_first_order'] 32 | X = pd.merge(X_base, order_tbl[col], on='order_id', how='left') 33 | 34 | col_feature = [] 35 | 36 | #============================================================================== 37 | # repeat_previous_ratio 38 | #============================================================================== 39 | order_tbl['t-2_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(2) 40 | order_tbl['t-3_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(3) 41 | order_tbl['t-4_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(4) 42 | order_tbl['t-5_product_name'] = order_tbl.groupby('user_id')['product_name'].shift(5) 43 | 44 | order_tbl = order_tbl[order_tbl.order_id.isin(all_order)] 45 | 46 | # fill list 47 | col = ['product_name'] + [c for c in order_tbl.columns if 't-' in c] 48 | 49 | def fill_list(s): 50 | if isinstance(s, float): 51 | return [] 52 | return s 53 | 54 | for c in col: 55 | order_tbl[c] = order_tbl[c].map(fill_list) 56 | 57 | def ratio(list1, list2): 58 | """ 59 | list1: previous 60 | list2: current 61 | 62 | return: intersection(previous & current) / current 63 | """ 64 | if len(list1)==0 or len(list2)==0: 65 | return 66 | ret = sum([1 for i in list2 if i in list1]) / len(list2) 67 | 68 | return ret 69 | 70 | # w means window size 71 | order_tbl['repeat_previous_ratio-w1'] = order_tbl.apply(\ 72 | lambda x: ratio(x['t-1_product_name'], x['product_name']), axis=1) 73 | 74 | order_tbl['repeat_previous_ratio-w2'] = order_tbl.apply(\ 75 | lambda x: ratio(x['t-1_product_name']+x['t-2_product_name'], 76 | x['product_name']), axis=1) 77 | 78 | order_tbl['repeat_previous_ratio-w3'] = order_tbl.apply(\ 79 | lambda x: ratio(x['t-1_product_name']+x['t-2_product_name']+x['t-3_product_name'], 80 | x['product_name']), axis=1) 81 | 82 | order_tbl['repeat_previous_ratio-w4'] = order_tbl.apply(\ 83 | lambda x: ratio(x['t-1_product_name']+x['t-2_product_name']+x['t-3_product_name']+\ 84 | x['t-4_product_name'], x['product_name']), axis=1) 85 | 86 | order_tbl['repeat_previous_ratio-w5'] = order_tbl.apply(\ 87 | lambda x: ratio(x['t-1_product_name']+x['t-2_product_name']+x['t-3_product_name']+\ 88 | x['t-4_product_name']+x['t-5_product_name'], x['product_name']), axis=1) 89 | 90 | col_feature += ['repeat_previous_ratio-w1','repeat_previous_ratio-w2', 91 | 'repeat_previous_ratio-w3','repeat_previous_ratio-w4', 92 | 'repeat_previous_ratio-w5'] 93 | 94 | #============================================================================== 95 | # reordered_ratio 96 | #============================================================================== 97 | log = utils.read_pickles('../input/mk/log') 98 | reordered_ratio = log.groupby(['order_id']).reordered.mean().reset_index() 99 | reordered_ratio.columns = ['order_id', 'reordered_ratio'] 100 | order_tbl = pd.merge(order_tbl, reordered_ratio, on='order_id', how='left') 101 | 102 | log['unreordered'] = 1-log.reordered 103 | unreordered_ratio = log.groupby(['order_id']).unreordered.mean().reset_index() 104 | unreordered_ratio.columns = ['order_id', 'unreordered_ratio'] 105 | 106 | order_tbl = pd.merge(order_tbl, unreordered_ratio, on='order_id', how='left') 107 | 108 | 109 | del reordered_ratio, unreordered_ratio; gc.collect() 110 | 111 | col_feature += ['reordered_ratio'] 112 | 113 | #============================================================================== 114 | # total_unique_item 115 | #============================================================================== 116 | 117 | order_unique_item = log.groupby('order_id').unreordered.sum().reset_index() 118 | order_unique_item.columns = ['order_id', 'unreordered_sum'] 119 | 120 | order_tbl = pd.merge(order_tbl, order_unique_item, on='order_id', how='left') 121 | 122 | order_tbl['total_unique_item'] = order_tbl.groupby('user_id').unreordered_sum.cumsum() 123 | order_tbl['total_unique_item_ratio'] = order_tbl['total_unique_item']/order_tbl['order_number'] 124 | 125 | del order_unique_item; gc.collect() 126 | 127 | col_feature += ['unreordered_sum','total_unique_item', 'total_unique_item_ratio'] 128 | 129 | #============================================================================== 130 | # ordered item 131 | #============================================================================== 132 | 133 | ordered_item = log.groupby('order_id').size().reset_index() 134 | ordered_item.columns = ['order_id', 'ordered_item'] 135 | 136 | order_tbl = pd.merge(order_tbl, ordered_item, on='order_id', how='left') 137 | 138 | order_tbl['total_ordered_item'] = order_tbl.groupby('user_id').ordered_item.cumsum() 139 | order_tbl['total_ordered_item_ratio'] = order_tbl['total_ordered_item']/order_tbl['order_number'] 140 | 141 | del ordered_item; gc.collect() 142 | 143 | col_feature += ['ordered_item','total_ordered_item', 'total_ordered_item_ratio'] 144 | 145 | 146 | 147 | #============================================================================== 148 | # merge & split 149 | #============================================================================== 150 | col = ['order_id', 'order_dow', 'order_hour_of_day', 151 | 'days_since_prior_order', 'days_since_first_order'] 152 | for i in range(1, 1+T): 153 | X = pd.merge(X, order_tbl[col+col_feature].add_prefix('t-{}_'.format(i)), 154 | on='t-{}_order_id'.format(i), how='left') 155 | 156 | 157 | train = X[X.is_train==1].drop(['user_id','is_train'], axis=1).reset_index(drop=1) 158 | test = X[X.is_train==0].drop(['user_id','is_train'], axis=1).reset_index(drop=1) 159 | 160 | #============================================================================== 161 | # write 162 | #============================================================================== 163 | col = [c for c in train.columns if not ('t-' in c and '_id' in c)] 164 | train[col].to_pickle('../feature/trainT-0/f101_order.p') 165 | test[col].to_pickle('../feature/test/f101_order.p') 166 | 167 | 168 | #============================================================================== 169 | utils.end(__file__) 170 | 171 | -------------------------------------------------------------------------------- /py_feature/102_orderspan_average.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 13 15:58:46 2017 5 | 6 | @author: konodera 7 | 8 | order span 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | 19 | X_base = pd.read_pickle('../feature/X_base_t3.p') 20 | col = ['order_id', 'user_id', 'days_since_prior_order', 'eval_set', 'order_number_rev'] 21 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col] 22 | 23 | #============================================================================== 24 | # train 25 | #============================================================================== 26 | def make(T): 27 | order_tbl_tr = order_tbl[order_tbl.order_number_rev>T] 28 | 29 | user = order_tbl_tr.groupby('user_id')['days_since_prior_order'].mean().reset_index() 30 | user.columns = ['user_id', 'days_order_mean'] 31 | 32 | user.to_pickle('../feature/trainT-{}/f102_user.p'.format(T)) 33 | 34 | make(0) 35 | make(1) 36 | make(2) 37 | 38 | 39 | 40 | #============================================================================== 41 | # test 42 | #============================================================================== 43 | order_tbl_te = order_tbl[order_tbl.eval_set != 'test'] 44 | 45 | user = order_tbl_te.groupby('user_id')['days_since_prior_order'].mean().reset_index() 46 | user.columns = ['user_id', 'days_order_mean'] 47 | 48 | user.to_pickle('../feature/test/f102_user.p') 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | #============================================================================== 61 | utils.end(__file__) 62 | 63 | -------------------------------------------------------------------------------- /py_feature/103_visit_time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jun 17 08:45:57 2017 5 | 6 | @author: konodera 7 | 8 | visit time ratio 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | #============================================================================== 19 | # load 20 | #============================================================================== 21 | X_base = pd.read_pickle('../feature/X_base_t3.p') 22 | 23 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 24 | log = utils.read_pickles('../input/mk/log', col) 25 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 26 | on='order_hour_of_day', how='left') 27 | log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone 28 | 29 | #============================================================================== 30 | # train 31 | #============================================================================== 32 | def make(T): 33 | log_tr = log[log.order_number_rev>T] 34 | 35 | # dow 36 | dow = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_') 37 | dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_') 38 | 39 | # timezone 40 | timezone = pd.crosstab(log_tr.user_id, log_tr.timezone).add_prefix('user_timezone_freq_') 41 | timezone_ = pd.crosstab(log_tr.user_id, log_tr.timezone, normalize='index').add_prefix('user_timezone_norm_') 42 | 43 | # dow * timezone 44 | dow_tz = pd.crosstab(log_tr.user_id, log_tr.dow_tz).add_prefix('user_dow-tz_freq_') 45 | dow_tz_ = pd.crosstab(log_tr.user_id, log_tr.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_') 46 | 47 | tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1) 48 | 49 | tab.reset_index().to_pickle('../feature/trainT-{}/f103_user.p'.format(T)) 50 | 51 | make(0) 52 | make(1) 53 | make(2) 54 | 55 | #============================================================================== 56 | # test 57 | #============================================================================== 58 | 59 | # dow 60 | dow = pd.crosstab(log.user_id, log.order_dow).add_prefix('user_dow_freq_') 61 | dow_ = pd.crosstab(log.user_id, log.order_dow, normalize='index').add_prefix('user_dow_norm_') 62 | 63 | # timezone 64 | timezone = pd.crosstab(log.user_id, log.timezone).add_prefix('user_timezone_freq_') 65 | timezone_ = pd.crosstab(log.user_id, log.timezone, normalize='index').add_prefix('user_timezone_norm_') 66 | 67 | # dow * timezone 68 | dow_tz = pd.crosstab(log.user_id, log.dow_tz).add_prefix('user_dow-tz_freq_') 69 | dow_tz_ = pd.crosstab(log.user_id, log.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_') 70 | 71 | tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1) 72 | 73 | tab.reset_index().to_pickle('../feature/test/f103_user.p') 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | utils.end(__file__) 83 | 84 | -------------------------------------------------------------------------------- /py_feature/104_organic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 21 07:51:30 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | 16 | #============================================================================== 17 | # load 18 | #============================================================================== 19 | X_base = pd.read_pickle('../feature/X_base_t3.p') 20 | 21 | 22 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 23 | log = utils.read_pickles('../input/mk/log', col) 24 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 25 | on='order_hour_of_day', how='left') 26 | log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone 27 | 28 | log = pd.merge(log, pd.read_pickle('../input/mk/products_feature.p'), 29 | on='product_id', how='left') 30 | 31 | #============================================================================== 32 | # train 33 | #============================================================================== 34 | def make(T): 35 | log_tr = log[log.order_number_rev>T] 36 | 37 | user = log_tr.groupby(['user_id']).size().to_frame() 38 | user.columns = ['total'] 39 | user['organic_cnt'] = log_tr.groupby(['user_id'])['item_is_Organic'].sum() 40 | user['glutenfree_cnt'] = log_tr.groupby(['user_id'])['item_is_Gluten-Free'].sum() 41 | user['Asian_cnt'] = log_tr.groupby(['user_id'])['item_is_Asian'].sum() 42 | 43 | user['organic_ratio'] = user['organic_cnt'] / user.total 44 | user['glutenfree_ratio'] = user['glutenfree_cnt'] / user.total 45 | user['Asian_ratio'] = user['Asian_cnt'] / user.total 46 | 47 | user.drop('total', axis=1, inplace=True) 48 | user.reset_index().to_pickle('../feature/trainT-{}/f104_user.p'.format(T)) 49 | 50 | make(0) 51 | make(1) 52 | make(2) 53 | 54 | #============================================================================== 55 | # test 56 | #============================================================================== 57 | 58 | user = log.groupby(['user_id']).size().to_frame() 59 | user.columns = ['total'] 60 | user['organic_cnt'] = log.groupby(['user_id'])['item_is_Organic'].sum() 61 | user['glutenfree_cnt'] = log.groupby(['user_id'])['item_is_Gluten-Free'].sum() 62 | user['Asian_cnt'] = log.groupby(['user_id'])['item_is_Asian'].sum() 63 | 64 | user['organic_ratio'] = user['organic_cnt'] / user.total 65 | user['glutenfree_ratio'] = user['glutenfree_cnt'] / user.total 66 | user['Asian_ratio'] = user['Asian_cnt'] / user.total 67 | 68 | user.drop('total', axis=1, inplace=True) 69 | user.reset_index().to_pickle('../feature/test/f104_user.p') 70 | 71 | 72 | #============================================================================== 73 | utils.end(__file__) 74 | 75 | -------------------------------------------------------------------------------- /py_feature/105_delta_time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jun 25 09:41:08 2017 5 | 6 | @author: konodera 7 | 8 | delta order time 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | #============================================================================== 19 | # load 20 | #============================================================================== 21 | 22 | col = ['order_id', 'user_id','order_number', 'order_dow', 'order_hour_of_day', 23 | 'days_since_prior_order', 'eval_set'] 24 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col] 25 | order_tbl.sort_values(['user_id', 'order_number'], inplace=True) 26 | #order_tbl = order_tbl[order_tbl.eval_set!='test'] 27 | 28 | 29 | #============================================================================== 30 | # main 31 | #============================================================================== 32 | order_tbl['t-1_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(1) 33 | order_tbl['t-2_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(2) 34 | order_tbl['t-3_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(3) 35 | 36 | col = ['order_id', 'order_dow', 'order_hour_of_day'] 37 | order_tbl = pd.merge(order_tbl, order_tbl[col].add_prefix('t-1_'), on='t-1_order_id', how='left') 38 | order_tbl = pd.merge(order_tbl, order_tbl[col].add_prefix('t-2_'), on='t-2_order_id', how='left') 39 | order_tbl = pd.merge(order_tbl, order_tbl[col].add_prefix('t-3_'), on='t-3_order_id', how='left') 40 | 41 | order_tbl['delta_hour_t-1'] = order_tbl['order_hour_of_day'] - order_tbl['t-1_order_hour_of_day'] 42 | order_tbl['delta_hour_t-2'] = order_tbl['order_hour_of_day'] - order_tbl['t-2_order_hour_of_day'] 43 | order_tbl['delta_hour_t-3'] = order_tbl['order_hour_of_day'] - order_tbl['t-3_order_hour_of_day'] 44 | 45 | 46 | col = ['order_id', 'delta_hour_t-1', 'delta_hour_t-2', 47 | 'delta_hour_t-3'] 48 | order_tbl[col].to_pickle('../feature/trainT-0/f105_order.p') 49 | order_tbl[col].to_pickle('../feature/test/f105_order.p') 50 | 51 | 52 | 53 | utils.end(__file__) 54 | 55 | -------------------------------------------------------------------------------- /py_feature/108_order_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 16 21:04:09 2017 5 | 6 | @author: konodera 7 | 8 | 9 | """ 10 | 11 | import pandas as pd 12 | import numpy as np 13 | from tqdm import tqdm 14 | import utils 15 | utils.start(__file__) 16 | 17 | #============================================================================== 18 | # load 19 | #============================================================================== 20 | 21 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] 22 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') 23 | 24 | #============================================================================== 25 | # def 26 | #============================================================================== 27 | def make(T): 28 | """ 29 | T = 0 30 | folder = 'trainT-0' 31 | """ 32 | 33 | if T==-1: 34 | folder = 'test' 35 | else: 36 | folder = 'trainT-'+str(T) 37 | 38 | log_ = log[log.order_number_rev>T] 39 | 40 | order_tbl = log_.groupby('order_id').size().to_frame() 41 | order_tbl.columns = ['order_size'] 42 | order_tbl.reset_index(inplace=True) 43 | 44 | order_tbl = pd.merge(order_tbl, log_[['order_id', 'user_id']].drop_duplicates()) 45 | 46 | user_osz = order_tbl.groupby(['user_id']).order_size.min().to_frame() 47 | user_osz.columns = ['user_order_size-min'] 48 | user_osz['user_order_size-max'] = order_tbl.groupby(['user_id']).order_size.max() 49 | user_osz['user_order_size-median'] = order_tbl.groupby(['user_id']).order_size.median() 50 | user_osz['user_order_size-mean'] = order_tbl.groupby(['user_id']).order_size.mean() 51 | user_osz['user_order_size-std'] = order_tbl.groupby(['user_id']).order_size.std() 52 | user_osz.reset_index(inplace=True) 53 | 54 | user_osz.to_pickle('../feature/{}/f108_user.p'.format(folder)) 55 | 56 | #============================================================================== 57 | # main 58 | #============================================================================== 59 | make(0) 60 | make(1) 61 | make(2) 62 | 63 | make(-1) 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | utils.end(__file__) 81 | 82 | -------------------------------------------------------------------------------- /py_feature/109_have_you_bought.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jul 21 00:54:02 2017 5 | 6 | @author: konodera 7 | 8 | pid freq 9 | ------------- 10 | 24852 57186 11 | 13176 47063 12 | 21137 39871 13 | 21903 38095 14 | 47209 30047 15 | 47626 28741 16 | 47766 28478 17 | 26209 26199 18 | 16797 25621 19 | 24964 21090 20 | 22935 20824 21 | 27966 20193 22 | 39275 20134 23 | 45007 19652 24 | 49683 17508 25 | 4605 16176 26 | 27845 16134 27 | 40706 16054 28 | 5876 15765 29 | 4920 15150 30 | 28204 14802 31 | 42265 14766 32 | 30391 14089 33 | 31717 13949 34 | 8277 13900 35 | 8518 13770 36 | 27104 13719 37 | 17794 13642 38 | 46979 13491 39 | 45066 13289 40 | 41 | """ 42 | 43 | import pandas as pd 44 | import numpy as np 45 | from tqdm import tqdm 46 | import utils 47 | utils.start(__file__) 48 | 49 | #============================================================================== 50 | # load 51 | #============================================================================== 52 | 53 | col = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] 54 | log = utils.read_pickles('../input/mk/log', col) 55 | 56 | 57 | #============================================================================== 58 | # def 59 | #============================================================================== 60 | def make(T): 61 | """ 62 | T = 0 63 | folder = 'trainT-0' 64 | """ 65 | 66 | if T==-1: 67 | folder = 'test' 68 | else: 69 | folder = 'trainT-'+str(T) 70 | 71 | log_ = log[log.order_number_rev>T] 72 | 73 | user = log_.drop_duplicates('user_id')[['user_id']].reset_index(drop=True) 74 | 75 | # have you bought -> hyb 76 | tag_user = log_[log_.product_id==24852].user_id 77 | user['hyb_Banana'] = 0 78 | user.loc[user.user_id.isin(tag_user), 'hyb_Banana'] = 1 79 | 80 | tag_user = log_[log_.product_id==13176].user_id 81 | user['hyb_BoO-Bananas'] = 0 82 | user.loc[user.user_id.isin(tag_user), 'hyb_BoO-Bananas'] = 1 83 | 84 | tag_user = log_[log_.product_id==21137].user_id 85 | user['hyb_Organic-Strawberries'] = 0 86 | user.loc[user.user_id.isin(tag_user), 'hyb_Organic-Strawberries'] = 1 87 | 88 | tag_user = log_[log_.product_id==21903].user_id 89 | user['hyb_Organic-Baby-Spinach'] = 0 90 | user.loc[user.user_id.isin(tag_user), 'hyb_Organic-Baby-Spinach'] = 1 91 | 92 | tag_user = log_[log_.product_id==47209].user_id 93 | user['hyb_Organic-Hass-Avocado'] = 0 94 | user.loc[user.user_id.isin(tag_user), 'hyb_Organic-Hass-Avocado'] = 1 95 | 96 | user.to_pickle('../feature/{}/f109_user.p'.format(folder)) 97 | 98 | #============================================================================== 99 | # main 100 | #============================================================================== 101 | make(0) 102 | make(1) 103 | make(2) 104 | 105 | make(-1) 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | utils.end(__file__) 121 | 122 | -------------------------------------------------------------------------------- /py_feature/110_None.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jul 31 23:59:01 2017 5 | 6 | @author: konodera 7 | 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | from tqdm import tqdm 13 | import multiprocessing as mp 14 | import utils 15 | utils.start(__file__) 16 | 17 | LOOP = 20 18 | #============================================================================== 19 | # load 20 | #============================================================================== 21 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[['order_id', 'user_id', 'order_number']].sort_values(['user_id', 'order_number', 'order_id']) 22 | for i in range(1, LOOP): 23 | order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i) 24 | 25 | col = [c for c in order_tbl.columns if 'order_id' in c] 26 | order_tbl = order_tbl[col] 27 | 28 | order_None = pd.read_pickle('../input/mk/order_None.p') 29 | 30 | 31 | #============================================================================== 32 | # main 33 | #============================================================================== 34 | df = order_tbl.copy() 35 | 36 | for i in tqdm(range(1, LOOP)): 37 | df = pd.merge(df, order_None.add_prefix('t-{}_'.format(i)), 38 | on='t-{}_order_id'.format(i), how='left') 39 | 40 | col = [c for c in df.columns if c.endswith('_order_id')] 41 | df.drop(col, axis=1, inplace=True) 42 | 43 | df.fillna(-1, inplace=True) 44 | 45 | df.to_pickle('../feature/trainT-0/f110_order.p') 46 | df.to_pickle('../feature/test/f110_order.p') 47 | 48 | 49 | #============================================================================== 50 | 51 | utils.end(__file__) 52 | 53 | -------------------------------------------------------------------------------- /py_feature/200_======item_feature======: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/200_======item_feature====== -------------------------------------------------------------------------------- /py_feature/202_buy_time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon May 29 19:58:46 2017 5 | 6 | @author: konodera 7 | 8 | アイテムが買われる時間帯 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | import gc 15 | from tqdm import tqdm 16 | import utils 17 | utils.start(__file__) 18 | 19 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 20 | log = utils.read_pickles('../input/mk/log', col) 21 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), on='order_hour_of_day', how='left') 22 | log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone 23 | 24 | 25 | # TODO: rolling mean 26 | def make(log, folder): 27 | #============================================================================== 28 | # hour 29 | #============================================================================== 30 | gc.collect() 31 | tbl = log.groupby(['product_id', 'order_hour_of_day']).size().reset_index() 32 | tbl.columns = ['product_id', 'order_hour_of_day', 'item_hour_cnt'] 33 | 34 | tbl['item_hour_ratio'] = tbl.item_hour_cnt / tbl.groupby('product_id').transform(np.sum).item_hour_cnt 35 | 36 | tbl.to_pickle('../feature/{}/f202_product_hour.p'.format(folder)) 37 | 38 | # unique 39 | tbl = log.drop_duplicates(['user_id', 'product_id', 'order_hour_of_day']).groupby(['product_id', 'order_hour_of_day']).size().reset_index() 40 | tbl.columns = ['product_id', 'order_hour_of_day', 'item_hour_cnt_unq'] 41 | 42 | tbl['item_hour_ratio_unq'] = tbl.item_hour_cnt_unq / tbl.groupby('product_id').transform(np.sum).item_hour_cnt_unq 43 | 44 | tbl.to_pickle('../feature/{}/f202_uniq_product_hour.p'.format(folder)) 45 | 46 | 47 | #============================================================================== 48 | # dow 49 | #============================================================================== 50 | gc.collect() 51 | tbl = log.groupby(['product_id', 'order_dow']).size().reset_index() 52 | tbl.columns = ['product_id', 'order_dow', 'item_dow_cnt'] 53 | 54 | tbl['item_dow_ratio'] = tbl.item_dow_cnt / tbl.groupby('product_id').transform(np.sum).item_dow_cnt 55 | 56 | tbl.to_pickle('../feature/{}/f202_product_dow.p'.format(folder)) 57 | 58 | # unique 59 | tbl = log.drop_duplicates(['user_id', 'product_id', 'order_dow']).groupby(['product_id', 'order_dow']).size().reset_index() 60 | tbl.columns = ['product_id', 'order_dow', 'item_dow_cnt_unq'] 61 | 62 | tbl['item_dow_ratio_unq'] = tbl.item_dow_cnt_unq / tbl.groupby('product_id').transform(np.sum).item_dow_cnt_unq 63 | 64 | tbl.to_pickle('../feature/{}/f202_uniq_product_dow.p'.format(folder)) 65 | 66 | 67 | #============================================================================== 68 | # timezone 69 | #============================================================================== 70 | gc.collect() 71 | tbl = log.groupby(['product_id', 'timezone']).size().reset_index() 72 | tbl.columns = ['product_id', 'timezone', 'item_timezone_cnt'] 73 | 74 | tbl['item_timezone_ratio'] = (tbl.item_timezone_cnt / tbl.groupby('product_id').transform(np.sum).item_timezone_cnt).map(float) 75 | 76 | tbl.to_pickle('../feature/{}/f202_product_timezone.p'.format(folder)) 77 | 78 | # unique 79 | tbl = log.drop_duplicates(['user_id', 'product_id', 'timezone']).groupby(['product_id', 'timezone']).size().reset_index() 80 | tbl.columns = ['product_id', 'timezone', 'item_timezone_cnt_uniq'] 81 | 82 | tbl['item_timezone_ratio_uniq'] = (tbl.item_timezone_cnt_uniq / tbl.groupby('product_id').transform(np.sum).item_timezone_cnt_uniq).map(float) 83 | 84 | tbl.to_pickle('../feature/{}/f202_uniq_product_timezone.p'.format(folder)) 85 | 86 | #============================================================================== 87 | # timezone * dow 88 | #============================================================================== 89 | gc.collect() 90 | 91 | tbl = log.groupby(['product_id', 'order_dow', 'timezone']).size().reset_index() 92 | tbl.columns = ['product_id', 'order_dow', 'timezone', 'item_dow-tz_cnt'] 93 | 94 | tbl['item_dow-tz_ratio'] = (tbl['item_dow-tz_cnt'] / tbl.groupby('product_id').transform(np.sum)['item_dow-tz_cnt']).map(float) 95 | 96 | tbl.to_pickle('../feature/{}/f202_product_dow-timezone.p'.format(folder)) 97 | 98 | # unique 99 | tbl = log.drop_duplicates(['user_id', 'product_id', 'order_dow', 'timezone']).groupby(['product_id', 'order_dow', 'timezone']).size().reset_index() 100 | tbl.columns = ['product_id', 'order_dow', 'timezone', 'item_dow-tz_cnt_uniq'] 101 | 102 | tbl['item_dow-tz_ratio_uniq'] = (tbl['item_dow-tz_cnt_uniq'] / tbl.groupby('product_id').transform(np.sum)['item_dow-tz_cnt_uniq']).map(float) 103 | 104 | tbl.to_pickle('../feature/{}/f202_uniq_product_dow-timezone.p'.format(folder)) 105 | 106 | 107 | #============================================================================== 108 | # flat 109 | #============================================================================== 110 | gc.collect() 111 | tbl = pd.crosstab(log.product_id, log.dow_tz, normalize='index').add_prefix('item_flat_dow-tz_') 112 | 113 | tbl.reset_index().to_pickle('../feature/{}/f202_flat_product.p'.format(folder)) 114 | #============================================================================== 115 | # main 116 | #============================================================================== 117 | make(log[log.order_number_rev>0], 'trainT-0') 118 | make(log[log.order_number_rev>1], 'trainT-1') 119 | make(log[log.order_number_rev>2], 'trainT-2') 120 | 121 | make(log, 'test') 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | #============================================================================== 135 | utils.end(__file__) 136 | 137 | -------------------------------------------------------------------------------- /py_feature/203_cycle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jun 3 06:46:06 2017 5 | 6 | @author: konodera 7 | 8 | Item buy cycle 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | #import gc 16 | import utils 17 | utils.start(__file__) 18 | 19 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] 20 | log = pd.merge(utils.read_pickles('../input/mk/log', usecols), 21 | utils.read_pickles('../input/mk/days_since_last_order'), 22 | on=['order_id','product_id'], how='left') 23 | 24 | def make(log, folder): 25 | 26 | tbl = log.groupby('product_id').days_since_last_order_this_item.mean().to_frame() 27 | tbl.columns = ['item_order_days_mean'] 28 | tbl['item_order_days_min'] = log.groupby('product_id').days_since_last_order_this_item.min() 29 | tbl['item_order_days_max'] = log.groupby('product_id').days_since_last_order_this_item.max() 30 | tbl['item_order_days_median'] = log.groupby('product_id').days_since_last_order_this_item.median() 31 | 32 | tbl['item_order_freq'] = log.groupby('product_id').size() 33 | 34 | tbl['item_reorderd_freq'] = log.groupby('product_id').reordered.sum() 35 | tbl['item_reorder_ratio'] = (tbl.item_reorderd_freq / tbl.item_order_freq).astype(np.float32) 36 | 37 | tbl['item_unique_user'] = log.drop_duplicates(['user_id', 'product_id']).groupby('product_id').size() 38 | tbl['item_order_per-user'] = tbl['item_order_freq'] / tbl['item_unique_user'] 39 | 40 | tbl.reset_index(inplace=1) 41 | 42 | 43 | tbl.to_pickle('../feature/{}/f203_product.p'.format(folder)) 44 | #============================================================================== 45 | # main 46 | #============================================================================== 47 | make(log[log.order_number_rev>0], 'trainT-0') 48 | make(log[log.order_number_rev>1], 'trainT-1') 49 | make(log[log.order_number_rev>2], 'trainT-2') 50 | 51 | make(log, 'test') 52 | 53 | 54 | 55 | #============================================================================== 56 | utils.end(__file__) 57 | 58 | -------------------------------------------------------------------------------- /py_feature/205_co-occur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 7 22:00:22 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 205_co-occur.py & 9 | 10 | 11 | === order_numberまたぎ共起 === 12 | t-1に何を買うと(自分以外)、t-0にreorderする? 13 | exp: 14 | t-1にbananaを買った人の30%がt-0にstrawberryを買う 15 | 16 | takes 3 hour 17 | """ 18 | 19 | import pandas as pd 20 | import numpy as np 21 | #from tqdm import tqdm 22 | from collections import Counter 23 | from itertools import product 24 | from operator import itemgetter 25 | import gc 26 | import multiprocessing as mp 27 | import utils 28 | utils.start(__file__) 29 | 30 | #============================================================================== 31 | # load 32 | #============================================================================== 33 | col = ['order_id', 'user_id', 'product_name', 't-1_product_name', 'order_number', 'order_number_rev'] 34 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col] 35 | order_tbl.sort_values(['user_id','order_number'], inplace=1) 36 | order_tbl['t-1_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(1) 37 | order_tbl.reset_index(drop=True, inplace=True) 38 | 39 | prods = pd.read_pickle('../input/mk/goods.p')[['product_id','product_name']] 40 | 41 | log = utils.read_pickles('../input/mk/log', ['order_id', 'product_id', 'order_number_rev']) 42 | order_item_array = log.groupby('order_id').product_id.apply(np.array).reset_index() 43 | del log; gc.collect() 44 | #============================================================================== 45 | # def 46 | #============================================================================== 47 | 48 | def make(T): 49 | """ 50 | T = 0 51 | folder = 'trainT-0' 52 | """ 53 | if T==-1: 54 | folder = 'test' 55 | else: 56 | folder = 'trainT-'+str(T) 57 | print("start T:{} folder:{}".format(T, folder)) 58 | order_tbl_ = order_tbl[order_tbl.order_number_rev>T].dropna() # drop first order 59 | 60 | item2item = [] 61 | item_bunbo = Counter() 62 | for item_prior, item_now in order_tbl_[['t-1_product_name', 'product_name']].values: 63 | item2item += [i1+' -> '+i2 for i1, i2 in list(product(item_prior, item_now))] 64 | item_bunbo += Counter(item_prior) 65 | item2item = Counter(item2item) 66 | 67 | df = pd.DataFrame.from_dict(item2item, orient='index').reset_index() 68 | df.columns = ['item', 'cnt'] 69 | del item2item; gc.collect() 70 | 71 | df_ = pd.DataFrame.from_dict(item_bunbo, orient='index').reset_index() 72 | df_.columns = ['before', 'total_cnt'] 73 | del item_bunbo; gc.collect() 74 | 75 | df.sort_values('cnt', ascending=False, inplace=True) 76 | 77 | df['before'] = df.item.map(lambda x: x.split(' -> ')[0]) 78 | df['after'] = df.item.map(lambda x: x.split(' -> ')[1]) 79 | df = df[df.before!=df.after] 80 | 81 | df = pd.merge(df, df_, on='before', how='left') 82 | 83 | df['before_to_after_ratio'] = df.cnt / df.total_cnt 84 | df = df[['before', 'after', 'before_to_after_ratio']] 85 | gc.collect() 86 | 87 | df = pd.merge(df, prods.rename(columns={'product_name':'before', 'product_id':'before_id'}), 88 | on='before', how='left') 89 | df = pd.merge(df, prods.rename(columns={'product_name':'after', 'product_id':'after_id'}), 90 | on='after', how='left') 91 | 92 | df = df[['before_id', 'after_id', 'before_to_after_ratio']] 93 | gc.collect() 94 | """ 95 | df.head() 96 | before_id after_id before_to_after_ratio 97 | 0 47209 13176 0.288618 98 | 1 13176 47209 0.175736 99 | 2 13176 21137 0.148974 100 | 3 21137 13176 0.188769 101 | """ 102 | #============================================================================== 103 | print('Merge', T) 104 | #============================================================================== 105 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 106 | label = pd.merge(label, order_tbl[['order_id', 't-1_order_id']], 107 | on='order_id', how='left') 108 | print('今まで買ったitem and t-1に買ったitem') 109 | order_b4after = pd.merge(label, order_item_array.add_prefix('t-1_'), 110 | on='t-1_order_id', how='left') 111 | gc.collect() 112 | 113 | col = ['order_id', 't-1_product_id', 'product_id'] 114 | order_b4after = order_b4after[col] 115 | gc.collect() 116 | """ 117 | order_b4after.head() 118 | Out[9]: 119 | order_id t-1_product_id product_id 120 | 0 1187899 [46149, 39657, 38928, 25133, 10258, 35951, 130... 196 121 | 1 1187899 [46149, 39657, 38928, 25133, 10258, 35951, 130... 10258 122 | 2 1187899 [46149, 39657, 38928, 25133, 10258, 35951, 130... 10326 123 | 3 1187899 [46149, 39657, 38928, 25133, 10258, 35951, 130... 12427 124 | 4 1187899 [46149, 39657, 38928, 25133, 10258, 35951, 130... 13032 125 | """ 126 | #============================================================================== 127 | print('search max ratio',T) 128 | #============================================================================== 129 | df['key'] = df.before_id.map(str) + 'to' + df.after_id.map(str) 130 | 131 | ratio_tbl = {} 132 | for k,v in df[['key','before_to_after_ratio']].values: 133 | ratio_tbl[k] = v 134 | 135 | del df; gc.collect() 136 | 137 | def get_ratio(key): 138 | try: 139 | return ratio_tbl[key] 140 | except: 141 | return -1 142 | 143 | def search_max_ratio(before_items, item): 144 | """ 145 | before_items = order_tr.loc[0,'t-1_product_id'] 146 | item = order_tr.loc[0,'product_id'] 147 | """ 148 | comb = list(product(before_items, [item])) 149 | comb = [str(x) + 'to' + str(y) for x,y in sorted(comb, key=itemgetter(1))] 150 | return np.max([get_ratio(k) for k in comb]) 151 | 152 | 153 | print('== before_to_after_ratio ==', T) 154 | ret = [] 155 | for before_items, item in order_b4after[['t-1_product_id', 'product_id']].values: 156 | ret.append(search_max_ratio(before_items, item)) 157 | order_b4after['before_to_after_ratio'] = ret 158 | 159 | col = ['order_id', 'product_id', 'before_to_after_ratio'] 160 | order_b4after[col].to_pickle('../feature/{}/f205_order_product.p'.format(folder)) 161 | 162 | #============================================================================== 163 | # main 164 | #============================================================================== 165 | 166 | mp_pool = mp.Pool(3) 167 | mp_pool.map(make, [-1, 0, 1, 2, #3,# 4, 5 168 | ]) 169 | 170 | 171 | 172 | 173 | #============================================================================== 174 | utils.end(__file__) 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /py_feature/207_mean_pos_cart.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jun 16 07:11:23 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | 16 | #============================================================================== 17 | # load 18 | #============================================================================== 19 | 20 | col = ['order_id', 'product_id', 'add_to_cart_order', 'order_number_rev'] 21 | log = utils.read_pickles('../input/mk/log', col) 22 | 23 | #============================================================================== 24 | # def 25 | #============================================================================== 26 | def make(T): 27 | """ 28 | T = 0 29 | folder = 'trainT-0' 30 | """ 31 | if T==-1: 32 | folder = 'test' 33 | else: 34 | folder = 'trainT-'+str(T) 35 | 36 | log_ = log[log.order_number_rev>T] 37 | 38 | gr = log_.groupby('product_id') 39 | 40 | items = gr.add_to_cart_order.mean().to_frame() 41 | items.columns = ['item_mean_pos_cart'] 42 | items['item_sum_pos_cart'] = gr.add_to_cart_order.sum() 43 | items['item_min_pos_cart'] = gr.add_to_cart_order.min() 44 | items['item_median_pos_cart'] = gr.add_to_cart_order.median() 45 | items['item_max_pos_cart'] = gr.add_to_cart_order.max() 46 | items['item_std_pos_cart'] = gr.add_to_cart_order.std() 47 | items.reset_index(inplace=True) 48 | 49 | items.to_pickle('../feature/{}/f207_product.p'.format(folder)) 50 | 51 | #============================================================================== 52 | # main 53 | #============================================================================== 54 | make(0) 55 | make(1) 56 | make(2) 57 | 58 | make(-1) 59 | 60 | #============================================================================== 61 | utils.end(__file__) 62 | 63 | -------------------------------------------------------------------------------- /py_feature/208_one-shot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jul 4 03:58:09 2017 5 | 6 | @author: konodera 7 | 8 | 一回しか買わないユーザーの数 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | #============================================================================== 19 | # load 20 | #============================================================================== 21 | 22 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 23 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') 24 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 25 | on='order_hour_of_day', how='left') 26 | 27 | #============================================================================== 28 | # def 29 | #============================================================================== 30 | def make(T): 31 | """ 32 | T = 0 33 | folder = 'trainT-0' 34 | """ 35 | if T==-1: 36 | folder = 'test' 37 | else: 38 | folder = 'trainT-'+str(T) 39 | 40 | log_ = log[log.order_number_rev>T] 41 | 42 | item = log_.groupby(['product_id', 'user_id']).size().reset_index() 43 | item.columns = ['product_id', 'user_id', 'cnt'] 44 | 45 | item_one = item[item.cnt==1].groupby('product_id').size().reset_index() 46 | item_one.columns = ['product_id', 'item_only_one_user_cnt'] 47 | 48 | item_size = item.groupby('product_id').size().reset_index() 49 | item_size.columns = ['product_id', 'item_unique_user'] 50 | 51 | item = pd.merge(item_one, item_size, on='product_id', how='left') 52 | item['item_only_one_user_cnt_ratio'] = item['item_only_one_user_cnt']/item['item_unique_user'] 53 | 54 | col = ['product_id', 'item_only_one_user_cnt', 'item_only_one_user_cnt_ratio'] 55 | item[col].to_pickle('../feature/{}/f208_product.p'.format(folder)) 56 | 57 | #============================================================================== 58 | # main 59 | #============================================================================== 60 | make(0) 61 | make(1) 62 | make(2) 63 | 64 | make(-1) 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | utils.end(__file__) 79 | 80 | -------------------------------------------------------------------------------- /py_feature/209_together.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 5 22:36:10 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | #============================================================================== 16 | # load 17 | #============================================================================== 18 | 19 | col = ['order_id', 'user_id', 'product_id', 'order_number_rev'] 20 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') 21 | 22 | #============================================================================== 23 | # def 24 | #============================================================================== 25 | def make(T): 26 | """ 27 | T = 0 28 | folder = 'trainT-0' 29 | """ 30 | if T==-1: 31 | folder = 'test' 32 | else: 33 | folder = 'trainT-'+str(T) 34 | 35 | log_ = log[log.order_number_rev>T] 36 | 37 | order_size = log_.groupby('order_id').size().reset_index() 38 | order_size.columns = ['order_id', 'total'] 39 | 40 | log_ = pd.merge(log_, order_size, on='order_id', how='left') 41 | 42 | item = log_.groupby('product_id').total.mean().to_frame() 43 | item.columns = ['item_together_mean'] 44 | 45 | item['item_together_min'] = log_.groupby('product_id').total.min() 46 | item['item_together_max'] = log_.groupby('product_id').total.max() 47 | item['item_together_std'] = log_.groupby('product_id').total.std() 48 | 49 | item.reset_index().to_pickle('../feature/{}/f209_product.p'.format(folder)) 50 | 51 | #============================================================================== 52 | # main 53 | #============================================================================== 54 | make(0) 55 | make(1) 56 | make(2) 57 | 58 | make(-1) 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | utils.end(__file__) 74 | 75 | -------------------------------------------------------------------------------- /py_feature/210_streak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 5 22:36:10 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | #============================================================================== 16 | # load 17 | #============================================================================== 18 | 19 | col = ['order_id', 'user_id', 'product_id', 'order_number_rev'] 20 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') 21 | 22 | streak = pd.read_pickle('../input/mk/streak_order-product.p') 23 | #============================================================================== 24 | # def 25 | #============================================================================== 26 | def make(T): 27 | """ 28 | T = 0 29 | folder = 'trainT-0' 30 | """ 31 | if T==-1: 32 | folder = 'test' 33 | else: 34 | folder = 'trainT-'+str(T) 35 | 36 | log_ = pd.merge(log[log.order_number_rev>T], streak, 37 | on=['order_id', 'product_id'], how='left') 38 | 39 | gr = log_.groupby('product_id') 40 | item = gr.streak.mean().to_frame() 41 | item.columns = ['item_streak_mean'] 42 | 43 | item['item_streak_min'] = gr.streak.min() 44 | item['item_streak_max'] = gr.streak.max() 45 | item['item_streak_std'] = gr.streak.std() 46 | 47 | item.reset_index().to_pickle('../feature/{}/f210_product.p'.format(folder)) 48 | 49 | #============================================================================== 50 | # main 51 | #============================================================================== 52 | make(0) 53 | make(1) 54 | make(2) 55 | 56 | make(-1) 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | utils.end(__file__) 72 | 73 | -------------------------------------------------------------------------------- /py_feature/212_withinN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 5 22:36:10 2017 5 | 6 | @author: konodera 7 | 8 | 9 | nohup python -u 212_withinN.py & 10 | 11 | 12 | """ 13 | 14 | import pandas as pd 15 | import gc 16 | import numpy as np 17 | from collections import defaultdict 18 | import multiprocessing as mp 19 | total_proc = 3 20 | import utils 21 | utils.start(__file__) 22 | 23 | #============================================================================== 24 | # load 25 | #============================================================================== 26 | 27 | usecols = ['product_id', 'user_id', 'order_number', 'order_id', 'order_number_rev'] 28 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3]) 29 | 30 | #============================================================================== 31 | # def 32 | #============================================================================== 33 | 34 | def make(T): 35 | """ 36 | T = 0 37 | folder = 'trainT-0' 38 | """ 39 | if T==-1: 40 | folder = 'test' 41 | else: 42 | folder = 'trainT-'+str(T) 43 | 44 | log_ = log[log.order_number_rev>T] 45 | log_['user_max_onb'] = log_.groupby('user_id').order_number.transform(np.max) 46 | 47 | item_N2_cnt = defaultdict(int) 48 | item_N2_chance = defaultdict(int) 49 | item_N3_cnt = defaultdict(int) 50 | item_N3_chance = defaultdict(int) 51 | item_N4_cnt = defaultdict(int) 52 | item_N4_chance = defaultdict(int) 53 | item_N5_cnt = defaultdict(int) 54 | item_N5_chance = defaultdict(int) 55 | pid_bk = uid_bk = onb_bk = None 56 | # for pid, uid, onb, max_onb in tqdm(log_[['product_id', 'user_id', 'order_number','user_max_onb']].values): 57 | for pid, uid, onb, max_onb in log_[['product_id', 'user_id', 'order_number','user_max_onb']].values: 58 | 59 | if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=2 and (max_onb-onb) >=2: 60 | item_N2_cnt[pid] +=1 61 | if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=2: 62 | item_N2_chance[pid] +=1 63 | 64 | if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=3 and (max_onb-onb) >=3: 65 | item_N3_cnt[pid] +=1 66 | if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=3: 67 | item_N3_chance[pid] +=1 68 | 69 | if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=4 and (max_onb-onb) >=4: 70 | item_N4_cnt[pid] +=1 71 | if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=4: 72 | item_N4_chance[pid] +=1 73 | 74 | if pid==pid_bk and uid==uid_bk and (onb-onb_bk)<=5 and (max_onb-onb) >=5: 75 | item_N5_cnt[pid] +=1 76 | if pid==pid_bk and uid==uid_bk and (max_onb-onb) >=5: 77 | item_N5_chance[pid] +=1 78 | 79 | pid_bk = pid 80 | uid_bk = uid 81 | onb_bk = onb 82 | 83 | item_N2_cnt = pd.DataFrame.from_dict(item_N2_cnt, orient='index').reset_index() 84 | item_N2_cnt.columns = ['product_id', 'item_N2_cnt'] 85 | item_N2_chance = pd.DataFrame.from_dict(item_N2_chance, orient='index').reset_index() 86 | item_N2_chance.columns = ['product_id', 'item_N2_chance'] 87 | 88 | item_N3_cnt = pd.DataFrame.from_dict(item_N3_cnt, orient='index').reset_index() 89 | item_N3_cnt.columns = ['product_id', 'item_N3_cnt'] 90 | item_N3_chance = pd.DataFrame.from_dict(item_N3_chance, orient='index').reset_index() 91 | item_N3_chance.columns = ['product_id', 'item_N3_chance'] 92 | 93 | item_N4_cnt = pd.DataFrame.from_dict(item_N4_cnt, orient='index').reset_index() 94 | item_N4_cnt.columns = ['product_id', 'item_N4_cnt'] 95 | item_N4_chance = pd.DataFrame.from_dict(item_N4_chance, orient='index').reset_index() 96 | item_N4_chance.columns = ['product_id', 'item_N4_chance'] 97 | 98 | item_N5_cnt = pd.DataFrame.from_dict(item_N5_cnt, orient='index').reset_index() 99 | item_N5_cnt.columns = ['product_id', 'item_N5_cnt'] 100 | item_N5_chance = pd.DataFrame.from_dict(item_N5_chance, orient='index').reset_index() 101 | item_N5_chance.columns = ['product_id', 'item_N5_chance'] 102 | 103 | df2 = pd.merge(item_N2_cnt, item_N2_chance, on='product_id', how='outer') 104 | df3 = pd.merge(item_N3_cnt, item_N3_chance, on='product_id', how='outer') 105 | df4 = pd.merge(item_N4_cnt, item_N4_chance, on='product_id', how='outer') 106 | df5 = pd.merge(item_N5_cnt, item_N5_chance, on='product_id', how='outer') 107 | 108 | df = pd.merge(pd.merge(df2, df3, on='product_id', how='outer'), 109 | pd.merge(df4, df5, on='product_id', how='outer'), 110 | on='product_id', how='outer').fillna(0) 111 | 112 | df['item_N2_ratio'] = df['item_N2_cnt']/df['item_N2_chance'] 113 | df['item_N3_ratio'] = df['item_N3_cnt']/df['item_N3_chance'] 114 | df['item_N4_ratio'] = df['item_N4_cnt']/df['item_N4_chance'] 115 | df['item_N5_ratio'] = df['item_N5_cnt']/df['item_N5_chance'] 116 | 117 | df.fillna(0, inplace=True) 118 | df.reset_index(drop=True, inplace=True) 119 | df.to_pickle('../feature/{}/f212_product.p'.format(folder)) 120 | 121 | #============================================================================== 122 | # main 123 | #============================================================================== 124 | 125 | mp_pool = mp.Pool(total_proc) 126 | mp_pool.map(make, range(-1,3)) 127 | 128 | #============================================================================== 129 | utils.end(__file__) 130 | 131 | -------------------------------------------------------------------------------- /py_feature/213_dow_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Aug 7 13:58:58 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import utils 12 | utils.start(__file__) 13 | 14 | #============================================================================== 15 | # load 16 | #============================================================================== 17 | 18 | usecols = ['product_id', 'order_dow', 'order_number_rev'] 19 | log = utils.read_pickles('../input/mk/log', usecols) 20 | 21 | #============================================================================== 22 | # def 23 | #============================================================================== 24 | 25 | def make(T): 26 | """ 27 | T = 0 28 | folder = 'trainT-0' 29 | """ 30 | if T==-1: 31 | folder = 'test' 32 | else: 33 | folder = 'trainT-'+str(T) 34 | 35 | log_ = log[log.order_number_rev>T] 36 | 37 | all_item_dist = log_.order_dow.value_counts(normalize=True).reset_index() 38 | all_item_dist.columns = ['order_dow', 'dow_dist_ratio'] 39 | 40 | tbl = log_.groupby(['product_id', 'order_dow']).size().reset_index() 41 | tbl.columns = ['product_id', 'order_dow', 'item_dow_cnt'] 42 | tbl['item_dow_ratio'] = tbl.item_dow_cnt / tbl.groupby('product_id').transform(np.sum).item_dow_cnt 43 | 44 | tbl = pd.merge(tbl, all_item_dist, on='order_dow', how='left') 45 | 46 | tbl['item_dow_ratio_diff'] = tbl.item_dow_ratio - tbl.dow_dist_ratio 47 | 48 | tbl[['product_id','order_dow', 'item_dow_ratio_diff']].to_pickle('../feature/{}/f213_product-dow.p'.format(folder)) 49 | 50 | #============================================================================== 51 | # main 52 | #============================================================================== 53 | make(0) 54 | make(1) 55 | make(2) 56 | 57 | make(-1) 58 | 59 | #============================================================================== 60 | utils.end(__file__) 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | utils.end(__file__) 77 | 78 | -------------------------------------------------------------------------------- /py_feature/214_first_order.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Aug 7 15:52:10 2017 5 | 6 | @author: konodera 7 | 8 | if t-1 == first buy, what's the ratio of reorderd? 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from collections import defaultdict 15 | from tqdm import tqdm 16 | import utils 17 | utils.start(__file__) 18 | 19 | #============================================================================== 20 | # load 21 | #============================================================================== 22 | 23 | usecols = ['user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] 24 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3]) 25 | 26 | #============================================================================== 27 | # def 28 | #============================================================================== 29 | 30 | def make(T): 31 | """ 32 | T = 0 33 | folder = 'trainT-0' 34 | """ 35 | if T==-1: 36 | folder = 'test' 37 | else: 38 | folder = 'trainT-'+str(T) 39 | 40 | log_ = log[log.order_number_rev>T] 41 | log_['user_max_onb'] = log_.groupby('user_id').order_number.transform(np.max) 42 | log_ = log_.groupby(['user_id', 'product_id']).head(2) 43 | 44 | item_cnt = defaultdict(int) 45 | item_chance = defaultdict(int) 46 | pid_bk = uid_bk = onb_bk = None 47 | 48 | for uid, pid, onb, max_onb in log_[['user_id', 'product_id', 'order_number', 'user_max_onb']].values: 49 | 50 | if uid==uid_bk and pid==pid_bk and (onb-onb_bk==1): 51 | item_cnt[pid] +=1 52 | if onb!=max_onb: 53 | item_chance[pid] +=1 54 | 55 | pid_bk = pid 56 | uid_bk = uid 57 | onb_bk = onb 58 | 59 | item_cnt = pd.DataFrame.from_dict(item_cnt, orient='index').reset_index() 60 | item_cnt.columns = ['product_id', 'item_first_cnt'] 61 | item_chance = pd.DataFrame.from_dict(item_chance, orient='index').reset_index() 62 | item_chance.columns = ['product_id', 'item_first_chance'] 63 | 64 | df = pd.merge(item_cnt, item_chance, on='product_id', how='outer').fillna(0) 65 | df['item_first_ratio'] = df.item_first_cnt/df.item_first_chance 66 | 67 | df.to_pickle('../feature/{}/f214_product.p'.format(folder)) 68 | 69 | 70 | #============================================================================== 71 | # main 72 | #============================================================================== 73 | 74 | make(0) 75 | make(1) 76 | make(2) 77 | 78 | make(-1) 79 | 80 | #============================================================================== 81 | 82 | utils.end(__file__) 83 | 84 | -------------------------------------------------------------------------------- /py_feature/215_onb_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 5 22:36:10 2017 5 | 6 | @author: konodera 7 | 8 | 9 | """ 10 | 11 | import pandas as pd 12 | import gc 13 | import numpy as np 14 | from collections import defaultdict 15 | from scipy.stats import skew 16 | import utils 17 | utils.start(__file__) 18 | 19 | #============================================================================== 20 | # load 21 | #============================================================================== 22 | 23 | col = ['product_id', 'user_id', 'order_number', 'order_number_rev'] 24 | log = utils.read_pickles('../input/mk/log', col).sort_values(col[:3]) 25 | 26 | """ 27 | 1 1 1 28 | 1 1 2 29 | 1 1 4 30 | 1 2 3 31 | 1 2 4 32 | 2 2 5 33 | """ 34 | #============================================================================== 35 | # def 36 | #============================================================================== 37 | def make(T): 38 | """ 39 | T = 0 40 | folder = 'trainT-0' 41 | """ 42 | if T==-1: 43 | folder = 'test' 44 | else: 45 | folder = 'trainT-'+str(T) 46 | 47 | log_ = log[log.order_number_rev>T] 48 | log_['user_max_onb'] = log_.groupby('user_id').order_number.transform(np.max) 49 | 50 | item_min = defaultdict(int) 51 | item_mean = defaultdict(int) 52 | item_median = defaultdict(int) 53 | item_max = defaultdict(int) 54 | item_std = defaultdict(int) 55 | item_skew = defaultdict(int) 56 | 57 | pid_bk = uid_bk = onb_bk = None 58 | diff = [] 59 | 60 | for pid, uid, onb, max_onb in log_[['product_id', 'user_id', 'order_number', 'user_max_onb']].values: 61 | 62 | if pid==pid_bk and uid==uid_bk: 63 | diff.append(onb-onb_bk) 64 | """ 65 | pattern would be like: 66 | onb -> diff 67 | 1111 1,2,3,4 -> [1,1,1] 68 | 11101 1,2,3,5 -> [1,1,2] 69 | 111 1,2,3 -> [1,1] 70 | 1101 1,2,4 -> [1,2] 71 | 1011 1,3,4 -> [2,1] 72 | """ 73 | 74 | elif pid==pid_bk and uid!=uid_bk: 75 | pass 76 | elif pid!=pid_bk: 77 | if len(diff)>0: 78 | item_min[pid] = np.min(diff) 79 | item_mean[pid] = np.mean(diff) 80 | item_median[pid] = np.median(diff) 81 | item_max[pid] = np.max(diff) 82 | item_std[pid] = np.std(diff) 83 | item_skew[pid] = skew(diff) 84 | diff = [] 85 | 86 | pid_bk = pid 87 | uid_bk = uid 88 | onb_bk = onb 89 | 90 | item_min = pd.DataFrame.from_dict(item_min, orient='index').reset_index() 91 | item_min.columns = ['product_id', 'item_onb_diff_min'] 92 | item_mean = pd.DataFrame.from_dict(item_mean, orient='index').reset_index() 93 | item_mean.columns = ['product_id', 'item_onb_diff_mean'] 94 | item_median = pd.DataFrame.from_dict(item_median, orient='index').reset_index() 95 | item_median.columns = ['product_id', 'item_onb_diff_median'] 96 | item_max = pd.DataFrame.from_dict(item_max, orient='index').reset_index() 97 | item_max.columns = ['product_id', 'item_onb_diff_max'] 98 | item_std = pd.DataFrame.from_dict(item_std, orient='index').reset_index() 99 | item_std.columns = ['product_id', 'item_onb_diff_std'] 100 | item_skew = pd.DataFrame.from_dict(item_skew, orient='index').reset_index() 101 | item_skew.columns = ['product_id', 'item_onb_diff_skew'] 102 | 103 | df1 = pd.merge(item_min, item_mean, on='product_id', how='outer') 104 | df2 = pd.merge(item_median, item_max, on='product_id', how='outer') 105 | df3 = pd.merge(item_std, item_skew, on='product_id', how='outer') 106 | 107 | df = pd.merge(pd.merge(df1, df2, on='product_id', how='outer'), 108 | df3, on='product_id', how='outer') 109 | 110 | df.fillna(-99, inplace=True) 111 | df.to_pickle('../feature/{}/f215_product.p'.format(folder)) 112 | 113 | 114 | #============================================================================== 115 | # main 116 | #============================================================================== 117 | make(0) 118 | make(1) 119 | make(2) 120 | 121 | make(-1) 122 | 123 | 124 | #============================================================================== 125 | utils.end(__file__) 126 | 127 | -------------------------------------------------------------------------------- /py_feature/300_======user x item======: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/300_======user x item====== -------------------------------------------------------------------------------- /py_feature/301_total_buy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri May 26 22:12:32 2017 5 | 6 | @author: konodera 7 | 8 | そのユーザーがそのアイテムをいくつ買ったか 9 | *リークしてない 10 | 11 | """ 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | from collections import defaultdict 17 | import utils 18 | utils.start(__file__) 19 | 20 | 21 | col = ['order_id', 'user_id', 'product_id', 'order_number_rev'] 22 | log = utils.read_pickles('../input/mk/log', col) 23 | 24 | orders = pd.read_csv('../input/orders.csv.gz',usecols=['order_id','user_id','order_number']) 25 | 26 | #============================================================================== 27 | # def 28 | #============================================================================== 29 | def make(T): 30 | """ 31 | T = 0 32 | folder = 'trainT-0' 33 | """ 34 | if T==-1: 35 | folder = 'test' 36 | else: 37 | folder = 'trainT-'+str(T) 38 | 39 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 40 | df = pd.merge(label, orders, on='order_id', how='left') 41 | 42 | total_buy = log[log.order_number_rev>T].groupby(['user_id', 'product_id']).size().reset_index() 43 | total_buy.columns = ['user_id', 'product_id','total_buy'] 44 | 45 | df = pd.merge(df, total_buy, on=['user_id', 'product_id'], how='left') 46 | df['total_buy_ratio'] = df.total_buy / (df.order_number-1) 47 | 48 | col = ['order_id', 'product_id','total_buy', 'total_buy_ratio'] 49 | df[col].to_pickle('../feature/{}/f301_order-product.p'.format(folder)) 50 | 51 | # near5 52 | df = pd.merge(label, orders, on='order_id', how='left') 53 | total_buy = log[log.order_number_rev>T][log.order_number_rev<=(T+5)].groupby(['user_id', 'product_id']).size().reset_index() 54 | total_buy.columns = ['user_id', 'product_id','total_buy_n5'] 55 | 56 | df = pd.merge(df, total_buy, on=['user_id', 'product_id'], how='left').fillna(0) 57 | df['total_buy_ratio_n5'] = df['total_buy_n5'] / df.order_number.map(lambda x: min(5, x)) 58 | 59 | col = ['order_id', 'product_id','total_buy_n5', 'total_buy_ratio_n5'] 60 | df[col].to_pickle('../feature/{}/f301_order-product_n5.p'.format(folder)) 61 | 62 | 63 | #============================================================================== 64 | # main 65 | #============================================================================== 66 | make(0) 67 | make(1) 68 | make(2) 69 | 70 | make(-1) 71 | 72 | #============================================================================== 73 | utils.end(__file__) 74 | 75 | -------------------------------------------------------------------------------- /py_feature/302-1_reorderd_all.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 14 09:42:55 2017 5 | 6 | @author: konodera 7 | 8 | 9 | """ 10 | 11 | import pandas as pd 12 | import numpy as np 13 | from tqdm import tqdm 14 | import multiprocessing as mp 15 | import utils 16 | utils.start(__file__) 17 | 18 | LOOP = 20 19 | #============================================================================== 20 | # load 21 | #============================================================================== 22 | order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[['order_id', 'user_id', 'order_number']].sort_values(['user_id', 'order_number', 'order_id']) 23 | for i in range(1, LOOP): 24 | order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i) 25 | 26 | col = [c for c in order_tbl.columns if 'order_id' in c] 27 | order_tbl = order_tbl[col] 28 | 29 | col = ['order_id', 'user_id', 'order_number', 'product_id', 'reordered'] 30 | log = utils.read_pickles('../input/mk/log', col) 31 | log.sort_values(['user_id', 'order_number', 'product_id'], inplace=True) 32 | 33 | 34 | #============================================================================== 35 | # def 36 | #============================================================================== 37 | def multi(T): 38 | """ 39 | T = 0 40 | folder = 'trainT-0' 41 | """ 42 | if T==-1: 43 | folder = 'test' 44 | else: 45 | folder = 'trainT-'+str(T) 46 | 47 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 48 | df = pd.merge(label, order_tbl, on='order_id', how='left') 49 | 50 | for i in tqdm(range(1, LOOP)): 51 | oid = 't-{}_order_id'.format(i) 52 | v = 't-{}_reordered'.format(i) 53 | log_ = log.rename(columns={'order_id':oid, 54 | 'reordered':v})[[oid, 'product_id', v]] 55 | df = pd.merge(df, log_, on=[oid, 'product_id'], how='left') 56 | 57 | col = ['order_id', 'product_id'] + [c for c in df.columns if '_reordered' in c] 58 | 59 | df[col].fillna(-1).to_pickle('../feature/{}/f302_order-product_all.p'.format(folder)) 60 | #============================================================================== 61 | # main 62 | #============================================================================== 63 | mp_pool = mp.Pool(7) 64 | mp_pool.map(multi, [0, 1, 2, #3, 4, 5, 65 | -1]) 66 | 67 | 68 | 69 | #============================================================================== 70 | 71 | utils.end(__file__) 72 | 73 | -------------------------------------------------------------------------------- /py_feature/303_last_order_date.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun May 28 18:06:05 2017 5 | 6 | @author: konodera 7 | 8 | そのユーザがそのアイテム注文したのは何日前か? 9 | *リークじゃない 10 | 11 | """ 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | import gc 17 | import utils 18 | utils.start(__file__) 19 | 20 | 21 | #============================================================================== 22 | # mk train * test log 23 | #============================================================================== 24 | tbl = utils.read_pickles('../input/mk/days_since_last_order') 25 | 26 | #============================================================================== 27 | # def 28 | #============================================================================== 29 | def make(T): 30 | """ 31 | T = 0 32 | folder = 'trainT-0' 33 | """ 34 | if T==-1: 35 | folder = 'test' 36 | else: 37 | folder = 'trainT-'+str(T) 38 | 39 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 40 | 41 | df = pd.merge(label[['order_id', 'product_id']], 42 | tbl[['order_id', 'product_id','days_since_last_order_this_item']], 43 | on=['order_id', 'product_id'], how='left') 44 | 45 | df.to_pickle('../feature/{}/f303_order-product.p'.format(folder)) 46 | #============================================================================== 47 | # main 48 | #============================================================================== 49 | make(0) 50 | make(1) 51 | make(2) 52 | 53 | make(-1) 54 | 55 | 56 | 57 | #============================================================================== 58 | utils.end(__file__) 59 | 60 | -------------------------------------------------------------------------------- /py_feature/304_buy_item_inarow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed May 31 02:10:45 2017 5 | 6 | @author: konodera 7 | 8 | 現時点の連続購入記録 9 | *リーク 10 | 11 | """ 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | import utils 17 | utils.start(__file__) 18 | 19 | 20 | log = pd.read_pickle('../input/mk/log_inarow.p') 21 | X_base = pd.read_pickle('../feature/X_base_t3.p') 22 | 23 | #============================================================================== 24 | # def 25 | #============================================================================== 26 | def make(T): 27 | """ 28 | T = 0 29 | folder = 'trainT-0' 30 | """ 31 | if T==-1: 32 | folder = 'test' 33 | else: 34 | folder = 'trainT-'+str(T) 35 | 36 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 37 | label = pd.merge(label, X_base, on='order_id', how='left') # TODO: change to inner 38 | 39 | # ======== T-1~3 ======== 40 | for t in range(1,4): 41 | col = ['order_id', 'product_id', 'buy_item_inarow'] 42 | df = pd.merge(label, log[col].rename(columns={'order_id':'t-{}_order_id'.format(t)}), 43 | on=['t-{}_order_id'.format(t),'product_id'], how='left') 44 | 45 | col = ['order_id', 'order_number'] 46 | df = pd.merge(df, log[col].rename(columns={'order_id':'t-{}_order_id'.format(t)}).drop_duplicates(), 47 | on=['t-{}_order_id'.format(t)], how='left') 48 | 49 | df['buy_item_inarow_ratio'] = df['buy_item_inarow']/df['order_number'] 50 | df = df.rename(columns={'buy_item_inarow':'t-{}_buy_item_inarow'.format(t), 51 | 'buy_item_inarow_ratio':'t-{}_buy_item_inarow_ratio'.format(t)}) 52 | print(df.isnull().sum()) 53 | df.fillna(0, inplace=1) 54 | df.reset_index(drop=1, inplace=1) 55 | 56 | col = ['order_id', 'product_id', 't-{}_buy_item_inarow'.format(t),'t-{}_buy_item_inarow_ratio'.format(t)] 57 | df[col].to_pickle('../feature/{}/f304-{}_order-product.p'.format(folder, t)) 58 | 59 | #============================================================================== 60 | # main 61 | #============================================================================== 62 | make(0) 63 | make(1) 64 | make(2) 65 | 66 | make(-1) 67 | 68 | #============================================================================== 69 | utils.end(__file__) 70 | 71 | -------------------------------------------------------------------------------- /py_feature/305_last_order_num.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 14 00:00:43 2017 5 | 6 | @author: konodera 7 | 8 | 9 | oder_num - last_order_num 10 | """ 11 | 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | import utils 17 | utils.start(__file__) 18 | 19 | 20 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] 21 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'order_number']) 22 | 23 | orders = pd.read_csv('../input/orders.csv.gz', usecols=['order_id', 'order_number']) 24 | 25 | X_base = pd.read_pickle('../feature/X_base_t3.p') 26 | X_base = pd.merge(X_base, orders, on='order_id', how='left') 27 | 28 | 29 | #============================================================================== 30 | # def 31 | #============================================================================== 32 | def make(T): 33 | """ 34 | T = 0 35 | folder = 'trainT-0' 36 | """ 37 | if T==-1: 38 | folder = 'test' 39 | else: 40 | folder = 'trainT-'+str(T) 41 | 42 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 43 | label = pd.merge(label, X_base, on='order_id', how='left') 44 | 45 | log_ = log[log.order_number_rev>T] 46 | log_.drop_duplicates(['user_id', 'product_id'], keep='last', inplace=True) 47 | log_.drop(['order_id','order_number_rev'], axis=1, inplace=1) 48 | log_.columns = ['user_id', 'product_id', 'last_order_number'] 49 | 50 | df = pd.merge(label, log_, on=['user_id', 'product_id'], how='left') 51 | df['order_number_diff'] = df.order_number - df.last_order_number 52 | 53 | col = ['order_id', 'product_id', 'last_order_number', 'order_number_diff'] 54 | df[col].to_pickle('../feature/{}/f305_order-product.p'.format(folder)) 55 | 56 | #============================================================================== 57 | # main 58 | #============================================================================== 59 | make(0) 60 | make(1) 61 | make(2) 62 | 63 | make(-1) 64 | 65 | 66 | 67 | 68 | utils.end(__file__) 69 | 70 | -------------------------------------------------------------------------------- /py_feature/306_mean_pos_cart.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jun 16 06:42:38 2017 5 | 6 | @author: konodera 7 | 8 | 平均pos_cart 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | 19 | col = ['order_id', 'user_id', 'product_id', 'add_to_cart_order', 'order_number_rev'] 20 | log = utils.read_pickles('../input/mk/log', col) 21 | 22 | #============================================================================== 23 | # def 24 | #============================================================================== 25 | def make(T): 26 | """ 27 | T = 0 28 | folder = 'trainT-0' 29 | """ 30 | if T==-1: 31 | folder = 'test' 32 | else: 33 | folder = 'trainT-'+str(T) 34 | 35 | log_ = log[log.order_number_rev>T] 36 | 37 | gr = log_.groupby(['user_id', 'product_id']) 38 | 39 | user = gr.add_to_cart_order.mean().to_frame() 40 | user.columns = ['useritem_mean_pos_cart'] 41 | user['useritem_sum_pos_cart'] = gr.add_to_cart_order.sum() 42 | user['useritem_min_pos_cart'] = gr.add_to_cart_order.min() 43 | user['useritem_median_pos_cart'] = gr.add_to_cart_order.median() 44 | user['useritem_max_pos_cart'] = gr.add_to_cart_order.max() 45 | user['useritem_std_pos_cart'] = gr.add_to_cart_order.std() 46 | user.reset_index(inplace=True) 47 | 48 | user.to_pickle('../feature/{}/f306_user-product.p'.format(folder)) 49 | 50 | # === near5 === 51 | log_ = log[log.order_number_rev>T][log.order_number_rev<=(T+5)] 52 | 53 | gr = log_.groupby(['user_id', 'product_id']) 54 | 55 | user = gr.add_to_cart_order.mean().to_frame() 56 | user.columns = ['useritem_mean_pos_cart_n5'] 57 | user['useritem_sum_pos_cart_n5'] = gr.add_to_cart_order.sum() 58 | user['useritem_min_pos_cart_n5'] = gr.add_to_cart_order.min() 59 | user['useritem_median_pos_cart_n5'] = gr.add_to_cart_order.median() 60 | user['useritem_max_pos_cart_n5'] = gr.add_to_cart_order.max() 61 | user['useritem_std_pos_cart_n5'] = gr.add_to_cart_order.std() 62 | user.reset_index(inplace=True) 63 | 64 | user.to_pickle('../feature/{}/f306_user-product_n5.p'.format(folder)) 65 | 66 | 67 | #============================================================================== 68 | # main 69 | #============================================================================== 70 | make(0) 71 | make(1) 72 | make(2) 73 | 74 | make(-1) 75 | 76 | 77 | 78 | 79 | #============================================================================== 80 | utils.end(__file__) 81 | 82 | -------------------------------------------------------------------------------- /py_feature/307_timezone_dow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jun 16 15:50:03 2017 5 | 6 | @author: konodera 7 | 8 | そのユーザーがそのアイテムを買う時間帯の割合 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | from collections import defaultdict 16 | import utils 17 | utils.start(__file__) 18 | 19 | #============================================================================== 20 | # load 21 | #============================================================================== 22 | 23 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 24 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') 25 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 26 | on='order_hour_of_day', how='left') 27 | 28 | #============================================================================== 29 | # def 30 | #============================================================================== 31 | def make(T): 32 | """ 33 | T = 0 34 | folder = 'trainT-0' 35 | """ 36 | if T==-1: 37 | folder = 'test' 38 | else: 39 | folder = 'trainT-'+str(T) 40 | 41 | log_ = log[log.order_number_rev>T] 42 | 43 | cnt = log_.groupby(['user_id', 'product_id', 'timezone']).size() 44 | cnt.name = 'useritem_buy_timezone_cnt' 45 | cnt = cnt.reset_index() 46 | 47 | sum_ = log_.groupby(['user_id', 'product_id']).size() 48 | sum_.name = 'total' 49 | sum_ = sum_.reset_index() 50 | 51 | df = pd.merge(cnt, sum_, on=['user_id', 'product_id'], how='left') 52 | 53 | df['useritem_buy_timezone_ratio'] = df.useritem_buy_timezone_cnt / df.total 54 | 55 | col = ['user_id', 'product_id', 'timezone', 56 | 'useritem_buy_timezone_cnt', 'useritem_buy_timezone_ratio'] 57 | 58 | df[col].to_pickle('../feature/{}/f307_user-product-timezone.p'.format(folder)) 59 | 60 | #============================================================================== 61 | 62 | 63 | cnt = log_.groupby(['user_id', 'product_id', 'order_dow']).size() 64 | cnt.name = 'useritem_buy_dow_cnt' 65 | cnt = cnt.reset_index() 66 | 67 | sum_ = log_.groupby(['user_id', 'product_id']).size() 68 | sum_.name = 'total' 69 | sum_ = sum_.reset_index() 70 | 71 | df = pd.merge(cnt, sum_, on=['user_id', 'product_id'], how='left') 72 | 73 | df['useritem_buy_dow_ratio'] = df.useritem_buy_dow_cnt / df.total 74 | 75 | col = ['user_id', 'product_id', 'order_dow', 76 | 'useritem_buy_dow_cnt', 'useritem_buy_dow_ratio'] 77 | 78 | df[col].to_pickle('../feature/{}/f307_user-product-dow.p'.format(folder)) 79 | 80 | #============================================================================== 81 | # main 82 | #============================================================================== 83 | make(0) 84 | make(1) 85 | make(2) 86 | 87 | make(-1) 88 | 89 | 90 | 91 | 92 | 93 | 94 | #============================================================================== 95 | utils.end(__file__) 96 | 97 | -------------------------------------------------------------------------------- /py_feature/308_timezone_dow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jun 17 23:28:15 2017 5 | 6 | @author: konodera 7 | 8 | そのユーザーがその時間にそのアイテムを買う率 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | #============================================================================== 19 | # load 20 | #============================================================================== 21 | 22 | col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 23 | log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') 24 | log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), 25 | on='order_hour_of_day', how='left') 26 | 27 | #============================================================================== 28 | # def 29 | #============================================================================== 30 | def make(T): 31 | """ 32 | T = 0 33 | folder = 'trainT-0' 34 | """ 35 | if T==-1: 36 | folder = 'test' 37 | else: 38 | folder = 'trainT-'+str(T) 39 | 40 | log_ = log[log.order_number_rev>T] 41 | 42 | # timezone 43 | cnt = log_.groupby(['user_id', 'product_id', 'timezone']).size() 44 | cnt.name = 'useritem_buy_timezone_cnt' 45 | cnt = cnt.reset_index() 46 | 47 | chance = log_.drop_duplicates('order_id').groupby(['user_id', 'timezone']).size() 48 | chance.name = 'total' 49 | chance = chance.reset_index() 50 | 51 | df = pd.merge(cnt, chance, on=['user_id', 'timezone'], how='left') 52 | df['useritem_buy_timezone_ratio2'] = df.useritem_buy_timezone_cnt / df.total 53 | 54 | col = ['user_id', 'product_id', 'timezone', 'useritem_buy_timezone_ratio2'] 55 | 56 | df[col].to_pickle('../feature/{}/f308_user-product-timezone.p'.format(folder)) 57 | 58 | # dow 59 | cnt = log_.groupby(['user_id', 'product_id', 'order_dow']).size() 60 | cnt.name = 'useritem_buy_dow_cnt' 61 | cnt = cnt.reset_index() 62 | 63 | chance = log_.drop_duplicates('order_id').groupby(['user_id', 'order_dow']).size() 64 | chance.name = 'total' 65 | chance = chance.reset_index() 66 | 67 | df = pd.merge(cnt, chance, on=['user_id', 'order_dow'], how='left') 68 | df['useritem_buy_dow_ratio2'] = df.useritem_buy_dow_cnt / df.total 69 | 70 | col = ['user_id', 'product_id', 'order_dow', 'useritem_buy_dow_ratio2'] 71 | 72 | df[col].to_pickle('../feature/{}/f308_user-product-dow.p'.format(folder)) 73 | 74 | #============================================================================== 75 | # main 76 | #============================================================================== 77 | make(0) 78 | make(1) 79 | make(2) 80 | 81 | make(-1) 82 | 83 | 84 | 85 | 86 | 87 | #============================================================================== 88 | utils.end(__file__) 89 | 90 | -------------------------------------------------------------------------------- /py_feature/309_order_ratio_by-chance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jun 18 12:55:38 2017 5 | 6 | @author: konodera 7 | 8 | item order ratio divide by chance 9 | 10 | 11 | ex1: 12 | onb_buy = [5,8,9] 13 | onb_visit = [1,2,5,8,9] 14 | return: 3/3 15 | 16 | ex2: 17 | onb_buy = [5,9] 18 | onb_visit = [1,2,5,8,9] 19 | return: 2/3 20 | 21 | """ 22 | 23 | import pandas as pd 24 | import numpy as np 25 | from tqdm import tqdm 26 | import utils 27 | utils.start(__file__) 28 | 29 | 30 | #============================================================================== 31 | # load 32 | #============================================================================== 33 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] 34 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number']) 35 | 36 | 37 | #============================================================================== 38 | # def 39 | #============================================================================== 40 | def make(T): 41 | """ 42 | T = 0 43 | folder = 'trainT-0' 44 | """ 45 | if T==-1: 46 | folder = 'test' 47 | else: 48 | folder = 'trainT-'+str(T) 49 | 50 | log_ = log[log.order_number_rev>T] 51 | 52 | cnt = log_.groupby(['user_id', 'product_id']).size() 53 | cnt.name = 'cnt' 54 | cnt = cnt.reset_index() 55 | 56 | # chance 57 | user_onb_max = log_.groupby('user_id').order_number.max().reset_index() 58 | user_onb_max.columns = ['user_id', 'onb_max'] 59 | 60 | user_item_min = log_.groupby(['user_id', 'product_id']).order_number.min().reset_index() 61 | user_item_min.columns = ['user_id', 'product_id', 'onb_min'] 62 | 63 | chance = pd.merge(user_item_min, user_onb_max, on='user_id', how='left') 64 | chance['chance'] = chance.onb_max - chance.onb_min +1 65 | 66 | df = pd.merge(cnt, chance, on=['user_id', 'product_id'], how='left') 67 | 68 | df['order_ratio_bychance'] = df.cnt / df.chance 69 | 70 | col = ['user_id', 'product_id', 'chance', 'order_ratio_bychance'] 71 | df[col].to_pickle('../feature/{}/f309_user-product.p'.format(folder)) 72 | 73 | # === near5 === 74 | log_ = log[log.order_number_rev>T][log.order_number_rev<=(T+5)] 75 | 76 | cnt = log_.groupby(['user_id', 'product_id']).size() 77 | cnt.name = 'cnt' 78 | cnt = cnt.reset_index() 79 | 80 | # chance 81 | user_onb_max = log_.groupby('user_id').order_number.max().reset_index() 82 | user_onb_max.columns = ['user_id', 'onb_max'] 83 | 84 | user_item_min = log_.groupby(['user_id', 'product_id']).order_number.min().reset_index() 85 | user_item_min.columns = ['user_id', 'product_id', 'onb_min'] 86 | 87 | chance = pd.merge(user_item_min, user_onb_max, on='user_id', how='left') 88 | chance['chance_n5'] = chance.onb_max - chance.onb_min +1 89 | 90 | df = pd.merge(cnt, chance, on=['user_id', 'product_id'], how='left') 91 | 92 | df['order_ratio_bychance_n5'] = df.cnt / df.chance_n5 93 | 94 | col = ['user_id', 'product_id', 'chance_n5', 'order_ratio_bychance_n5'] 95 | df[col].to_pickle('../feature/{}/f309_user-product_n5.p'.format(folder)) 96 | 97 | 98 | #============================================================================== 99 | # main 100 | #============================================================================== 101 | make(0) 102 | make(1) 103 | make(2) 104 | 105 | make(-1) 106 | 107 | 108 | 109 | #============================================================================== 110 | utils.end(__file__) 111 | 112 | -------------------------------------------------------------------------------- /py_feature/310_repeat_within_today.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jun 18 15:58:38 2017 5 | 6 | @author: konodera 7 | 8 | 同じアイテムを同日に買ったことがあるか 9 | 10 | 11 | """ 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | from collections import defaultdict 17 | import utils 18 | utils.start(__file__) 19 | 20 | #============================================================================== 21 | # load 22 | #============================================================================== 23 | col = ['order_id', 'user_id', 'product_id', 'order_number','days_since_prior_order', 'order_number_rev'] 24 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number']) 25 | log.user_id = log.user_id.map(str) 26 | log.product_id = log.product_id.map(str) 27 | 28 | 29 | #============================================================================== 30 | # def 31 | #============================================================================== 32 | def make(T): 33 | """ 34 | T = 0 35 | folder = 'trainT-0' 36 | """ 37 | if T==-1: 38 | folder = 'test' 39 | else: 40 | folder = 'trainT-'+str(T) 41 | 42 | log_ = log[log.order_number_rev>T] 43 | 44 | uid_pid = {} 45 | uid_bk = pid_bk = onb_bk = None 46 | col = ['user_id', 'product_id', 'order_number', 'days_since_prior_order'] 47 | 48 | for uid,pid,onb,days in log_[col].values: 49 | # uid = str(uid) 50 | # pid = str(pid) 51 | if uid_bk is None: 52 | pass 53 | elif uid+'@'+pid in uid_pid: 54 | continue 55 | elif days == 0 and uid == uid_bk and pid == pid_bk and onb-onb_bk==1: 56 | uid_pid[uid+'@'+pid] = 1 57 | 58 | uid_bk = uid 59 | pid_bk = pid 60 | onb_bk = onb 61 | 62 | df = pd.DataFrame().from_dict(uid_pid, orient='index').reset_index() 63 | df.columns = ['uidpid', 'buy_within_sameday'] 64 | df['user_id'] = df.uidpid.map(lambda x:x.split('@')[0]) 65 | df['product_id'] = df.uidpid.map(lambda x:x.split('@')[1]) 66 | 67 | df = df[['user_id', 'product_id', 'buy_within_sameday']] 68 | for c in df.columns: 69 | df[c] = df[c].map(int) 70 | df.sort_values(df.columns.tolist(), inplace=True) 71 | df.reset_index(drop=1, inplace=1) 72 | 73 | df.to_pickle('../feature/{}/f310_user-product.p'.format(folder)) 74 | 75 | #============================================================================== 76 | # main 77 | #============================================================================== 78 | make(0) 79 | make(1) 80 | make(2) 81 | 82 | make(-1) 83 | 84 | #============================================================================== 85 | utils.end(__file__) 86 | 87 | -------------------------------------------------------------------------------- /py_feature/312_cycle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jun 26 10:35:09 2017 5 | 6 | @author: konodera 7 | 8 | userのそのitemのcycle 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | 19 | #============================================================================== 20 | # load 21 | #============================================================================== 22 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] 23 | log = pd.merge(utils.read_pickles('../input/mk/log', usecols), 24 | utils.read_pickles('../input/mk/days_since_last_order'), 25 | on=['order_id','product_id'], how='left') 26 | 27 | 28 | #============================================================================== 29 | # def 30 | #============================================================================== 31 | def make(T): 32 | """ 33 | T = 0 34 | folder = 'trainT-0' 35 | """ 36 | if T==-1: 37 | folder = 'test' 38 | else: 39 | folder = 'trainT-'+str(T) 40 | 41 | log_ = log[log.order_number_rev>T] 42 | 43 | key = ['user_id', 'product_id'] 44 | tbl = log_.groupby(key).days_since_last_order_this_item.mean().to_frame() 45 | tbl.columns = ['useritem_order_days_mean'] 46 | tbl['useritem_order_days_min'] = log_.groupby(key).days_since_last_order_this_item.min() 47 | tbl['useritem_order_days_max'] = log_.groupby(key).days_since_last_order_this_item.max() 48 | tbl['useritem_order_days_median'] = log_.groupby(key).days_since_last_order_this_item.median() 49 | 50 | tbl.reset_index().to_pickle('../feature/{}/f312_user_product.p'.format(folder)) 51 | 52 | # === near5 === 53 | log_ = log[log.order_number_rev>T][log.order_number_rev<=(T+5)] 54 | 55 | key = ['user_id', 'product_id'] 56 | tbl = log_.groupby(key).days_since_last_order_this_item.mean().to_frame() 57 | tbl.columns = ['useritem_order_days_mean_n5'] 58 | tbl['useritem_order_days_min_n5'] = log_.groupby(key).days_since_last_order_this_item.min() 59 | tbl['useritem_order_days_max_n5'] = log_.groupby(key).days_since_last_order_this_item.max() 60 | tbl['useritem_order_days_median_n5'] = log_.groupby(key).days_since_last_order_this_item.median() 61 | 62 | tbl.reset_index().to_pickle('../feature/{}/f312_user_product_n5.p'.format(folder)) 63 | 64 | #============================================================================== 65 | # main 66 | #============================================================================== 67 | make(0) 68 | make(1) 69 | make(2) 70 | 71 | make(-1) 72 | 73 | 74 | 75 | #============================================================================== 76 | utils.end(__file__) 77 | 78 | -------------------------------------------------------------------------------- /py_feature/313_aisle_dep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 12 00:48:08 2017 5 | 6 | @author: konodera 7 | 8 | aisle & department 9 | 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from tqdm import tqdm 15 | import utils 16 | utils.start(__file__) 17 | 18 | 19 | #============================================================================== 20 | # load 21 | #============================================================================== 22 | usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] 23 | log = utils.read_pickles('../input/mk/log', usecols) 24 | 25 | goods = pd.read_pickle('../input/mk/goods.p')[['product_id', 'aisle_id', 'department_id']] 26 | 27 | log = pd.merge(log, goods, on='product_id', how='left') 28 | 29 | #============================================================================== 30 | # def 31 | #============================================================================== 32 | def make(T): 33 | """ 34 | T = 0 35 | folder = 'trainT-0' 36 | """ 37 | if T==-1: 38 | folder = 'test' 39 | else: 40 | folder = 'trainT-'+str(T) 41 | 42 | log_ = log[log.order_number_rev>T] 43 | 44 | user = log_.groupby(['user_id']).size().to_frame() 45 | user.columns = ['total'] 46 | user.reset_index(inplace=True) 47 | 48 | user_aisle = log_.groupby(['user_id', 'aisle_id']).size().to_frame() 49 | user_aisle.columns = ['user_aisle_cnt'] 50 | user_aisle.reset_index(inplace=True) 51 | user_aisle = pd.merge(user_aisle, user, on='user_id', how='left') 52 | user_aisle['user_aisle_ratio'] = user_aisle.user_aisle_cnt / user_aisle.total 53 | user_aisle.drop('total', axis=1, inplace=True) 54 | user_aisle.to_pickle('../feature/{}/f313_user_aisle.p'.format(folder)) 55 | 56 | user_dep = log_.groupby(['user_id', 'department_id']).size().to_frame() 57 | user_dep.columns = ['user_dep_cnt'] 58 | user_dep.reset_index(inplace=True) 59 | user_dep = pd.merge(user, user_dep, on='user_id', how='left') 60 | user_dep['user_dep_ratio'] = user_dep.user_dep_cnt / user_dep.total 61 | user_dep.drop('total', axis=1, inplace=True) 62 | user_dep.to_pickle('../feature/{}/f313_user_dep.p'.format(folder)) 63 | 64 | #============================================================================== 65 | # main 66 | #============================================================================== 67 | make(0) 68 | make(1) 69 | make(2) 70 | 71 | make(-1) 72 | 73 | 74 | 75 | #============================================================================== 76 | utils.end(__file__) 77 | 78 | -------------------------------------------------------------------------------- /py_feature/314_co-occur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 16 16:02:01 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | 16 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] 17 | log = utils.read_pickles('../input/mk/log', col) 18 | 19 | #============================================================================== 20 | # def 21 | #============================================================================== 22 | def make(T): 23 | """ 24 | T = 0 25 | folder = 'trainT-0' 26 | """ 27 | if T==-1: 28 | folder = 'test' 29 | else: 30 | folder = 'trainT-'+str(T) 31 | 32 | log_ = log[log.order_number_rev>T] 33 | 34 | order_tbl = log_.groupby('order_id').size().to_frame() 35 | order_tbl.columns = ['order_size'] 36 | order_tbl.reset_index(inplace=True) 37 | 38 | order_tbl = pd.merge(order_tbl, log_[['order_id', 'user_id', 'product_id']]) 39 | 40 | col = ['user_id', 'product_id'] 41 | tbl = log_.sort_values(col).drop_duplicates(col)[col] 42 | tbl = tbl.set_index(col) 43 | 44 | gr = order_tbl.groupby(['user_id', 'product_id']) 45 | 46 | tbl['useritem_cooccur-min'] = gr.order_size.min() 47 | tbl['useritem_cooccur-max'] = gr.order_size.max() 48 | tbl['useritem_cooccur-mean'] = gr.order_size.mean() 49 | tbl['useritem_cooccur-median'] = gr.order_size.median() 50 | tbl['useritem_cooccur-std'] = gr.order_size.std() 51 | tbl.reset_index(inplace=True) 52 | 53 | user_osz = order_tbl.groupby(['user_id']).order_size.min().to_frame() 54 | user_osz.columns = ['user_order_size-min'] 55 | user_osz['user_order_size-max'] = order_tbl.groupby(['user_id']).order_size.max() 56 | user_osz.reset_index(inplace=True) 57 | 58 | tbl = pd.merge(tbl, user_osz, on='user_id', how='left') 59 | 60 | tbl['useritem_cooccur-min-min'] = tbl['user_order_size-min'] - tbl['useritem_cooccur-min'] 61 | tbl['useritem_cooccur-max-min'] = tbl['useritem_cooccur-max'] - tbl['useritem_cooccur-min'] 62 | tbl['useritem_cooccur-max-max'] = tbl['user_order_size-max'] - tbl['useritem_cooccur-max'] 63 | tbl.drop(['user_order_size-min', 'user_order_size-max'], axis=1, inplace=True) 64 | 65 | tbl.to_pickle('../feature/{}/f314_user-product.p'.format(folder)) 66 | 67 | #============================================================================== 68 | # main 69 | #============================================================================== 70 | make(0) 71 | make(1) 72 | make(2) 73 | 74 | make(-1) 75 | 76 | 77 | utils.end(__file__) 78 | 79 | -------------------------------------------------------------------------------- /py_feature/315_streak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed May 31 02:10:45 2017 5 | 6 | @author: konodera 7 | 8 | 現時点の連続購入記録 9 | *リーク 10 | 11 | """ 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | import multiprocessing as mp 17 | import utils 18 | utils.start(__file__) 19 | 20 | 21 | streak = pd.read_pickle('../input/mk/streak_order-product.p') 22 | X_base = pd.read_pickle('../feature/X_base_t3.p') 23 | 24 | #============================================================================== 25 | # def 26 | #============================================================================== 27 | def multi(T): 28 | """ 29 | T = 0 30 | folder = 'trainT-0' 31 | """ 32 | if T==-1: 33 | folder = 'test' 34 | else: 35 | folder = 'trainT-'+str(T) 36 | 37 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 38 | label = pd.merge(label, X_base, on='order_id', how='inner') 39 | 40 | # ======== T-1~3 ======== 41 | for t in range(1,4): 42 | 43 | df = pd.merge(label, streak.rename(columns={'order_id':'t-{}_order_id'.format(t), 44 | 'streak':'t-{}_streak'.format(t)}), 45 | on=['t-{}_order_id'.format(t),'product_id'], how='left') 46 | 47 | print(df.isnull().sum()) 48 | df.fillna(-99, inplace=1) 49 | df.reset_index(drop=1, inplace=1) 50 | 51 | col = ['order_id', 'product_id', 't-{}_streak'.format(t)] 52 | df[col].to_pickle('../feature/{}/f315-{}_order-product.p'.format(folder, t)) 53 | 54 | #============================================================================== 55 | # main 56 | #============================================================================== 57 | mp_pool = mp.Pool(3) 58 | callback = mp_pool.map(multi, list(range(-1,3))) 59 | 60 | #============================================================================== 61 | utils.end(__file__) 62 | 63 | -------------------------------------------------------------------------------- /py_feature/316_replacement.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 5 22:36:10 2017 5 | 6 | @author: konodera 7 | 8 | 9 | nohup python -u 316_replacement.py & 10 | 11 | 12 | """ 13 | 14 | import pandas as pd 15 | import gc 16 | import numpy as np 17 | from tqdm import tqdm 18 | from collections import defaultdict 19 | from itertools import product 20 | import utils 21 | utils.start(__file__) 22 | 23 | #============================================================================== 24 | # load 25 | #============================================================================== 26 | 27 | usecols = ['user_id', 'order_number', 'product_id', 'product_name', 'order_id', 'order_number_rev'] 28 | log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3]) 29 | order_pids = log.groupby('order_id').product_id.apply(set).reset_index() 30 | 31 | #item = pd.read_pickle('../input/mk/replacement2.p').head(999) 32 | item = pd.read_pickle('../input/mk/replacement.p') 33 | item = item[item.back>9] 34 | 35 | # parse 36 | item_di = defaultdict(int) 37 | for pid1,pid2,ratio in item[['pid1', 'pid2', 'ratio']].values: 38 | item_di['{} {}'.format(int(pid1),int(pid2))] = ratio 39 | #============================================================================== 40 | # def 41 | #============================================================================== 42 | def make(T): 43 | """ 44 | T = 0 45 | folder = 'trainT-0' 46 | """ 47 | if T==-1: 48 | folder = 'test' 49 | else: 50 | folder = 'trainT-'+str(T) 51 | 52 | X_base = pd.read_pickle('../feature/X_base_t3.p') 53 | label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) 54 | 55 | # 'inner' for removing t-n_order_id == NaN 56 | if 'train' in folder: 57 | df = pd.merge(X_base[X_base.is_train==1], label, on='order_id', how='inner') 58 | elif folder == 'test': 59 | df = pd.merge(X_base[X_base.is_train==0], label, on='order_id', how='inner') 60 | 61 | df = pd.merge(df, 62 | order_pids.add_prefix('t-1_'), 63 | on='t-1_order_id', how='left') 64 | df = pd.merge(df, 65 | order_pids.add_prefix('t-2_'), 66 | on='t-2_order_id', how='left') 67 | 68 | ratio_min = [] 69 | ratio_mean = [] 70 | ratio_max = [] 71 | ratio_sum = [] 72 | ratio_len = [] 73 | for t_2,t_1,pid in tqdm(df[['t-2_product_id', 't-1_product_id', 'product_id']].values, miniters=99999): 74 | rep = t_1 - t_2 75 | if pid not in t_1 and pid in t_2 and len(rep)>0: 76 | ratios = [item_di['{} {}'.format(i1,i2)] for i1,i2 in list(product([pid], rep))] 77 | ratio_min.append(np.min(ratios)) 78 | ratio_mean.append(np.mean(ratios)) 79 | ratio_max.append(np.max(ratios)) 80 | ratio_sum.append(np.sum(ratios)) 81 | ratio_len.append(len(ratios)) 82 | else: 83 | ratio_min.append(-1) 84 | ratio_mean.append(-1) 85 | ratio_max.append(-1) 86 | ratio_sum.append(-1) 87 | ratio_len.append(-1) 88 | 89 | df['comeback_ratio_min'] = ratio_min 90 | df['comeback_ratio_mean'] = ratio_mean 91 | df['comeback_ratio_max'] = ratio_max 92 | df['comeback_ratio_sum'] = ratio_sum 93 | df['comeback_ratio_len'] = ratio_len 94 | 95 | col = ['order_id', 'product_id', 'comeback_ratio_min', 'comeback_ratio_mean', 96 | 'comeback_ratio_max', 'comeback_ratio_sum', 'comeback_ratio_len'] 97 | df[col].to_pickle('../feature/{}/f316_order_product.p'.format(folder)) 98 | del df 99 | gc.collect() 100 | 101 | #============================================================================== 102 | # main 103 | #============================================================================== 104 | make(0) 105 | make(1) 106 | make(2) 107 | 108 | make(-1) 109 | 110 | #============================================================================== 111 | utils.end(__file__) 112 | 113 | -------------------------------------------------------------------------------- /py_feature/400_===== daytime =====: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/400_===== daytime ===== -------------------------------------------------------------------------------- /py_feature/401_how_many_come.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jun 18 01:09:41 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm import tqdm 12 | import utils 13 | utils.start(__file__) 14 | 15 | 16 | #============================================================================== 17 | # load 18 | #============================================================================== 19 | col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'order_number_rev'] 20 | log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number']) 21 | 22 | 23 | #============================================================================== 24 | # def 25 | #============================================================================== 26 | def make(T): 27 | """ 28 | T = 0 29 | folder = 'trainT-0' 30 | """ 31 | if T==-1: 32 | folder = 'test' 33 | else: 34 | folder = 'trainT-'+str(T) 35 | 36 | log_ = log[log.order_number_rev>T] 37 | 38 | # dow 39 | dow = log_.drop_duplicates('order_id').groupby('order_dow').size() 40 | dow.name = 'dow_order_cnt' 41 | dow = dow.to_frame() 42 | 43 | dow['dow_item_cnt'] = log_.groupby('order_dow').size() 44 | 45 | dow /= dow.sum() 46 | 47 | dow['dow_rank_diff'] = dow.dow_order_cnt.rank() - dow.dow_item_cnt.rank() 48 | 49 | dow.reset_index().to_pickle('../feature/{}/f401_dow.p'.format(folder)) 50 | 51 | 52 | # hour 53 | hour = log_.drop_duplicates('order_id').groupby('order_hour_of_day').size() 54 | hour.name = 'hour_order_cnt' 55 | hour = hour.to_frame() 56 | 57 | hour['hour_item_cnt'] = log_.groupby('order_hour_of_day').size() 58 | 59 | hour /= hour.sum() 60 | 61 | hour['hour_rank_diff'] = hour.hour_order_cnt.rank() - hour.hour_item_cnt.rank() 62 | 63 | hour.reset_index().to_pickle('../feature/{}/f401_hour.p'.format(folder)) 64 | 65 | #============================================================================== 66 | # main 67 | #============================================================================== 68 | make(0) 69 | make(1) 70 | make(2) 71 | 72 | make(-1) 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | #============================================================================== 81 | utils.end(__file__) 82 | 83 | -------------------------------------------------------------------------------- /py_feature/500_===== concat =====: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/500_===== concat ===== -------------------------------------------------------------------------------- /py_feature/900_===== run =====: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_feature/900_===== run ===== -------------------------------------------------------------------------------- /py_feature/901_run_feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu May 18 23:13:37 2017 5 | 6 | @author: konodera 7 | 8 | 9 | nohup python -u 901_run_feature.py > log_run_feature.txt & 10 | 11 | 12 | from glob import glob 13 | files = [f for f in sorted(glob('*.py')) if f[0].isdigit()] 14 | for f in files: 15 | print("os.system('python -u {}')".format(f)) 16 | 17 | 18 | """ 19 | 20 | import os 21 | import utils 22 | utils.start(__file__) 23 | 24 | utils.mkdir_p('../input/mk') 25 | utils.mkdir_p('../output') 26 | utils.mkdir_p('../output/model') 27 | utils.mkdir_p('../output/sub') 28 | utils.mkdir_p('../output/imp') 29 | utils.mkdir_p('../feature') 30 | utils.mkdir_p('../feature/trainT-0') 31 | utils.mkdir_p('../feature/trainT-1') 32 | utils.mkdir_p('../feature/trainT-2') 33 | utils.mkdir_p('../feature/test') 34 | 35 | 36 | os.system('python -u 000_mk.py') 37 | os.system('python -u 003_X_base_T.py') 38 | os.system('python -u 004_label.py') 39 | os.system('python -u 005_inarow.py') 40 | os.system('python -u 006_days_since_last_order.py') 41 | os.system('python -u 007_timezone.py') 42 | os.system('python -u 008_product_feature.py') 43 | os.system('python -u 009_None.py') 44 | os.system('python -u 010_streak.py') 45 | os.system('python -u 011_replacement.py') 46 | os.system('python -u 012_aisle_dep_cumsum.py') 47 | 48 | os.system('nohup python -u 101_repeat_previous_ratio_T.py &') 49 | os.system('python -u 102_orderspan_average.py') 50 | os.system('nohup python -u 103_visit_time.py &') 51 | os.system('python -u 104_organic.py') 52 | os.system('python -u 105_delta_time.py') 53 | os.system('python -u 108_order_size.py') 54 | os.system('python -u 109_have_you_bought.py') 55 | os.system('python -u 110_None.py') 56 | 57 | os.system('nohup python -u 202_buy_time.py &') 58 | os.system('python -u 203_cycle.py') 59 | os.system('nohup python -u 205_co-occur.py &') 60 | os.system('python -u 207_mean_pos_cart.py') 61 | os.system('python -u 208_one-shot.py') 62 | os.system('python -u 209_together.py') 63 | os.system('nohup python -u 210_streak.py &') 64 | os.system('nohup python -u 211_1to1.py &') 65 | os.system('nohup python -u 212_withinN.py &') 66 | os.system('nohup python -u 213_dow_diff.py &') 67 | os.system('nohup python -u 214_first_order.py &') 68 | os.system('nohup python -u 215_onb_diff.py &') 69 | 70 | os.system('python -u 301_total_buy.py') 71 | os.system('nohup python -u 302-1_reorderd_all.py &') 72 | os.system('nohup python -u 303_last_order_date.py &') 73 | os.system('nohup python -u 304_buy_item_inarow.py &') 74 | os.system('nohup python -u 305_last_order_num.py &') 75 | os.system('nohup python -u 306_mean_pos_cart.py &') 76 | os.system('nohup python -u 307_timezone_dow.py &') 77 | os.system('nohup python -u 308_timezone_dow.py &') 78 | os.system('nohup python -u 309_order_ratio_by-chance.py &') 79 | os.system('python -u 310_repeat_within_today.py') 80 | os.system('python -u 312_cycle.py') 81 | os.system('python -u 313_aisle_dep.py') 82 | os.system('python -u 314_co-occur.py') 83 | os.system('nohup python -u 315_streak.py &') 84 | os.system('nohup python -u 316_replacement.py &') 85 | 86 | os.system('python -u 401_how_many_come.py') 87 | 88 | 89 | 90 | #============================================================================== 91 | utils.end(__file__) 92 | -------------------------------------------------------------------------------- /py_feature/902_run_concat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu May 18 23:13:37 2017 5 | 6 | @author: konodera 7 | 8 | 9 | 10 | nohup python -u 902_run_concat.py > log_run_concat.txt & 11 | 12 | 13 | """ 14 | 15 | import os 16 | import utils 17 | utils.start(__file__) 18 | 19 | os.system('python -u 501_concat.py') 20 | os.system('python -u 502_concat.py') 21 | 22 | 23 | 24 | utils.end(__file__) -------------------------------------------------------------------------------- /py_feature/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Mar 17 19:37:49 2018 5 | 6 | @author: Kazuki 7 | """ 8 | 9 | import os 10 | from time import sleep 11 | import sys 12 | argv = sys.argv 13 | 14 | file = argv[1] 15 | if len(argv)>2: 16 | sec = 60 * int(argv[2]) 17 | print(f'wait {sec} sec') 18 | else: 19 | sec = 0 20 | 21 | sleep(sec) 22 | os.system(f'nohup python -u {file} > LOG/log_{file}.txt &') 23 | 24 | -------------------------------------------------------------------------------- /py_feature/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 17 01:21:53 2017 4 | 5 | @author: konodera 6 | """ 7 | 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | import pandas as pd 11 | import numpy as np 12 | from glob import glob 13 | import os 14 | from tqdm import tqdm 15 | from sklearn.model_selection import KFold 16 | #import pickle 17 | from time import time 18 | from datetime import datetime 19 | import gc 20 | #from itertools import chain 21 | 22 | 23 | # ============================================================================= 24 | # def 25 | # ============================================================================= 26 | def start(fname): 27 | global st_time 28 | st_time = time() 29 | print(""" 30 | #============================================================================== 31 | # START!!! {} PID: {} time: {} 32 | #============================================================================== 33 | """.format( fname, os.getpid(), datetime.today() )) 34 | 35 | # send_line(f'START {fname} time: {elapsed_minute():.2f}min') 36 | 37 | return 38 | 39 | def end(fname): 40 | 41 | print(""" 42 | #============================================================================== 43 | # SUCCESS !!! {} 44 | #============================================================================== 45 | """.format(fname)) 46 | print('time: {:.2f}min'.format( elapsed_minute() )) 47 | 48 | # send_line(f'FINISH {fname} time: {elapsed_minute():.2f}min') 49 | 50 | return 51 | 52 | def elapsed_minute(): 53 | return (time() - st_time)/60 54 | 55 | def mkdir_p(path): 56 | try: 57 | os.stat(path) 58 | except: 59 | os.mkdir(path) 60 | 61 | def to_pickles(df, path, split_size=3, inplace=True): 62 | """ 63 | path = '../output/mydf' 64 | 65 | wirte '../output/mydf/0.p' 66 | '../output/mydf/1.p' 67 | '../output/mydf/2.p' 68 | 69 | """ 70 | if inplace==True: 71 | df.reset_index(drop=True, inplace=True) 72 | else: 73 | df = df.reset_index(drop=True) 74 | gc.collect() 75 | mkdir_p(path) 76 | 77 | kf = KFold(n_splits=split_size) 78 | for i, (train_index, val_index) in enumerate(tqdm(kf.split(df))): 79 | df.iloc[val_index].to_pickle(f'{path}/{i:03d}.p') 80 | return 81 | 82 | def read_pickles(path, col=None): 83 | if col is None: 84 | df = pd.concat([pd.read_pickle(f) for f in tqdm(sorted(glob(path+'/*')))]) 85 | else: 86 | df = pd.concat([pd.read_pickle(f)[col] for f in tqdm(sorted(glob(path+'/*')))]) 87 | return df 88 | 89 | def reduce_memory(df, ix_start=0): 90 | df.fillna(-1, inplace=True) 91 | df_ = df.sample(9999, random_state=71) 92 | ## int 93 | col_int8 = [] 94 | col_int16 = [] 95 | col_int32 = [] 96 | for c in tqdm(df.columns[ix_start:], miniters=20): 97 | if df[c].dtype=='O': 98 | continue 99 | if (df_[c] == df_[c].astype(np.int8)).all(): 100 | col_int8.append(c) 101 | elif (df_[c] == df_[c].astype(np.int16)).all(): 102 | col_int16.append(c) 103 | elif (df_[c] == df_[c].astype(np.int32)).all(): 104 | col_int32.append(c) 105 | 106 | df[col_int8] = df[col_int8].astype(np.int8) 107 | df[col_int16] = df[col_int16].astype(np.int16) 108 | df[col_int32] = df[col_int32].astype(np.int32) 109 | 110 | ## float 111 | col = [c for c in df.dtypes[df.dtypes==np.float64].index if '_id' not in c] 112 | df[col] = df[col].astype(np.float32) 113 | 114 | gc.collect() 115 | 116 | #============================================================================== 117 | # main 118 | #============================================================================== 119 | if __name__ == "__main__": 120 | 121 | files = sorted(glob('../input/*')) 122 | data = {} 123 | for f in files: 124 | if os.path.isfile(f): 125 | data[f.split('/')[-1]] = pd.read_csv(f) 126 | 127 | print(""" 128 | #============================================================================== 129 | # SUCCESS !!! {} 130 | #============================================================================== 131 | """.format(__file__)) 132 | 133 | -------------------------------------------------------------------------------- /py_model/000_====== user x item prediction ======: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_model/000_====== user x item prediction ====== -------------------------------------------------------------------------------- /py_model/002_xgb_holdout_item_812_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 002_xgb_holdout_item_812_1.py > LOG/_xgb_item.txt & 9 | 10 | 11 | """ 12 | 13 | import warnings 14 | warnings.filterwarnings("ignore") 15 | import pandas as pd 16 | import numpy as np 17 | import gc 18 | import xgboost as xgb 19 | import utils 20 | 21 | utils.start(__file__) 22 | 23 | 24 | 25 | # setting 26 | DATE = '812_1' 27 | LOOP = 2 28 | ESR = 40 29 | 30 | #seed = np.random.randint(99999) 31 | seed = 71 32 | 33 | np.random.seed(seed) 34 | 35 | valid_size = 0.05 36 | 37 | 38 | # XGB param 39 | nround = 10000 40 | #nround = 10 41 | 42 | param = {'max_depth':10, 43 | 'eta':0.02, 44 | 'colsample_bytree':0.4, 45 | 'subsample':0.75, 46 | 'silent':1, 47 | 'nthread':27, 48 | 'eval_metric':'logloss', 49 | 'objective':'binary:logistic', 50 | 'tree_method':'hist' 51 | } 52 | 53 | print("""#==== print param ======""") 54 | print('DATE:', DATE) 55 | print('seed:', seed) 56 | 57 | #============================================================================== 58 | # prepare 59 | #============================================================================== 60 | train = pd.concat([utils.load_pred_item('trainT-0'), 61 | utils.load_pred_item('trainT-1'), 62 | utils.load_pred_item('trainT-2') 63 | ], ignore_index=True) 64 | 65 | y_train = train['y'] 66 | X_train = train.drop('y', axis=1) 67 | del train 68 | gc.collect() 69 | 70 | # drop id 71 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 72 | col.remove('user_id') 73 | print('drop1',col) 74 | X_train.drop(col, axis=1, inplace=True) # keep user_id 75 | 76 | # drop obj 77 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 78 | print('drop2',col) 79 | X_train.drop(col, axis=1, inplace=True) 80 | 81 | X_train.fillna(-1, inplace=1) 82 | 83 | #============================================================================== 84 | # SPLIT! 85 | print('split by user') 86 | #============================================================================== 87 | train_user = X_train[['user_id']].drop_duplicates() 88 | 89 | def split_build_valid(): 90 | 91 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 92 | p=[1-valid_size, valid_size]) 93 | valid_n = train_user['is_valid'].sum() 94 | build_n = (train_user.shape[0] - valid_n) 95 | 96 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 97 | valid_user = train_user[train_user['is_valid']==1].user_id 98 | is_valid = X_train.user_id.isin(valid_user) 99 | 100 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 101 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 102 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 103 | 104 | print('FINAL SHAPE') 105 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 106 | (dvalid.num_row(), dvalid.num_col()))) 107 | 108 | return dbuild, dvalid, watchlist 109 | 110 | #============================================================================== 111 | print('hold out') 112 | #============================================================================== 113 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 114 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 115 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 116 | 117 | # hold out 118 | models = [] 119 | for i in range(LOOP): 120 | print('LOOP',i) 121 | dbuild, dvalid, watchlist = split_build_valid() 122 | 123 | if i==0: 124 | col_train = dbuild.feature_names 125 | 126 | model = xgb.train(param, dbuild, nround, watchlist, 127 | early_stopping_rounds=ESR, verbose_eval=5) 128 | models.append(model) 129 | model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i)) 130 | # VALID 131 | valid_yhat = model.predict(dvalid) 132 | print('Valid Mean:', np.mean(valid_yhat)) 133 | del dbuild, dvalid, watchlist 134 | gc.collect() 135 | 136 | del train_user, X_train, y_train 137 | gc.collect() 138 | 139 | #============================================================================== 140 | print('test') 141 | #============================================================================== 142 | test = utils.load_pred_item('test').fillna(-1) 143 | 144 | sub_test = test[['order_id', 'product_id']] 145 | 146 | dtest = xgb.DMatrix(test[col_train]) 147 | sub_test['yhat'] = 0 148 | for model in models: 149 | sub_test['yhat'] += model.predict(dtest) 150 | sub_test['yhat'] /= LOOP 151 | print('Test Mean:', sub_test['yhat'].mean()) 152 | 153 | sub_test.to_pickle('../output/sub/{}/sub_test.p'.format(DATE)) 154 | 155 | 156 | #============================================================================== 157 | utils.end(__file__) 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /py_model/002_xgb_holdout_item_813_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 002_xgb_holdout_item_813_1.py > LOG/_xgb_item.txt & 9 | 10 | 11 | """ 12 | 13 | import warnings 14 | warnings.filterwarnings("ignore") 15 | import pandas as pd 16 | import numpy as np 17 | import gc 18 | import xgboost as xgb 19 | import utils 20 | 21 | utils.start(__file__) 22 | 23 | 24 | 25 | # setting 26 | DATE = '813_1' 27 | LOOP = 2 28 | ESR = 60 29 | 30 | #seed = np.random.randint(99999) 31 | seed = 72 32 | 33 | np.random.seed(seed) 34 | 35 | valid_size = 0.05 36 | 37 | 38 | # XGB param 39 | nround = 10000 40 | #nround = 10 41 | 42 | param = {'max_depth':10, 43 | 'eta':0.02, 44 | 'colsample_bytree':0.4, 45 | 'subsample':0.75, 46 | 'silent':1, 47 | 'nthread':27, 48 | 'eval_metric':'logloss', 49 | 'objective':'binary:logistic', 50 | 'tree_method':'hist' 51 | } 52 | 53 | print("""#==== print param ======""") 54 | print('DATE:', DATE) 55 | print('seed:', seed) 56 | 57 | #============================================================================== 58 | # prepare 59 | #============================================================================== 60 | train = pd.concat([utils.load_pred_item('trainT-0'), 61 | utils.load_pred_item('trainT-1'), 62 | utils.load_pred_item('trainT-2') 63 | ], ignore_index=True) 64 | 65 | y_train = train['y'] 66 | X_train = train.drop('y', axis=1) 67 | del train 68 | gc.collect() 69 | 70 | # drop id 71 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 72 | col.remove('user_id') 73 | print('drop1',col) 74 | X_train.drop(col, axis=1, inplace=True) # keep user_id 75 | 76 | # drop obj 77 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 78 | print('drop2',col) 79 | X_train.drop(col, axis=1, inplace=True) 80 | 81 | X_train.fillna(-1, inplace=1) 82 | 83 | #============================================================================== 84 | # SPLIT! 85 | print('split by user') 86 | #============================================================================== 87 | train_user = X_train[['user_id']].drop_duplicates() 88 | 89 | def split_build_valid(): 90 | 91 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 92 | p=[1-valid_size, valid_size]) 93 | valid_n = train_user['is_valid'].sum() 94 | build_n = (train_user.shape[0] - valid_n) 95 | 96 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 97 | valid_user = train_user[train_user['is_valid']==1].user_id 98 | is_valid = X_train.user_id.isin(valid_user) 99 | 100 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 101 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 102 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 103 | 104 | print('FINAL SHAPE') 105 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 106 | (dvalid.num_row(), dvalid.num_col()))) 107 | 108 | return dbuild, dvalid, watchlist 109 | 110 | #============================================================================== 111 | print('hold out') 112 | #============================================================================== 113 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 114 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 115 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 116 | 117 | # hold out 118 | models = [] 119 | for i in range(LOOP): 120 | print('LOOP',i) 121 | dbuild, dvalid, watchlist = split_build_valid() 122 | 123 | if i==0: 124 | col_train = dbuild.feature_names 125 | 126 | model = xgb.train(param, dbuild, nround, watchlist, 127 | early_stopping_rounds=ESR, verbose_eval=5) 128 | models.append(model) 129 | model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i)) 130 | # VALID 131 | valid_yhat = model.predict(dvalid) 132 | print('Valid Mean:', np.mean(valid_yhat)) 133 | del dbuild, dvalid, watchlist 134 | gc.collect() 135 | 136 | del train_user, X_train, y_train 137 | gc.collect() 138 | 139 | #============================================================================== 140 | print('test') 141 | #============================================================================== 142 | test = utils.load_pred_item('test').fillna(-1) 143 | 144 | sub_test = test[['order_id', 'product_id']] 145 | 146 | dtest = xgb.DMatrix(test[col_train]) 147 | sub_test['yhat'] = 0 148 | for model in models: 149 | sub_test['yhat'] += model.predict(dtest) 150 | sub_test['yhat'] /= LOOP 151 | print('Test Mean:', sub_test['yhat'].mean()) 152 | 153 | sub_test.to_pickle('../output/sub/{}/sub_test.p'.format(DATE)) 154 | 155 | 156 | #============================================================================== 157 | utils.end(__file__) 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /py_model/002_xgb_holdout_item_813_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 002_xgb_holdout_item_813_3.py > LOG/_xgb_item.txt & 9 | 10 | 11 | """ 12 | 13 | import warnings 14 | warnings.filterwarnings("ignore") 15 | import pandas as pd 16 | import numpy as np 17 | import gc 18 | import xgboost as xgb 19 | import utils 20 | 21 | utils.start(__file__) 22 | 23 | 24 | 25 | # setting 26 | DATE = '813_3' 27 | LOOP = 2 28 | ESR = 60 29 | 30 | #seed = np.random.randint(99999) 31 | seed = 73 32 | 33 | np.random.seed(seed) 34 | 35 | valid_size = 0.05 36 | 37 | 38 | # XGB param 39 | nround = 10000 40 | #nround = 10 41 | 42 | param = {'max_depth':10, 43 | 'eta':0.02, 44 | 'colsample_bytree':0.4, 45 | 'subsample':0.75, 46 | 'silent':1, 47 | 'nthread':27, 48 | 'eval_metric':'logloss', 49 | 'objective':'binary:logistic', 50 | 'tree_method':'hist' 51 | } 52 | 53 | print("""#==== print param ======""") 54 | print('DATE:', DATE) 55 | print('seed:', seed) 56 | 57 | #============================================================================== 58 | # prepare 59 | #============================================================================== 60 | train = pd.concat([utils.load_pred_item('trainT-0'), 61 | utils.load_pred_item('trainT-1'), 62 | utils.load_pred_item('trainT-2') 63 | ], ignore_index=True) 64 | 65 | y_train = train['y'] 66 | X_train = train.drop('y', axis=1) 67 | del train 68 | gc.collect() 69 | 70 | # drop id 71 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 72 | col.remove('user_id') 73 | print('drop1',col) 74 | X_train.drop(col, axis=1, inplace=True) # keep user_id 75 | 76 | # drop obj 77 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 78 | print('drop2',col) 79 | X_train.drop(col, axis=1, inplace=True) 80 | 81 | X_train.fillna(-1, inplace=1) 82 | 83 | #============================================================================== 84 | # SPLIT! 85 | print('split by user') 86 | #============================================================================== 87 | train_user = X_train[['user_id']].drop_duplicates() 88 | 89 | def split_build_valid(): 90 | 91 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 92 | p=[1-valid_size, valid_size]) 93 | valid_n = train_user['is_valid'].sum() 94 | build_n = (train_user.shape[0] - valid_n) 95 | 96 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 97 | valid_user = train_user[train_user['is_valid']==1].user_id 98 | is_valid = X_train.user_id.isin(valid_user) 99 | 100 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 101 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 102 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 103 | 104 | print('FINAL SHAPE') 105 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 106 | (dvalid.num_row(), dvalid.num_col()))) 107 | 108 | return dbuild, dvalid, watchlist 109 | 110 | #============================================================================== 111 | print('hold out') 112 | #============================================================================== 113 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 114 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 115 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 116 | 117 | # hold out 118 | models = [] 119 | for i in range(LOOP): 120 | print('LOOP',i) 121 | dbuild, dvalid, watchlist = split_build_valid() 122 | 123 | if i==0: 124 | col_train = dbuild.feature_names 125 | 126 | model = xgb.train(param, dbuild, nround, watchlist, 127 | early_stopping_rounds=ESR, verbose_eval=5) 128 | models.append(model) 129 | model.save_model('../output/model/{}/xgb_item_{}.model'.format(DATE, i)) 130 | # VALID 131 | valid_yhat = model.predict(dvalid) 132 | print('Valid Mean:', np.mean(valid_yhat)) 133 | del dbuild, dvalid, watchlist 134 | gc.collect() 135 | 136 | del train_user, X_train, y_train 137 | gc.collect() 138 | 139 | 140 | 141 | #============================================================================== 142 | print('test') 143 | #============================================================================== 144 | test = utils.load_pred_item('test').fillna(-1) 145 | 146 | sub_test = test[['order_id', 'product_id']] 147 | 148 | dtest = xgb.DMatrix(test[col_train]) 149 | sub_test['yhat'] = 0 150 | for model in models: 151 | sub_test['yhat'] += model.predict(dtest) 152 | sub_test['yhat'] /= LOOP 153 | print('Test Mean:', sub_test['yhat'].mean()) 154 | 155 | sub_test.to_pickle('../output/sub/{}/sub_test.p'.format(DATE)) 156 | 157 | #============================================================================== 158 | utils.end(__file__) 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /py_model/100_====== None prediction ======: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_model/100_====== None prediction ====== -------------------------------------------------------------------------------- /py_model/102_xgb_holdout_None_813_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 102_xgb_holdout_None_813_3.py > LOG/_xgb_None.txt & 9 | 10 | """ 11 | 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | import pandas as pd 15 | import numpy as np 16 | import gc 17 | import xgboost as xgb 18 | import utils 19 | utils.start(__file__) 20 | 21 | 22 | 23 | # setting 24 | DATE = '813_3' 25 | LOOP = 5 26 | ESR = 40 27 | 28 | #seed = np.random.randint(99999) 29 | seed = 71 30 | 31 | np.random.seed(seed) 32 | 33 | valid_size = 0.05 34 | 35 | 36 | # XGB param 37 | nround = 10000 38 | #nround = 10 39 | 40 | param = {'max_depth':10, 41 | 'eta':0.01, 42 | 'colsample_bytree':0.5, 43 | 'subsample':0.75, 44 | 'silent':1, 45 | 'nthread':27, 46 | 'eval_metric':'logloss', 47 | 'objective':'binary:logistic', 48 | 'tree_method':'hist' 49 | } 50 | 51 | print("""#==== print param ======""") 52 | print('DATE:', DATE) 53 | print('seed:', seed) 54 | 55 | #============================================================================== 56 | # prepare 57 | #============================================================================== 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3), 59 | utils.load_pred_None('trainT-1', 3), 60 | utils.load_pred_None('trainT-2', 3) 61 | ], ignore_index=True) 62 | 63 | sub_train = train[['order_id', 'y']] 64 | y_train = train['y'] 65 | X_train = train.drop('y', axis=1) 66 | del train; gc.collect() 67 | 68 | # drop id 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 70 | col.remove('user_id') 71 | print('drop1',col) 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id 73 | 74 | # drop obj 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 76 | print('drop2',col) 77 | X_train.drop(col, axis=1, inplace=True) 78 | 79 | X_train.fillna(-1, inplace=1) 80 | 81 | #============================================================================== 82 | # SPLIT! 83 | print('split by user') 84 | #============================================================================== 85 | train_user = X_train[['user_id']].drop_duplicates() 86 | #utils.to_pickles(X_train, 'X_train', 10) 87 | #del X_train; gc.collect() 88 | 89 | 90 | def split_build_valid(): 91 | 92 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 93 | p=[1-valid_size, valid_size]) 94 | valid_n = train_user['is_valid'].sum() 95 | build_n = (train_user.shape[0] - valid_n) 96 | 97 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 98 | valid_user = train_user[train_user['is_valid']==1].user_id 99 | is_valid = X_train.user_id.isin(valid_user) 100 | 101 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 102 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 103 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 104 | 105 | label = dbuild.get_label() 106 | scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1) 107 | 108 | print('scale_pos_weight', scale_pos_weight) 109 | print('FINAL SHAPE') 110 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 111 | (dvalid.num_row(), dvalid.num_col()))) 112 | 113 | return dbuild, dvalid, watchlist, scale_pos_weight 114 | 115 | dbuild, dvalid, watchlist, weight = split_build_valid() 116 | 117 | col_train = dbuild.feature_names 118 | #============================================================================== 119 | print('hold out') 120 | #============================================================================== 121 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 122 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 123 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 124 | 125 | # hold out 126 | models = [] 127 | for i in range(LOOP): 128 | print('LOOP',i) 129 | # param['scale_pos_weight'] = weight 130 | model = xgb.train(param, dbuild, nround, watchlist, 131 | early_stopping_rounds=ESR, verbose_eval=5) 132 | models.append(model) 133 | model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i)) 134 | 135 | # VALID 136 | yhat = model.predict(dvalid) 137 | print('Valid Mean:', np.mean(yhat)) 138 | 139 | if i != (LOOP-1): 140 | del dbuild, dvalid, watchlist 141 | gc.collect() 142 | dbuild, dvalid, watchlist, weight = split_build_valid() 143 | 144 | 145 | del train_user, sub_train, X_train, y_train 146 | del dbuild, dvalid 147 | gc.collect() 148 | 149 | 150 | #============================================================================== 151 | print('test') 152 | #============================================================================== 153 | test = utils.load_pred_None('test', 3).fillna(-1) 154 | sub_test = test[['order_id']] 155 | 156 | dtest = xgb.DMatrix(test[col_train]) 157 | sub_test['yhat'] = 0 158 | for model in models: 159 | sub_test['yhat'] += model.predict(dtest) 160 | sub_test['yhat'] /= LOOP 161 | print('Test Mean:', sub_test['yhat'].mean()) 162 | 163 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE)) 164 | 165 | 166 | #============================================================================== 167 | utils.end(__file__) 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /py_model/102_xgb_holdout_None_814_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 102_xgb_holdout_None_814_1.py > LOG/_xgb_None.txt & 9 | 10 | """ 11 | 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | import pandas as pd 15 | import numpy as np 16 | import gc 17 | import xgboost as xgb 18 | import utils 19 | utils.start(__file__) 20 | 21 | 22 | 23 | # setting 24 | DATE = '814_1' 25 | LOOP = 6 26 | ESR = 50 27 | 28 | #seed = np.random.randint(99999) 29 | seed = 72 30 | 31 | np.random.seed(seed) 32 | 33 | valid_size = 0.05 34 | 35 | 36 | # XGB param 37 | nround = 10000 38 | #nround = 10 39 | 40 | param = {'max_depth':10, 41 | 'eta':0.01, 42 | 'colsample_bytree':0.5, 43 | 'subsample':0.75, 44 | 'silent':1, 45 | 'nthread':28, 46 | 'eval_metric':'logloss', 47 | 'objective':'binary:logistic', 48 | 'tree_method':'hist' 49 | } 50 | 51 | print("""#==== print param ======""") 52 | print('DATE:', DATE) 53 | print('seed:', seed) 54 | 55 | #============================================================================== 56 | # prepare 57 | #============================================================================== 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3), 59 | utils.load_pred_None('trainT-1', 3), 60 | utils.load_pred_None('trainT-2', 3) 61 | ], ignore_index=True) 62 | 63 | sub_train = train[['order_id', 'y']] 64 | y_train = train['y'] 65 | X_train = train.drop('y', axis=1) 66 | del train; gc.collect() 67 | 68 | # drop id 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 70 | col.remove('user_id') 71 | print('drop1',col) 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id 73 | 74 | # drop obj 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 76 | print('drop2',col) 77 | X_train.drop(col, axis=1, inplace=True) 78 | 79 | X_train.fillna(-1, inplace=1) 80 | 81 | #============================================================================== 82 | # SPLIT! 83 | print('split by user') 84 | #============================================================================== 85 | train_user = X_train[['user_id']].drop_duplicates() 86 | #utils.to_pickles(X_train, 'X_train', 10) 87 | #del X_train; gc.collect() 88 | 89 | 90 | def split_build_valid(): 91 | 92 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 93 | p=[1-valid_size, valid_size]) 94 | valid_n = train_user['is_valid'].sum() 95 | build_n = (train_user.shape[0] - valid_n) 96 | 97 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 98 | valid_user = train_user[train_user['is_valid']==1].user_id 99 | is_valid = X_train.user_id.isin(valid_user) 100 | 101 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 102 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 103 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 104 | 105 | label = dbuild.get_label() 106 | scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1) 107 | 108 | print('scale_pos_weight', scale_pos_weight) 109 | print('FINAL SHAPE') 110 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 111 | (dvalid.num_row(), dvalid.num_col()))) 112 | 113 | return dbuild, dvalid, watchlist, scale_pos_weight 114 | 115 | dbuild, dvalid, watchlist, weight = split_build_valid() 116 | 117 | col_train = dbuild.feature_names 118 | #============================================================================== 119 | print('hold out') 120 | #============================================================================== 121 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 122 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 123 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 124 | 125 | # hold out 126 | models = [] 127 | for i in range(LOOP): 128 | print('LOOP',i) 129 | # param['scale_pos_weight'] = weight 130 | model = xgb.train(param, dbuild, nround, watchlist, 131 | early_stopping_rounds=ESR, verbose_eval=5) 132 | models.append(model) 133 | model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i)) 134 | 135 | # VALID 136 | yhat = model.predict(dvalid) 137 | print('Valid Mean:', np.mean(yhat)) 138 | 139 | if i != (LOOP-1): 140 | del dbuild, dvalid, watchlist 141 | gc.collect() 142 | dbuild, dvalid, watchlist, weight = split_build_valid() 143 | 144 | 145 | del train_user, sub_train, X_train, y_train 146 | del dbuild, dvalid 147 | gc.collect() 148 | 149 | 150 | #============================================================================== 151 | print('test') 152 | #============================================================================== 153 | test = utils.load_pred_None('test', 3).fillna(-1) 154 | sub_test = test[['order_id']] 155 | 156 | dtest = xgb.DMatrix(test[col_train]) 157 | sub_test['yhat'] = 0 158 | for model in models: 159 | sub_test['yhat'] += model.predict(dtest) 160 | sub_test['yhat'] /= LOOP 161 | print('Test Mean:', sub_test['yhat'].mean()) 162 | 163 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE)) 164 | 165 | 166 | #============================================================================== 167 | utils.end(__file__) 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /py_model/102_xgb_holdout_None_814_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 102_xgb_holdout_None_814_2.py > LOG/_xgb_None.txt & 9 | 10 | """ 11 | 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | import pandas as pd 15 | import numpy as np 16 | import gc 17 | import xgboost as xgb 18 | import utils 19 | utils.start(__file__) 20 | 21 | 22 | 23 | # setting 24 | DATE = '814_2' 25 | LOOP = 3 26 | ESR = 60 27 | 28 | #seed = np.random.randint(99999) 29 | seed = 73 30 | 31 | np.random.seed(seed) 32 | 33 | valid_size = 0.05 34 | 35 | 36 | # XGB param 37 | nround = 20000 38 | #nround = 10 39 | 40 | param = {'max_depth':10, 41 | 'eta':0.002, 42 | 'colsample_bytree':0.5, 43 | 'subsample':0.75, 44 | 'silent':1, 45 | 'nthread':28, 46 | 'eval_metric':'logloss', 47 | 'objective':'binary:logistic', 48 | 'tree_method':'hist' 49 | } 50 | 51 | print("""#==== print param ======""") 52 | print('DATE:', DATE) 53 | print('seed:', seed) 54 | 55 | #============================================================================== 56 | # prepare 57 | #============================================================================== 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3), 59 | utils.load_pred_None('trainT-1', 3), 60 | utils.load_pred_None('trainT-2', 3) 61 | ], ignore_index=True) 62 | 63 | sub_train = train[['order_id', 'y']] 64 | y_train = train['y'] 65 | X_train = train.drop('y', axis=1) 66 | del train; gc.collect() 67 | 68 | # drop id 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 70 | col.remove('user_id') 71 | print('drop1',col) 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id 73 | 74 | # drop obj 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 76 | print('drop2',col) 77 | X_train.drop(col, axis=1, inplace=True) 78 | 79 | X_train.fillna(-1, inplace=1) 80 | 81 | #============================================================================== 82 | # SPLIT! 83 | print('split by user') 84 | #============================================================================== 85 | train_user = X_train[['user_id']].drop_duplicates() 86 | #utils.to_pickles(X_train, 'X_train', 10) 87 | #del X_train; gc.collect() 88 | 89 | 90 | def split_build_valid(): 91 | 92 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 93 | p=[1-valid_size, valid_size]) 94 | valid_n = train_user['is_valid'].sum() 95 | build_n = (train_user.shape[0] - valid_n) 96 | 97 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 98 | valid_user = train_user[train_user['is_valid']==1].user_id 99 | is_valid = X_train.user_id.isin(valid_user) 100 | 101 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 102 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 103 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 104 | 105 | label = dbuild.get_label() 106 | scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1) 107 | 108 | print('scale_pos_weight', scale_pos_weight) 109 | print('FINAL SHAPE') 110 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 111 | (dvalid.num_row(), dvalid.num_col()))) 112 | 113 | return dbuild, dvalid, watchlist, scale_pos_weight 114 | 115 | dbuild, dvalid, watchlist, weight = split_build_valid() 116 | 117 | col_train = dbuild.feature_names 118 | #============================================================================== 119 | print('hold out') 120 | #============================================================================== 121 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 122 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 123 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 124 | 125 | # hold out 126 | models = [] 127 | for i in range(LOOP): 128 | print('LOOP',i) 129 | # param['scale_pos_weight'] = weight 130 | model = xgb.train(param, dbuild, nround, watchlist, 131 | early_stopping_rounds=ESR, verbose_eval=5) 132 | models.append(model) 133 | model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i)) 134 | 135 | # VALID 136 | yhat = model.predict(dvalid) 137 | print('Valid Mean:', np.mean(yhat)) 138 | 139 | if i != (LOOP-1): 140 | del dbuild, dvalid, watchlist 141 | gc.collect() 142 | dbuild, dvalid, watchlist, weight = split_build_valid() 143 | 144 | 145 | del train_user, sub_train, X_train, y_train 146 | del dbuild, dvalid 147 | gc.collect() 148 | 149 | #============================================================================== 150 | print('test') 151 | #============================================================================== 152 | test = utils.load_pred_None('test', 3).fillna(-1) 153 | sub_test = test[['order_id']] 154 | 155 | dtest = xgb.DMatrix(test[col_train]) 156 | sub_test['yhat'] = 0 157 | for model in models: 158 | sub_test['yhat'] += model.predict(dtest) 159 | sub_test['yhat'] /= LOOP 160 | print('Test Mean:', sub_test['yhat'].mean()) 161 | 162 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE)) 163 | 164 | #============================================================================== 165 | utils.end(__file__) 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /py_model/102_xgb_holdout_None_814_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue May 30 23:28:19 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 102_xgb_holdout_None_814_3.py > LOG/_xgb_None.txt & 9 | 10 | """ 11 | 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | import pandas as pd 15 | import numpy as np 16 | import gc 17 | import xgboost as xgb 18 | import utils 19 | utils.start(__file__) 20 | 21 | 22 | 23 | # setting 24 | DATE = '814_3' 25 | LOOP = 3 26 | ESR = 60 27 | 28 | #seed = np.random.randint(99999) 29 | seed = 74 30 | 31 | np.random.seed(seed) 32 | 33 | valid_size = 0.05 34 | 35 | 36 | # XGB param 37 | nround = 20000 38 | #nround = 10 39 | 40 | param = {'max_depth':10, 41 | 'eta':0.002, 42 | 'colsample_bytree':0.5, 43 | 'subsample':0.75, 44 | 'silent':1, 45 | 'nthread':28, 46 | 'eval_metric':'logloss', 47 | 'objective':'binary:logistic', 48 | 'tree_method':'hist' 49 | } 50 | 51 | print("""#==== print param ======""") 52 | print('DATE:', DATE) 53 | print('seed:', seed) 54 | 55 | #============================================================================== 56 | # prepare 57 | #============================================================================== 58 | train = pd.concat([utils.load_pred_None('trainT-0', 3), 59 | utils.load_pred_None('trainT-1', 3), 60 | utils.load_pred_None('trainT-2', 3) 61 | ], ignore_index=True) 62 | 63 | sub_train = train[['order_id', 'y']] 64 | y_train = train['y'] 65 | X_train = train.drop('y', axis=1) 66 | del train; gc.collect() 67 | 68 | # drop id 69 | col = [c for c in X_train.columns if '_id' in c] + ['is_train'] 70 | col.remove('user_id') 71 | print('drop1',col) 72 | X_train.drop(col, axis=1, inplace=True) # keep user_id 73 | 74 | # drop obj 75 | col = X_train.dtypes[X_train.dtypes=='object'].index.tolist() 76 | print('drop2',col) 77 | X_train.drop(col, axis=1, inplace=True) 78 | 79 | X_train.fillna(-1, inplace=1) 80 | 81 | #============================================================================== 82 | # SPLIT! 83 | print('split by user') 84 | #============================================================================== 85 | train_user = X_train[['user_id']].drop_duplicates() 86 | #utils.to_pickles(X_train, 'X_train', 10) 87 | #del X_train; gc.collect() 88 | 89 | 90 | def split_build_valid(): 91 | 92 | train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 93 | p=[1-valid_size, valid_size]) 94 | valid_n = train_user['is_valid'].sum() 95 | build_n = (train_user.shape[0] - valid_n) 96 | 97 | print('build user:{}, valid user:{}'.format(build_n, valid_n)) 98 | valid_user = train_user[train_user['is_valid']==1].user_id 99 | is_valid = X_train.user_id.isin(valid_user) 100 | 101 | dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) 102 | dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) 103 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 104 | 105 | label = dbuild.get_label() 106 | scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1) 107 | 108 | print('scale_pos_weight', scale_pos_weight) 109 | print('FINAL SHAPE') 110 | print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), 111 | (dvalid.num_row(), dvalid.num_col()))) 112 | 113 | return dbuild, dvalid, watchlist, scale_pos_weight 114 | 115 | dbuild, dvalid, watchlist, weight = split_build_valid() 116 | 117 | col_train = dbuild.feature_names 118 | #============================================================================== 119 | print('hold out') 120 | #============================================================================== 121 | utils.mkdir_p('../output/model/{}/'.format(DATE)) 122 | utils.mkdir_p('../output/imp/{}/'.format(DATE)) 123 | utils.mkdir_p('../output/sub/{}/'.format(DATE)) 124 | 125 | # hold out 126 | models = [] 127 | for i in range(LOOP): 128 | print('LOOP',i) 129 | # param['scale_pos_weight'] = weight 130 | model = xgb.train(param, dbuild, nround, watchlist, 131 | early_stopping_rounds=ESR, verbose_eval=5) 132 | models.append(model) 133 | model.save_model('../output/model/{}/xgb_None_{}.model'.format(DATE, i)) 134 | 135 | # VALID 136 | yhat = model.predict(dvalid) 137 | print('Valid Mean:', np.mean(yhat)) 138 | 139 | if i != (LOOP-1): 140 | del dbuild, dvalid, watchlist 141 | gc.collect() 142 | dbuild, dvalid, watchlist, weight = split_build_valid() 143 | 144 | 145 | del train_user, sub_train, X_train, y_train 146 | del dbuild, dvalid 147 | gc.collect() 148 | 149 | #============================================================================== 150 | print('test') 151 | #============================================================================== 152 | test = utils.load_pred_None('test', 3).fillna(-1) 153 | sub_test = test[['order_id']] 154 | 155 | dtest = xgb.DMatrix(test[col_train]) 156 | sub_test['yhat'] = 0 157 | for model in models: 158 | sub_test['yhat'] += model.predict(dtest) 159 | sub_test['yhat'] /= LOOP 160 | print('Test Mean:', sub_test['yhat'].mean()) 161 | 162 | sub_test.to_pickle('../output/sub/{}/sub_test_None.p'.format(DATE)) 163 | 164 | #============================================================================== 165 | utils.end(__file__) 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /py_model/200_===== threshold estimation =====: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KazukiOnodera/Instacart/416b6b0220d3aed62c8d323caa3ee46f4b614a72/py_model/200_===== threshold estimation ===== -------------------------------------------------------------------------------- /py_model/201_Faron_opt_bagging_815_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jul 29 18:59:46 2017 5 | 6 | @author: konodera 7 | 8 | nohup python -u 201_Faron_opt_bagging_815_3.py > LOG/_Faron-opt.txt & 9 | 10 | """ 11 | 12 | import pandas as pd 13 | from opt_fscore import get_best_prediction 14 | import multiprocessing as mp 15 | import time 16 | import utils 17 | utils.start(__file__) 18 | 19 | 20 | # setting 21 | DATE_item = ['812_1', '813_1', '813_3'] 22 | 23 | DATE_None = ['813_3', '814_1', '814_2', '814_3'] 24 | 25 | total_proc = 60 26 | 27 | OUTF = "../output/sub/final/Faron-opt_bagging-v3.csv.gz" 28 | 29 | print("""#==== print param ======""") 30 | print('OUTF:', OUTF) 31 | print('DATE_item:', DATE_item) 32 | print('DATE_None:', DATE_None) 33 | print('total_proc:', total_proc) 34 | 35 | utils.mkdir_p('../output/sub/final') 36 | #============================================================================== 37 | # load 38 | #============================================================================== 39 | sub_item = pd.concat([pd.read_pickle('../output/sub/{}/sub_test.p'.format(d)) for d in DATE_item]) 40 | sub_item = sub_item.groupby(['order_id','product_id']).yhat.mean().reset_index() 41 | sub = sub_item.groupby('order_id').product_id.apply(list).to_frame() 42 | sub['yhat'] = sub_item.groupby('order_id').yhat.apply(list) 43 | 44 | # weighted 45 | for i,(w,d) in enumerate(zip([0.1, 0.1, 0.4, 0.4], DATE_None)): 46 | tmp = pd.read_pickle('../output/sub/{}/sub_test_None.p'.format(d)).rename(columns={'yhat':'yhat_None'}) 47 | tmp.yhat_None *= w 48 | if i==0: 49 | sub_None = tmp 50 | else: 51 | sub_None = pd.concat([sub_None, tmp]) 52 | 53 | sub_None = sub_None.groupby('order_id').yhat_None.sum().reset_index() 54 | 55 | sub = pd.merge(sub.reset_index(), sub_None, on='order_id', how='left') 56 | 57 | #============================================================================== 58 | # optimize 59 | #============================================================================== 60 | def multi(i): 61 | if i%1000==0: 62 | print('{:.3f} min'.format((time.time()-st_time)/60)) 63 | items = sub.loc[i,'product_id'] 64 | preds = sub.loc[i,'yhat'] 65 | pNone = sub.loc[i,'yhat_None'] 66 | ret = get_best_prediction(items, preds, pNone) 67 | return ret 68 | 69 | # start!!! 70 | st_time = time.time() 71 | pool = mp.Pool(total_proc) 72 | callback = pool.map(multi, range(sub.shape[0])) 73 | 74 | sub['products'] = callback 75 | 76 | print('writing...') 77 | sub[['order_id', 'products']].to_csv(OUTF, index=0, compression='gzip') 78 | 79 | #============================================================================== 80 | utils.end(__file__) 81 | 82 | -------------------------------------------------------------------------------- /py_model/999_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Aug 21 13:13:57 2017 5 | 6 | @author: konodera 7 | """ 8 | 9 | import os 10 | import utils 11 | utils.start(__file__) 12 | 13 | 14 | os.system('python -u 002_xgb_holdout_item_812_1.py') 15 | os.system('python -u 002_xgb_holdout_item_813_1.py') 16 | os.system('python -u 002_xgb_holdout_item_813_3.py') 17 | 18 | os.system('python -u 102_xgb_holdout_None_813_3.py') 19 | os.system('python -u 102_xgb_holdout_None_814_1.py') 20 | os.system('python -u 102_xgb_holdout_None_814_2.py') 21 | os.system('python -u 102_xgb_holdout_None_814_3.py') 22 | 23 | os.system('python -u 201_Faron_opt_bagging_815_3.py') 24 | 25 | utils.end(__file__) 26 | 27 | -------------------------------------------------------------------------------- /py_model/opt_fscore.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Faron 4 | """ 5 | import numpy as np 6 | from operator import itemgetter 7 | 8 | ''' 9 | This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in 10 | "Ye, N., Chai, K., Lee, W., and Chieu, H. Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012." 11 | 12 | It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])] 13 | with [[None]] being the indicator for predicting label "None" 14 | given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n 15 | under label independence assumption by means of dynamic programming in O(n²). 16 | ''' 17 | 18 | 19 | class F1Optimizer(): 20 | def __init__(self): 21 | pass 22 | 23 | @staticmethod 24 | def get_expectations(P, pNone=None): 25 | expectations = [] 26 | P = np.sort(P)[::-1] 27 | 28 | n = np.array(P).shape[0] 29 | DP_C = np.zeros((n + 2, n + 1)) 30 | if pNone is None: 31 | pNone = (1.0 - P).prod() 32 | 33 | DP_C[0][0] = 1.0 34 | for j in range(1, n): 35 | DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1] 36 | 37 | for i in range(1, n + 1): 38 | DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1] 39 | for j in range(i + 1, n + 1): 40 | DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1] 41 | 42 | DP_S = np.zeros((2 * n + 1,)) 43 | DP_SNone = np.zeros((2 * n + 1,)) 44 | for i in range(1, 2 * n + 1): 45 | DP_S[i] = 1. / (1. * i) 46 | DP_SNone[i] = 1. / (1. * i + 1) 47 | for k in range(n + 1)[::-1]: 48 | f1 = 0 49 | f1None = 0 50 | for k1 in range(n + 1): 51 | f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1] 52 | f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1] 53 | for i in range(1, 2 * k - 1): 54 | DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1] 55 | DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1] 56 | expectations.append([f1None + 2 * pNone / (2 + k), f1]) 57 | 58 | return np.array(expectations[::-1]).T 59 | 60 | @staticmethod 61 | def maximize_expectation(P, pNone=None): 62 | expectations = F1Optimizer.get_expectations(P, pNone) 63 | 64 | ix_max = np.unravel_index(expectations.argmax(), expectations.shape) 65 | max_f1 = expectations[ix_max] 66 | 67 | predNone = True if ix_max[0] == 0 else False 68 | best_k = ix_max[1] 69 | 70 | return best_k, predNone, max_f1 71 | 72 | @staticmethod 73 | def _F1(tp, fp, fn): 74 | return 2 * tp / (2 * tp + fp + fn) 75 | 76 | @staticmethod 77 | def _Fbeta(tp, fp, fn, beta=1.0): 78 | beta_squared = beta ** 2 79 | return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn) 80 | 81 | 82 | def get_best_prediction(items, preds, pNone=None): 83 | # print("Maximize F1-Expectation") 84 | # print("=" * 23) 85 | items_preds = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True) 86 | P = [p for i,p in items_preds] 87 | L = [i for i,p in items_preds] 88 | 89 | opt = F1Optimizer.maximize_expectation(P, pNone) 90 | best_prediction = ['None'] if opt[1] else [] 91 | best_prediction += (L[:opt[0]]) 92 | # f1_max = opt[2] 93 | 94 | # print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max)) 95 | return ' '.join(list(map(str,best_prediction))) 96 | 97 | if __name__ == '__main__': 98 | get_best_prediction(['a', 'b'], [0.9, 0.3], 0.5) 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /py_model/pyx_get_best_items.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Fri Jun 30 15:09:33 2017 3 | 4 | @author: konodera 5 | """ 6 | from operator import itemgetter 7 | import numpy as np 8 | 9 | LOOP = 9999 10 | np.random.seed(71) 11 | 12 | cdef int __tp__(y_true, y_pred): 13 | return len(y_true & y_pred) 14 | 15 | cdef int __tpfp__(y_pred): 16 | return len(y_pred) 17 | 18 | cdef int __tpfn__(y_true): 19 | return len(y_true) 20 | 21 | cdef double multilabel_fscore(y_true, y_pred): 22 | cdef double precision, recall 23 | cdef double tp, tpfp, tpfn 24 | 25 | tp = __tp__(y_true, y_pred) 26 | tpfp = __tpfp__(y_pred) 27 | tpfn = __tpfn__(y_true) 28 | 29 | precision = tp/tpfp 30 | recall = tp/tpfn 31 | 32 | if precision + recall == 0: 33 | return 0 34 | return (2 * precision * recall) / (precision + recall) 35 | 36 | cdef get_y_true(items): 37 | """ 38 | items: dict 39 | {A:0.9, B:0.3} 40 | """ 41 | cdef list y_true = [] 42 | for k in items.keys(): 43 | if items[k]>np.random.uniform(): 44 | y_true.append(k) 45 | if len(y_true)==0 or 'None' in y_true: 46 | y_true = ['None'] 47 | return y_true 48 | 49 | def get_best_items(items, preds): 50 | """ 51 | items: list 52 | [1, 2, 3...] 53 | 54 | preds: list 55 | [0.3, 0.9, 0.2...] 56 | 57 | items = [1, 2, 3, 4, 5, 6, 7] 58 | preds = [0.2, 0.19, 0.18, 0.17, 0.16, 0.15, 0.14] 59 | 60 | """ 61 | items_true = dict(zip(items, preds)) 62 | cdef list items_pred = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True) 63 | items_pred = [k for k,v in items_pred] 64 | cdef list y_trues = [set(get_y_true(items_true)) for i in range(LOOP)] 65 | cdef list best_items 66 | 67 | cdef double best_score = 0 68 | for i in range(1,len(items_pred)+1): 69 | score = np.mean([multilabel_fscore(y_trues[j], set(items_pred[:i])) for j in range(LOOP)]) 70 | if best_score < score: 71 | best_score = score 72 | elif best_score > score: 73 | best_items = items_pred[:i-1] 74 | break 75 | if i==len(items_pred): 76 | # last 77 | best_items = items_pred[:] 78 | break 79 | 80 | if 'None' in best_items: 81 | return ' '.join(map(str, best_items)) 82 | 83 | # search None 84 | best_items = best_items[::-1] # low is head 85 | for i in range(len(best_items)+1): 86 | score = np.mean([multilabel_fscore(y_trues[j], set(best_items[i:]+['None'])) for j in range(LOOP)]) 87 | if best_score < score: 88 | best_score = score 89 | elif best_score > score and i==0: 90 | break 91 | elif best_score > score: 92 | best_items = best_items[i-1:]+['None'] 93 | break 94 | elif i==len(best_items): 95 | # last 96 | best_items = ['None'] 97 | break 98 | 99 | return ' '.join(map(str, best_items)) 100 | 101 | def get_best_items2(items, preds): 102 | """ 103 | items: list 104 | [1, 2, 3...] 105 | 106 | preds: list 107 | [0.3, 0.9, 0.2...] 108 | 109 | ex: 110 | items = [1, 2, 3, 4, 5, 6, 7] 111 | preds = [0.2, 0.19, 0.18, 0.17, 0.16, 0.15, 0.14] 112 | 113 | """ 114 | items_true = dict(zip(items, preds)) 115 | cdef list items_pred = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True) 116 | items_pred = [k for k,v in items_pred] 117 | cdef list y_trues = [set(get_y_true(items_true)) for i in range(LOOP)] 118 | cdef list best_items 119 | 120 | cdef double best_score = 0 121 | for i in range(1,len(items_pred)+1): 122 | score = np.mean([multilabel_fscore(y_trues[j], set(items_pred[:i])) for j in range(LOOP)]) 123 | if best_score < score: 124 | best_score = score 125 | elif best_score > score: 126 | best_items = items_pred[:i-1] 127 | break 128 | if i==len(items_pred): 129 | # last 130 | best_items = items_pred[:] 131 | break 132 | 133 | return ' '.join(map(str, best_items)) 134 | 135 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.12.1 2 | pandas==0.19.2 3 | scipy==0.19.0 4 | tqdm==4.11.2 5 | xgboost==0.6 6 | --------------------------------------------------------------------------------