├── setup.sh ├── src ├── constant.py ├── picklization.py ├── country2utc.py ├── config.py ├── nn.py ├── utils.py ├── run_nn.py ├── data.py ├── run_xgb.py └── run_lgb.py ├── README.md └── LICENSE /setup.sh: -------------------------------------------------------------------------------- 1 | mkdir input 2 | mkdir output 3 | mkdir weights -------------------------------------------------------------------------------- /src/constant.py: -------------------------------------------------------------------------------- 1 | input_dir = '../input' 2 | output_dir = '../output' 3 | weights_dir = '../weights' 4 | 5 | DUMMY_ACTION = 'DUMMMY_A' 6 | 7 | DUMMY_USER = -1 8 | 9 | DUMMY_ITEM = -1 10 | 11 | DUMMY_PRICE_RANK=25 12 | 13 | DUMMY_IMPRESSION_INDEX = 25 -------------------------------------------------------------------------------- /src/picklization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Transform files from csv to pickle for faster reading 3 | ''' 4 | import pickle 5 | import pandas as pd 6 | 7 | df = pd.read_csv('../input/train.csv') 8 | with open('../input/train_v2.p','wb') as f: 9 | pickle.dump(df, f) 10 | 11 | df = pd.read_csv('../input/test.csv') 12 | with open('../input/test_v2.p','wb') as f: 13 | pickle.dump(df, f) 14 | 15 | df = pd.read_csv('../input/item_metadata.csv') 16 | with open('../input/item_metadata.p','wb') as f: 17 | pickle.dump(df, f) -------------------------------------------------------------------------------- /src/country2utc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | from utils import * 5 | import pycountry 6 | from timezonefinder import TimezoneFinder 7 | from geopy.geocoders import Nominatim 8 | import datetime 9 | import pytz 10 | 11 | tzf = TimezoneFinder() 12 | geolocator = Nominatim(user_agent="recsys") 13 | 14 | with open('../input/train_v2.p', 'rb') as f: 15 | train = pickle.load(f) 16 | 17 | 18 | with open('../input/test_v2.p', 'rb') as f: 19 | test = pickle.load(f) 20 | 21 | 22 | def location2utc_offset(location): 23 | ''' 24 | return the utc offset given the location 25 | ''' 26 | location = geolocator.geocode(location) 27 | 28 | if location == None: 29 | return np.nan 30 | 31 | lat = location.latitude 32 | lon = location.longitude 33 | offset_sec = datetime.datetime.now(pytz.timezone(tzf.timezone_at(lng=lon, lat=lat))) 34 | return offset_sec.utcoffset().total_seconds()/60/60 35 | 36 | 37 | all_countries = [platform2country(s) for s in set(train.platform.tolist() + test.platform.tolist())] 38 | 39 | offsets= [location2utc_offset(c) for c in all_countries ] 40 | 41 | # map country to offsets 42 | country2offsets_dict = dict(set(zip(all_countries, offsets))) 43 | with open('../input/country2offsets_dict.p','wb') as f: 44 | pickle.dump(country2offsets_dict, f) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The 4th Place Solution to the 2019 ACM RecSys Challenge 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ## Team Members 10 | _Kung-hsiang (Steeve), Huang_ __(Rosetta.ai)__; _Yi-fu, Fu_; _Yi-ting, Lee_; _Tzong-hann, Lee_; _Yao-chun, Chan_ __(National Taiwan University)__; _Yi-hui, Lee_ __(University of Texas, Dallas)__; _Shou-de, Lin_ __(National Taiwan University)__ 11 | 12 | Contact: steeve@rosetta.ai 13 | 14 | 15 | 16 | ## Introduction 17 | This repository contains RosettaAI's approach to the 2019 ACM Recys Challenge ([paper](https://dl.acm.org/citation.cfm?id=3359560), [writeup](https://medium.com/@huangkh19951228/the-5th-place-approach-to-the-2019-acm-recsys-challenge-by-team-rosettaai-eb3c4e6178c4)). Instead of treating it as a ranking problem, we use __Binary Cross Entropy__ as our loss function. Three different models were implemented: 18 | 1. Neural Networks (based on [DeepFM](https://arxiv.org/pdf/1804.04950.pdf) and this [Youtube paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf)) 19 | 2. LightGBM 20 | 3. XGBoost 21 | 22 | 23 | 24 | ## Environment 25 | * Ubuntu 16.04 26 | * CUDA 9.0 27 | * Python==3.6.8 28 | * Numpy==1.16 29 | * Pandas==0.24.2 30 | * PyTorch==1.1.0 31 | * Sklearn==0.21.2 32 | * Scipy==1.3.0 33 | * LightGBM==2.2.4 34 | * XGBoost==0.9 35 | * timezonefinder==4.0.3 36 | * geopy==1.20.0 37 | 38 | ## Project Structure 39 | 40 | ``` 41 | ├── input 42 | ├── output 43 | ├── src 44 | └── weights 45 | ``` 46 | 47 | ## Setup 48 | Run the following commands to create directories that conform to the structure of the project, then place the unzipped data into the ```input``` directory.: 49 | 50 | ```. setup.sh``` 51 | 52 | 53 | 54 | Run the two python scripts to picklize the input data and obtain the utc offsets from countries: 55 | ``` 56 | cd src 57 | python picklization.py 58 | python country2utc.py 59 | ``` 60 | 61 | To enable the model to train on the whole data, set ```debug``` and ```subsample``` to ```False``` in the ```config.py``` file. 62 | 63 | ``` 64 | class Configuration(object): 65 | 66 | def __init__(self): 67 | ... 68 | self.debug = False 69 | self.sub_sample = False 70 | ... 71 | ``` 72 | 73 | 74 | ## Training & Submission 75 | 76 | The models are all trained in an end-to-end fashion. To train and predict each of the three models, simply run the following commands: 77 | ``` 78 | python run_nn.py 79 | python run_lgb.py 80 | python run_xgb.py 81 | ``` 82 | The submission files are stored in the ```output``` directory. 83 | 84 | The results generated from LightGBM alone would place us at the 5th position in the public leaderboard. To ensemble these three models, change the output name of each model in ```Merge.ipynb``` and run it. 85 | 86 | 87 | ## Performance 88 | 89 | | Model | Local Validation MRR | Public Leaderboard MRR | 90 | | ------------- |-------------:| -----:| 91 | | LightGBM | 0.685787 | N/A | 92 | | XGBoost | 0.684521 | 0.681128 | 93 | | NN | 0.675206 | 0.672117 | 94 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datetime import datetime 3 | import torch 4 | import inspect 5 | 6 | 7 | class Configuration(object): 8 | 9 | def __init__(self): 10 | self.alias = None 11 | self.num_epochs = 1 12 | self.batch_size = 1024 13 | self.optimizer = 'adam' 14 | self.use_cuda = True 15 | self.device_id = 0 16 | self.early_stopping = 1 17 | self.loss = torch.nn.BCELoss 18 | self.debug = True 19 | self.sub_sample = False 20 | self.slack = True 21 | self.use_test = True if not self.sub_sample else False 22 | 23 | def __getitem__(cls, x): 24 | '''make configuration subscriptable''' 25 | return getattr(cls, x) 26 | 27 | def __setitem__(cls, x, v): 28 | '''make configuration subscriptable''' 29 | return setattr(cls, x, v) 30 | 31 | def get_attributes(self): 32 | attributes = inspect.getmembers(self, lambda a: not (inspect.isroutine(a))) 33 | 34 | # store only not the default attribute __xx__ 35 | attribute_tuple_list = [a for a in attributes if not (a[0].startswith('__') and a[0].endswith('__'))] 36 | 37 | attribute_dict = {} 38 | for tup in attribute_tuple_list: 39 | key = tup[0] 40 | value = tup[1] 41 | if key == 'loss': 42 | value = str(value) 43 | # convert numpy value to float 44 | if type(value) == np.float64: 45 | value = float(value) 46 | attribute_dict[key] = value 47 | 48 | return attribute_dict 49 | 50 | def set_model_dir(self): 51 | now = datetime.now() 52 | 53 | time_info = f'{now.year}{now.month:02d}{now.day:02d}{now.hour:02d}{now.minute:02d}' 54 | self.model_dir = f'model_weights/{self.alias}-{time_info}.model' 55 | 56 | def attribute_to_integer(self): 57 | '''Convert the attributes in self.integer_attribute_list to integer''' 58 | 59 | for attribute in self.integer_attribute_list: 60 | self[attribute] = int(self[attribute]) 61 | 62 | def set_config(self, config): 63 | for key in config: 64 | self[key] = config[key] 65 | 66 | 67 | class NNConfiguration(Configuration): 68 | 69 | def __init__(self): 70 | super(NNConfiguration, self).__init__() 71 | self.categorical_emb_dim = 128 72 | 73 | self.alias = 'NN' 74 | self.optimizer = 'adam' 75 | self.learning_rate = 0.001 76 | self.weight_decay = 0 77 | self.sequence_length = 10 78 | self.sess_length = 30 79 | self.num_embeddings = {} 80 | self.verbose = True 81 | self.hidden_dims = [256 , 128] 82 | self.dropout_rate = 0 83 | self.loss = torch.nn.BCELoss 84 | 85 | 86 | class LGBConfiguration(Configuration): 87 | 88 | def __init__(self): 89 | super(LGBConfiguration, self).__init__() 90 | self.categorical_emb_dim = 128 91 | self.alias = 'LGB' 92 | self.sequence_length = 10 93 | self.sess_length = 30 94 | 95 | class XGBConfiguration(Configuration): 96 | 97 | def __init__(self): 98 | super(XGBConfiguration, self).__init__() 99 | self.categorical_emb_dim = 128 100 | self.alias = 'XGB' 101 | self.sequence_length = 10 102 | self.sess_length = 30 103 | 104 | -------------------------------------------------------------------------------- /src/nn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils import * 3 | from data import * 4 | import torch.nn as nn 5 | 6 | 7 | class Net(torch.nn.Module): 8 | def __init__(self, config): 9 | super(Net, self).__init__() 10 | self.config = config 11 | self.all_cat_columns = self.config.all_cat_columns 12 | self.categorical_emb_dim = config.categorical_emb_dim 13 | self.hidden_dims = config.hidden_dims 14 | self.num_embeddings = config.num_embeddings 15 | 16 | # embedding part 17 | self.emb_dict = torch.nn.ModuleDict() 18 | for cat_col in self.config.all_cat_columns: 19 | if cat_col =='item_id': 20 | 21 | self.emb_dict[cat_col] = torch.nn.Embedding(num_embeddings=self.num_embeddings[cat_col], 22 | embedding_dim=self.categorical_emb_dim, padding_idx = self.config.transformed_dummy_item) 23 | else: 24 | self.emb_dict[cat_col] = torch.nn.Embedding(num_embeddings=self.num_embeddings[cat_col], 25 | embedding_dim=self.categorical_emb_dim) 26 | # gru for extracting session and user interest 27 | self.gru_sess = torch.nn.GRU(input_size = self.categorical_emb_dim *2, hidden_size = self.categorical_emb_dim//2, bidirectional=True , num_layers=2, batch_first=True) 28 | self.other_item_gru = torch.nn.GRU(input_size = self.categorical_emb_dim, hidden_size = self.categorical_emb_dim//2, bidirectional=True , num_layers=1, batch_first=True) 29 | 30 | # linear layer on top of continuous features 31 | self.cont_linear = torch.nn.Linear(config.continuous_size,self.categorical_emb_dim ) 32 | 33 | # hidden layerrs 34 | self.hidden1 = torch.nn.Linear(self.categorical_emb_dim*17 , self.hidden_dims[0]) 35 | self.hidden2 = torch.nn.Linear(self.hidden_dims[0] + config.continuous_size*2 + 3 + config.neighbor_size, self.hidden_dims[1] ) 36 | 37 | # output layer 38 | self.output = torch.nn.Linear(self.hidden_dims[1] , 1) 39 | 40 | # batch normalization 41 | self.bn = torch.nn.BatchNorm1d(self.categorical_emb_dim*17) 42 | self.bn_hidden = torch.nn.BatchNorm1d(self.hidden_dims[0] + config.continuous_size*2+ 3 + config.neighbor_size ) 43 | 44 | def forward(self, item_id, past_interactions, mask, price_rank, city, last_item, impression_index, cont_features, star, past_interactions_sess, past_actions_sess, last_click_item, last_click_impression, last_interact_index, neighbor_prices, other_item_ids, city_platform): 45 | embeddings = [] 46 | user_embeddings = [] 47 | batch_size = item_id.size(0) 48 | 49 | # embedding of all categorical features 50 | emb_item = self.emb_dict['item_id'](item_id) 51 | emb_past_interactions = self.emb_dict['item_id'](past_interactions) 52 | emb_price_rank = self.emb_dict['price_rank'](price_rank) 53 | emb_city = self.emb_dict['city'](city) 54 | emb_last_item = self.emb_dict['item_id'](last_item) 55 | emb_impression_index = self.emb_dict['impression_index'](impression_index) 56 | emb_star = self.emb_dict['star'](star) 57 | emb_past_interactions_sess = self.emb_dict['item_id'](past_interactions_sess) 58 | emb_past_actions_sess = self.emb_dict['action'](past_actions_sess) 59 | emb_last_click_item = self.emb_dict['item_id'](last_click_item) 60 | emb_last_click_impression = self.emb_dict['impression_index'](last_click_impression) 61 | emb_last_interact_index = self.emb_dict['impression_index'](last_interact_index) 62 | emb_city_platform = self.emb_dict['city_platform'](city_platform) 63 | emb_other_item_ids = self.emb_dict['item_id'](other_item_ids) 64 | 65 | # other items processed by gru 66 | emb_other_item_ids_gru, _ = self.other_item_gru(emb_other_item_ids) 67 | pooled_other_item_ids = F.max_pool1d(emb_other_item_ids_gru.permute(0,2,1), kernel_size=emb_other_item_ids_gru.size(1)).squeeze(2) 68 | 69 | # user's past clicked-out item 70 | emb_past_interactions = emb_past_interactions.permute(0,2,1) 71 | pooled_interaction = F.max_pool1d(emb_past_interactions, kernel_size=self.config.sequence_length).squeeze(2) 72 | 73 | 74 | # concatenate sequence of item ids and actions to model session dynamics 75 | emb_past_interactions_sess = torch.cat( [emb_past_interactions_sess, emb_past_actions_sess], dim=2) 76 | emb_past_interactions_sess , _ = self.gru_sess(emb_past_interactions_sess) 77 | emb_past_interactions_sess = emb_past_interactions_sess.permute(0,2,1) 78 | pooled_interaction_sess = F.max_pool1d(emb_past_interactions_sess, kernel_size=self.config.sess_length).squeeze(2) 79 | 80 | 81 | # categorical feature interactions 82 | item_interaction = emb_item * pooled_interaction 83 | item_last_item = emb_item * emb_last_item 84 | item_last_click_item = emb_item * emb_last_click_item 85 | imp_last_idx = emb_impression_index * emb_last_interact_index 86 | 87 | 88 | 89 | # efficiently compute the aggregation of feature interactions 90 | emb_list = [emb_item, pooled_interaction, emb_price_rank, emb_city, emb_last_item, emb_impression_index, emb_star] 91 | emb_concat = torch.cat(emb_list, dim=1) 92 | sum_squared = torch.pow( torch.sum( emb_concat, dim=1) , 2).unsqueeze(1) 93 | squared_sum = torch.sum( torch.pow( emb_concat, 2) , dim=1).unsqueeze(1) 94 | second_order = 0.5 * (sum_squared - squared_sum) 95 | 96 | # compute the square of continuous features 97 | squared_cont = torch.pow(cont_features, 2) 98 | 99 | 100 | # DNN part 101 | concat = torch.cat([emb_item, pooled_interaction, emb_price_rank, emb_city, emb_last_item, emb_impression_index, item_interaction, item_last_item, emb_star, pooled_interaction_sess, emb_last_click_item, emb_last_click_impression, emb_last_interact_index, item_last_click_item, imp_last_idx, pooled_other_item_ids, emb_city_platform] , dim=1) 102 | concat = self.bn(concat) 103 | 104 | hidden = torch.nn.ReLU()(self.hidden1(concat)) 105 | 106 | hidden = torch.cat( [cont_features, hidden, sum_squared, squared_sum, second_order, squared_cont, neighbor_prices] , dim=1) 107 | 108 | hidden = self.bn_hidden(hidden) 109 | hidden = torch.nn.ReLU()(self.hidden2(hidden)) 110 | 111 | 112 | output = torch.sigmoid(self.output(hidden)).squeeze() 113 | 114 | 115 | return output 116 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from constant import * 3 | import torch 4 | import os 5 | import random 6 | import time 7 | from contextlib import contextmanager 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.preprocessing import MinMaxScaler 11 | import torch.nn.functional as F 12 | from scipy.special import erfinv 13 | from ordered_set import OrderedSet 14 | import scipy 15 | from collections import Counter 16 | from timezonefinder import TimezoneFinder 17 | from geopy.geocoders import Nominatim 18 | import pycountry 19 | import datetime 20 | import pytz 21 | 22 | tzf = TimezoneFinder() 23 | 24 | 25 | activation_getter = {'iden': lambda x: x, 'relu': F.relu, 'tanh': torch.tanh, 'sigm': torch.sigmoid} 26 | 27 | 28 | def platform2country(platform): 29 | ''' 30 | return country name given platform 31 | ''' 32 | 33 | if pycountry.countries.get(alpha_2=platform) != None: 34 | try: 35 | return pycountry.countries.get(alpha_2=platform).common_name 36 | except: 37 | return pycountry.countries.get(alpha_2=platform).name 38 | 39 | 40 | else: 41 | return np.nan 42 | 43 | 44 | def location2utc_offset(location): 45 | ''' 46 | return the utc offset given the location 47 | ''' 48 | geolocator = Nominatim(user_agent=str(location)) 49 | # print(location) 50 | location = geolocator.geocode(location) 51 | 52 | if location == None: 53 | return np.nan 54 | try: 55 | lat = location.latitude 56 | lon = location.longitude 57 | offset_sec = datetime.datetime.now(pytz.timezone(tzf.timezone_at(lng=lon, lat=lat))) 58 | return offset_sec.utcoffset().total_seconds()/60/60 59 | except: 60 | return np.nan 61 | 62 | def find_longest_repetitive_sequences(sequence): 63 | ''' 64 | returns a dict that maps each element with the length of its longest repetitive sequneces in the list 65 | args: 66 | sequence: list 67 | 68 | ''' 69 | counter = Counter() 70 | current_element = None 71 | 72 | # iterate the sequence 73 | for element in sequence: 74 | 75 | if current_element == None: 76 | current_element = element 77 | current_rep = 1 78 | elif element == current_element: 79 | current_rep += 1 80 | elif element != current_element: 81 | # update the element with the longest rep 82 | if counter[current_element] < current_rep: 83 | counter[current_element] = current_rep 84 | current_rep = 1 85 | current_element = element 86 | # update the element with the longest rep outside the loop 87 | if len(sequence) > 0 and counter[current_element] < current_rep: 88 | counter[current_element] = current_rep 89 | 90 | return counter 91 | 92 | 93 | 94 | 95 | def qcut_safe(prices, q): 96 | nbins=min(q, len(prices)) 97 | result = pd.qcut(prices, nbins, labels=np.arange(nbins) ) 98 | 99 | return result 100 | 101 | 102 | 103 | class GaussRankScaler(): 104 | 105 | def __init__( self ): 106 | self.epsilon = 1e-9 107 | self.lower = -1 + self.epsilon 108 | self.upper = 1 - self.epsilon 109 | self.range = self.upper - self.lower 110 | 111 | def fit_transform( self, X ): 112 | 113 | i = np.argsort( X, axis = 0 ) 114 | j = np.argsort( i, axis = 0 ) 115 | 116 | assert ( j.min() == 0 ).all() 117 | assert ( j.max() == len( j ) - 1 ).all() 118 | 119 | j_range = len( j ) - 1 120 | self.divider = j_range / self.range 121 | 122 | transformed = j / self.divider 123 | transformed = transformed - self.upper 124 | transformed = scipy.special.erfinv( transformed ) 125 | ############ 126 | # transformed = transformed - np.mean(transformed) 127 | 128 | return transformed 129 | 130 | def seed_everything(seed=42): 131 | random.seed(seed) 132 | torch.manual_seed(seed) 133 | torch.cuda.manual_seed_all(seed) 134 | np.random.seed(seed) 135 | os.environ['PYTHONHASHSEED'] = str(seed) 136 | 137 | def compute_rank(inp, to_np=False): 138 | sorted_inp = sorted(inp) 139 | out = [sorted_inp.index(i) for i in inp] 140 | if to_np: 141 | out = np.array(out) 142 | return out 143 | 144 | def set_seed(seed, cuda=False): 145 | 146 | np.random.seed(seed) 147 | random.seed(seed) 148 | if cuda: 149 | torch.cuda.manual_seed(seed) 150 | else: 151 | torch.manual_seed(seed) 152 | 153 | 154 | class CategoricalEncoder(): 155 | ''' 156 | This class is for those operating on large data, in which sklearn's LabelEncoder class may take too much time. 157 | This encoder is only suitable for 1-d array/ list. You may modify it to become n-d compatible. 158 | ''' 159 | def __init__(self): 160 | self.f_dict = {} 161 | self.r_dict = {} 162 | 163 | def fit(self, array): 164 | ''' 165 | 166 | :param array: list or np array 167 | :return: None 168 | ''' 169 | 170 | unique_elements = OrderedSet(array) 171 | # unique_elements = sorted(unique_elements) 172 | # print(DUMMY_ITEM in unique_elements) 173 | # print('-1' in unique_elements) 174 | self.n_elements = 0 175 | self.f_dict = {} 176 | self.r_dict = {} 177 | 178 | for e in unique_elements: 179 | self.f_dict[e] = self.n_elements 180 | self.r_dict[self.n_elements] = e 181 | self.n_elements += 1 182 | 183 | 184 | def continue_fit(self, array): 185 | ''' 186 | Do not refresh n_elements, count from the latest n_elements. 187 | :param array: 188 | :return: None 189 | ''' 190 | unique_elements = set(array) 191 | for e in unique_elements: 192 | if e not in self.f_dict: 193 | self.f_dict[e] = self.n_elements 194 | self.r_dict[self.n_elements] = e 195 | self.n_elements += 1 196 | 197 | 198 | def reverse_transform(self, transformed_array, to_np=False): 199 | ''' 200 | 201 | :param transformed_array: list or np array 202 | :return: array: np array with the same shape as input 203 | ''' 204 | 205 | 206 | array = [self.r_dict[e] for e in transformed_array] 207 | if to_np: 208 | array = np.array(array) 209 | return array 210 | 211 | 212 | def transform(self, array, to_np=False): 213 | ''' 214 | 215 | :param array: array list or np array 216 | :return: list or np array with the same shape as the input 217 | ''' 218 | transformed_array = [self.f_dict[e] for e in array] 219 | if to_np: 220 | transformed_array = np.array(transformed_array) 221 | return transformed_array 222 | 223 | def fit_transform(self, array, to_np=False): 224 | ''' 225 | 226 | :param array: array list or np array 227 | :return: list or np array with the same shape as the input 228 | ''' 229 | self.fit(array) 230 | return self.transform(array, to_np) 231 | 232 | def str2bool(v): 233 | return v.lower() in ('true') 234 | 235 | def use_optimizer(network, params): 236 | if params['optimizer'] == 'adam': 237 | optimizer = torch.optim.Adam(network.parameters(), lr=params['learning_rate'] , weight_decay=params['weight_decay'], eps=1e-07, amsgrad=True) 238 | elif params['optimizer'] == 'rmsprop': 239 | optimizer = torch.optim.RMSprop(network.parameters(), 240 | lr=params['learning_rate'],) 241 | elif params['optimizer'] == 'sgd': 242 | optimizer = torch.optim.SGD(network.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay']) 243 | return optimizer 244 | 245 | def get_attn_key_pad_mask(seq_k, seq_q, transformed_dummy_value): 246 | ''' For masking out the padding part of key sequence. ''' 247 | 248 | # Expand to fit the shape of key query attention matrix. 249 | len_q = seq_q.size(1) 250 | padding_mask = seq_k.eq(transformed_dummy_value) 251 | padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1) # b x lq x lk 252 | 253 | return padding_mask 254 | 255 | def compute_mean_reciprocal_rank(rs): 256 | ''' 257 | rs: 2d array 258 | 259 | >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]] 260 | >>> mean_reciprocal_rank(rs) 261 | 0.61111111111111105 262 | >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]]) 263 | >>> mean_reciprocal_rank(rs) 264 | 0.5 265 | >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]] 266 | >>> mean_reciprocal_rank(rs) 267 | 0.75 268 | ''' 269 | 270 | rs = (np.asarray(r).nonzero()[0] for r in rs) 271 | return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs]) 272 | 273 | @contextmanager 274 | def timer(name): 275 | t0 = time.time() 276 | yield 277 | print('[{}] done in {:.5f} s'.format(name,(time.time() - t0))) 278 | 279 | 280 | -------------------------------------------------------------------------------- /src/run_nn.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | from data import * 3 | from utils import * 4 | from constant import * 5 | from nn import * 6 | from torch.autograd import Variable 7 | from tqdm import tqdm 8 | import numpy as np 9 | import os 10 | from datetime import datetime 11 | import pytz 12 | 13 | 14 | 15 | 16 | 17 | 18 | model_name = 'nn_xnn_time_diff_v2' 19 | 20 | 21 | torch.backends.cudnn.deterministic = True 22 | seed_everything(42) 23 | 24 | configuration = NNConfiguration() 25 | 26 | 27 | os.environ["CUDA_VISIBLE_DEVICES"] = str(configuration.device_id) 28 | print("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"]) 29 | 30 | if configuration.sub_sample: 31 | model_name += '_140k' 32 | else: 33 | model_name += '_all' 34 | 35 | if configuration.use_test: 36 | model_name += '_ut' 37 | 38 | if configuration.debug: 39 | model_name += '_db' 40 | 41 | model_name += f'_{configuration.device_id}' 42 | 43 | 44 | weight_path = f"../weights/{model_name}.model" 45 | 46 | 47 | 48 | 49 | print(configuration.get_attributes()) 50 | 51 | 52 | data_gen = NNDataGenerator(configuration) 53 | 54 | 55 | 56 | print(configuration.get_attributes()) 57 | 58 | 59 | 60 | valid_data = data_gen.val_data 61 | train_data= data_gen.train_data 62 | 63 | 64 | 65 | if configuration.use_cuda: 66 | net = Net(configuration).cuda() 67 | else: 68 | net = Net(configuration) 69 | 70 | optim = use_optimizer(net, configuration) 71 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, 'min',min_lr=0.0005, factor=0.7, verbose=True) 72 | print(net) 73 | 74 | def get_prediction(loader, net): 75 | net.eval() 76 | all_scores = [] 77 | validation_loss = [] 78 | for batch_id, data in enumerate(loader): 79 | with torch.no_grad(): 80 | item_ids = Variable(data[0]).to(device=device_type) 81 | targets = Variable(data[1]).to(device=device_type) 82 | past_interactions = Variable(data[2]).to(device=device_type) 83 | 84 | past_interaction_masks = (data[3]) 85 | 86 | price_rank = Variable(data[4]).to(device=device_type) 87 | city = Variable(data[5]).to(device=device_type) 88 | last_item = Variable(data[6]).to(device=device_type) 89 | impression_index = Variable(data[7]).to(device=device_type) 90 | continuous_features = Variable(data[8]).to(device=device_type) 91 | 92 | star = Variable(data[9]).to(device=device_type) 93 | 94 | past_interactions_sess = Variable(data[10]).to(device=device_type) 95 | past_actions_sess = Variable(data[11]).to(device=device_type) 96 | 97 | 98 | last_click_item = Variable(data[12]).to(device=device_type) 99 | last_click_impression = Variable(data[13]).to(device=device_type) 100 | last_interact_index = Variable(data[14]).to(device=device_type) 101 | neighbor_prices = Variable(data[15]).to(device=device_type) 102 | other_item_ids = Variable(data[16]).to(device=device_type) 103 | city_platform = Variable(data[17]).to(device=device_type) 104 | 105 | prediction = net(item_ids, past_interactions, past_interaction_masks, price_rank, city, last_item, impression_index, continuous_features, star, past_interactions_sess, past_actions_sess, last_click_item, last_click_impression, last_interact_index, neighbor_prices, other_item_ids, city_platform) 106 | loss = crit(prediction,targets).item() 107 | prediction = prediction.detach().cpu().numpy().tolist() 108 | all_scores += prediction 109 | validation_loss.append(loss) 110 | validation_loss = np.mean(validation_loss) 111 | return all_scores, validation_loss 112 | 113 | def evaluate_valid(val_loader, val_df, net ): 114 | 115 | 116 | val_df['score'], val_loss = get_prediction(val_loader, net) 117 | 118 | 119 | grouped_val = val_df.groupby('session_id') 120 | rss = [] 121 | rss_group = {i:[] for i in range(1,26)} 122 | incorrect_session = {} 123 | for session_id, group in grouped_val: 124 | 125 | scores = group['score'] 126 | sorted_arg = np.flip(np.argsort(scores)) 127 | 128 | if group['label'].values[sorted_arg][0] != 1: 129 | incorrect_session[session_id] = (sorted_arg.values, group['label'].values[sorted_arg]) 130 | 131 | rss.append( group['label'].values[sorted_arg]) 132 | rss_group[len(group)].append(group['label'].values[sorted_arg]) 133 | 134 | mrr = compute_mean_reciprocal_rank(rss) 135 | mrr_group = {i:(len(rss_group[i]), compute_mean_reciprocal_rank(rss_group[i])) for i in range(1,26)} 136 | # print(mrr_group) 137 | pickle.dump( incorrect_session, open(f'../output/{model_name}_val_incorrect_order.p','wb')) 138 | 139 | return mrr, mrr_group, val_loss 140 | 141 | 142 | 143 | device_type='cuda' 144 | 145 | 146 | 147 | crit = configuration.loss() 148 | 149 | 150 | best_mrr = 0 151 | early_stopping = configuration.early_stopping 152 | not_improve_round = 0 153 | val_loader = data_gen.evaluate_data_valid() 154 | test_loader =data_gen.instance_a_test_loader() 155 | train_loader = data_gen.instance_a_train_loader() 156 | n_iter = 0 157 | stopped = False 158 | for i in range(configuration.num_epochs): 159 | 160 | 161 | net.train() 162 | for batch_id, data in enumerate(tqdm(train_loader)): 163 | optim.zero_grad() 164 | n_iter += 1 165 | 166 | item_ids = Variable(data[0]).to(device=device_type) 167 | targets = Variable(data[1]).to(device=device_type) 168 | past_interactions = Variable(data[2]).to(device=device_type) 169 | 170 | past_interaction_masks = (data[3]) 171 | 172 | price_rank = Variable(data[4]).to(device=device_type) 173 | city = Variable(data[5]).to(device=device_type) 174 | last_item = Variable(data[6]).to(device=device_type) 175 | impression_index = Variable(data[7]).to(device=device_type) 176 | continuous_features = Variable(data[8]).to(device=device_type) 177 | star = Variable(data[9]).to(device=device_type) 178 | 179 | past_interactions_sess = Variable(data[10]).to(device=device_type) 180 | past_actions_sess = Variable(data[11]).to(device=device_type) 181 | 182 | # other_item_impressions = Variable(data[13]).to(device=device_type) 183 | last_click_item = Variable(data[12]).to(device=device_type) 184 | last_click_impression = Variable(data[13]).to(device=device_type) 185 | last_interact_index = Variable(data[14]).to(device=device_type) 186 | neighbor_prices = Variable(data[15]).to(device=device_type) 187 | other_item_ids = Variable(data[16]).to(device=device_type) 188 | city_platform = Variable(data[17]).to(device=device_type) 189 | prediction = net(item_ids, past_interactions, past_interaction_masks, price_rank, city, last_item, impression_index, continuous_features, star, past_interactions_sess, past_actions_sess, last_click_item, last_click_impression, last_interact_index, neighbor_prices, other_item_ids, city_platform) 190 | 191 | loss = crit(prediction,targets) 192 | loss.backward() 193 | optim.step() 194 | 195 | mrr, mrr_group, val_loss = evaluate_valid(val_loader, valid_data, net) 196 | if mrr > best_mrr: 197 | print(f"improve from {best_mrr} to {mrr}") 198 | best_mrr = mrr 199 | not_improve_round = 0 200 | torch.save(net.state_dict(), weight_path) 201 | else: 202 | print(f"didn't improve from {best_mrr} to {mrr}") 203 | not_improve_round += 1 204 | if not_improve_round >= early_stopping: 205 | break 206 | 207 | 208 | net.load_state_dict(torch.load(weight_path)) 209 | 210 | 211 | print("BEST mrr", best_mrr) 212 | 213 | 214 | 215 | if configuration.debug: 216 | exit(0) 217 | 218 | 219 | 220 | 221 | test_df = data_gen.test_data 222 | test_df['score'], _ = get_prediction(test_loader, net) 223 | 224 | 225 | 226 | with open(f'../output/{model_name}_test_score.p', 'wb') as f: 227 | pickle.dump( test_df.loc[:,['score', 'session_id', 'step']],f, protocol=4) 228 | 229 | grouped_test = test_df.groupby('session_id') 230 | predictions = [] 231 | session_ids = [] 232 | for session_id, group in grouped_test: 233 | 234 | scores = group['score'] 235 | sorted_arg = np.flip(np.argsort(scores)) 236 | sorted_item_ids = group['item_id'].values[sorted_arg] 237 | sorted_item_ids = data_gen.cat_encoders['item_id'].reverse_transform(sorted_item_ids) 238 | sorted_item_string = ' '.join([str(i) for i in sorted_item_ids]) 239 | predictions.append(sorted_item_string) 240 | session_ids.append(session_id) 241 | 242 | prediction_df = pd.DataFrame() 243 | prediction_df['session_id'] = session_ids 244 | prediction_df['item_recommendations'] = predictions 245 | 246 | print("pred df shape", prediction_df.shape) 247 | sub_df = pd.read_csv('../input/submission_popular.csv') 248 | sub_df.drop('item_recommendations', axis=1, inplace=True) 249 | sub_df = sub_df.merge(prediction_df, on="session_id") 250 | # sub_df['item_recommendations'] = predictions 251 | 252 | sub_df.to_csv(f'../output/{model_name}.csv', index=None) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2019] [Rosetta.ai] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import gc 6 | from constant import * 7 | from utils import * 8 | from config import * 9 | import torch 10 | from torch.utils.data import DataLoader, Dataset 11 | # from sklearn.preprocessing import LabelEncoder 12 | from joblib import Parallel, delayed 13 | from tqdm import tqdm 14 | from collections import defaultdict 15 | from ordered_set import OrderedSet 16 | from sklearn.decomposition import TruncatedSVD 17 | from sklearn.preprocessing import MinMaxScaler, StandardScaler 18 | from sklearn.feature_extraction.text import TfidfVectorizer 19 | 20 | 21 | class NNDataLoader(): 22 | def __init__(self, data, config, shuffle=True, batch_size=128, continuous_features=None): 23 | self.item_id = torch.LongTensor(data.item_id.values) 24 | self.config = config 25 | self.label = torch.FloatTensor(data.label.values) 26 | self.past_interactions = torch.LongTensor(np.vstack(data.past_interactions.values)) 27 | self.batch_size = batch_size 28 | self.shuffle = shuffle 29 | self.indices = np.arange(len(self.item_id)) 30 | self.past_interaction_masks = self.past_interactions != self.config.transformed_dummy_item 31 | self.price_rank = torch.LongTensor(data.price_rank.values) 32 | self.city = torch.LongTensor(data.city.values) 33 | self.last_item = torch.LongTensor(data.last_item.values) 34 | self.impression_index = torch.LongTensor(data.impression_index) 35 | 36 | self.continuous_features = torch.FloatTensor(data.loc[:,continuous_features].values) 37 | 38 | self.neighbor_prices = torch.FloatTensor(np.vstack(data.neighbor_prices)) 39 | 40 | 41 | 42 | self.star = torch.LongTensor(data.star) 43 | 44 | self.past_interactions_sess = torch.LongTensor(np.vstack(data.past_interactions_sess.values)) 45 | self.past_actions_sess = torch.LongTensor(np.vstack(data.past_actions_sess.values)) 46 | self.last_click_item = torch.LongTensor(data.last_click_item.values) 47 | self.last_click_impression = torch.LongTensor(data.last_click_impression.values) 48 | self.last_interact_index = torch.LongTensor(data.last_interact_index.values) 49 | self.other_item_ids = torch.LongTensor(np.vstack(data.other_item_ids.values)) 50 | self.city_platform = torch.LongTensor(data.city_platform.values) 51 | 52 | 53 | assert len(self.item_id) == len(self.past_interactions) 54 | assert len(self.past_interactions) == len(self.label) 55 | def __len__(self): 56 | return len(self.item_id) // self.batch_size 57 | 58 | def __iter__(self): 59 | self.batch_id = 0 60 | if self.shuffle: 61 | np.random.shuffle(self.indices) 62 | return self 63 | 64 | def __next__(self): 65 | if self.batch_id * self.batch_size <= len(self.indices): 66 | current_indices = self.indices[self.batch_id * self.batch_size: (self.batch_id + 1) * self.batch_size] 67 | result = [self.item_id[current_indices], self.label[current_indices], self.past_interactions[current_indices]\ 68 | , self.past_interaction_masks[current_indices], self.price_rank[current_indices], self.city[current_indices]\ 69 | , self.last_item[current_indices], self.impression_index[current_indices], self.continuous_features[current_indices]\ 70 | , self.star[current_indices], self.past_interactions_sess[current_indices], self.past_actions_sess[current_indices]\ 71 | , self.last_click_item[current_indices], self.last_click_impression[current_indices], self.last_interact_index[current_indices]\ 72 | , self.neighbor_prices[current_indices], self.other_item_ids[current_indices], self.city_platform[current_indices]] 73 | self.batch_id += 1 74 | return result 75 | else: 76 | raise StopIteration 77 | 78 | 79 | 80 | class NNDataGenerator(): 81 | """Construct dataset for NN""" 82 | def __init__(self, config): 83 | """ 84 | args: 85 | target_action: the target action at the next timestep. Can be 'buy', 'select', 'click', 'view' 86 | monitor_actions: the action that we should keep track with 87 | """ 88 | 89 | self.config = config 90 | 91 | self.target_action = self.config.target_action = 'clickout item' 92 | # self.config.keep_columns = self.keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item'] 93 | self.config.all_cat_columns = self.all_cat_columns = ['user_id', 'item_id', 'city','action', 'city_platform'] 94 | 95 | with open( f'{input_dir}/train_v2.p', 'rb') as f: 96 | train = pickle.load(f) 97 | train['id']= np.arange(len(train)) 98 | 99 | with open(f'{input_dir}/test_v2.p', 'rb') as f: 100 | test = pickle.load(f) 101 | test['id'] = np.arange( len(train), len(train)+ len(test)) 102 | 103 | with open('../input/item_metadata.p', 'rb') as f: 104 | item_meta = pickle.load(f) 105 | item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|')) 106 | item_meta['item_id'] = item_meta['item_id'].apply(str) 107 | 108 | 109 | 110 | if config.sub_sample: 111 | with open('../input/selected_users_140k.p', 'rb') as f: 112 | selected_users = pickle.load(f) 113 | 114 | train = train.loc[train.user_id.isin(selected_users),:] 115 | 116 | if config.debug: 117 | train = train.sample(1000) 118 | test = test.sample(1000) 119 | 120 | train.rename(columns={'reference': 'item_id', 'action_type':'action'}, inplace=True) 121 | test.rename(columns={'reference': 'item_id', 'action_type':'action'}, inplace=True) 122 | 123 | 124 | 125 | 126 | # fill item_id with DUMMY 127 | train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1) 128 | test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1) 129 | 130 | 131 | train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1) 132 | test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1) 133 | 134 | 135 | 136 | 137 | 138 | 139 | train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM 140 | test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM 141 | 142 | train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM 143 | test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM 144 | 145 | train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM 146 | test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM 147 | 148 | train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM 149 | test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM 150 | 151 | 152 | 153 | 154 | 155 | # filter out rows where reference doesn't present in impression 156 | train['in_impressions'] = True 157 | train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1) 158 | train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True) 159 | 160 | test['in_impressions'] = True 161 | test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1) 162 | test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True) 163 | 164 | 165 | train['item_id'] = train['item_id'].apply(str) 166 | train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|')) 167 | train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x]) 168 | 169 | 170 | 171 | test['item_id'] = test['item_id'].apply(str) 172 | test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|')) 173 | test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x]) 174 | 175 | 176 | 177 | data = pd.concat([train, test], axis=0) 178 | data = data.reset_index(drop=True) 179 | all_items = [] 180 | 181 | for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()]: 182 | all_items += imp 183 | 184 | unique_items = OrderedSet(all_items) 185 | unique_actions = OrderedSet(data.action.values) 186 | 187 | train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list)) 188 | test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list)) 189 | 190 | 191 | train_session_actions = dict(train.groupby('session_id')['action'].apply(list)) 192 | test_session_actions = dict(test.groupby('session_id')['action'].apply(list)) 193 | 194 | 195 | train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int) 196 | test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int) 197 | 198 | 199 | train['city_platform'] = train.apply(lambda x: x['city'] + x['platform'], axis=1) 200 | test['city_platform'] = test.apply(lambda x: x['city'] + x['platform'], axis=1) 201 | # get last item 202 | train['last_item'] = np.nan 203 | test['last_item'] = np.nan 204 | 205 | train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist() 206 | test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist() 207 | train['last_item'] = train_shifted_item_id 208 | test['last_item'] = test_shifted_item_id 209 | 210 | 211 | train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist() 212 | test_shifted_item_id = [DUMMY_ITEM] *2 + test.item_id.values[:-2].tolist() 213 | 214 | train['second_last_item'] = train_shifted_item_id 215 | test['second_last_item'] = test_shifted_item_id 216 | 217 | 218 | train['step_rank'] = train.groupby('session_id')['timestamp'].rank(method='max', ascending=True) 219 | test['step_rank'] = test.groupby('session_id')['timestamp'].rank(method='max', ascending=True) 220 | 221 | 222 | train.loc[(train.step_rank == 1) , 'last_item'] = DUMMY_ITEM 223 | test.loc[(test.step_rank == 1) , 'last_item'] = DUMMY_ITEM 224 | 225 | 226 | train.loc[(train.step_rank == 2) , 'second_last_item'] = DUMMY_ITEM 227 | test.loc[(test.step_rank == 2) , 'second_last_item'] = DUMMY_ITEM 228 | 229 | 230 | 231 | data = pd.concat([train, test], axis=0) 232 | data = data.reset_index(drop=True) 233 | 234 | data_feature = data.loc[:,['id','session_id','timestamp', 'step']].copy() 235 | data_feature['time_diff'] = data_feature.groupby('session_id')['timestamp'].diff() 236 | data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff() 237 | data_feature['time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values) 238 | data_feature['time_diff_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff_diff'].values) 239 | data_feature['mm_step'] = GaussRankScaler().fit_transform(data_feature['step'].values) 240 | data_feature['day'] = MinMaxScaler().fit_transform(pd.to_datetime(data.timestamp, unit='s').dt.day.values.reshape(-1,1) ) 241 | data_feature['rg_timestamp'] = GaussRankScaler().fit_transform(data_feature['timestamp'].values) 242 | 243 | 244 | data_feature = data_feature.drop( ['session_id','timestamp','step'],axis=1) 245 | 246 | 247 | # get time diff 248 | train = train.merge(data_feature, on='id', how='left') 249 | test = test.merge(data_feature, on='id', how='left') 250 | 251 | train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list)) 252 | test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list)) 253 | 254 | self.cat_encoders = {} 255 | 256 | for col in self.all_cat_columns: 257 | self.cat_encoders[col] = CategoricalEncoder() 258 | 259 | self.cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM] ) 260 | self.cat_encoders['city'].fit(data.city.values) 261 | self.cat_encoders['city_platform'].fit(data.city_platform.values) 262 | self.cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION]) 263 | self.cat_encoders['user_id'].fit(data.user_id.values) 264 | # with open('../input/user_encoder.p','rb') as f: 265 | # self.cat_encoders['user_id'] = pickle.load(f) 266 | # self.cat_encoders['user_id'].fit(data.user_id.tolist() ) 267 | 268 | 269 | for col in self.all_cat_columns: 270 | 271 | train[col] = self.cat_encoders[col].transform(train[col].values) 272 | test[col] = self.cat_encoders[col].transform(test[col].values) 273 | self.config.num_embeddings[col] = self.cat_encoders[col].n_elements 274 | 275 | 276 | #this is an integer 277 | self.config.transformed_clickout_action = self.transformed_clickout_action = self.cat_encoders['action'].transform(['clickout item'])[0] 278 | self.config.transformed_dummy_action = self.transformed_dummy_action = self.cat_encoders['action'].transform([DUMMY_ACTION])[0] 279 | self.transformed_interaction_image = self.cat_encoders['action'].transform(['interaction item image'])[0] 280 | self.transformed_interaction_deals = self.cat_encoders['action'].transform(['interaction item deals'])[0] 281 | self.transformed_interaction_info = self.cat_encoders['action'].transform(['interaction item info'])[0] 282 | self.transformed_interaction_rating = self.cat_encoders['action'].transform(['interaction item rating'])[0] 283 | 284 | self.config.transformed_dummy_item = self.transformed_dummy_item = self.cat_encoders['item_id'].transform([DUMMY_ITEM])[0] 285 | self.config.transformed_nan_item = self.transformed_nan_item = self.cat_encoders['item_id'].transform(['nan'])[0] 286 | 287 | 288 | # transform last item 289 | train['last_item'] = self.cat_encoders['item_id'].transform(train['last_item'].values) 290 | test['last_item'] = self.cat_encoders['item_id'].transform(test['last_item'].values) 291 | 292 | train['second_last_item'] = self.cat_encoders['item_id'].transform(train.second_last_item.values) 293 | test['second_last_item'] = self.cat_encoders['item_id'].transform(test.second_last_item.values) 294 | 295 | # transform session interactions and pad dummy in front of all of them 296 | for session_id, item_list in train_session_interactions.items(): 297 | train_session_interactions[session_id] = [self.transformed_dummy_item] * self.config.sess_length + self.cat_encoders['item_id'].transform(item_list) 298 | 299 | for session_id, item_list in test_session_interactions.items(): 300 | test_session_interactions[session_id] = [self.transformed_dummy_item] * self.config.sess_length + self.cat_encoders['item_id'].transform(item_list) 301 | 302 | for session_id, action_list in train_session_actions.items(): 303 | train_session_actions[session_id] = [self.transformed_dummy_action] * self.config.sess_length + self.cat_encoders['action'].transform(action_list) 304 | 305 | for session_id, action_list in test_session_actions.items(): 306 | test_session_actions[session_id] = [self.transformed_dummy_action] * self.config.sess_length + self.cat_encoders['action'].transform(action_list) 307 | 308 | 309 | implicit_train = train.loc[train.action != self.transformed_clickout_action, :] 310 | implicit_test = test.loc[test.action != self.transformed_clickout_action, :] 311 | 312 | 313 | 314 | 315 | 316 | # get interaction count for all item 317 | interaction_item_ids = implicit_train.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + implicit_test.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 318 | unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True) 319 | self.interaction_count_dict = dict(zip(unique_interaction_items, counts)) 320 | 321 | # get interaction count for all item 322 | interaction_image_item_ids = train.loc[train.action == self.transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == self.transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 323 | unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True) 324 | self.image_count_dict = dict(zip(unique_interaction_image_items, counts)) 325 | 326 | 327 | # get only the clickout 328 | train = train.loc[train.action ==self.transformed_clickout_action,:] 329 | test = test.loc[test.action == self.transformed_clickout_action,:] 330 | 331 | 332 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False) 333 | 334 | 335 | 336 | # compute global item-price DataFrame 337 | # prices = np.hstack([np.hstack(train['prices'].values), np.hstack(test.prices.values)]) 338 | item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)]) 339 | 340 | unique_items, counts = np.unique(item_ids, return_counts=True) 341 | self.item_popularity_dict = dict(zip(unique_items, counts)) 342 | 343 | clickout_item_ids = train.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 344 | unique_clickout_items, counts = np.unique(clickout_item_ids, return_counts=True) 345 | 346 | self.clickout_count_dict = dict(zip(unique_clickout_items, counts)) 347 | 348 | self.platform_clickout_count = pd.concat([train, test], axis=0).groupby(['platform','item_id']).size() 349 | 350 | 351 | 352 | 353 | 354 | if config.debug: 355 | val = train.loc[train.step_rank == 1,:].iloc[:5] 356 | else: 357 | val = train.loc[train.step_rank == 1,:].iloc[:50000] 358 | 359 | val_index = val.index 360 | train = train.loc[~train.index.isin(val_index),:] 361 | 362 | 363 | 364 | # {'user_id':[11,2,5,9,]} 365 | self.past_interaction_dict = {} 366 | self.past_interaction_dict_sess = {} 367 | self.last_click_sess_dict = {} 368 | self.last_impressions_dict = {} 369 | self.sess_impressions_dict = {} 370 | self.sess_last_step_dict = {} 371 | self.sess_last_imp_idx_dict = {} 372 | self.sess_last_price_dict = {} 373 | self.sess_time_diff_dict = {} 374 | 375 | 376 | # split the interaction df into train/ val and construct training sequences 377 | self.train_data = self.build_user_item_interactions(train, train_session_interactions, train_session_actions, train_session_time_diff) 378 | self.val_data = self.build_user_item_interactions(val, train_session_interactions, train_session_actions, train_session_time_diff) 379 | self.test_data, labeled_test = self.build_user_item_interactions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False) 380 | 381 | # standard scale price 382 | 383 | price_sc = StandardScaler() 384 | 385 | 386 | self.train_data['price_diff'] = price_sc.fit_transform(self.train_data.price_diff.values.reshape(-1,1)) 387 | self.val_data['price_diff'] = price_sc.transform(self.val_data.price_diff.values.reshape(-1,1)) 388 | self.test_data['price_diff'] = price_sc.transform(self.test_data.price_diff.values.reshape(-1,1)) 389 | 390 | 391 | 392 | price_mm = MinMaxScaler() 393 | self.train_data['price_ratio'] = price_mm.fit_transform(self.train_data.price_ratio.values.reshape(-1,1)) 394 | self.val_data['price_ratio'] = price_mm.transform(self.val_data.price_ratio.values.reshape(-1,1)) 395 | self.test_data['price_ratio'] = price_mm.transform(self.test_data.price_ratio.values.reshape(-1,1)) 396 | 397 | 398 | 399 | 400 | 401 | price_mm.fit(np.hstack([np.hstack(self.train_data.neighbor_prices.values), np.hstack(self.val_data.neighbor_prices.values),\ 402 | np.hstack(self.test_data.neighbor_prices.values)]).reshape(-1,1) ) 403 | # print(self.train_data['neighbor_prices'].head(5)) 404 | self.train_data['neighbor_prices'] = self.train_data['neighbor_prices'].apply(lambda x: price_mm.transform(np.array(x).reshape(-1,1)).reshape(-1)) 405 | self.val_data['neighbor_prices'] = self.val_data['neighbor_prices'].apply(lambda x: price_mm.transform(np.array(x).reshape(-1,1)).reshape(-1)) 406 | self.test_data['neighbor_prices'] = self.test_data['neighbor_prices'].apply(lambda x: price_mm.transform(np.array(x).reshape(-1,1)).reshape(-1)) 407 | 408 | 409 | 410 | if config.use_test: 411 | self.train_data = pd.concat([self.train_data, labeled_test], axis=0) 412 | 413 | sampled_test_session = self.test_data.session_id.sample(frac=0.3) 414 | 415 | # self.train_data = pd.concat([self.train_data, self.test_data.loc[self.test_data.session_id.isin(sampled_test_session)]], axis=0) 416 | # item_meta multi-hot 417 | item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:] 418 | item_meta['item_id'] = self.cat_encoders['item_id'].transform(item_meta['item_id'].values) 419 | item_meta['star'] = 0 420 | item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1 421 | item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2 422 | item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3 423 | item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4 424 | item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5 425 | 426 | unique_property = list(OrderedSet(np.hstack(item_meta.properties.tolist()))) 427 | self.unique_property = unique_property 428 | 429 | self.cat_encoders['item_property'] = CategoricalEncoder() 430 | self.cat_encoders['item_property'].fit(unique_property) 431 | item_properties_array = [] 432 | for row in item_meta.itertuples(): 433 | current_row = np.zeros(len(unique_property) + 2) 434 | one_indices = self.cat_encoders['item_property'].transform(row.properties) 435 | current_row[one_indices] = 1 436 | current_row[-1] = row.item_id 437 | current_row[-2] = row.star 438 | item_properties_array.append(current_row) 439 | 440 | item_properties_array = np.vstack(item_properties_array) 441 | item_properties_df = pd.DataFrame(item_properties_array, columns=unique_property + ['star', 'item_id']) 442 | 443 | 444 | item_properties_item_id = item_properties_df.item_id.values 445 | item_properties_star = item_properties_df.star.values 446 | 447 | tsvd = TruncatedSVD(n_components=30, n_iter=10, random_state=None) 448 | svd_matrix = tsvd.fit_transform(item_properties_df.drop( ['star', 'item_id'],axis=1).values) 449 | print("explained ratio", tsvd.explained_variance_ratio_.sum()) 450 | svd_ip_columns = [ f'svd_ip_{i}' for i in np.arange(30)] 451 | item_properties_df = pd.DataFrame(svd_matrix, columns=svd_ip_columns) 452 | item_properties_df['item_id'] = item_properties_item_id 453 | item_properties_df['star'] = item_properties_star 454 | item_properties_df['pet_friendly'] = item_meta.properties.apply(lambda x: 'Pet Friendly' in x) 455 | item_properties_df['parking'] = item_meta.properties.apply(lambda x: 'Car Park' in x) 456 | item_properties_df = item_properties_df.astype(dtype= {"item_id":"int32","pet_friendly":"float32", "parking":"float32"}) 457 | 458 | 459 | filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']] 460 | filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|')) 461 | filter_set = list(OrderedSet(np.hstack(filter_df['current_filters'].to_list()))) 462 | 463 | self.cat_encoders['filters'] = CategoricalEncoder() 464 | self.cat_encoders['filters'].fit(filter_set) 465 | all_filter_array = [] 466 | 467 | for row in filter_df.itertuples(): 468 | current_row = np.zeros(len(filter_set) + 1, dtype=object) 469 | current_filters = row.current_filters 470 | one_indices = self.cat_encoders['filters'].transform(row.current_filters) 471 | current_row[one_indices] = 1 472 | current_row[-1] = row.id 473 | all_filter_array.append(current_row) 474 | 475 | 476 | all_filter_array = np.vstack(all_filter_array) 477 | filters_df = pd.DataFrame(all_filter_array, columns= [f'ft_{f}' for f in filter_set] + ['id']) 478 | dtype_dict = {"id":"int32"} 479 | for f in filter_set: 480 | dtype_dict[f'ft_{f}'] = "int32" 481 | filters_df = filters_df.astype(dtype= dtype_dict) 482 | 483 | filters_id = filters_df.id.values 484 | 485 | 486 | tsvd = TruncatedSVD(n_components=10, n_iter=10, random_state=None) 487 | svd_matrix = tsvd.fit_transform(filters_df.drop( ['id'],axis=1).values) 488 | print("explained ratio", tsvd.explained_variance_ratio_.sum()) 489 | svd_ft_columns = [ f'svd_ft_{i}' for i in np.arange(10)] 490 | filters_df = pd.DataFrame(svd_matrix, columns=svd_ft_columns) 491 | for c in svd_ft_columns: 492 | filters_df[c] = MinMaxScaler().fit_transform(filters_df[c].values.reshape(-1,1)) 493 | filters_df['id'] = filters_id 494 | 495 | del train, test, data 496 | gc.collect() 497 | 498 | self.train_data = self.train_data.merge(item_properties_df, on="item_id", how="left") 499 | self.val_data = self.val_data.merge(item_properties_df, on="item_id", how="left") 500 | self.test_data = self.test_data.merge(item_properties_df, on="item_id", how="left") 501 | 502 | self.train_data = self.train_data.merge(filters_df, on=['id'], how="left") 503 | self.val_data = self.val_data.merge(filters_df, on=['id'], how="left") 504 | self.test_data = self.test_data.merge(filters_df, on=['id'], how="left") 505 | 506 | self.train_data = self.train_data.merge(data_feature, on=['id'], how="left") 507 | self.val_data = self.val_data.merge(data_feature, on=['id'], how="left") 508 | self.test_data = self.test_data.merge(data_feature, on=['id'], how="left") 509 | 510 | self.train_data['interaction_image_count'] = self.train_data.item_id.map(self.image_count_dict) 511 | self.val_data['interaction_image_count'] = self.val_data.item_id.map(self.image_count_dict) 512 | self.test_data['interaction_image_count'] = self.test_data.item_id.map(self.image_count_dict) 513 | 514 | train_other_is_interacted = np.vstack(self.train_data.other_is_interacted.values).astype(np.float32) 515 | val_other_is_interacted = np.vstack(self.val_data.other_is_interacted.values).astype(np.float32) 516 | test_other_is_interacted = np.vstack(self.test_data.other_is_interacted.values).astype(np.float32) 517 | 518 | is_interacted_columns = [] 519 | for i in range(train_other_is_interacted.shape[1]): 520 | col = f'is_int_{i}' 521 | is_interacted_columns.append(col) 522 | self.train_data[col] = train_other_is_interacted[:,i] 523 | self.val_data[col] = val_other_is_interacted[:,i] 524 | self.test_data[col] = test_other_is_interacted[:,i] 525 | 526 | self.train_data.drop('other_is_interacted',axis=1, inplace=True) 527 | self.val_data.drop('other_is_interacted',axis=1, inplace=True) 528 | self.test_data.drop('other_is_interacted',axis=1, inplace=True) 529 | 530 | train_other_is_clicked = np.vstack(self.train_data.other_is_clicked.values).astype(np.float32) 531 | val_other_is_clicked = np.vstack(self.val_data.other_is_clicked.values).astype(np.float32) 532 | test_other_is_clicked = np.vstack(self.test_data.other_is_clicked.values).astype(np.float32) 533 | 534 | 535 | is_clicked_columns = [] 536 | for i in range(train_other_is_clicked.shape[1]): 537 | col = f'is_cl_{i}' 538 | is_clicked_columns.append(col) 539 | self.train_data[col] = train_other_is_clicked[:,i] 540 | self.val_data[col] = val_other_is_clicked[:,i] 541 | self.test_data[col] = test_other_is_clicked[:,i] 542 | 543 | self.train_data.drop('other_is_clicked',axis=1, inplace=True) 544 | self.val_data.drop('other_is_clicked',axis=1, inplace=True) 545 | self.test_data.drop('other_is_clicked',axis=1, inplace=True) 546 | 547 | # rank gauss transform 548 | train_len = self.train_data.shape[0] 549 | val_len = self.val_data.shape[0] 550 | 551 | 552 | 553 | self.continuous_features = svd_ip_columns + svd_ft_columns + is_interacted_columns + is_clicked_columns + ['mm_step','time_diff', 'day', 'mm_price', 'equal_last_impressions', 'price_diff','price','last_price','price_ratio','is_clicked','is_interacted','item_popularity','is_interacted_image','is_interacted_deals','interaction_count','clickout_count','interaction_image_count','click_diff','rg_timestamp','equal_last_item','global_clickout_count_rank','rg_price','interaction_count_avg','avg_is_interacted_image','avg_is_interacted'] 554 | 555 | 556 | # normalize num_impressions 557 | 558 | 559 | # target encoding 560 | agg_cols = ['impression_index','price_rank'] 561 | for c in agg_cols: 562 | gp = self.train_data.groupby(c)['label'] 563 | mean = gp.mean() 564 | self.train_data[f'{c}_label_avg'] = self.train_data[c].map(mean) 565 | self.val_data[f'{c}_label_avg'] = self.val_data[c].map(mean) 566 | self.test_data[f'{c}_label_avg'] = self.test_data[c].map(mean) 567 | 568 | self.continuous_features.append(f'{c}_label_avg') 569 | 570 | 571 | 572 | agg_cols = ['city'] 573 | for c in agg_cols: 574 | gp = self.train_data.groupby(c)['price'] 575 | mean = gp.mean() 576 | self.train_data[f'{c}_price_avg'] = self.train_data[c].map(mean) 577 | self.val_data[f'{c}_price_avg'] = self.val_data[c].map(mean) 578 | self.test_data[f'{c}_price_avg'] = self.test_data[c].map(mean) 579 | 580 | self.continuous_features.append(f'{c}_price_avg') 581 | 582 | agg_cols = ['city'] 583 | for c in agg_cols: 584 | gp = self.train_data.groupby(c)['price'] 585 | mean = gp.std() 586 | self.train_data[f'{c}_price_std'] = self.train_data[c].map(mean) 587 | self.val_data[f'{c}_price_std'] = self.val_data[c].map(mean) 588 | self.test_data[f'{c}_price_std'] = self.test_data[c].map(mean) 589 | 590 | self.continuous_features.append(f'{c}_price_std') 591 | 592 | #normalize 593 | self.train_data['global_clickout_count_rank'] /= 25 594 | self.val_data['global_clickout_count_rank'] /= 25 595 | self.test_data['global_clickout_count_rank'] /= 25 596 | 597 | 598 | 599 | 600 | 601 | # fill zero 602 | for col in ['star','time_diff']: 603 | 604 | self.train_data.loc[:,col].fillna(0, inplace=True) 605 | self.val_data.loc[:,col].fillna(0, inplace=True) 606 | self.test_data.loc[:,col].fillna(0, inplace=True) 607 | 608 | 609 | 610 | for up in self.continuous_features : 611 | mean_value = self.train_data.loc[ ~self.train_data[up].isna() , up].mean() 612 | self.train_data.loc[:,up].fillna(mean_value, inplace=True) 613 | self.val_data.loc[:,up].fillna(mean_value, inplace=True) 614 | self.test_data.loc[:,up].fillna(mean_value, inplace=True) 615 | 616 | 617 | for c in self.continuous_features: 618 | if self.train_data[c].isna().sum() >0 or self.val_data[c].isna().sum() >0 or self.test_data[c].isna().sum() >0: 619 | print("is null!!", c) 620 | 621 | self.config.num_embeddings['price_rank'] = 25 622 | self.config.num_embeddings['impression_index'] = 26 623 | 624 | # self.config.num_embeddings['day_of_week'] = 7 625 | self.config.num_embeddings['star'] = 6 626 | 627 | self.config.all_cat_columns+= ['price_rank', 'impression_index', 'star'] 628 | 629 | self.config.continuous_size = len(self.continuous_features) 630 | self.config.neighbor_size = 5 631 | 632 | self.all_cat_columns = self.config.all_cat_columns 633 | 634 | if self.config.verbose: 635 | print(f"Number of training data: {self.train_data.shape}") 636 | print(f"Number of validation data: {self.val_data.shape}") 637 | print(f"Number of test data: {self.test_data.shape}") 638 | 639 | def get_features(self): 640 | return ', '.join([c for c in self.continuous_features if 'svd' not in c]) 641 | 642 | def build_user_item_interactions(self, df, session_interactions, session_actions, session_time_diff, training=True): 643 | df_list = [] 644 | label_test_df_list = [] 645 | # parse impressions for train set 646 | for idx, row in enumerate(tqdm(df.itertuples())): 647 | if row.user_id not in self.past_interaction_dict: 648 | self.past_interaction_dict[row.user_id] = [self.transformed_dummy_item] * self.config.sequence_length 649 | # if row.session_id not in self.past_interaction_dict_sess: 650 | # self.past_interaction_dict_sess[row.session_id] = [self.transformed_dummy_item] * self.config.sess_length 651 | if row.session_id not in self.last_click_sess_dict: 652 | self.last_click_sess_dict[row.session_id] = self.transformed_dummy_item 653 | 654 | if row.session_id not in self.last_impressions_dict: 655 | self.last_impressions_dict[row.session_id] = None 656 | 657 | if row.session_id not in self.sess_last_imp_idx_dict: 658 | self.sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX 659 | 660 | if row.session_id not in self.sess_last_price_dict: 661 | self.sess_last_price_dict[row.session_id] = None 662 | 663 | if row.session_id not in self.sess_time_diff_dict: 664 | self.sess_time_diff_dict[row.session_id] = None 665 | 666 | transformed_impressions = self.cat_encoders['item_id'].transform(row.impressions, to_np=True) 667 | 668 | # compute session_interaction 669 | sess_step = row.sess_step 670 | session_id = row.session_id 671 | 672 | current_session_interactions = session_interactions[session_id][:self.config.sess_length+ sess_step -1] # -1 for excluding the current row 673 | current_session_interactions = current_session_interactions[-self.config.sess_length:] 674 | 675 | current_session_actions = session_actions[session_id][:self.config.sess_length+ sess_step -1] 676 | current_session_actions = current_session_actions[-self.config.sess_length:] 677 | 678 | assert len(current_session_interactions) == self.config.sess_length 679 | 680 | if row.last_item in transformed_impressions: 681 | last_interact_index = transformed_impressions.tolist().index(row.last_item) 682 | else: 683 | last_interact_index = DUMMY_IMPRESSION_INDEX 684 | 685 | if row.second_last_item in transformed_impressions: 686 | second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item) 687 | else: 688 | second_last_interact_index = DUMMY_IMPRESSION_INDEX 689 | 690 | # if row.item_id != self.transformed_nan_item: 691 | # training 692 | label = transformed_impressions == row.item_id 693 | # else: 694 | 695 | # last3_impression_idices = [ transformed_impressions.index(imp) for imp in session_interactions[session_id][self.config.sess_length+ sess_step -4:self.config.sess_length+ sess_step -1] if imp in transformed_impressions else DUMMY_IMPRESSION_INDEX] 696 | # # # test 697 | # label = row.pseudo_label 698 | # if len(transformed_impressions) < 25: 699 | # padded_transformed_impressions = np.array(transformed_impressions.tolist() + [self.transformed_dummy_item] * (25 - len(transformed_impressions))) 700 | # else: 701 | # padded_transformed_impressions = transformed_impressions.copy() 702 | interaction_image_indices = np.array(session_actions[session_id][:self.config.sess_length+ sess_step -1]) == self.transformed_interaction_image 703 | interaction_image_item = np.array(session_interactions[session_id][:self.config.sess_length+ sess_step -1])[interaction_image_indices] 704 | sess_unique_items, counts = np.unique(interaction_image_item, return_counts=True) 705 | interaction_image_count_dict = dict(zip(sess_unique_items, counts)) 706 | 707 | 708 | interaction_deals_indices = np.array(session_actions[session_id][:self.config.sess_length+ sess_step -1]) == self.transformed_interaction_deals 709 | interaction_deals_item = np.array(session_interactions[session_id][:self.config.sess_length+ sess_step -1])[interaction_deals_indices] 710 | sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True) 711 | interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts)) 712 | 713 | 714 | interaction_clickout_indices = np.array(session_actions[session_id][:self.config.sess_length+ sess_step -1]) == self.transformed_clickout_action 715 | interaction_clickout_item = np.array(session_interactions[session_id][:self.config.sess_length+ sess_step -1])[interaction_clickout_indices] 716 | sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True) 717 | interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts)) 718 | 719 | finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1]) 720 | finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices] 721 | 722 | # don't leak the current clickout info 723 | unleaked_clickout_count = [self.clickout_count_dict[imp] if imp in self.clickout_count_dict else 0 for imp in transformed_impressions] 724 | unleaked_clickout_count = [unleaked_clickout_count[idx] -1 if imp == row.item_id else unleaked_clickout_count[idx] for idx, imp in enumerate(transformed_impressions)] 725 | 726 | # unleaked_platform_clickout_count = [self.platform_clickout_count[row.platform, imp] if (row.platform, imp) in self.platform_clickout_count else 0 for imp in transformed_impressions] 727 | # unleaked_platform_clickout_count = [unleaked_platform_clickout_count[idx] -1 if imp == row.item_id else unleaked_platform_clickout_count[idx] for idx, imp in enumerate(transformed_impressions)] 728 | 729 | other_is_interacted = [imp in session_interactions[session_id][:self.config.sess_length+ sess_step -1] for imp in transformed_impressions] 730 | padded_other_is_interacted = other_is_interacted + [False] * (25 - len(other_is_interacted)) 731 | 732 | other_is_clicked = [imp in self.past_interaction_dict[row.user_id] for imp in transformed_impressions] 733 | padded_other_is_clicked = other_is_clicked + [False] * (25 - len(other_is_clicked)) 734 | 735 | 736 | unpad_interactions = session_interactions[session_id][self.config.sess_length:self.config.sess_length+ sess_step -1] 737 | 738 | 739 | unique_interaction = pd.unique(session_interactions[session_id][:self.config.sess_length+ sess_step -1]) 740 | 741 | # time elapse of within two steps for each item before the clickout 742 | item_time_elapse_dict = {} 743 | 744 | for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]): 745 | if it not in item_time_elapse_dict: #or elapse > item_time_elapse_dict[it]: 746 | 747 | item_time_elapse_dict[it] = elapse 748 | else: 749 | item_time_elapse_dict[it] += elapse 750 | 751 | 752 | if len(transformed_impressions) < 25: 753 | padded_transformed_impressions = np.array(transformed_impressions.tolist() + [self.transformed_dummy_item] * (25 - len(transformed_impressions))) 754 | else: 755 | padded_transformed_impressions = transformed_impressions.copy() 756 | # padded_transformed_impressions = np.array([transformed_impressions[0]] * 2 + transformed_impressions.tolist() + [transformed_impressions[-1]] * 2) 757 | padded_prices = [ row.prices[0]] * 2 + row.prices + [row.prices[-1]]*2 758 | price_rank = compute_rank(row.prices) 759 | current_rows = np.zeros([len(row.impressions), 41], dtype=object) 760 | current_rows[:, 0] = row.user_id 761 | current_rows[:, 1] = transformed_impressions 762 | current_rows[:, 2] = label 763 | current_rows[:, 3] = row.session_id 764 | current_rows[:, 4] = [np.array(self.past_interaction_dict[row.user_id])] * len(row.impressions) 765 | current_rows[:, 5] = price_rank 766 | current_rows[:, 6] = row.city 767 | current_rows[:, 7] = row.last_item 768 | 769 | # impression index 770 | current_rows[:, 8] = np.arange(len(transformed_impressions)) 771 | current_rows[:, 9] = row.step 772 | current_rows[:, 10] = row.id 773 | 774 | current_rows[:, 11] = [np.array(current_session_interactions)] * len(row.impressions) 775 | current_rows[:, 12] = [np.array(current_session_actions)] * len(row.impressions) 776 | current_rows[:, 13] = MinMaxScaler().fit_transform(np.array(row.prices).reshape(-1,1)).reshape(-1) 777 | current_rows[:, 14] = row.prices 778 | 779 | # last click item id 780 | current_rows[:, 15] = self.last_click_sess_dict[row.session_id] 781 | 782 | # equal_last_impressions 783 | current_rows[:, 16] = self.last_impressions_dict[row.session_id] == transformed_impressions.tolist() 784 | 785 | # impression index of last clicked item 786 | current_rows[:, 17] = self.sess_last_imp_idx_dict[row.session_id] 787 | 788 | #impression index of last interacted item 789 | current_rows[:, 18] = last_interact_index 790 | 791 | # price difference with last interacted item 792 | current_rows[:, 19] = row.prices - self.sess_last_price_dict[row.session_id] if self.sess_last_price_dict[row.session_id] else 0 793 | 794 | 795 | current_rows[:, 20] = self.sess_last_price_dict[row.session_id] if self.sess_last_price_dict[row.session_id] else 0 796 | current_rows[:, 21] = row.prices / self.sess_last_price_dict[row.session_id] if self.sess_last_price_dict[row.session_id] else 0 797 | current_rows[:, 22] = [ padded_prices[i:i+5] for i in range(len(row.impressions))] 798 | 799 | current_rows[:, 23] = [np.concatenate([padded_transformed_impressions[:i], padded_transformed_impressions[i+1:]]) for i in range(len(row.impressions))] 800 | current_rows[:, 24] = row.city_platform 801 | 802 | # if that item has been clicked by the current user 803 | current_rows[:, 25] = [imp in self.past_interaction_dict[row.user_id] for imp in transformed_impressions] 804 | 805 | # if that item has been interaced in the current session 806 | current_rows[:, 26] = [imp in session_interactions[session_id][:self.config.sess_length+ sess_step -1] for imp in transformed_impressions] 807 | 808 | # note that the impressions here was not transformed 809 | current_rows[:, 27] = [self.item_popularity_dict[imp] for imp in row.impressions] 810 | 811 | current_rows[:, 28] = [1 if imp in interaction_image_count_dict else 0 for imp in transformed_impressions] 812 | current_rows[:, 29] = [1 if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions] 813 | 814 | current_rows[:, 30] = [self.interaction_count_dict[imp] if imp in self.interaction_count_dict else 0 for imp in transformed_impressions] 815 | current_rows[:, 31] = unleaked_clickout_count 816 | current_rows[:, 32] = [self.past_interaction_dict[row.user_id][::-1].index(imp) if imp in self.past_interaction_dict[row.user_id] else 0 for imp in transformed_impressions] 817 | current_rows[:, 33] = [np.array(padded_other_is_interacted)] * len(row.impressions) 818 | current_rows[:, 34] = [np.array(padded_other_is_clicked)] * len(row.impressions) 819 | current_rows[:, 35] = transformed_impressions == row.last_item 820 | current_rows[:, 36] = np.argsort(np.argsort(unleaked_clickout_count)) 821 | current_rows[:, 37] = GaussRankScaler().fit_transform(row.prices) 822 | current_rows[:, 38] = np.mean(current_rows[:, 30]) 823 | current_rows[:, 39] = np.mean(current_rows[:, 28]) 824 | current_rows[:, 40] = np.mean(current_rows[:, 26]) 825 | 826 | # current_rows[:, 41] = np.mean(finite_time_diff_array) 827 | # current_rows[:, 41] = np.std(current_rows[:, 30]) 828 | # current_rows[:, 41] = 2 * last_interact_index - second_last_interact_index 829 | 830 | 831 | # current_rows[:, 41] = second_last_interact_index 832 | 833 | #TODO: Rank of statistics 834 | 835 | # print(unleaked_platform_clickout_count) 836 | 837 | # current_rows[:, 35] = [session_interactions[session_id][:self.config.sess_length+ sess_step -1][::-1].index(imp) if imp in session_interactions[session_id][:self.config.sess_length+ sess_step -1] else 0 for imp in transformed_impressions] 838 | # for i in range(35, 42): 839 | # current_rows[:, i] = np.mean(current_rows[:, i-10]) 840 | 841 | # current_rows[:, 29] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions] 842 | 843 | 844 | # neighboring item 845 | # current_rows[:, 23] = [np.concatenate([padded_transformed_impressions[:i], padded_transformed_impressions[i+1:]]) for i in range(len(row.impressions))] 846 | 847 | 848 | # current_rows[:, 20] = row.prices - np.concatenate([row.prices[1:], [row.prices[-1]]], axis=0) 849 | # current_rows[:, 21] = row.prices - np.concatenate([[row.prices[0]], row.prices[:-1]], axis=0) 850 | 851 | # current_rows[:, 17] = row.step - self.sess_last_step_dict[row.session_id] if self.sess_last_step_dict[row.session_id] else 0 852 | 853 | # back pad transformed impressions 854 | 855 | 856 | 857 | # current_rows[:, 16] = [np.delete(np.arange(25), i) for i in range(len(row.impressions))] 858 | # print(self.last_click_sess_dict[row.session_id], self.last_impressions_dict[row.session_id] == transformed_impressions.tolist()) 859 | 860 | if training or row.item_id == self.transformed_nan_item: 861 | 862 | df_list.append(current_rows) 863 | else: 864 | label_test_df_list.append(current_rows) 865 | #include both labeled and pseudo-labelled 866 | 867 | 868 | 869 | # pad current item_id to default dict 870 | self.past_interaction_dict[row.user_id] = self.past_interaction_dict[row.user_id][1:] 871 | self.past_interaction_dict[row.user_id].append(row.item_id) 872 | 873 | 874 | self.last_click_sess_dict[row.session_id] = row.item_id 875 | self.last_impressions_dict[row.session_id] = transformed_impressions.tolist() 876 | self.sess_last_step_dict[row.session_id] = row.step 877 | self.sess_time_diff_dict[row.session_id] = row.timestamp 878 | 879 | 880 | 881 | # update last impression index 882 | if row.item_id != self.transformed_nan_item: 883 | self.sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True) 884 | self.sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0] 885 | 886 | 887 | data = np.vstack(df_list) 888 | dtype_dict = {"city":"int32", "last_item":"int32", 'impression_index':'int32', "step":"int32","id":"int32", "user_id":"int32", 889 | "item_id":"int32", "label": "int32", "price_rank":"int32", "mm_price":"float32", 'price':'float32', "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'int16', 'price_diff':'float32','last_price':'float32','price_ratio':'float32','city_platform':'int32', 'is_clicked':'int8', 'is_interacted':'int8','item_popularity':'int32', 'is_interacted_image':'int8','is_interacted_deals':'int8','interaction_count':'int32','clickout_count':'int32','click_diff':'float32','equal_last_item':'int8','global_clickout_count_rank':'int8','rg_price':'float32','interaction_count_avg':'float32','avg_is_interacted_image':'float32','avg_is_interacted':'float32'} 890 | df_columns= ['user_id', 'item_id', 'label', 'session_id', 'past_interactions', 'price_rank', 'city', 'last_item', 'impression_index', 'step', 'id', 'past_interactions_sess', 'past_actions_sess', 'mm_price','price','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','neighbor_prices','other_item_ids','city_platform', 'is_clicked', 'is_interacted', 'item_popularity','is_interacted_image','is_interacted_deals','interaction_count','clickout_count','click_diff','other_is_interacted','other_is_clicked','equal_last_item','global_clickout_count_rank','rg_price','interaction_count_avg','avg_is_interacted_image', 'avg_is_interacted'] 891 | df = pd.DataFrame(data, columns=df_columns) 892 | df = df.astype(dtype= dtype_dict) 893 | if training: 894 | return df 895 | else: 896 | label_test = np.vstack(label_test_df_list) 897 | label_test = pd.DataFrame(label_test, columns=df_columns) 898 | label_test = label_test.astype(dtype= dtype_dict) 899 | return df, label_test 900 | def instance_a_train_loader(self): 901 | 902 | 903 | train_data = self.train_data 904 | 905 | return NNDataLoader(train_data, self.config, shuffle=True, batch_size=self.config.batch_size, continuous_features=self.continuous_features) 906 | def evaluate_data_valid(self): 907 | val_data = self.val_data 908 | return NNDataLoader(val_data, self.config, shuffle=False, batch_size=self.config.batch_size, continuous_features=self.continuous_features) 909 | 910 | def instance_a_test_loader(self): 911 | test_data = self.test_data 912 | return NNDataLoader(test_data, self.config, shuffle=False, batch_size=self.config.batch_size,continuous_features=self.continuous_features) 913 | 914 | 915 | if __name__ =='__main__': 916 | conf = NNConfiguration() 917 | data_gen = NNDataGenerator(conf) 918 | with timer("gen"): 919 | for result in data_gen.instance_a_train_loader(128): 920 | print(result[-1]) 921 | print(torch.LongTensor(result[-1])) 922 | 923 | for result in data_gen.instance_a_train_loader(128): 924 | print(result[-1]) 925 | print(torch.LongTensor(result[-1])) 926 | -------------------------------------------------------------------------------- /src/run_xgb.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import gc 6 | from constant import * 7 | from utils import * 8 | from config import * 9 | import torch 10 | from joblib import Parallel, delayed 11 | from tqdm import tqdm 12 | import xgboost as xgb 13 | import scipy 14 | from sklearn.decomposition import TruncatedSVD 15 | import multiprocessing 16 | from ordered_set import OrderedSet 17 | import os 18 | import itertools 19 | from scipy.sparse import csr_matrix 20 | 21 | # os.environ['CUDA_VISIBLE_DEVICES'] = '2' 22 | configuration = XGBConfiguration() 23 | 24 | model_name='xgb_gic_lic_wosh_lf350_lr002_v2' 25 | 26 | if configuration.sub_sample: 27 | model_name += '_140k' 28 | else: 29 | model_name += '_all' 30 | 31 | if configuration.use_test: 32 | model_name += '_ut' 33 | 34 | seed_everything(42) 35 | 36 | ########################################################### Load data ###################################################################### 37 | with open( f'{input_dir}/train_v2.p', 'rb') as f: 38 | train = pickle.load(f) 39 | train['id']= np.arange(len(train)) 40 | 41 | with open(f'{input_dir}/test_v2.p', 'rb') as f: 42 | test = pickle.load(f) 43 | test['id'] = np.arange( len(train), len(train)+ len(test)) 44 | 45 | with open('../input/item_metadata.p', 'rb') as f: 46 | item_meta = pickle.load(f) 47 | item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|')) 48 | item_meta['item_id'] = item_meta['item_id'].apply(str) 49 | 50 | # whether to use sub sample of the data to speed up the evaluation 51 | if configuration.sub_sample: 52 | with open('../input/selected_users_140k.p', 'rb') as f: 53 | selected_users = pickle.load(f) 54 | 55 | train = train.loc[train.user_id.isin(selected_users),:] 56 | 57 | # check if the code can run with debug mode 58 | if configuration.debug: 59 | train = train.sample(1000) 60 | test = test.sample(1000) 61 | 62 | with timer("preprocessing"): 63 | 64 | # change columns name 65 | train.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True) 66 | test.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True) 67 | 68 | # concatenate the action and reference in string format as these refernce are not actually item id 69 | train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1) 70 | test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1) 71 | 72 | 73 | train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1) 74 | test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1) 75 | 76 | 77 | # wipe out the item id associated with these actions, reason same as the above 78 | train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM 79 | test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM 80 | 81 | train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM 82 | test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM 83 | 84 | train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM 85 | test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM 86 | 87 | train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM 88 | test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM 89 | 90 | # remove training example where clicked item is not in the impressions 91 | train['in_impressions'] = True 92 | train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1) 93 | train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True) 94 | 95 | test['in_impressions'] = True 96 | test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1) 97 | test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True) 98 | 99 | # parse impressions and prices list from string to list 100 | train['item_id'] = train['item_id'].apply(str) 101 | train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|')) 102 | train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x]) 103 | 104 | test['item_id'] = test['item_id'].apply(str) 105 | test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|')) 106 | test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x]) 107 | 108 | # compute the last interacted item by shifted the item_id by 1 position 109 | train['last_item'] = np.nan 110 | test['last_item'] = np.nan 111 | 112 | train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist() 113 | test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist() 114 | 115 | # compute the last interacted item by shifted the item_id by 2 position 116 | train['last_item'] = train_shifted_item_id 117 | test['last_item'] = test_shifted_item_id 118 | 119 | train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist() 120 | test_shifted_item_id = [DUMMY_ITEM] *2 + test.item_id.values[:-2].tolist() 121 | 122 | # compute the last interacted item by shifted the item_id by 3 position 123 | train['second_last_item'] = train_shifted_item_id 124 | test['second_last_item'] = test_shifted_item_id 125 | 126 | train_shifted_item_id = [DUMMY_ITEM] *3 + train.item_id.values[:-3].tolist() 127 | test_shifted_item_id = [DUMMY_ITEM] *3 + test.item_id.values[:-3].tolist() 128 | 129 | train['third_last_item'] = train_shifted_item_id 130 | test['third_last_item'] = test_shifted_item_id 131 | 132 | # mask out the last interacted item if that interaction comes first in its session 133 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=True) 134 | test['step_rank'] = test.groupby('session_id')['step'].rank(method='max', ascending=True) 135 | 136 | # fill the invalid shifted last n item with a constant number 137 | train.loc[(train.step_rank == 1) & (train.action == 'clickout item'), 'last_item'] = DUMMY_ITEM 138 | test.loc[(test.step_rank == 1) & (test.action == 'clickout item'), 'last_item'] = DUMMY_ITEM 139 | 140 | train.loc[(train.step_rank == 2) & (train.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM 141 | test.loc[(test.step_rank == 2) & (test.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM 142 | 143 | train.loc[(train.step_rank == 3) & (train.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM 144 | test.loc[(test.step_rank == 3) & (test.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM 145 | 146 | 147 | # ignore this 148 | keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item'] 149 | all_cat_columns = ['item_id', 'city', 'platform', 'device','country','country_platform','action','device_platform'] 150 | 151 | 152 | # generate country from city 153 | train['country'] = train.city.apply(lambda x:x.split(',')[-1]) 154 | test['country'] = test.city.apply(lambda x:x.split(',')[-1]) 155 | 156 | # concate country and platform in string format as a new feature 157 | train['country_platform'] = train.apply(lambda row: row.country + row.platform, axis=1) 158 | test['country_platform'] = test.apply(lambda row: row.country + row.platform, axis=1) 159 | 160 | train['device_platform'] = train.apply(lambda row: row.device + row.platform, axis=1) 161 | test['device_platform'] = test.apply(lambda row: row.device + row.platform, axis=1) 162 | # filter out rows where reference doesn't present in impression 163 | # train = train.loc[train.apply(lambda row:row.item_id in row.impressions, axis=1),:] 164 | 165 | print("train shape",train.shape) 166 | 167 | 168 | # concat train and test 169 | data = pd.concat([train, test], axis=0) 170 | data = data.reset_index(drop=True) 171 | 172 | # compute a dicationary that maps session id to the sequence of item ids in that session 173 | train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list)) 174 | test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list)) 175 | 176 | 177 | # compute a dicationary that maps session id to the sequence of action in that session 178 | train_session_actions = dict(train.groupby('session_id')['action'].apply(list)) 179 | test_session_actions = dict(test.groupby('session_id')['action'].apply(list)) 180 | 181 | # compute session session step since the "step" column in some session is not correctly order 182 | train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int) 183 | test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int) 184 | 185 | 186 | 187 | 188 | data_feature = data.loc[:,['id','step','session_id', 'timestamp','platform','country']].copy() 189 | 190 | # compute the time difference between each step 191 | data_feature['time_diff'] = data.groupby('session_id')['timestamp'].diff() 192 | 193 | # compute the difference of time difference between each step 194 | data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff() 195 | 196 | # compute the difference of the difference of time difference between each step 197 | data_feature['time_diff_diff_diff'] = data_feature.groupby('session_id')['time_diff_diff'].diff() 198 | 199 | # compute the time difference from 2 steps ahead 200 | data_feature['time_diff_2'] = data.groupby('session_id')['timestamp'].diff().shift(1) 201 | 202 | # compute the time difference from 3 steps ahead 203 | data_feature['time_diff_3'] = data.groupby('session_id')['timestamp'].diff().shift(2) 204 | 205 | data_feature['hour']= pd.to_datetime(data_feature.timestamp, unit='s').dt.hour//4 206 | 207 | # map platform to country 208 | data_feature['mapped_country'] = data_feature.platform.apply(platform2country) 209 | 210 | 211 | # load the precomputed country to utc offsets from geopy 212 | with open('../input/country2offsets_dict.p','rb') as f: 213 | platform_country2offsets_dict = pickle.load(f) 214 | data_feature['platform2country_utc_offsets'] = data_feature.mapped_country.map(platform_country2offsets_dict) 215 | 216 | 217 | # trasnform time difference with rank gauss 218 | data_feature['rg_time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values) 219 | 220 | # compute the log of step 221 | data_feature['step_log'] = np.log1p(data_feature['step']) 222 | 223 | # drop the useless columns 224 | data_feature = data_feature.drop(['session_id','step','timestamp','hour','platform','country','mapped_country'], axis=1) 225 | 226 | 227 | 228 | 229 | # merge train, test with data_feature 230 | train = train.merge(data_feature, on='id', how='left') 231 | test = test.merge(data_feature, on='id', how='left') 232 | 233 | 234 | # compute the sequence of time difference in each session 235 | train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list)) 236 | test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list)) 237 | 238 | # encode the categorical feture 239 | cat_encoders = {} 240 | for col in all_cat_columns: 241 | cat_encoders[col] = CategoricalEncoder() 242 | 243 | 244 | all_items = [] 245 | for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()] : 246 | all_items += imp 247 | 248 | unique_items = OrderedSet(all_items) 249 | unique_actions = OrderedSet(data.action.values) 250 | 251 | cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM]) 252 | cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION]) 253 | for col in ['city', 'platform', 'device','country','country_platform', 'device_platform']: 254 | 255 | cat_encoders[col].fit(data[col].tolist() ) 256 | 257 | 258 | # transform all the categorical columns to continuous integer 259 | for col in all_cat_columns: 260 | train[col] = cat_encoders[col].transform(train[col].values) 261 | test[col] = cat_encoders[col].transform(test[col].values) 262 | 263 | 264 | # get the encoded action 265 | transformed_clickout_action = cat_encoders['action'].transform(['clickout item'])[0] 266 | transformed_dummy_item = cat_encoders['item_id'].transform([DUMMY_ITEM])[0] 267 | transformed_dummy_action = cat_encoders['action'].transform([DUMMY_ACTION])[0] 268 | transformed_interaction_image = cat_encoders['action'].transform(['interaction item image'])[0] 269 | transformed_interaction_deals = cat_encoders['action'].transform(['interaction item deals'])[0] 270 | transformed_interaction_info = cat_encoders['action'].transform(['interaction item info'])[0] 271 | transformed_interaction_rating = cat_encoders['action'].transform(['interaction item rating'])[0] 272 | 273 | # transform session interactions and pad dummy in front of all of them 274 | for session_id, item_list in train_session_interactions.items(): 275 | train_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list) 276 | 277 | for session_id, item_list in test_session_interactions.items(): 278 | test_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list) 279 | 280 | for session_id, action_list in train_session_actions.items(): 281 | train_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list) 282 | 283 | for session_id, action_list in test_session_actions.items(): 284 | test_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list) 285 | 286 | 287 | ### compute co-occurence matrix 288 | implicit_train = train.loc[train.action != transformed_clickout_action, :] 289 | implicit_test = test.loc[test.action != transformed_clickout_action, :] 290 | 291 | # get all interacted items in a session 292 | implicit_all = pd.concat([implicit_train , implicit_test], axis=0) 293 | # a list of list containing items in the same session 294 | co_occ_items = implicit_all.groupby('session_id').item_id.apply(list).to_dict().values() 295 | co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in co_occ_items] 296 | 297 | #aggregate co-ocurrence across sessions 298 | co_occ_coordinates = [] 299 | for coordinates in co_occ_permutes: 300 | co_occ_coordinates += coordinates 301 | 302 | #construct csr 303 | row, col, values = zip(*((i,j,1) for i,j in co_occ_coordinates )) 304 | co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32) 305 | 306 | co_occ_matrix_csc = co_occ_matrix.tocsc() 307 | 308 | print("max entry: ", co_occ_matrix.max()) 309 | 310 | 311 | ### compute co-occurence matrix for imp list 312 | 313 | # imp_co_occ_items = train.loc[~train.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist() + test.loc[~test.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist() 314 | # imp_co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in imp_co_occ_items] 315 | 316 | # #aggregate co-ocurrence across sessions 317 | # imp_co_occ_coordinates = [] 318 | # for coordinates in imp_co_occ_permutes: 319 | # imp_co_occ_coordinates += coordinates 320 | 321 | # #construct csr 322 | # row, col, values = zip(*((i,j,1) for i,j in imp_co_occ_coordinates )) 323 | # imp_co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32) 324 | 325 | # imp_co_occ_matrix_csc = imp_co_occ_matrix.tocsc() 326 | 327 | # print("max entry: ", imp_co_occ_matrix.max()) 328 | 329 | # categorically encode last, second last and third item 330 | train['last_item'] = cat_encoders['item_id'].transform(train['last_item'].values) 331 | test['last_item'] = cat_encoders['item_id'].transform(test['last_item'].values) 332 | 333 | train['second_last_item'] = cat_encoders['item_id'].transform(train.second_last_item.values) 334 | test['second_last_item'] = cat_encoders['item_id'].transform(test.second_last_item.values) 335 | 336 | train['third_last_item'] = cat_encoders['item_id'].transform(train.third_last_item.values) 337 | test['third_last_item'] = cat_encoders['item_id'].transform(test.third_last_item.values) 338 | 339 | 340 | 341 | 342 | # genetate item properties features 343 | item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:] 344 | # item_meta multi-hot 345 | item_meta['item_id'] = cat_encoders['item_id'].transform(item_meta['item_id'].values) 346 | item_meta['star'] = np.nan 347 | item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1 348 | item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2 349 | item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3 350 | item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4 351 | item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5 352 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Excellent Rating' in y) ), 'star'] = 9 353 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Very Good Rating' in y) ), 'star'] = 8 354 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Good Rating' in y) ), 'star'] = 7 355 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Satisfactory Rating' in y) ), 'star'] = 6 356 | 357 | item_meta['rating'] = np.nan 358 | item_meta.loc[item_meta.properties.apply(lambda x: 'Satisfactory Rating' in x), 'rating'] = 7.0 359 | item_meta.loc[item_meta.properties.apply(lambda x: 'Good Rating' in x), 'rating'] = 7.5 360 | item_meta.loc[item_meta.properties.apply(lambda x: 'Very Good Rating' in x), 'rating'] = 8.0 361 | item_meta.loc[item_meta.properties.apply(lambda x: 'Excellent Rating' in x), 'rating'] = 8.5 362 | 363 | # get binary properties feature 364 | item_properties_df = pd.DataFrame() 365 | item_properties_df['item_id'] = item_meta.item_id 366 | item_properties_df['num_properties'] = item_meta.properties.apply(len) 367 | item_properties_df['star'] = item_meta.star 368 | item_properties_df['item_Beach'] = item_meta.properties.apply(lambda x: 'Beach' in x).astype(np.float16) 369 | item_properties_df['item_Bed & Breakfast'] = item_meta.properties.apply(lambda x: 'Bed & Breakfast' in x).astype(np.float16) 370 | item_properties_df['rating'] = item_meta['rating'] 371 | 372 | 373 | item_star_map = item_properties_df.loc[:,['item_id','star']].set_index('item_id').to_dict()['star'] 374 | item_rating_map = item_properties_df.loc[:,['item_id','rating']].set_index('item_id').to_dict()['rating'] 375 | 376 | 377 | 378 | del item_meta 379 | gc.collect() 380 | 381 | # ignore filter_df , not using, consume huge memory yet increase a little 382 | filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']] 383 | filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|')) 384 | 385 | # filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Star' in x), 'nights'] = 3 386 | filter_df['nights']=np.nan 387 | filter_df.loc[filter_df.current_filters.apply(lambda x: '2 Nights' in x), 'nights'] = 1 388 | filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Nights' in x), 'nights'] = 2 389 | 390 | filter_set = list(set(np.hstack(filter_df['current_filters'].to_list()))) 391 | 392 | cat_encoders['filters'] = CategoricalEncoder() 393 | cat_encoders['filters'].fit(filter_set) 394 | 395 | # get binary filter feature 396 | filters_df = pd.DataFrame() 397 | filters_df['id'] = filter_df.id 398 | filters_df['num_filters'] = filter_df.current_filters.apply(len) 399 | filters_df['breakfast_included'] = filter_df.current_filters.apply( lambda x: 'Breakfast Included' in x).astype(np.float16) 400 | filters_df['filters_Sort By Price'] = filter_df.current_filters.apply( lambda x: 'Sort by Price' in x).astype(np.float16) 401 | filters_df['filters_Sort By Popularity'] = filter_df.current_filters.apply( lambda x: 'Sort By Popularity' in x).astype(np.float16) 402 | 403 | 404 | 405 | # compute interaction image count for each item across train/ test 406 | interaction_image_item_ids = train.loc[train.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 407 | unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True) 408 | global_image_count_dict = dict(zip(unique_interaction_image_items, counts)) 409 | 410 | # compute interaction count for each item across train/ test 411 | interaction_item_ids = train.loc[train.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 412 | unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True) 413 | global_interaction_count_dict = dict(zip(unique_interaction_items, counts)) 414 | 415 | # compute interaction deals count for each item across train/ test 416 | interaction_deals_item_ids = train.loc[train.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 417 | unique_interaction_deals_items, counts = np.unique(interaction_deals_item_ids, return_counts=True) 418 | global_deals_count_dict = dict(zip(unique_interaction_deals_items, counts)) 419 | 420 | 421 | # compute step rank to identify the last row in each session for train/ val split 422 | train = train.loc[train.action == transformed_clickout_action,:] 423 | test = test.loc[test.action == transformed_clickout_action,:] 424 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False) 425 | 426 | # compute the impression count for each item 427 | item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)]) 428 | unique_items, counts = np.unique(item_ids, return_counts=True) 429 | impression_count_dict = dict(zip(unique_items, counts)) 430 | 431 | # compute the rank gauss transformed prices 432 | unique_prices = np.unique(np.hstack([np.hstack(train.prices.values), np.hstack(test.prices.values)]) ) 433 | rg_unique_prices = GaussRankScaler().fit_transform(unique_prices) 434 | price_rg_price_dict = dict(zip(unique_prices, rg_unique_prices)) 435 | 436 | 437 | #train/ val split 438 | if configuration.debug: 439 | val = train.loc[train.step_rank == 1,:].iloc[:5] 440 | else: 441 | val = train.loc[train.step_rank == 1,:].iloc[:50000] 442 | 443 | val_index = val.index 444 | train = train.loc[~train.index.isin(val_index),:] 445 | 446 | train = train.drop('step_rank', axis=1) 447 | val = val.drop('step_rank', axis=1) 448 | 449 | 450 | # get the encoded nan item 451 | transformed_nan_item = cat_encoders['item_id'].transform(['nan'])[0] 452 | 453 | 454 | 455 | 456 | from collections import defaultdict, Counter 457 | session_clickout_count_dict = {} 458 | past_interaction_dict = {} 459 | last_click_sess_dict = {} 460 | last_impressions_dict = {} 461 | sess_last_imp_idx_dict={} 462 | sess_last_price_dict = {} 463 | sess_time_diff_dict ={} 464 | sess_step_diff_dict = {} 465 | 466 | cumulative_click_dict = defaultdict(lambda : 0) 467 | 468 | 469 | 470 | 471 | def parse_impressions(df, session_interactions, session_actions, session_time_diff, training=True): 472 | # parse the data into a binary classification task, generate 1 example for each item in the impression list 473 | df_list = [] 474 | label_test_df_list = [] 475 | # parse impressions for train set 476 | past_interaction_rows = [] 477 | past_interaction_columns = [] 478 | for idx, row in enumerate(tqdm(df.itertuples())): 479 | 480 | if row.session_id not in session_clickout_count_dict: 481 | session_clickout_count_dict[row.session_id] = 0 482 | 483 | if row.user_id not in past_interaction_dict: 484 | past_interaction_dict[row.user_id] = [] 485 | 486 | 487 | sess_step = row.sess_step 488 | session_id = row.session_id 489 | 490 | # compute the categorically encoded impression list 491 | transformed_impressions = cat_encoders['item_id'].transform(row.impressions, to_np=True) 492 | 493 | current_rows = np.zeros([len(row.impressions), 66], dtype=object) 494 | 495 | # compute rank of price this clickout 496 | price_rank = compute_rank(row.prices) 497 | 498 | #compute the number of interactions associated with the last interacted item in this session 499 | equal_last_item_indices = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) == row.last_item 500 | last_item_interaction = len(set(np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1])[equal_last_item_indices])) 501 | 502 | #compute the local interaction count for each item id 503 | interaction_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) != transformed_clickout_action 504 | interaction_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_indices] 505 | sess_unique_items, counts = np.unique(interaction_item, return_counts=True) 506 | interaction_count_dict = dict(zip(sess_unique_items, counts)) 507 | 508 | #compute the local interaction image count for each item id 509 | interaction_image_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_image 510 | interaction_image_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_image_indices] 511 | sess_unique_image_items, counts = np.unique(interaction_image_item, return_counts=True) 512 | interaction_image_count_dict = dict(zip(sess_unique_image_items, counts)) 513 | 514 | #compute the local interaction deals count for each item id 515 | interaction_deals_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_deals 516 | interaction_deals_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_deals_indices] 517 | sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True) 518 | interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts)) 519 | 520 | #compute the local clickout count for each item id 521 | interaction_clickout_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_clickout_action 522 | interaction_clickout_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_clickout_indices] 523 | sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True) 524 | interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts)) 525 | 526 | #compute the local interaction rating count for each item id 527 | interaction_rating_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_rating 528 | interaction_rating_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_rating_indices] 529 | sess_unique_rating_items, counts = np.unique(interaction_rating_item, return_counts=True) 530 | interaction_rating_count_dict = dict(zip(sess_unique_rating_items, counts)) 531 | 532 | 533 | # get the time diffference array in this session for later computing the average of it 534 | finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1]) 535 | finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices] 536 | 537 | # unpad the interactions 538 | unpad_interactions = session_interactions[session_id][configuration.sess_length:configuration.sess_length+ sess_step -1] 539 | unique_interaction = pd.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) 540 | 541 | # time elapse of within two steps for each item before the clickout 542 | item_time_elapse_dict = {} 543 | for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]): 544 | if it not in item_time_elapse_dict: 545 | item_time_elapse_dict[it] = [elapse] 546 | 547 | else: 548 | item_time_elapse_dict[it].append(elapse) 549 | 550 | # compute time_diff for each item in the session 551 | interact_diff = [unpad_interactions[::-1].index(imp) if imp in unpad_interactions else np.nan for imp in transformed_impressions] 552 | item_time_diff = np.array([ sum(session_time_diff[session_id][sess_step - diff -1 :sess_step]) if np.isfinite(diff) else np.nan for diff in interact_diff]) 553 | 554 | target_index = transformed_impressions.tolist().index(row.item_id) if training else np.nan 555 | 556 | #(imp len, num items) 557 | current_co_occ = co_occ_matrix[transformed_impressions,:] 558 | 559 | 560 | #(imp len, num unique items in the session b4 this clickout) 561 | current_co_occ = current_co_occ[:,sess_unique_items].toarray() 562 | 563 | # (1, num unique items in the session b4 this clickout) 564 | # print(current_co_occ.dtype) 565 | 566 | norm = (1 + co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1)) 567 | 568 | # #(imp len, num items) 569 | # imp_current_co_occ = imp_co_occ_matrix[transformed_impressions,:] 570 | 571 | 572 | # #(imp len, num unique items in the session b4 this clickout) 573 | # imp_current_co_occ = imp_current_co_occ[:,sess_unique_items].toarray() 574 | 575 | # # (1, num unique items in the session b4 this clickout) 576 | # # print(current_co_occ.dtype) 577 | 578 | # imp_norm = (1 + imp_co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1)) 579 | 580 | # norm_imp_current_co_occ = imp_current_co_occ / imp_norm 581 | 582 | # the position of the last interacted item in the current impression list 583 | if row.last_item in transformed_impressions: 584 | last_interact_index = transformed_impressions.tolist().index(row.last_item) 585 | else: 586 | last_interact_index = np.nan 587 | 588 | # the position of the second last interacted item in the current impression list 589 | if row.second_last_item in transformed_impressions: 590 | second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item) 591 | else: 592 | second_last_interact_index = np.nan 593 | 594 | # the position of the third last interacted item in the current impression list 595 | if row.third_last_item in transformed_impressions: 596 | third_last_interact_index = transformed_impressions.tolist().index(row.third_last_item) 597 | else: 598 | third_last_interact_index = np.nan 599 | 600 | # initialize dictionaries 601 | if row.session_id not in last_click_sess_dict: 602 | last_click_sess_dict[row.session_id] = transformed_dummy_item 603 | 604 | if row.session_id not in last_impressions_dict: 605 | last_impressions_dict[row.session_id] = None 606 | 607 | if row.session_id not in sess_last_imp_idx_dict: 608 | sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX 609 | 610 | if row.session_id not in sess_last_price_dict: 611 | sess_last_price_dict[row.session_id] = None 612 | 613 | if row.session_id not in sess_time_diff_dict: 614 | sess_time_diff_dict[row.session_id] = None 615 | 616 | if row.session_id not in sess_step_diff_dict: 617 | sess_step_diff_dict[row.session_id] = None 618 | 619 | 620 | # item id 621 | current_rows[:, 0] = transformed_impressions 622 | 623 | # label 624 | current_rows[:, 1] = transformed_impressions == row.item_id 625 | current_rows[:, 2] = row.session_id 626 | 627 | # whether current item id equal to the last interacted item id 628 | current_rows[:, 3] = transformed_impressions == row.last_item 629 | current_rows[:, 4] = price_rank 630 | current_rows[:, 5] = row.platform 631 | current_rows[:, 6] = row.device 632 | current_rows[:, 7] = row.city 633 | current_rows[:, 8] = row.prices 634 | current_rows[:, 9] = row.country 635 | 636 | # impression index 637 | current_rows[:, 10] = np.arange(len(row.impressions)) 638 | current_rows[:, 11] = row.step 639 | current_rows[:, 12] = row.id 640 | 641 | # last_click_item: last clickout item id 642 | current_rows[:, 13] = last_click_sess_dict[row.session_id] 643 | 644 | # equal_last_impressions: current impression list is eactly the same as the last one that the user encountered 645 | current_rows[:, 14] = last_impressions_dict[row.session_id] == transformed_impressions.tolist() 646 | 647 | 648 | current_rows[:, 15] = sess_last_imp_idx_dict[row.session_id] 649 | # last_interact_index 650 | current_rows[:, 16] = last_interact_index 651 | 652 | # price_diff 653 | current_rows[:, 17] = row.prices - sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan 654 | 655 | # last_price 656 | current_rows[:, 18] = sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan 657 | 658 | # price_ratio 659 | current_rows[:, 19] = row.prices / sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np. nan 660 | 661 | # clickout_time_diff 662 | current_rows[:, 20] = row.timestamp - sess_time_diff_dict[row.session_id] if sess_time_diff_dict[row.session_id] else np.nan 663 | 664 | # country_platform 665 | current_rows[:, 21] = row.country_platform 666 | 667 | # impression_count 668 | current_rows[:, 22] = [impression_count_dict[imp] for imp in row.impressions] 669 | 670 | # is_interacted: if that item has been interaced in the current session 671 | current_rows[:, 23] = [imp in session_interactions[session_id][:configuration.sess_length+ sess_step -1] for imp in transformed_impressions] 672 | 673 | # local_interaction_image_count 674 | current_rows[:, 24] = [interaction_image_count_dict[imp] if imp in interaction_image_count_dict else 0 for imp in transformed_impressions] 675 | # local_interaction_deals_count 676 | current_rows[:, 25] = [interaction_deals_count_dict[imp] if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions] 677 | 678 | # local_interaction_clickout_count 679 | current_rows[:, 26] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions] 680 | 681 | # global_interaction_image_count 682 | current_rows[:, 27] = [global_image_count_dict[imp] if imp in global_image_count_dict else 0 for imp in transformed_impressions] 683 | 684 | # global_interaction_deals_count 685 | current_rows[:, 28] = [global_deals_count_dict[imp] if imp in global_deals_count_dict else 0 for imp in transformed_impressions] 686 | 687 | # is_clicked 688 | current_rows[:, 29] = [imp in past_interaction_dict[row.user_id] for imp in transformed_impressions] 689 | 690 | # click_diff 691 | current_rows[:, 30] = [past_interaction_dict[row.user_id][::-1].index(imp) if imp in past_interaction_dict[row.user_id] else np.nan for imp in transformed_impressions] 692 | 693 | # average of the previous features 694 | for i in range(31, 38): 695 | current_rows[:, i] = np.mean(current_rows[:, i-8]) 696 | 697 | # impression_avg_prices 698 | current_rows[:, 38] = np.mean(row.prices) 699 | current_rows[:, 39] = row.device_platform 700 | 701 | # equal_max_liic: euqal the maximum of local interaction image count 702 | current_rows[:, 40] = np.array(current_rows[:, 24]) == np.max(current_rows[:, 24]) if sum(current_rows[:, 24]) >0 else False 703 | 704 | # num_interacted_items 705 | current_rows[:, 41] = len(np.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1])) 706 | 707 | # equal_second_last_item 708 | current_rows[:, 42] = transformed_impressions == row.second_last_item 709 | 710 | # last_action 711 | current_rows[:, 43] = session_actions[session_id][configuration.sess_length+ sess_step -2] 712 | 713 | # last_second_last_imp_idx_diff 714 | current_rows[:, 44] = last_interact_index - second_last_interact_index 715 | 716 | # predicted_next_imp_idx (the idea is to trace your eyeball, last_interact_index + (last_interact_index - second_last_interact_index)) 717 | current_rows[:, 45] = 2 * last_interact_index - second_last_interact_index 718 | 719 | # list_len 720 | current_rows[:, 46] = len(row.impressions) 721 | 722 | # imp_idx_velocity 723 | current_rows[:, 47] = last_interact_index - 2 * second_last_interact_index + third_last_interact_index 724 | 725 | # time_diff_sess_avg 726 | current_rows[:, 48] = np.mean(finite_time_diff_array) 727 | 728 | # max_time_elapse 729 | current_rows[:, 49] = [ max(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions] 730 | 731 | # sum_time_elapse 732 | current_rows[:, 50] = [ sum(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions] 733 | 734 | # avg_time_elapse 735 | current_rows[:, 51] = [ np.mean(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions] 736 | 737 | # item_time_diff 738 | current_rows[:, 52] = item_time_diff 739 | 740 | # global_interaction_count 741 | current_rows[:, 53] = [global_interaction_count_dict[imp] if imp in global_interaction_count_dict else 0 for imp in transformed_impressions] 742 | 743 | # average global_interaction_count 744 | current_rows[:, 54] = np.mean(current_rows[:, 53]) 745 | 746 | # std of global interaction image count 747 | current_rows[:, 55] = np.std(current_rows[:, 27]) 748 | 749 | # std of glocal interaction conut 750 | current_rows[:, 56] = np.std(current_rows[:, 53]) 751 | 752 | # local_interaction_count 753 | current_rows[:, 57] = [interaction_count_dict[imp] if imp in interaction_count_dict else 0 for imp in transformed_impressions] 754 | current_rows[:, 58] = target_index 755 | 756 | # target price 757 | current_rows[:, 59] = row.prices[target_index] if not np.isnan(target_index) else np.nan 758 | 759 | # normalized co-occurence statistics 760 | current_rows[:, 60] = np.mean(current_co_occ/ norm, axis=1).reshape(-1) 761 | current_rows[:, 61] = np.min(current_co_occ/ norm, axis=1).reshape(-1) 762 | current_rows[:, 62] = np.max(current_co_occ/norm, axis=1).reshape(-1) 763 | current_rows[:, 63] = np.median(current_co_occ/norm, axis=1).reshape(-1) 764 | 765 | # last_item_interaction 766 | current_rows[:, 64] = last_item_interaction 767 | 768 | # target price rank 769 | current_rows[:, 65] = price_rank[target_index] if not np.isnan(target_index) else np.nan 770 | # current_rows[:, 66] = np.mean(norm_imp_current_co_occ, axis=1).reshape(-1) 771 | # current_rows[:, 67] = np.min(norm_imp_current_co_occ, axis=1).reshape(-1) 772 | # current_rows[:, 68] = np.max(norm_imp_current_co_occ, axis=1).reshape(-1) 773 | # current_rows[:, 69] = np.median(norm_imp_current_co_occ, axis=1).reshape(-1) 774 | 775 | 776 | 777 | 778 | 779 | if training or row.item_id == transformed_nan_item: 780 | df_list.append(current_rows) 781 | else: 782 | label_test_df_list.append(current_rows) 783 | # cumulative_click_dict[row.item_id] += 1 784 | past_interaction_dict[row.user_id].append(row.item_id) 785 | last_click_sess_dict[row.session_id] = row.item_id 786 | last_impressions_dict[row.session_id] = transformed_impressions.tolist() 787 | sess_time_diff_dict[row.session_id] = row.timestamp 788 | sess_step_diff_dict[row.session_id] = row.step 789 | if row.item_id != transformed_nan_item: 790 | sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True) 791 | sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0] 792 | # cumulative_click_dict[row.item_id] += 1 793 | data = np.vstack(df_list) 794 | df_columns = ['item_id', 'label', 'session_id', 'equal_last_item', 'price_rank', 'platform', 'device', 'city', 'price', 'country', 'impression_index','step', 'id','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','clickout_time_diff','country_platform','impression_count','is_interacted','local_interaction_image_count','local_interaction_deals_count','local_interaction_clickout_count','global_interaction_image_count','global_interaction_deals_count','is_clicked','click_diff', 'avg_is_interacted','avg_liic', 'avg_lidc','avg_licc','avg_giic','avg_gdc','avg_is_clicked','impression_avg_prices','device_platform','equal_max_liic','num_interacted_items','equal_second_last_item','last_action','last_second_last_imp_idx_diff','predicted_next_imp_idx', 'list_len','imp_idx_velocity','time_diff_sess_avg','max_time_elapse','sum_time_elapse','avg_time_elapse','item_time_diff','global_interaction_count','avg_gic','std_giic','std_gic','local_interaction_count','target_index','target_price','co_occ_mean_norm','co_occ_min_norm','co_occ_max_norm','co_occ_median_norm','last_item_interaction','target_price_rank'] 795 | dtype_dict = {"item_id":"int32", "label": "int8", "equal_last_item":"int8", "step":"int16", "price_rank": "int32","impression_index":"int32", "platform":"int32","device":"int32","city":"int32", "id":"int32", "country":"int32", "price":"int16", "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'float32', 'price_diff':'float16','last_price':'float16','price_ratio':'float32','clickout_time_diff':'float16','country_platform':'int32','impression_count':'int32','is_interacted':'int8','local_interaction_image_count':'int32','local_interaction_deals_count':'int32','local_interaction_clickout_count':'int32','global_interaction_image_count':'int32','global_interaction_deals_count':'int32','is_clicked':'int8','click_diff':'float32'\ 796 | , 'avg_is_interacted':'float16' ,'avg_liic':'float16', 'avg_lidc':'float32','avg_licc':'float32','avg_giic':'float32','avg_gdc':'float32','avg_is_clicked':'float32','impression_avg_prices':'float32','device_platform':'int32','equal_max_liic':'int8','num_interacted_items':'int32','equal_second_last_item':'int8','last_action':'int32','last_second_last_imp_idx_diff':'float32', 'predicted_next_imp_idx': 'float32','list_len':'int16','imp_idx_velocity':'float32','time_diff_sess_avg':'float32','max_time_elapse':'float32','sum_time_elapse':'float32','avg_time_elapse':'float32','item_time_diff':'float32','global_interaction_count':'float32','avg_gic':'float32','std_giic':'float32','std_gic':'float32','local_interaction_count':'int32','target_index':'float32','target_price':'float32','co_occ_mean_norm':'float32','co_occ_min_norm':'float32','co_occ_max_norm':'float32','co_occ_median_norm':'float32','last_item_interaction':'int32','target_price_rank':'float32'} 797 | df = pd.DataFrame(data, columns=df_columns) 798 | df = df.astype(dtype=dtype_dict ) 799 | if training: 800 | return df 801 | else: 802 | label_test = np.vstack(label_test_df_list) 803 | label_test = pd.DataFrame(label_test, columns=df_columns) 804 | label_test = label_test.astype(dtype= dtype_dict) 805 | return df, label_test 806 | 807 | 808 | 809 | 810 | train.sort_values('timestamp',inplace=True) 811 | val.sort_values('timestamp',inplace=True) 812 | test.sort_values('timestamp',inplace=True) 813 | 814 | # print("sorted!!") 815 | train = parse_impressions(train, train_session_interactions, train_session_actions, train_session_time_diff) 816 | val = parse_impressions(val, train_session_interactions, train_session_actions, train_session_time_diff) 817 | test, label_test = parse_impressions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False) 818 | 819 | if configuration.use_test: 820 | train = pd.concat([train, label_test], axis=0) 821 | 822 | 823 | 824 | 825 | 826 | 827 | print("test before merge", test.shape) 828 | train = train.merge(item_properties_df, on="item_id", how="left") 829 | val = val.merge(item_properties_df, on="item_id", how="left") 830 | test = test.merge(item_properties_df, on="item_id", how="left") 831 | 832 | 833 | print("test ", test.shape) 834 | train = train.merge(filters_df, on='id', how="left") 835 | val = val.merge(filters_df, on='id', how="left") 836 | test = test.merge(filters_df, on='id', how="left") 837 | 838 | 839 | # print("test ", test.shape) 840 | # print("test before merge data_feature", test.shape) 841 | 842 | train = train.merge(data_feature, on='id', how="left") 843 | val = val.merge(data_feature, on='id', how="left") 844 | test = test.merge(data_feature, on='id', how="left") 845 | print("test ", test.shape) 846 | 847 | del filters_df, data_feature 848 | del data 849 | gc.collect() 850 | 851 | # target encoding 852 | agg_cols = [ 'price_rank', 'city', 'platform', 'device', 'country', 'impression_index','star'] 853 | for c in agg_cols: 854 | gp = train.groupby(c)['label'] 855 | mean = gp.mean() 856 | train[f'{c}_label_avg'] = train[c].map(mean) 857 | val[f'{c}_label_avg'] = val[c].map(mean) 858 | test[f'{c}_label_avg'] = test[c].map(mean) 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | agg_cols = ['city','impression_index', 'platform'] 867 | for c in agg_cols: 868 | gp = train.groupby(c)['price'] 869 | mean = gp.mean() 870 | train[f'{c}_price_avg'] = train[c].map(mean) 871 | val[f'{c}_price_avg'] = val[c].map(mean) 872 | test[f'{c}_price_avg'] = test[c].map(mean) 873 | 874 | 875 | 876 | agg_cols = ['city'] 877 | for c in agg_cols: 878 | gp = train.groupby(c)['rg_time_diff'] 879 | mean = gp.mean() 880 | train[f'{c}_td_avg'] = train[c].map(mean) 881 | val[f'{c}_td_avg'] = val[c].map(mean) 882 | test[f'{c}_td_avg'] = test[c].map(mean) 883 | 884 | 885 | 886 | train['rg_price'] = train.price.map(price_rg_price_dict) 887 | val['rg_price'] = val.price.map(price_rg_price_dict) 888 | test['rg_price'] = test.price.map(price_rg_price_dict) 889 | 890 | 891 | 892 | #price cut within city 893 | 894 | data = pd.concat([train,val,test], axis=0).reset_index() 895 | data = data.loc[:,['city','price']].drop_duplicates(['city','price']) 896 | data['city_price_bin'] = data.groupby('city').price.apply(lambda x: qcut_safe(x, q = 40).astype(str)) 897 | data['city_price_bin'] = data.apply( lambda x: str(x.city) + x.city_price_bin,axis=1) 898 | data['city_price_bin'] = data['city_price_bin'].factorize()[0] 899 | 900 | 901 | train = train.merge(data, on=['city','price'], how='left') 902 | val = val.merge(data, on=['city','price'], how='left') 903 | test = test.merge(data, on=['city','price'], how='left') 904 | 905 | 906 | 907 | print("train", train.shape) 908 | print("val", val.shape) 909 | print("test", test.shape) 910 | # test = test.merge(item_properties_df, on="item_id", how="left") 911 | 912 | 913 | 914 | 915 | 916 | data_drop_columns= ['label', 'session_id', 'step', 'id'] 917 | data_drop_columns+= ['target_index','target_price','target_price_rank'] 918 | # data_drop_columns+= ['avg_lidc','avg_licc'] 919 | 920 | train_label = train.label 921 | 922 | val_label = val.label 923 | 924 | 925 | d_train = xgb.DMatrix(data=train.drop(data_drop_columns, axis=1), label=train_label.values, silent=True, nthread=-1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist()) 926 | d_val = xgb.DMatrix(data=val.drop(data_drop_columns, axis=1), label=val_label.values, silent=True, nthread= -1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist()) 927 | d_test = xgb.DMatrix(test.drop(data_drop_columns, axis=1), nthread=-1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist()) 928 | 929 | cat_cols = [ 'item_id', "price_rank", 'city', 'platform', 'device', 'country', 'impression_index','star','last_click_impression','last_click_item','last_interact_index','country_platform'] 930 | 931 | for col in cat_cols: 932 | if (train[col] < 0).sum() > 0: 933 | print("contains negative ", col) 934 | 935 | del train 936 | gc.collect() 937 | 938 | # params = { 939 | # 'objective': 'binary', 940 | # 'boosting_type': 'gbdt', 941 | # 'nthread': multiprocessing.cpu_count() //2, 942 | # 'num_leaves': 200, 943 | # 'max_depth':10, 944 | # 'learning_rate': 0.05, 945 | # 'bagging_fraction': 0.8, 946 | # 'bagging_freq': 5, 947 | # 'feature_fraction':0.7, 948 | # 'seed': 0, 949 | # 'verbose': -1, 950 | 951 | # } 952 | 953 | params={ 954 | 'eta': 0.02, # 0.03, 955 | "booster": "gbtree", 956 | 'tree_method':'hist', 957 | 'max_leaves': 350, 958 | 'max_depth': 10, # 18 959 | "nthread": multiprocessing.cpu_count() -1, 960 | 'subsample': 0.9, 961 | 'colsample_bytree': 0.8, 962 | 'colsample_bylevel': 0.8, 963 | 'min_child_weight': 2, 964 | 'alpha': 1, 965 | 'objective': 'binary:logistic', 966 | 'eval_metric': 'logloss', 967 | 'random_state': 5478, 968 | 'verbosity': 0, 969 | } 970 | 971 | 972 | watchlist = [ (d_train, 'train'), (d_val, 'valid')] 973 | clf = xgb.train( 974 | params=params, 975 | dtrain=d_train, 976 | num_boost_round=50000, #11927 977 | evals= watchlist, 978 | early_stopping_rounds=500, 979 | verbose_eval=500, 980 | # categorical_feature= cat_cols 981 | ) 982 | 983 | 984 | # clf.save_model('../weights/lgb-10000-200-01.model') 985 | 986 | def evaluate(val_df, clf): 987 | val_df['scores'] = clf.predict(d_val) 988 | grouped_val = val_df.groupby('session_id') 989 | rss = [] 990 | for _, group in grouped_val: 991 | 992 | scores = group.scores 993 | sorted_arg = np.flip(np.argsort(scores)) 994 | rss.append( group['label'].values[sorted_arg]) 995 | 996 | mrr = compute_mean_reciprocal_rank(rss) 997 | return mrr 998 | 999 | 1000 | 1001 | mrr = evaluate(val, clf) 1002 | 1003 | print("MRR score: ", mrr) 1004 | 1005 | 1006 | 1007 | imp = clf.get_score( importance_type='gain') 1008 | imp_df = pd.DataFrame.from_dict(imp, orient='index').reset_index() 1009 | 1010 | imp_df.columns=['name','importance'] 1011 | imp_df.sort_values('importance', ascending=False, inplace=True) 1012 | 1013 | 1014 | 1015 | print(imp_df.head(20)) 1016 | 1017 | 1018 | # del d_train 1019 | # gc.collect() 1020 | 1021 | if configuration.debug: 1022 | exit(0) 1023 | 1024 | predictions = [] 1025 | session_ids = [] 1026 | 1027 | test['score'] = clf.predict(d_test) 1028 | save_test = test.copy() 1029 | save_test['item_id'] = cat_encoders['item_id'].reverse_transform(save_test.item_id.values) 1030 | with open(f'../output/{model_name}_test_score.p', 'wb') as f: 1031 | pickle.dump( save_test.loc[:,['score', 'session_id', 'item_id', 'step']],f, protocol=4) 1032 | 1033 | grouped_test = test.groupby('session_id') 1034 | for session_id, group in grouped_test: 1035 | scores = group['score'] 1036 | sorted_arg = np.flip(np.argsort(scores)) 1037 | sorted_item_ids = group['item_id'].values[sorted_arg] 1038 | sorted_item_ids = cat_encoders['item_id'].reverse_transform(sorted_item_ids) 1039 | sorted_item_string = ' '.join([str(i) for i in sorted_item_ids]) 1040 | predictions.append(sorted_item_string) 1041 | session_ids.append(session_id) 1042 | 1043 | prediction_df = pd.DataFrame() 1044 | prediction_df['session_id'] = session_ids 1045 | prediction_df['item_recommendations'] = predictions 1046 | 1047 | print("pred df shape", prediction_df.shape) 1048 | sub_df = pd.read_csv('../input/submission_popular.csv') 1049 | sub_df.drop('item_recommendations', axis=1, inplace=True) 1050 | sub_df = sub_df.merge(prediction_df, on="session_id") 1051 | # sub_df['item_recommendations'] = predictions 1052 | 1053 | sub_df.to_csv(f'../output/{model_name}.csv', index=None) 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | -------------------------------------------------------------------------------- /src/run_lgb.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import gc 6 | from constant import * 7 | from utils import * 8 | from config import * 9 | import torch 10 | from joblib import Parallel, delayed 11 | from tqdm import tqdm 12 | import lightgbm as lgb 13 | import scipy 14 | from sklearn.decomposition import TruncatedSVD 15 | import multiprocessing 16 | from ordered_set import OrderedSet 17 | from sklearn.metrics.pairwise import cosine_similarity 18 | from sklearn.metrics import log_loss 19 | import pycountry 20 | from sklearn.feature_extraction.text import TfidfVectorizer 21 | from scipy.spatial.distance import cosine 22 | import itertools 23 | from scipy.sparse import csr_matrix 24 | 25 | configuration = LGBConfiguration() 26 | 27 | model_name='lgb_imp_cooc_v2' 28 | 29 | if configuration.sub_sample: 30 | model_name += '_140k' 31 | else: 32 | model_name += '_all' 33 | 34 | if configuration.use_test: 35 | model_name += '_ut' 36 | 37 | seed_everything(42) 38 | 39 | ########################################################### Load data ###################################################################### 40 | with open( f'{input_dir}/train_v2.p', 'rb') as f: 41 | train = pickle.load(f) 42 | train['id']= np.arange(len(train)) 43 | 44 | with open(f'{input_dir}/test_v2.p', 'rb') as f: 45 | test = pickle.load(f) 46 | test['id'] = np.arange( len(train), len(train)+ len(test)) 47 | 48 | with open('../input/item_metadata.p', 'rb') as f: 49 | item_meta = pickle.load(f) 50 | item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|')) 51 | item_meta['item_id'] = item_meta['item_id'].apply(str) 52 | 53 | # whether to use sub sample of the data to speed up the evaluation 54 | if configuration.sub_sample: 55 | with open('../input/selected_users_140k.p', 'rb') as f: 56 | selected_users = pickle.load(f) 57 | 58 | train = train.loc[train.user_id.isin(selected_users),:] 59 | 60 | # check if the code can run with debug mode 61 | if configuration.debug: 62 | train = train.sample(1000) 63 | test = test.sample(1000) 64 | 65 | with timer("preprocessing"): 66 | 67 | # change columns name 68 | train.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True) 69 | test.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True) 70 | 71 | # concatenate the action and reference in string format as these refernce are not actually item id 72 | train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1) 73 | test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1) 74 | 75 | 76 | train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1) 77 | test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1) 78 | 79 | 80 | # wipe out the item id associated with these actions, reason same as the above 81 | train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM 82 | test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM 83 | 84 | train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM 85 | test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM 86 | 87 | train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM 88 | test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM 89 | 90 | train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM 91 | test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM 92 | 93 | # remove training example where clicked item is not in the impressions 94 | train['in_impressions'] = True 95 | train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1) 96 | train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True) 97 | 98 | test['in_impressions'] = True 99 | test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1) 100 | test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True) 101 | 102 | # parse impressions and prices list from string to list 103 | train['item_id'] = train['item_id'].apply(str) 104 | train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|')) 105 | train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x]) 106 | 107 | test['item_id'] = test['item_id'].apply(str) 108 | test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|')) 109 | test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x]) 110 | 111 | # compute the last interacted item by shifted the item_id by 1 position 112 | train['last_item'] = np.nan 113 | test['last_item'] = np.nan 114 | 115 | train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist() 116 | test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist() 117 | 118 | # compute the last interacted item by shifted the item_id by 2 position 119 | train['last_item'] = train_shifted_item_id 120 | test['last_item'] = test_shifted_item_id 121 | 122 | train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist() 123 | test_shifted_item_id = [DUMMY_ITEM] *2 + test.item_id.values[:-2].tolist() 124 | 125 | # compute the last interacted item by shifted the item_id by 3 position 126 | train['second_last_item'] = train_shifted_item_id 127 | test['second_last_item'] = test_shifted_item_id 128 | 129 | train_shifted_item_id = [DUMMY_ITEM] *3 + train.item_id.values[:-3].tolist() 130 | test_shifted_item_id = [DUMMY_ITEM] *3 + test.item_id.values[:-3].tolist() 131 | 132 | train['third_last_item'] = train_shifted_item_id 133 | test['third_last_item'] = test_shifted_item_id 134 | 135 | # mask out the last interacted item if that interaction comes first in its session 136 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=True) 137 | test['step_rank'] = test.groupby('session_id')['step'].rank(method='max', ascending=True) 138 | 139 | # fill the invalid shifted last n item with a constant number 140 | train.loc[(train.step_rank == 1) & (train.action == 'clickout item'), 'last_item'] = DUMMY_ITEM 141 | test.loc[(test.step_rank == 1) & (test.action == 'clickout item'), 'last_item'] = DUMMY_ITEM 142 | 143 | train.loc[(train.step_rank == 2) & (train.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM 144 | test.loc[(test.step_rank == 2) & (test.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM 145 | 146 | train.loc[(train.step_rank == 3) & (train.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM 147 | test.loc[(test.step_rank == 3) & (test.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM 148 | 149 | 150 | # ignore this 151 | keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item'] 152 | all_cat_columns = ['item_id', 'city', 'platform', 'device','country','country_platform','action','device_platform'] 153 | 154 | 155 | # generate country from city 156 | train['country'] = train.city.apply(lambda x:x.split(',')[-1]) 157 | test['country'] = test.city.apply(lambda x:x.split(',')[-1]) 158 | 159 | # concate country and platform in string format as a new feature 160 | train['country_platform'] = train.apply(lambda row: row.country + row.platform, axis=1) 161 | test['country_platform'] = test.apply(lambda row: row.country + row.platform, axis=1) 162 | 163 | train['device_platform'] = train.apply(lambda row: row.device + row.platform, axis=1) 164 | test['device_platform'] = test.apply(lambda row: row.device + row.platform, axis=1) 165 | # filter out rows where reference doesn't present in impression 166 | # train = train.loc[train.apply(lambda row:row.item_id in row.impressions, axis=1),:] 167 | 168 | print("train shape",train.shape) 169 | 170 | 171 | # concat train and test 172 | data = pd.concat([train, test], axis=0) 173 | data = data.reset_index(drop=True) 174 | 175 | # compute a dicationary that maps session id to the sequence of item ids in that session 176 | train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list)) 177 | test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list)) 178 | 179 | 180 | # compute a dicationary that maps session id to the sequence of action in that session 181 | train_session_actions = dict(train.groupby('session_id')['action'].apply(list)) 182 | test_session_actions = dict(test.groupby('session_id')['action'].apply(list)) 183 | 184 | # compute session session step since the "step" column in some session is not correctly order 185 | train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int) 186 | test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int) 187 | 188 | 189 | 190 | 191 | data_feature = data.loc[:,['id','step','session_id', 'timestamp','platform','country']].copy() 192 | 193 | # compute the time difference between each step 194 | data_feature['time_diff'] = data.groupby('session_id')['timestamp'].diff() 195 | 196 | # compute the difference of time difference between each step 197 | data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff() 198 | 199 | # compute the difference of the difference of time difference between each step 200 | data_feature['time_diff_diff_diff'] = data_feature.groupby('session_id')['time_diff_diff'].diff() 201 | 202 | # compute the time difference from 2 steps ahead 203 | data_feature['time_diff_2'] = data.groupby('session_id')['timestamp'].diff().shift(1) 204 | 205 | # compute the time difference from 3 steps ahead 206 | data_feature['time_diff_3'] = data.groupby('session_id')['timestamp'].diff().shift(2) 207 | 208 | data_feature['hour']= pd.to_datetime(data_feature.timestamp, unit='s').dt.hour//4 209 | 210 | # map platform to country 211 | data_feature['mapped_country'] = data_feature.platform.apply(platform2country) 212 | 213 | 214 | # load the precomputed country to utc offsets from geopy 215 | with open('../input/country2offsets_dict.p','rb') as f: 216 | platform_country2offsets_dict = pickle.load(f) 217 | data_feature['platform2country_utc_offsets'] = data_feature.mapped_country.map(platform_country2offsets_dict) 218 | 219 | 220 | # trasnform time difference with rank gauss 221 | data_feature['rg_time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values) 222 | 223 | # compute the log of step 224 | data_feature['step_log'] = np.log1p(data_feature['step']) 225 | 226 | # drop the useless columns 227 | data_feature = data_feature.drop(['session_id','step','timestamp','hour','platform','country','mapped_country'], axis=1) 228 | 229 | 230 | 231 | 232 | # merge train, test with data_feature 233 | train = train.merge(data_feature, on='id', how='left') 234 | test = test.merge(data_feature, on='id', how='left') 235 | 236 | 237 | # compute the sequence of time difference in each session 238 | train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list)) 239 | test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list)) 240 | 241 | # encode the categorical feture 242 | cat_encoders = {} 243 | for col in all_cat_columns: 244 | cat_encoders[col] = CategoricalEncoder() 245 | 246 | 247 | all_items = [] 248 | for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()] : 249 | all_items += imp 250 | 251 | unique_items = OrderedSet(all_items) 252 | unique_actions = OrderedSet(data.action.values) 253 | 254 | cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM]) 255 | cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION]) 256 | for col in ['city', 'platform', 'device','country','country_platform', 'device_platform']: 257 | 258 | cat_encoders[col].fit(data[col].tolist() ) 259 | 260 | 261 | # transform all the categorical columns to continuous integer 262 | for col in all_cat_columns: 263 | train[col] = cat_encoders[col].transform(train[col].values) 264 | test[col] = cat_encoders[col].transform(test[col].values) 265 | 266 | 267 | # get the encoded action 268 | transformed_clickout_action = cat_encoders['action'].transform(['clickout item'])[0] 269 | transformed_dummy_item = cat_encoders['item_id'].transform([DUMMY_ITEM])[0] 270 | transformed_dummy_action = cat_encoders['action'].transform([DUMMY_ACTION])[0] 271 | transformed_interaction_image = cat_encoders['action'].transform(['interaction item image'])[0] 272 | transformed_interaction_deals = cat_encoders['action'].transform(['interaction item deals'])[0] 273 | transformed_interaction_info = cat_encoders['action'].transform(['interaction item info'])[0] 274 | transformed_interaction_rating = cat_encoders['action'].transform(['interaction item rating'])[0] 275 | 276 | # transform session interactions and pad dummy in front of all of them 277 | for session_id, item_list in train_session_interactions.items(): 278 | train_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list) 279 | 280 | for session_id, item_list in test_session_interactions.items(): 281 | test_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list) 282 | 283 | for session_id, action_list in train_session_actions.items(): 284 | train_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list) 285 | 286 | for session_id, action_list in test_session_actions.items(): 287 | test_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list) 288 | 289 | 290 | ### compute co-occurence matrix 291 | implicit_train = train.loc[train.action != transformed_clickout_action, :] 292 | implicit_test = test.loc[test.action != transformed_clickout_action, :] 293 | 294 | # get all interacted items in a session 295 | implicit_all = pd.concat([implicit_train , implicit_test], axis=0) 296 | # a list of list containing items in the same session 297 | co_occ_items = implicit_all.groupby('session_id').item_id.apply(list).to_dict().values() 298 | co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in co_occ_items] 299 | 300 | #aggregate co-ocurrence across sessions 301 | co_occ_coordinates = [] 302 | for coordinates in co_occ_permutes: 303 | co_occ_coordinates += coordinates 304 | 305 | #construct csr 306 | row, col, values = zip(*((i,j,1) for i,j in co_occ_coordinates )) 307 | co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32) 308 | 309 | co_occ_matrix_csc = co_occ_matrix.tocsc() 310 | 311 | print("max entry: ", co_occ_matrix.max()) 312 | 313 | 314 | ### compute co-occurence matrix for imp list 315 | 316 | # imp_co_occ_items = train.loc[~train.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist() + test.loc[~test.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist() 317 | # imp_co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in imp_co_occ_items] 318 | 319 | # #aggregate co-ocurrence across sessions 320 | # imp_co_occ_coordinates = [] 321 | # for coordinates in imp_co_occ_permutes: 322 | # imp_co_occ_coordinates += coordinates 323 | 324 | # #construct csr 325 | # row, col, values = zip(*((i,j,1) for i,j in imp_co_occ_coordinates )) 326 | # imp_co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32) 327 | 328 | # imp_co_occ_matrix_csc = imp_co_occ_matrix.tocsc() 329 | 330 | # print("max entry: ", imp_co_occ_matrix.max()) 331 | 332 | # categorically encode last, second last and third item 333 | train['last_item'] = cat_encoders['item_id'].transform(train['last_item'].values) 334 | test['last_item'] = cat_encoders['item_id'].transform(test['last_item'].values) 335 | 336 | train['second_last_item'] = cat_encoders['item_id'].transform(train.second_last_item.values) 337 | test['second_last_item'] = cat_encoders['item_id'].transform(test.second_last_item.values) 338 | 339 | train['third_last_item'] = cat_encoders['item_id'].transform(train.third_last_item.values) 340 | test['third_last_item'] = cat_encoders['item_id'].transform(test.third_last_item.values) 341 | 342 | 343 | 344 | 345 | # genetate item properties features 346 | item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:] 347 | # item_meta multi-hot 348 | item_meta['item_id'] = cat_encoders['item_id'].transform(item_meta['item_id'].values) 349 | item_meta['star'] = np.nan 350 | item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1 351 | item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2 352 | item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3 353 | item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4 354 | item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5 355 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Excellent Rating' in y) ), 'star'] = 9 356 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Very Good Rating' in y) ), 'star'] = 8 357 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Good Rating' in y) ), 'star'] = 7 358 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Satisfactory Rating' in y) ), 'star'] = 6 359 | 360 | item_meta['rating'] = np.nan 361 | item_meta.loc[item_meta.properties.apply(lambda x: 'Satisfactory Rating' in x), 'rating'] = 7.0 362 | item_meta.loc[item_meta.properties.apply(lambda x: 'Good Rating' in x), 'rating'] = 7.5 363 | item_meta.loc[item_meta.properties.apply(lambda x: 'Very Good Rating' in x), 'rating'] = 8.0 364 | item_meta.loc[item_meta.properties.apply(lambda x: 'Excellent Rating' in x), 'rating'] = 8.5 365 | 366 | # get binary properties feature 367 | item_properties_df = pd.DataFrame() 368 | item_properties_df['item_id'] = item_meta.item_id 369 | item_properties_df['num_properties'] = item_meta.properties.apply(len) 370 | item_properties_df['star'] = item_meta.star 371 | item_properties_df['item_Beach'] = item_meta.properties.apply(lambda x: 'Beach' in x).astype(np.float16) 372 | item_properties_df['item_Bed & Breakfast'] = item_meta.properties.apply(lambda x: 'Bed & Breakfast' in x).astype(np.float16) 373 | item_properties_df['rating'] = item_meta['rating'] 374 | 375 | 376 | item_star_map = item_properties_df.loc[:,['item_id','star']].set_index('item_id').to_dict()['star'] 377 | item_rating_map = item_properties_df.loc[:,['item_id','rating']].set_index('item_id').to_dict()['rating'] 378 | 379 | 380 | 381 | del item_meta 382 | gc.collect() 383 | 384 | # ignore filter_df , not using, consume huge memory yet increase a little 385 | filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']] 386 | filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|')) 387 | 388 | # filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Star' in x), 'nights'] = 3 389 | filter_df['nights']=np.nan 390 | filter_df.loc[filter_df.current_filters.apply(lambda x: '2 Nights' in x), 'nights'] = 1 391 | filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Nights' in x), 'nights'] = 2 392 | 393 | filter_set = list(set(np.hstack(filter_df['current_filters'].to_list()))) 394 | 395 | cat_encoders['filters'] = CategoricalEncoder() 396 | cat_encoders['filters'].fit(filter_set) 397 | 398 | # get binary filter feature 399 | filters_df = pd.DataFrame() 400 | filters_df['id'] = filter_df.id 401 | filters_df['num_filters'] = filter_df.current_filters.apply(len) 402 | filters_df['breakfast_included'] = filter_df.current_filters.apply( lambda x: 'Breakfast Included' in x).astype(np.float16) 403 | filters_df['filters_Sort By Price'] = filter_df.current_filters.apply( lambda x: 'Sort by Price' in x).astype(np.float16) 404 | filters_df['filters_Sort By Popularity'] = filter_df.current_filters.apply( lambda x: 'Sort By Popularity' in x).astype(np.float16) 405 | 406 | 407 | 408 | # compute interaction image count for each item across train/ test 409 | interaction_image_item_ids = train.loc[train.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 410 | unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True) 411 | global_image_count_dict = dict(zip(unique_interaction_image_items, counts)) 412 | 413 | # compute interaction count for each item across train/ test 414 | interaction_item_ids = train.loc[train.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 415 | unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True) 416 | global_interaction_count_dict = dict(zip(unique_interaction_items, counts)) 417 | 418 | # compute interaction deals count for each item across train/ test 419 | interaction_deals_item_ids = train.loc[train.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() 420 | unique_interaction_deals_items, counts = np.unique(interaction_deals_item_ids, return_counts=True) 421 | global_deals_count_dict = dict(zip(unique_interaction_deals_items, counts)) 422 | 423 | 424 | # compute step rank to identify the last row in each session for train/ val split 425 | train = train.loc[train.action == transformed_clickout_action,:] 426 | test = test.loc[test.action == transformed_clickout_action,:] 427 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False) 428 | 429 | # compute the impression count for each item 430 | item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)]) 431 | unique_items, counts = np.unique(item_ids, return_counts=True) 432 | impression_count_dict = dict(zip(unique_items, counts)) 433 | 434 | # compute the rank gauss transformed prices 435 | unique_prices = np.unique(np.hstack([np.hstack(train.prices.values), np.hstack(test.prices.values)]) ) 436 | rg_unique_prices = GaussRankScaler().fit_transform(unique_prices) 437 | price_rg_price_dict = dict(zip(unique_prices, rg_unique_prices)) 438 | 439 | 440 | #train/ val split 441 | if configuration.debug: 442 | val = train.loc[train.step_rank == 1,:].iloc[:5] 443 | else: 444 | val = train.loc[train.step_rank == 1,:].iloc[:50000] 445 | 446 | val_index = val.index 447 | train = train.loc[~train.index.isin(val_index),:] 448 | 449 | train = train.drop('step_rank', axis=1) 450 | val = val.drop('step_rank', axis=1) 451 | 452 | 453 | # get the encoded nan item 454 | transformed_nan_item = cat_encoders['item_id'].transform(['nan'])[0] 455 | 456 | 457 | 458 | 459 | from collections import defaultdict, Counter 460 | session_clickout_count_dict = {} 461 | past_interaction_dict = {} 462 | last_click_sess_dict = {} 463 | last_impressions_dict = {} 464 | sess_last_imp_idx_dict={} 465 | sess_last_price_dict = {} 466 | sess_time_diff_dict ={} 467 | sess_step_diff_dict = {} 468 | 469 | cumulative_click_dict = defaultdict(lambda : 0) 470 | 471 | 472 | 473 | 474 | def parse_impressions(df, session_interactions, session_actions, session_time_diff, training=True): 475 | # parse the data into a binary classification task, generate 1 example for each item in the impression list 476 | df_list = [] 477 | label_test_df_list = [] 478 | # parse impressions for train set 479 | past_interaction_rows = [] 480 | past_interaction_columns = [] 481 | for idx, row in enumerate(tqdm(df.itertuples())): 482 | 483 | if row.session_id not in session_clickout_count_dict: 484 | session_clickout_count_dict[row.session_id] = 0 485 | 486 | if row.user_id not in past_interaction_dict: 487 | past_interaction_dict[row.user_id] = [] 488 | 489 | 490 | sess_step = row.sess_step 491 | session_id = row.session_id 492 | 493 | # compute the categorically encoded impression list 494 | transformed_impressions = cat_encoders['item_id'].transform(row.impressions, to_np=True) 495 | 496 | current_rows = np.zeros([len(row.impressions), 66], dtype=object) 497 | 498 | # compute rank of price this clickout 499 | price_rank = compute_rank(row.prices) 500 | 501 | #compute the number of interactions associated with the last interacted item in this session 502 | equal_last_item_indices = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) == row.last_item 503 | last_item_interaction = len(set(np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1])[equal_last_item_indices])) 504 | 505 | #compute the local interaction count for each item id 506 | interaction_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) != transformed_clickout_action 507 | interaction_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_indices] 508 | sess_unique_items, counts = np.unique(interaction_item, return_counts=True) 509 | interaction_count_dict = dict(zip(sess_unique_items, counts)) 510 | 511 | #compute the local interaction image count for each item id 512 | interaction_image_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_image 513 | interaction_image_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_image_indices] 514 | sess_unique_image_items, counts = np.unique(interaction_image_item, return_counts=True) 515 | interaction_image_count_dict = dict(zip(sess_unique_image_items, counts)) 516 | 517 | #compute the local interaction deals count for each item id 518 | interaction_deals_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_deals 519 | interaction_deals_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_deals_indices] 520 | sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True) 521 | interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts)) 522 | 523 | #compute the local clickout count for each item id 524 | interaction_clickout_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_clickout_action 525 | interaction_clickout_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_clickout_indices] 526 | sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True) 527 | interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts)) 528 | 529 | #compute the local interaction rating count for each item id 530 | interaction_rating_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_rating 531 | interaction_rating_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_rating_indices] 532 | sess_unique_rating_items, counts = np.unique(interaction_rating_item, return_counts=True) 533 | interaction_rating_count_dict = dict(zip(sess_unique_rating_items, counts)) 534 | 535 | 536 | # get the time diffference array in this session for later computing the average of it 537 | finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1]) 538 | finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices] 539 | 540 | # unpad the interactions 541 | unpad_interactions = session_interactions[session_id][configuration.sess_length:configuration.sess_length+ sess_step -1] 542 | unique_interaction = pd.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) 543 | 544 | # time elapse of within two steps for each item before the clickout 545 | item_time_elapse_dict = {} 546 | for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]): 547 | if it not in item_time_elapse_dict: 548 | item_time_elapse_dict[it] = [elapse] 549 | 550 | else: 551 | item_time_elapse_dict[it].append(elapse) 552 | 553 | # compute time_diff for each item in the session 554 | interact_diff = [unpad_interactions[::-1].index(imp) if imp in unpad_interactions else np.nan for imp in transformed_impressions] 555 | item_time_diff = np.array([ sum(session_time_diff[session_id][sess_step - diff -1 :sess_step]) if np.isfinite(diff) else np.nan for diff in interact_diff]) 556 | 557 | target_index = transformed_impressions.tolist().index(row.item_id) if training else np.nan 558 | 559 | #(imp len, num items) 560 | current_co_occ = co_occ_matrix[transformed_impressions,:] 561 | 562 | 563 | #(imp len, num unique items in the session b4 this clickout) 564 | current_co_occ = current_co_occ[:,sess_unique_items].toarray() 565 | 566 | # (1, num unique items in the session b4 this clickout) 567 | # print(current_co_occ.dtype) 568 | 569 | norm = (1 + co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1)) 570 | 571 | # #(imp len, num items) 572 | # imp_current_co_occ = imp_co_occ_matrix[transformed_impressions,:] 573 | 574 | 575 | # #(imp len, num unique items in the session b4 this clickout) 576 | # imp_current_co_occ = imp_current_co_occ[:,sess_unique_items].toarray() 577 | 578 | # # (1, num unique items in the session b4 this clickout) 579 | # # print(current_co_occ.dtype) 580 | 581 | # imp_norm = (1 + imp_co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1)) 582 | 583 | # norm_imp_current_co_occ = imp_current_co_occ / imp_norm 584 | 585 | # the position of the last interacted item in the current impression list 586 | if row.last_item in transformed_impressions: 587 | last_interact_index = transformed_impressions.tolist().index(row.last_item) 588 | else: 589 | last_interact_index = np.nan 590 | 591 | # the position of the second last interacted item in the current impression list 592 | if row.second_last_item in transformed_impressions: 593 | second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item) 594 | else: 595 | second_last_interact_index = np.nan 596 | 597 | # the position of the third last interacted item in the current impression list 598 | if row.third_last_item in transformed_impressions: 599 | third_last_interact_index = transformed_impressions.tolist().index(row.third_last_item) 600 | else: 601 | third_last_interact_index = np.nan 602 | 603 | # initialize dictionaries 604 | if row.session_id not in last_click_sess_dict: 605 | last_click_sess_dict[row.session_id] = transformed_dummy_item 606 | 607 | if row.session_id not in last_impressions_dict: 608 | last_impressions_dict[row.session_id] = None 609 | 610 | if row.session_id not in sess_last_imp_idx_dict: 611 | sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX 612 | 613 | if row.session_id not in sess_last_price_dict: 614 | sess_last_price_dict[row.session_id] = None 615 | 616 | if row.session_id not in sess_time_diff_dict: 617 | sess_time_diff_dict[row.session_id] = None 618 | 619 | if row.session_id not in sess_step_diff_dict: 620 | sess_step_diff_dict[row.session_id] = None 621 | 622 | 623 | # item id 624 | current_rows[:, 0] = transformed_impressions 625 | 626 | # label 627 | current_rows[:, 1] = transformed_impressions == row.item_id 628 | current_rows[:, 2] = row.session_id 629 | 630 | # whether current item id equal to the last interacted item id 631 | current_rows[:, 3] = transformed_impressions == row.last_item 632 | current_rows[:, 4] = price_rank 633 | current_rows[:, 5] = row.platform 634 | current_rows[:, 6] = row.device 635 | current_rows[:, 7] = row.city 636 | current_rows[:, 8] = row.prices 637 | current_rows[:, 9] = row.country 638 | 639 | # impression index 640 | current_rows[:, 10] = np.arange(len(row.impressions)) 641 | current_rows[:, 11] = row.step 642 | current_rows[:, 12] = row.id 643 | 644 | # last_click_item: last clickout item id 645 | current_rows[:, 13] = last_click_sess_dict[row.session_id] 646 | 647 | # equal_last_impressions: current impression list is eactly the same as the last one that the user encountered 648 | current_rows[:, 14] = last_impressions_dict[row.session_id] == transformed_impressions.tolist() 649 | 650 | 651 | current_rows[:, 15] = sess_last_imp_idx_dict[row.session_id] 652 | # last_interact_index 653 | current_rows[:, 16] = last_interact_index 654 | 655 | # price_diff 656 | current_rows[:, 17] = row.prices - sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan 657 | 658 | # last_price 659 | current_rows[:, 18] = sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan 660 | 661 | # price_ratio 662 | current_rows[:, 19] = row.prices / sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np. nan 663 | 664 | # clickout_time_diff 665 | current_rows[:, 20] = row.timestamp - sess_time_diff_dict[row.session_id] if sess_time_diff_dict[row.session_id] else np.nan 666 | 667 | # country_platform 668 | current_rows[:, 21] = row.country_platform 669 | 670 | # impression_count 671 | current_rows[:, 22] = [impression_count_dict[imp] for imp in row.impressions] 672 | 673 | # is_interacted: if that item has been interaced in the current session 674 | current_rows[:, 23] = [imp in session_interactions[session_id][:configuration.sess_length+ sess_step -1] for imp in transformed_impressions] 675 | 676 | # local_interaction_image_count 677 | current_rows[:, 24] = [interaction_image_count_dict[imp] if imp in interaction_image_count_dict else 0 for imp in transformed_impressions] 678 | # local_interaction_deals_count 679 | current_rows[:, 25] = [interaction_deals_count_dict[imp] if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions] 680 | 681 | # local_interaction_clickout_count 682 | current_rows[:, 26] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions] 683 | 684 | # global_interaction_image_count 685 | current_rows[:, 27] = [global_image_count_dict[imp] if imp in global_image_count_dict else 0 for imp in transformed_impressions] 686 | 687 | # global_interaction_deals_count 688 | current_rows[:, 28] = [global_deals_count_dict[imp] if imp in global_deals_count_dict else 0 for imp in transformed_impressions] 689 | 690 | # is_clicked 691 | current_rows[:, 29] = [imp in past_interaction_dict[row.user_id] for imp in transformed_impressions] 692 | 693 | # click_diff 694 | current_rows[:, 30] = [past_interaction_dict[row.user_id][::-1].index(imp) if imp in past_interaction_dict[row.user_id] else np.nan for imp in transformed_impressions] 695 | 696 | # average of the previous features 697 | for i in range(31, 38): 698 | current_rows[:, i] = np.mean(current_rows[:, i-8]) 699 | 700 | # impression_avg_prices 701 | current_rows[:, 38] = np.mean(row.prices) 702 | current_rows[:, 39] = row.device_platform 703 | 704 | # equal_max_liic: euqal the maximum of local interaction image count 705 | current_rows[:, 40] = np.array(current_rows[:, 24]) == np.max(current_rows[:, 24]) if sum(current_rows[:, 24]) >0 else False 706 | 707 | # num_interacted_items 708 | current_rows[:, 41] = len(np.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1])) 709 | 710 | # equal_second_last_item 711 | current_rows[:, 42] = transformed_impressions == row.second_last_item 712 | 713 | # last_action 714 | current_rows[:, 43] = session_actions[session_id][configuration.sess_length+ sess_step -2] 715 | 716 | # last_second_last_imp_idx_diff 717 | current_rows[:, 44] = last_interact_index - second_last_interact_index 718 | 719 | # predicted_next_imp_idx (the idea is to trace your eyeball, last_interact_index + (last_interact_index - second_last_interact_index)) 720 | current_rows[:, 45] = 2 * last_interact_index - second_last_interact_index 721 | 722 | # list_len 723 | current_rows[:, 46] = len(row.impressions) 724 | 725 | # imp_idx_velocity 726 | current_rows[:, 47] = last_interact_index - 2 * second_last_interact_index + third_last_interact_index 727 | 728 | # time_diff_sess_avg 729 | current_rows[:, 48] = np.mean(finite_time_diff_array) 730 | 731 | # max_time_elapse 732 | current_rows[:, 49] = [ max(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions] 733 | 734 | # sum_time_elapse 735 | current_rows[:, 50] = [ sum(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions] 736 | 737 | # avg_time_elapse 738 | current_rows[:, 51] = [ np.mean(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions] 739 | 740 | # item_time_diff 741 | current_rows[:, 52] = item_time_diff 742 | 743 | # global_interaction_count 744 | current_rows[:, 53] = [global_interaction_count_dict[imp] if imp in global_interaction_count_dict else 0 for imp in transformed_impressions] 745 | 746 | # average global_interaction_count 747 | current_rows[:, 54] = np.mean(current_rows[:, 53]) 748 | 749 | # std of global interaction image count 750 | current_rows[:, 55] = np.std(current_rows[:, 27]) 751 | 752 | # std of glocal interaction conut 753 | current_rows[:, 56] = np.std(current_rows[:, 53]) 754 | 755 | # local_interaction_count 756 | current_rows[:, 57] = [interaction_count_dict[imp] if imp in interaction_count_dict else 0 for imp in transformed_impressions] 757 | current_rows[:, 58] = target_index 758 | 759 | # target price 760 | current_rows[:, 59] = row.prices[target_index] if not np.isnan(target_index) else np.nan 761 | 762 | # normalized co-occurence statistics 763 | current_rows[:, 60] = np.mean(current_co_occ/ norm, axis=1).reshape(-1) 764 | current_rows[:, 61] = np.min(current_co_occ/ norm, axis=1).reshape(-1) 765 | current_rows[:, 62] = np.max(current_co_occ/norm, axis=1).reshape(-1) 766 | current_rows[:, 63] = np.median(current_co_occ/norm, axis=1).reshape(-1) 767 | 768 | # last_item_interaction 769 | current_rows[:, 64] = last_item_interaction 770 | 771 | # target price rank 772 | current_rows[:, 65] = price_rank[target_index] if not np.isnan(target_index) else np.nan 773 | # current_rows[:, 66] = np.mean(norm_imp_current_co_occ, axis=1).reshape(-1) 774 | # current_rows[:, 67] = np.min(norm_imp_current_co_occ, axis=1).reshape(-1) 775 | # current_rows[:, 68] = np.max(norm_imp_current_co_occ, axis=1).reshape(-1) 776 | # current_rows[:, 69] = np.median(norm_imp_current_co_occ, axis=1).reshape(-1) 777 | 778 | 779 | 780 | 781 | 782 | if training or row.item_id == transformed_nan_item: 783 | df_list.append(current_rows) 784 | else: 785 | label_test_df_list.append(current_rows) 786 | # cumulative_click_dict[row.item_id] += 1 787 | past_interaction_dict[row.user_id].append(row.item_id) 788 | last_click_sess_dict[row.session_id] = row.item_id 789 | last_impressions_dict[row.session_id] = transformed_impressions.tolist() 790 | sess_time_diff_dict[row.session_id] = row.timestamp 791 | sess_step_diff_dict[row.session_id] = row.step 792 | if row.item_id != transformed_nan_item: 793 | sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True) 794 | sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0] 795 | # cumulative_click_dict[row.item_id] += 1 796 | data = np.vstack(df_list) 797 | df_columns = ['item_id', 'label', 'session_id', 'equal_last_item', 'price_rank', 'platform', 'device', 'city', 'price', 'country', 'impression_index','step', 'id','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','clickout_time_diff','country_platform','impression_count','is_interacted','local_interaction_image_count','local_interaction_deals_count','local_interaction_clickout_count','global_interaction_image_count','global_interaction_deals_count','is_clicked','click_diff', 'avg_is_interacted','avg_liic', 'avg_lidc','avg_licc','avg_giic','avg_gdc','avg_is_clicked','impression_avg_prices','device_platform','equal_max_liic','num_interacted_items','equal_second_last_item','last_action','last_second_last_imp_idx_diff','predicted_next_imp_idx', 'list_len','imp_idx_velocity','time_diff_sess_avg','max_time_elapse','sum_time_elapse','avg_time_elapse','item_time_diff','global_interaction_count','avg_gic','std_giic','std_gic','local_interaction_count','target_index','target_price','co_occ_mean_norm','co_occ_min_norm','co_occ_max_norm','co_occ_median_norm','last_item_interaction','target_price_rank'] 798 | dtype_dict = {"item_id":"int32", "label": "int8", "equal_last_item":"int8", "step":"int16", "price_rank": "int32","impression_index":"int32", "platform":"int32","device":"int32","city":"int32", "id":"int32", "country":"int32", "price":"int16", "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'float32', 'price_diff':'float16','last_price':'float16','price_ratio':'float32','clickout_time_diff':'float16','country_platform':'int32','impression_count':'int32','is_interacted':'int8','local_interaction_image_count':'int32','local_interaction_deals_count':'int32','local_interaction_clickout_count':'int32','global_interaction_image_count':'int32','global_interaction_deals_count':'int32','is_clicked':'int8','click_diff':'float32'\ 799 | , 'avg_is_interacted':'float16' ,'avg_liic':'float16', 'avg_lidc':'float32','avg_licc':'float32','avg_giic':'float32','avg_gdc':'float32','avg_is_clicked':'float32','impression_avg_prices':'float32','device_platform':'int32','equal_max_liic':'int8','num_interacted_items':'int32','equal_second_last_item':'int8','last_action':'int32','last_second_last_imp_idx_diff':'float32', 'predicted_next_imp_idx': 'float32','list_len':'int16','imp_idx_velocity':'float32','time_diff_sess_avg':'float32','max_time_elapse':'float32','sum_time_elapse':'float32','avg_time_elapse':'float32','item_time_diff':'float32','global_interaction_count':'float32','avg_gic':'float32','std_giic':'float32','std_gic':'float32','local_interaction_count':'int32','target_index':'float32','target_price':'float32','co_occ_mean_norm':'float32','co_occ_min_norm':'float32','co_occ_max_norm':'float32','co_occ_median_norm':'float32','last_item_interaction':'int32','target_price_rank':'float32'} 800 | df = pd.DataFrame(data, columns=df_columns) 801 | df = df.astype(dtype=dtype_dict ) 802 | if training: 803 | return df 804 | else: 805 | label_test = np.vstack(label_test_df_list) 806 | label_test = pd.DataFrame(label_test, columns=df_columns) 807 | label_test = label_test.astype(dtype= dtype_dict) 808 | return df, label_test 809 | 810 | 811 | 812 | 813 | train.sort_values('timestamp',inplace=True) 814 | val.sort_values('timestamp',inplace=True) 815 | test.sort_values('timestamp',inplace=True) 816 | 817 | # print("sorted!!") 818 | train = parse_impressions(train, train_session_interactions, train_session_actions, train_session_time_diff) 819 | val = parse_impressions(val, train_session_interactions, train_session_actions, train_session_time_diff) 820 | test, label_test = parse_impressions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False) 821 | 822 | if configuration.use_test: 823 | train = pd.concat([train, label_test], axis=0) 824 | 825 | 826 | 827 | 828 | 829 | 830 | print("test before merge", test.shape) 831 | train = train.merge(item_properties_df, on="item_id", how="left") 832 | val = val.merge(item_properties_df, on="item_id", how="left") 833 | test = test.merge(item_properties_df, on="item_id", how="left") 834 | 835 | 836 | print("test ", test.shape) 837 | train = train.merge(filters_df, on='id', how="left") 838 | val = val.merge(filters_df, on='id', how="left") 839 | test = test.merge(filters_df, on='id', how="left") 840 | 841 | 842 | # print("test ", test.shape) 843 | # print("test before merge data_feature", test.shape) 844 | 845 | train = train.merge(data_feature, on='id', how="left") 846 | val = val.merge(data_feature, on='id', how="left") 847 | test = test.merge(data_feature, on='id', how="left") 848 | print("test ", test.shape) 849 | 850 | del filters_df, data_feature 851 | del data 852 | gc.collect() 853 | 854 | # target encoding 855 | agg_cols = [ 'price_rank', 'city', 'platform', 'device', 'country', 'impression_index','star'] 856 | for c in agg_cols: 857 | gp = train.groupby(c)['label'] 858 | mean = gp.mean() 859 | train[f'{c}_label_avg'] = train[c].map(mean) 860 | val[f'{c}_label_avg'] = val[c].map(mean) 861 | test[f'{c}_label_avg'] = test[c].map(mean) 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | agg_cols = ['city','impression_index', 'platform'] 870 | for c in agg_cols: 871 | gp = train.groupby(c)['price'] 872 | mean = gp.mean() 873 | train[f'{c}_price_avg'] = train[c].map(mean) 874 | val[f'{c}_price_avg'] = val[c].map(mean) 875 | test[f'{c}_price_avg'] = test[c].map(mean) 876 | 877 | 878 | 879 | agg_cols = ['city'] 880 | for c in agg_cols: 881 | gp = train.groupby(c)['rg_time_diff'] 882 | mean = gp.mean() 883 | train[f'{c}_td_avg'] = train[c].map(mean) 884 | val[f'{c}_td_avg'] = val[c].map(mean) 885 | test[f'{c}_td_avg'] = test[c].map(mean) 886 | 887 | 888 | 889 | train['rg_price'] = train.price.map(price_rg_price_dict) 890 | val['rg_price'] = val.price.map(price_rg_price_dict) 891 | test['rg_price'] = test.price.map(price_rg_price_dict) 892 | 893 | 894 | 895 | #price cut within city 896 | 897 | data = pd.concat([train,val,test], axis=0).reset_index() 898 | data = data.loc[:,['city','price']].drop_duplicates(['city','price']) 899 | data['city_price_bin'] = data.groupby('city').price.apply(lambda x: qcut_safe(x, q = 40).astype(str)) 900 | data['city_price_bin'] = data.apply( lambda x: str(x.city) + x.city_price_bin,axis=1) 901 | data['city_price_bin'] = data['city_price_bin'].factorize()[0] 902 | 903 | 904 | train = train.merge(data, on=['city','price'], how='left') 905 | val = val.merge(data, on=['city','price'], how='left') 906 | test = test.merge(data, on=['city','price'], how='left') 907 | 908 | 909 | 910 | print("train", train.shape) 911 | print("val", val.shape) 912 | print("test", test.shape) 913 | # test = test.merge(item_properties_df, on="item_id", how="left") 914 | 915 | 916 | 917 | 918 | 919 | data_drop_columns= ['label', 'session_id', 'step', 'id'] 920 | data_drop_columns+= ['target_index','target_price','target_price_rank'] 921 | 922 | train_label = train.label 923 | val_label = val.label 924 | 925 | # build lgbm dataset 926 | d_train = lgb.Dataset(data=train.drop(data_drop_columns, axis=1), label=train_label, free_raw_data=True, silent=True) 927 | d_val = lgb.Dataset(data=val.drop(data_drop_columns, axis=1), label=val_label, free_raw_data=True, silent=True) 928 | 929 | 930 | 931 | 932 | 933 | del train 934 | gc.collect() 935 | 936 | # params = { 937 | # 'objective': 'binary', 938 | # 'boosting_type': 'gbdt', 939 | # 'nthread': multiprocessing.cpu_count() // 3 if configuration.sub_sample else 24, 940 | # 'num_leaves': 200, 941 | # 'max_depth':10, 942 | # 'learning_rate': 0.05 if configuration.sub_sample else 0.01 , 943 | # 'bagging_fraction': 0.8, 944 | # 'bagging_freq': 5, 945 | # 'feature_fraction':0.7, 946 | # 'seed': 0, 947 | # 'verbose': -1, 948 | 949 | # } 950 | params = {'objective': 'binary', 951 | 'boosting_type': 'gbdt', 952 | 'colsample_bytree': 0.76, 953 | 'learning_rate': 0.01, 954 | 'nthread': multiprocessing.cpu_count() -1, 955 | 'max_depth': 13, 956 | 'min_child_weight': 33, 957 | 'min_data_in_leaf': 94, 958 | 'num_leaves': 302, 959 | 'seed': 30, 960 | 'verbose': -1 961 | } 962 | 963 | 964 | 965 | clf = lgb.train( 966 | params=params, 967 | train_set=d_train, 968 | num_boost_round=50000, 969 | valid_sets=[d_train, d_val], 970 | early_stopping_rounds=200 if configuration.sub_sample else 500, 971 | verbose_eval=500, 972 | 973 | ) 974 | 975 | 976 | 977 | # evaluation 978 | def evaluate(val_df, clf): 979 | incorrect_session = {} 980 | val_df['scores'] = clf.predict(val_df.drop(data_drop_columns, axis=1)) 981 | 982 | loss = log_loss(val_df.label.values, val_df.scores.values) 983 | grouped_val = val_df.groupby('session_id') 984 | rss_group = {i:[] for i in range(1,26)} 985 | rss = [] 986 | for session_id, group in grouped_val: 987 | 988 | scores = group.scores 989 | sorted_arg = np.flip(np.argsort(scores)) 990 | rss.append( group['label'].values[sorted_arg]) 991 | rss_group[len(group)].append(group['label'].values[sorted_arg]) 992 | if group['label'].values[sorted_arg][0] != 1: 993 | incorrect_session[session_id] = (sorted_arg.values, group['label'].values[sorted_arg]) 994 | mrr = compute_mean_reciprocal_rank(rss) 995 | mrr_group = {i:(len(rss_group[i]), compute_mean_reciprocal_rank(rss_group[i])) for i in range(1,26)} 996 | print(mrr_group) 997 | if not configuration.debug: 998 | pickle.dump( incorrect_session, open(f'../output/{model_name}_val_incorrect_order.p','wb')) 999 | return mrr, mrr_group, loss 1000 | 1001 | 1002 | 1003 | mrr, mrr_group, val_log_loss = evaluate(val, clf) 1004 | 1005 | print("MRR score: ", mrr) 1006 | 1007 | 1008 | 1009 | imp = clf.feature_importance('gain') 1010 | fn =clf.feature_name() 1011 | imp_df = pd.DataFrame() 1012 | imp_df['importance'] = imp 1013 | imp_df['name'] = fn 1014 | imp_df.sort_values('importance', ascending=False, inplace=True) 1015 | 1016 | 1017 | print(imp_df.head(20)) 1018 | 1019 | 1020 | 1021 | del d_train, d_val 1022 | gc.collect() 1023 | 1024 | 1025 | if configuration.debug: 1026 | exit(0) 1027 | 1028 | predictions = [] 1029 | session_ids = [] 1030 | 1031 | test['score'] = clf.predict(test.drop(data_drop_columns, axis=1)) 1032 | save_test = test.copy() 1033 | save_test['item_id'] = cat_encoders['item_id'].reverse_transform(save_test.item_id.values) 1034 | with open(f'../output/{model_name}_test_score.p', 'wb') as f: 1035 | pickle.dump( save_test.loc[:,['score', 'session_id', 'item_id', 'step']],f, protocol=4) 1036 | 1037 | grouped_test = test.groupby('session_id') 1038 | for session_id, group in grouped_test: 1039 | scores = group['score'] 1040 | sorted_arg = np.flip(np.argsort(scores)) 1041 | sorted_item_ids = group['item_id'].values[sorted_arg] 1042 | sorted_item_ids = cat_encoders['item_id'].reverse_transform(sorted_item_ids) 1043 | sorted_item_string = ' '.join([str(i) for i in sorted_item_ids]) 1044 | predictions.append(sorted_item_string) 1045 | session_ids.append(session_id) 1046 | 1047 | prediction_df = pd.DataFrame() 1048 | prediction_df['session_id'] = session_ids 1049 | prediction_df['item_recommendations'] = predictions 1050 | 1051 | print("pred df shape", prediction_df.shape) 1052 | sub_df = pd.read_csv('../input/submission_popular.csv') 1053 | sub_df.drop('item_recommendations', axis=1, inplace=True) 1054 | sub_df = sub_df.merge(prediction_df, on="session_id") 1055 | # sub_df['item_recommendations'] = predictions 1056 | 1057 | sub_df.to_csv(f'../output/{model_name}.csv', index=None) 1058 | 1059 | 1060 | 1061 | --------------------------------------------------------------------------------