├── setup.sh
├── src
├── constant.py
├── picklization.py
├── country2utc.py
├── config.py
├── nn.py
├── utils.py
├── run_nn.py
├── data.py
├── run_xgb.py
└── run_lgb.py
├── README.md
└── LICENSE
/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir input
2 | mkdir output
3 | mkdir weights
--------------------------------------------------------------------------------
/src/constant.py:
--------------------------------------------------------------------------------
1 | input_dir = '../input'
2 | output_dir = '../output'
3 | weights_dir = '../weights'
4 |
5 | DUMMY_ACTION = 'DUMMMY_A'
6 |
7 | DUMMY_USER = -1
8 |
9 | DUMMY_ITEM = -1
10 |
11 | DUMMY_PRICE_RANK=25
12 |
13 | DUMMY_IMPRESSION_INDEX = 25
--------------------------------------------------------------------------------
/src/picklization.py:
--------------------------------------------------------------------------------
1 | '''
2 | Transform files from csv to pickle for faster reading
3 | '''
4 | import pickle
5 | import pandas as pd
6 |
7 | df = pd.read_csv('../input/train.csv')
8 | with open('../input/train_v2.p','wb') as f:
9 | pickle.dump(df, f)
10 |
11 | df = pd.read_csv('../input/test.csv')
12 | with open('../input/test_v2.p','wb') as f:
13 | pickle.dump(df, f)
14 |
15 | df = pd.read_csv('../input/item_metadata.csv')
16 | with open('../input/item_metadata.p','wb') as f:
17 | pickle.dump(df, f)
--------------------------------------------------------------------------------
/src/country2utc.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import pickle
4 | from utils import *
5 | import pycountry
6 | from timezonefinder import TimezoneFinder
7 | from geopy.geocoders import Nominatim
8 | import datetime
9 | import pytz
10 |
11 | tzf = TimezoneFinder()
12 | geolocator = Nominatim(user_agent="recsys")
13 |
14 | with open('../input/train_v2.p', 'rb') as f:
15 | train = pickle.load(f)
16 |
17 |
18 | with open('../input/test_v2.p', 'rb') as f:
19 | test = pickle.load(f)
20 |
21 |
22 | def location2utc_offset(location):
23 | '''
24 | return the utc offset given the location
25 | '''
26 | location = geolocator.geocode(location)
27 |
28 | if location == None:
29 | return np.nan
30 |
31 | lat = location.latitude
32 | lon = location.longitude
33 | offset_sec = datetime.datetime.now(pytz.timezone(tzf.timezone_at(lng=lon, lat=lat)))
34 | return offset_sec.utcoffset().total_seconds()/60/60
35 |
36 |
37 | all_countries = [platform2country(s) for s in set(train.platform.tolist() + test.platform.tolist())]
38 |
39 | offsets= [location2utc_offset(c) for c in all_countries ]
40 |
41 | # map country to offsets
42 | country2offsets_dict = dict(set(zip(all_countries, offsets)))
43 | with open('../input/country2offsets_dict.p','wb') as f:
44 | pickle.dump(country2offsets_dict, f)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The 4th Place Solution to the 2019 ACM RecSys Challenge
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | ## Team Members
10 | _Kung-hsiang (Steeve), Huang_ __(Rosetta.ai)__; _Yi-fu, Fu_; _Yi-ting, Lee_; _Tzong-hann, Lee_; _Yao-chun, Chan_ __(National Taiwan University)__; _Yi-hui, Lee_ __(University of Texas, Dallas)__; _Shou-de, Lin_ __(National Taiwan University)__
11 |
12 | Contact: steeve@rosetta.ai
13 |
14 |
15 |
16 | ## Introduction
17 | This repository contains RosettaAI's approach to the 2019 ACM Recys Challenge ([paper](https://dl.acm.org/citation.cfm?id=3359560), [writeup](https://medium.com/@huangkh19951228/the-5th-place-approach-to-the-2019-acm-recsys-challenge-by-team-rosettaai-eb3c4e6178c4)). Instead of treating it as a ranking problem, we use __Binary Cross Entropy__ as our loss function. Three different models were implemented:
18 | 1. Neural Networks (based on [DeepFM](https://arxiv.org/pdf/1804.04950.pdf) and this [Youtube paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf))
19 | 2. LightGBM
20 | 3. XGBoost
21 |
22 |
23 |
24 | ## Environment
25 | * Ubuntu 16.04
26 | * CUDA 9.0
27 | * Python==3.6.8
28 | * Numpy==1.16
29 | * Pandas==0.24.2
30 | * PyTorch==1.1.0
31 | * Sklearn==0.21.2
32 | * Scipy==1.3.0
33 | * LightGBM==2.2.4
34 | * XGBoost==0.9
35 | * timezonefinder==4.0.3
36 | * geopy==1.20.0
37 |
38 | ## Project Structure
39 |
40 | ```
41 | ├── input
42 | ├── output
43 | ├── src
44 | └── weights
45 | ```
46 |
47 | ## Setup
48 | Run the following commands to create directories that conform to the structure of the project, then place the unzipped data into the ```input``` directory.:
49 |
50 | ```. setup.sh```
51 |
52 |
53 |
54 | Run the two python scripts to picklize the input data and obtain the utc offsets from countries:
55 | ```
56 | cd src
57 | python picklization.py
58 | python country2utc.py
59 | ```
60 |
61 | To enable the model to train on the whole data, set ```debug``` and ```subsample``` to ```False``` in the ```config.py``` file.
62 |
63 | ```
64 | class Configuration(object):
65 |
66 | def __init__(self):
67 | ...
68 | self.debug = False
69 | self.sub_sample = False
70 | ...
71 | ```
72 |
73 |
74 | ## Training & Submission
75 |
76 | The models are all trained in an end-to-end fashion. To train and predict each of the three models, simply run the following commands:
77 | ```
78 | python run_nn.py
79 | python run_lgb.py
80 | python run_xgb.py
81 | ```
82 | The submission files are stored in the ```output``` directory.
83 |
84 | The results generated from LightGBM alone would place us at the 5th position in the public leaderboard. To ensemble these three models, change the output name of each model in ```Merge.ipynb``` and run it.
85 |
86 |
87 | ## Performance
88 |
89 | | Model | Local Validation MRR | Public Leaderboard MRR |
90 | | ------------- |-------------:| -----:|
91 | | LightGBM | 0.685787 | N/A |
92 | | XGBoost | 0.684521 | 0.681128 |
93 | | NN | 0.675206 | 0.672117 |
94 |
--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from datetime import datetime
3 | import torch
4 | import inspect
5 |
6 |
7 | class Configuration(object):
8 |
9 | def __init__(self):
10 | self.alias = None
11 | self.num_epochs = 1
12 | self.batch_size = 1024
13 | self.optimizer = 'adam'
14 | self.use_cuda = True
15 | self.device_id = 0
16 | self.early_stopping = 1
17 | self.loss = torch.nn.BCELoss
18 | self.debug = True
19 | self.sub_sample = False
20 | self.slack = True
21 | self.use_test = True if not self.sub_sample else False
22 |
23 | def __getitem__(cls, x):
24 | '''make configuration subscriptable'''
25 | return getattr(cls, x)
26 |
27 | def __setitem__(cls, x, v):
28 | '''make configuration subscriptable'''
29 | return setattr(cls, x, v)
30 |
31 | def get_attributes(self):
32 | attributes = inspect.getmembers(self, lambda a: not (inspect.isroutine(a)))
33 |
34 | # store only not the default attribute __xx__
35 | attribute_tuple_list = [a for a in attributes if not (a[0].startswith('__') and a[0].endswith('__'))]
36 |
37 | attribute_dict = {}
38 | for tup in attribute_tuple_list:
39 | key = tup[0]
40 | value = tup[1]
41 | if key == 'loss':
42 | value = str(value)
43 | # convert numpy value to float
44 | if type(value) == np.float64:
45 | value = float(value)
46 | attribute_dict[key] = value
47 |
48 | return attribute_dict
49 |
50 | def set_model_dir(self):
51 | now = datetime.now()
52 |
53 | time_info = f'{now.year}{now.month:02d}{now.day:02d}{now.hour:02d}{now.minute:02d}'
54 | self.model_dir = f'model_weights/{self.alias}-{time_info}.model'
55 |
56 | def attribute_to_integer(self):
57 | '''Convert the attributes in self.integer_attribute_list to integer'''
58 |
59 | for attribute in self.integer_attribute_list:
60 | self[attribute] = int(self[attribute])
61 |
62 | def set_config(self, config):
63 | for key in config:
64 | self[key] = config[key]
65 |
66 |
67 | class NNConfiguration(Configuration):
68 |
69 | def __init__(self):
70 | super(NNConfiguration, self).__init__()
71 | self.categorical_emb_dim = 128
72 |
73 | self.alias = 'NN'
74 | self.optimizer = 'adam'
75 | self.learning_rate = 0.001
76 | self.weight_decay = 0
77 | self.sequence_length = 10
78 | self.sess_length = 30
79 | self.num_embeddings = {}
80 | self.verbose = True
81 | self.hidden_dims = [256 , 128]
82 | self.dropout_rate = 0
83 | self.loss = torch.nn.BCELoss
84 |
85 |
86 | class LGBConfiguration(Configuration):
87 |
88 | def __init__(self):
89 | super(LGBConfiguration, self).__init__()
90 | self.categorical_emb_dim = 128
91 | self.alias = 'LGB'
92 | self.sequence_length = 10
93 | self.sess_length = 30
94 |
95 | class XGBConfiguration(Configuration):
96 |
97 | def __init__(self):
98 | super(XGBConfiguration, self).__init__()
99 | self.categorical_emb_dim = 128
100 | self.alias = 'XGB'
101 | self.sequence_length = 10
102 | self.sess_length = 30
103 |
104 |
--------------------------------------------------------------------------------
/src/nn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from utils import *
3 | from data import *
4 | import torch.nn as nn
5 |
6 |
7 | class Net(torch.nn.Module):
8 | def __init__(self, config):
9 | super(Net, self).__init__()
10 | self.config = config
11 | self.all_cat_columns = self.config.all_cat_columns
12 | self.categorical_emb_dim = config.categorical_emb_dim
13 | self.hidden_dims = config.hidden_dims
14 | self.num_embeddings = config.num_embeddings
15 |
16 | # embedding part
17 | self.emb_dict = torch.nn.ModuleDict()
18 | for cat_col in self.config.all_cat_columns:
19 | if cat_col =='item_id':
20 |
21 | self.emb_dict[cat_col] = torch.nn.Embedding(num_embeddings=self.num_embeddings[cat_col],
22 | embedding_dim=self.categorical_emb_dim, padding_idx = self.config.transformed_dummy_item)
23 | else:
24 | self.emb_dict[cat_col] = torch.nn.Embedding(num_embeddings=self.num_embeddings[cat_col],
25 | embedding_dim=self.categorical_emb_dim)
26 | # gru for extracting session and user interest
27 | self.gru_sess = torch.nn.GRU(input_size = self.categorical_emb_dim *2, hidden_size = self.categorical_emb_dim//2, bidirectional=True , num_layers=2, batch_first=True)
28 | self.other_item_gru = torch.nn.GRU(input_size = self.categorical_emb_dim, hidden_size = self.categorical_emb_dim//2, bidirectional=True , num_layers=1, batch_first=True)
29 |
30 | # linear layer on top of continuous features
31 | self.cont_linear = torch.nn.Linear(config.continuous_size,self.categorical_emb_dim )
32 |
33 | # hidden layerrs
34 | self.hidden1 = torch.nn.Linear(self.categorical_emb_dim*17 , self.hidden_dims[0])
35 | self.hidden2 = torch.nn.Linear(self.hidden_dims[0] + config.continuous_size*2 + 3 + config.neighbor_size, self.hidden_dims[1] )
36 |
37 | # output layer
38 | self.output = torch.nn.Linear(self.hidden_dims[1] , 1)
39 |
40 | # batch normalization
41 | self.bn = torch.nn.BatchNorm1d(self.categorical_emb_dim*17)
42 | self.bn_hidden = torch.nn.BatchNorm1d(self.hidden_dims[0] + config.continuous_size*2+ 3 + config.neighbor_size )
43 |
44 | def forward(self, item_id, past_interactions, mask, price_rank, city, last_item, impression_index, cont_features, star, past_interactions_sess, past_actions_sess, last_click_item, last_click_impression, last_interact_index, neighbor_prices, other_item_ids, city_platform):
45 | embeddings = []
46 | user_embeddings = []
47 | batch_size = item_id.size(0)
48 |
49 | # embedding of all categorical features
50 | emb_item = self.emb_dict['item_id'](item_id)
51 | emb_past_interactions = self.emb_dict['item_id'](past_interactions)
52 | emb_price_rank = self.emb_dict['price_rank'](price_rank)
53 | emb_city = self.emb_dict['city'](city)
54 | emb_last_item = self.emb_dict['item_id'](last_item)
55 | emb_impression_index = self.emb_dict['impression_index'](impression_index)
56 | emb_star = self.emb_dict['star'](star)
57 | emb_past_interactions_sess = self.emb_dict['item_id'](past_interactions_sess)
58 | emb_past_actions_sess = self.emb_dict['action'](past_actions_sess)
59 | emb_last_click_item = self.emb_dict['item_id'](last_click_item)
60 | emb_last_click_impression = self.emb_dict['impression_index'](last_click_impression)
61 | emb_last_interact_index = self.emb_dict['impression_index'](last_interact_index)
62 | emb_city_platform = self.emb_dict['city_platform'](city_platform)
63 | emb_other_item_ids = self.emb_dict['item_id'](other_item_ids)
64 |
65 | # other items processed by gru
66 | emb_other_item_ids_gru, _ = self.other_item_gru(emb_other_item_ids)
67 | pooled_other_item_ids = F.max_pool1d(emb_other_item_ids_gru.permute(0,2,1), kernel_size=emb_other_item_ids_gru.size(1)).squeeze(2)
68 |
69 | # user's past clicked-out item
70 | emb_past_interactions = emb_past_interactions.permute(0,2,1)
71 | pooled_interaction = F.max_pool1d(emb_past_interactions, kernel_size=self.config.sequence_length).squeeze(2)
72 |
73 |
74 | # concatenate sequence of item ids and actions to model session dynamics
75 | emb_past_interactions_sess = torch.cat( [emb_past_interactions_sess, emb_past_actions_sess], dim=2)
76 | emb_past_interactions_sess , _ = self.gru_sess(emb_past_interactions_sess)
77 | emb_past_interactions_sess = emb_past_interactions_sess.permute(0,2,1)
78 | pooled_interaction_sess = F.max_pool1d(emb_past_interactions_sess, kernel_size=self.config.sess_length).squeeze(2)
79 |
80 |
81 | # categorical feature interactions
82 | item_interaction = emb_item * pooled_interaction
83 | item_last_item = emb_item * emb_last_item
84 | item_last_click_item = emb_item * emb_last_click_item
85 | imp_last_idx = emb_impression_index * emb_last_interact_index
86 |
87 |
88 |
89 | # efficiently compute the aggregation of feature interactions
90 | emb_list = [emb_item, pooled_interaction, emb_price_rank, emb_city, emb_last_item, emb_impression_index, emb_star]
91 | emb_concat = torch.cat(emb_list, dim=1)
92 | sum_squared = torch.pow( torch.sum( emb_concat, dim=1) , 2).unsqueeze(1)
93 | squared_sum = torch.sum( torch.pow( emb_concat, 2) , dim=1).unsqueeze(1)
94 | second_order = 0.5 * (sum_squared - squared_sum)
95 |
96 | # compute the square of continuous features
97 | squared_cont = torch.pow(cont_features, 2)
98 |
99 |
100 | # DNN part
101 | concat = torch.cat([emb_item, pooled_interaction, emb_price_rank, emb_city, emb_last_item, emb_impression_index, item_interaction, item_last_item, emb_star, pooled_interaction_sess, emb_last_click_item, emb_last_click_impression, emb_last_interact_index, item_last_click_item, imp_last_idx, pooled_other_item_ids, emb_city_platform] , dim=1)
102 | concat = self.bn(concat)
103 |
104 | hidden = torch.nn.ReLU()(self.hidden1(concat))
105 |
106 | hidden = torch.cat( [cont_features, hidden, sum_squared, squared_sum, second_order, squared_cont, neighbor_prices] , dim=1)
107 |
108 | hidden = self.bn_hidden(hidden)
109 | hidden = torch.nn.ReLU()(self.hidden2(hidden))
110 |
111 |
112 | output = torch.sigmoid(self.output(hidden)).squeeze()
113 |
114 |
115 | return output
116 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | from constant import *
3 | import torch
4 | import os
5 | import random
6 | import time
7 | from contextlib import contextmanager
8 | import numpy as np
9 | import pandas as pd
10 | from sklearn.preprocessing import MinMaxScaler
11 | import torch.nn.functional as F
12 | from scipy.special import erfinv
13 | from ordered_set import OrderedSet
14 | import scipy
15 | from collections import Counter
16 | from timezonefinder import TimezoneFinder
17 | from geopy.geocoders import Nominatim
18 | import pycountry
19 | import datetime
20 | import pytz
21 |
22 | tzf = TimezoneFinder()
23 |
24 |
25 | activation_getter = {'iden': lambda x: x, 'relu': F.relu, 'tanh': torch.tanh, 'sigm': torch.sigmoid}
26 |
27 |
28 | def platform2country(platform):
29 | '''
30 | return country name given platform
31 | '''
32 |
33 | if pycountry.countries.get(alpha_2=platform) != None:
34 | try:
35 | return pycountry.countries.get(alpha_2=platform).common_name
36 | except:
37 | return pycountry.countries.get(alpha_2=platform).name
38 |
39 |
40 | else:
41 | return np.nan
42 |
43 |
44 | def location2utc_offset(location):
45 | '''
46 | return the utc offset given the location
47 | '''
48 | geolocator = Nominatim(user_agent=str(location))
49 | # print(location)
50 | location = geolocator.geocode(location)
51 |
52 | if location == None:
53 | return np.nan
54 | try:
55 | lat = location.latitude
56 | lon = location.longitude
57 | offset_sec = datetime.datetime.now(pytz.timezone(tzf.timezone_at(lng=lon, lat=lat)))
58 | return offset_sec.utcoffset().total_seconds()/60/60
59 | except:
60 | return np.nan
61 |
62 | def find_longest_repetitive_sequences(sequence):
63 | '''
64 | returns a dict that maps each element with the length of its longest repetitive sequneces in the list
65 | args:
66 | sequence: list
67 |
68 | '''
69 | counter = Counter()
70 | current_element = None
71 |
72 | # iterate the sequence
73 | for element in sequence:
74 |
75 | if current_element == None:
76 | current_element = element
77 | current_rep = 1
78 | elif element == current_element:
79 | current_rep += 1
80 | elif element != current_element:
81 | # update the element with the longest rep
82 | if counter[current_element] < current_rep:
83 | counter[current_element] = current_rep
84 | current_rep = 1
85 | current_element = element
86 | # update the element with the longest rep outside the loop
87 | if len(sequence) > 0 and counter[current_element] < current_rep:
88 | counter[current_element] = current_rep
89 |
90 | return counter
91 |
92 |
93 |
94 |
95 | def qcut_safe(prices, q):
96 | nbins=min(q, len(prices))
97 | result = pd.qcut(prices, nbins, labels=np.arange(nbins) )
98 |
99 | return result
100 |
101 |
102 |
103 | class GaussRankScaler():
104 |
105 | def __init__( self ):
106 | self.epsilon = 1e-9
107 | self.lower = -1 + self.epsilon
108 | self.upper = 1 - self.epsilon
109 | self.range = self.upper - self.lower
110 |
111 | def fit_transform( self, X ):
112 |
113 | i = np.argsort( X, axis = 0 )
114 | j = np.argsort( i, axis = 0 )
115 |
116 | assert ( j.min() == 0 ).all()
117 | assert ( j.max() == len( j ) - 1 ).all()
118 |
119 | j_range = len( j ) - 1
120 | self.divider = j_range / self.range
121 |
122 | transformed = j / self.divider
123 | transformed = transformed - self.upper
124 | transformed = scipy.special.erfinv( transformed )
125 | ############
126 | # transformed = transformed - np.mean(transformed)
127 |
128 | return transformed
129 |
130 | def seed_everything(seed=42):
131 | random.seed(seed)
132 | torch.manual_seed(seed)
133 | torch.cuda.manual_seed_all(seed)
134 | np.random.seed(seed)
135 | os.environ['PYTHONHASHSEED'] = str(seed)
136 |
137 | def compute_rank(inp, to_np=False):
138 | sorted_inp = sorted(inp)
139 | out = [sorted_inp.index(i) for i in inp]
140 | if to_np:
141 | out = np.array(out)
142 | return out
143 |
144 | def set_seed(seed, cuda=False):
145 |
146 | np.random.seed(seed)
147 | random.seed(seed)
148 | if cuda:
149 | torch.cuda.manual_seed(seed)
150 | else:
151 | torch.manual_seed(seed)
152 |
153 |
154 | class CategoricalEncoder():
155 | '''
156 | This class is for those operating on large data, in which sklearn's LabelEncoder class may take too much time.
157 | This encoder is only suitable for 1-d array/ list. You may modify it to become n-d compatible.
158 | '''
159 | def __init__(self):
160 | self.f_dict = {}
161 | self.r_dict = {}
162 |
163 | def fit(self, array):
164 | '''
165 |
166 | :param array: list or np array
167 | :return: None
168 | '''
169 |
170 | unique_elements = OrderedSet(array)
171 | # unique_elements = sorted(unique_elements)
172 | # print(DUMMY_ITEM in unique_elements)
173 | # print('-1' in unique_elements)
174 | self.n_elements = 0
175 | self.f_dict = {}
176 | self.r_dict = {}
177 |
178 | for e in unique_elements:
179 | self.f_dict[e] = self.n_elements
180 | self.r_dict[self.n_elements] = e
181 | self.n_elements += 1
182 |
183 |
184 | def continue_fit(self, array):
185 | '''
186 | Do not refresh n_elements, count from the latest n_elements.
187 | :param array:
188 | :return: None
189 | '''
190 | unique_elements = set(array)
191 | for e in unique_elements:
192 | if e not in self.f_dict:
193 | self.f_dict[e] = self.n_elements
194 | self.r_dict[self.n_elements] = e
195 | self.n_elements += 1
196 |
197 |
198 | def reverse_transform(self, transformed_array, to_np=False):
199 | '''
200 |
201 | :param transformed_array: list or np array
202 | :return: array: np array with the same shape as input
203 | '''
204 |
205 |
206 | array = [self.r_dict[e] for e in transformed_array]
207 | if to_np:
208 | array = np.array(array)
209 | return array
210 |
211 |
212 | def transform(self, array, to_np=False):
213 | '''
214 |
215 | :param array: array list or np array
216 | :return: list or np array with the same shape as the input
217 | '''
218 | transformed_array = [self.f_dict[e] for e in array]
219 | if to_np:
220 | transformed_array = np.array(transformed_array)
221 | return transformed_array
222 |
223 | def fit_transform(self, array, to_np=False):
224 | '''
225 |
226 | :param array: array list or np array
227 | :return: list or np array with the same shape as the input
228 | '''
229 | self.fit(array)
230 | return self.transform(array, to_np)
231 |
232 | def str2bool(v):
233 | return v.lower() in ('true')
234 |
235 | def use_optimizer(network, params):
236 | if params['optimizer'] == 'adam':
237 | optimizer = torch.optim.Adam(network.parameters(), lr=params['learning_rate'] , weight_decay=params['weight_decay'], eps=1e-07, amsgrad=True)
238 | elif params['optimizer'] == 'rmsprop':
239 | optimizer = torch.optim.RMSprop(network.parameters(),
240 | lr=params['learning_rate'],)
241 | elif params['optimizer'] == 'sgd':
242 | optimizer = torch.optim.SGD(network.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
243 | return optimizer
244 |
245 | def get_attn_key_pad_mask(seq_k, seq_q, transformed_dummy_value):
246 | ''' For masking out the padding part of key sequence. '''
247 |
248 | # Expand to fit the shape of key query attention matrix.
249 | len_q = seq_q.size(1)
250 | padding_mask = seq_k.eq(transformed_dummy_value)
251 | padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1) # b x lq x lk
252 |
253 | return padding_mask
254 |
255 | def compute_mean_reciprocal_rank(rs):
256 | '''
257 | rs: 2d array
258 |
259 | >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
260 | >>> mean_reciprocal_rank(rs)
261 | 0.61111111111111105
262 | >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
263 | >>> mean_reciprocal_rank(rs)
264 | 0.5
265 | >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
266 | >>> mean_reciprocal_rank(rs)
267 | 0.75
268 | '''
269 |
270 | rs = (np.asarray(r).nonzero()[0] for r in rs)
271 | return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
272 |
273 | @contextmanager
274 | def timer(name):
275 | t0 = time.time()
276 | yield
277 | print('[{}] done in {:.5f} s'.format(name,(time.time() - t0)))
278 |
279 |
280 |
--------------------------------------------------------------------------------
/src/run_nn.py:
--------------------------------------------------------------------------------
1 | from config import *
2 | from data import *
3 | from utils import *
4 | from constant import *
5 | from nn import *
6 | from torch.autograd import Variable
7 | from tqdm import tqdm
8 | import numpy as np
9 | import os
10 | from datetime import datetime
11 | import pytz
12 |
13 |
14 |
15 |
16 |
17 |
18 | model_name = 'nn_xnn_time_diff_v2'
19 |
20 |
21 | torch.backends.cudnn.deterministic = True
22 | seed_everything(42)
23 |
24 | configuration = NNConfiguration()
25 |
26 |
27 | os.environ["CUDA_VISIBLE_DEVICES"] = str(configuration.device_id)
28 | print("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"])
29 |
30 | if configuration.sub_sample:
31 | model_name += '_140k'
32 | else:
33 | model_name += '_all'
34 |
35 | if configuration.use_test:
36 | model_name += '_ut'
37 |
38 | if configuration.debug:
39 | model_name += '_db'
40 |
41 | model_name += f'_{configuration.device_id}'
42 |
43 |
44 | weight_path = f"../weights/{model_name}.model"
45 |
46 |
47 |
48 |
49 | print(configuration.get_attributes())
50 |
51 |
52 | data_gen = NNDataGenerator(configuration)
53 |
54 |
55 |
56 | print(configuration.get_attributes())
57 |
58 |
59 |
60 | valid_data = data_gen.val_data
61 | train_data= data_gen.train_data
62 |
63 |
64 |
65 | if configuration.use_cuda:
66 | net = Net(configuration).cuda()
67 | else:
68 | net = Net(configuration)
69 |
70 | optim = use_optimizer(net, configuration)
71 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, 'min',min_lr=0.0005, factor=0.7, verbose=True)
72 | print(net)
73 |
74 | def get_prediction(loader, net):
75 | net.eval()
76 | all_scores = []
77 | validation_loss = []
78 | for batch_id, data in enumerate(loader):
79 | with torch.no_grad():
80 | item_ids = Variable(data[0]).to(device=device_type)
81 | targets = Variable(data[1]).to(device=device_type)
82 | past_interactions = Variable(data[2]).to(device=device_type)
83 |
84 | past_interaction_masks = (data[3])
85 |
86 | price_rank = Variable(data[4]).to(device=device_type)
87 | city = Variable(data[5]).to(device=device_type)
88 | last_item = Variable(data[6]).to(device=device_type)
89 | impression_index = Variable(data[7]).to(device=device_type)
90 | continuous_features = Variable(data[8]).to(device=device_type)
91 |
92 | star = Variable(data[9]).to(device=device_type)
93 |
94 | past_interactions_sess = Variable(data[10]).to(device=device_type)
95 | past_actions_sess = Variable(data[11]).to(device=device_type)
96 |
97 |
98 | last_click_item = Variable(data[12]).to(device=device_type)
99 | last_click_impression = Variable(data[13]).to(device=device_type)
100 | last_interact_index = Variable(data[14]).to(device=device_type)
101 | neighbor_prices = Variable(data[15]).to(device=device_type)
102 | other_item_ids = Variable(data[16]).to(device=device_type)
103 | city_platform = Variable(data[17]).to(device=device_type)
104 |
105 | prediction = net(item_ids, past_interactions, past_interaction_masks, price_rank, city, last_item, impression_index, continuous_features, star, past_interactions_sess, past_actions_sess, last_click_item, last_click_impression, last_interact_index, neighbor_prices, other_item_ids, city_platform)
106 | loss = crit(prediction,targets).item()
107 | prediction = prediction.detach().cpu().numpy().tolist()
108 | all_scores += prediction
109 | validation_loss.append(loss)
110 | validation_loss = np.mean(validation_loss)
111 | return all_scores, validation_loss
112 |
113 | def evaluate_valid(val_loader, val_df, net ):
114 |
115 |
116 | val_df['score'], val_loss = get_prediction(val_loader, net)
117 |
118 |
119 | grouped_val = val_df.groupby('session_id')
120 | rss = []
121 | rss_group = {i:[] for i in range(1,26)}
122 | incorrect_session = {}
123 | for session_id, group in grouped_val:
124 |
125 | scores = group['score']
126 | sorted_arg = np.flip(np.argsort(scores))
127 |
128 | if group['label'].values[sorted_arg][0] != 1:
129 | incorrect_session[session_id] = (sorted_arg.values, group['label'].values[sorted_arg])
130 |
131 | rss.append( group['label'].values[sorted_arg])
132 | rss_group[len(group)].append(group['label'].values[sorted_arg])
133 |
134 | mrr = compute_mean_reciprocal_rank(rss)
135 | mrr_group = {i:(len(rss_group[i]), compute_mean_reciprocal_rank(rss_group[i])) for i in range(1,26)}
136 | # print(mrr_group)
137 | pickle.dump( incorrect_session, open(f'../output/{model_name}_val_incorrect_order.p','wb'))
138 |
139 | return mrr, mrr_group, val_loss
140 |
141 |
142 |
143 | device_type='cuda'
144 |
145 |
146 |
147 | crit = configuration.loss()
148 |
149 |
150 | best_mrr = 0
151 | early_stopping = configuration.early_stopping
152 | not_improve_round = 0
153 | val_loader = data_gen.evaluate_data_valid()
154 | test_loader =data_gen.instance_a_test_loader()
155 | train_loader = data_gen.instance_a_train_loader()
156 | n_iter = 0
157 | stopped = False
158 | for i in range(configuration.num_epochs):
159 |
160 |
161 | net.train()
162 | for batch_id, data in enumerate(tqdm(train_loader)):
163 | optim.zero_grad()
164 | n_iter += 1
165 |
166 | item_ids = Variable(data[0]).to(device=device_type)
167 | targets = Variable(data[1]).to(device=device_type)
168 | past_interactions = Variable(data[2]).to(device=device_type)
169 |
170 | past_interaction_masks = (data[3])
171 |
172 | price_rank = Variable(data[4]).to(device=device_type)
173 | city = Variable(data[5]).to(device=device_type)
174 | last_item = Variable(data[6]).to(device=device_type)
175 | impression_index = Variable(data[7]).to(device=device_type)
176 | continuous_features = Variable(data[8]).to(device=device_type)
177 | star = Variable(data[9]).to(device=device_type)
178 |
179 | past_interactions_sess = Variable(data[10]).to(device=device_type)
180 | past_actions_sess = Variable(data[11]).to(device=device_type)
181 |
182 | # other_item_impressions = Variable(data[13]).to(device=device_type)
183 | last_click_item = Variable(data[12]).to(device=device_type)
184 | last_click_impression = Variable(data[13]).to(device=device_type)
185 | last_interact_index = Variable(data[14]).to(device=device_type)
186 | neighbor_prices = Variable(data[15]).to(device=device_type)
187 | other_item_ids = Variable(data[16]).to(device=device_type)
188 | city_platform = Variable(data[17]).to(device=device_type)
189 | prediction = net(item_ids, past_interactions, past_interaction_masks, price_rank, city, last_item, impression_index, continuous_features, star, past_interactions_sess, past_actions_sess, last_click_item, last_click_impression, last_interact_index, neighbor_prices, other_item_ids, city_platform)
190 |
191 | loss = crit(prediction,targets)
192 | loss.backward()
193 | optim.step()
194 |
195 | mrr, mrr_group, val_loss = evaluate_valid(val_loader, valid_data, net)
196 | if mrr > best_mrr:
197 | print(f"improve from {best_mrr} to {mrr}")
198 | best_mrr = mrr
199 | not_improve_round = 0
200 | torch.save(net.state_dict(), weight_path)
201 | else:
202 | print(f"didn't improve from {best_mrr} to {mrr}")
203 | not_improve_round += 1
204 | if not_improve_round >= early_stopping:
205 | break
206 |
207 |
208 | net.load_state_dict(torch.load(weight_path))
209 |
210 |
211 | print("BEST mrr", best_mrr)
212 |
213 |
214 |
215 | if configuration.debug:
216 | exit(0)
217 |
218 |
219 |
220 |
221 | test_df = data_gen.test_data
222 | test_df['score'], _ = get_prediction(test_loader, net)
223 |
224 |
225 |
226 | with open(f'../output/{model_name}_test_score.p', 'wb') as f:
227 | pickle.dump( test_df.loc[:,['score', 'session_id', 'step']],f, protocol=4)
228 |
229 | grouped_test = test_df.groupby('session_id')
230 | predictions = []
231 | session_ids = []
232 | for session_id, group in grouped_test:
233 |
234 | scores = group['score']
235 | sorted_arg = np.flip(np.argsort(scores))
236 | sorted_item_ids = group['item_id'].values[sorted_arg]
237 | sorted_item_ids = data_gen.cat_encoders['item_id'].reverse_transform(sorted_item_ids)
238 | sorted_item_string = ' '.join([str(i) for i in sorted_item_ids])
239 | predictions.append(sorted_item_string)
240 | session_ids.append(session_id)
241 |
242 | prediction_df = pd.DataFrame()
243 | prediction_df['session_id'] = session_ids
244 | prediction_df['item_recommendations'] = predictions
245 |
246 | print("pred df shape", prediction_df.shape)
247 | sub_df = pd.read_csv('../input/submission_popular.csv')
248 | sub_df.drop('item_recommendations', axis=1, inplace=True)
249 | sub_df = sub_df.merge(prediction_df, on="session_id")
250 | # sub_df['item_recommendations'] = predictions
251 |
252 | sub_df.to_csv(f'../output/{model_name}.csv', index=None)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [2019] [Rosetta.ai]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/src/data.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | import pickle
5 | import gc
6 | from constant import *
7 | from utils import *
8 | from config import *
9 | import torch
10 | from torch.utils.data import DataLoader, Dataset
11 | # from sklearn.preprocessing import LabelEncoder
12 | from joblib import Parallel, delayed
13 | from tqdm import tqdm
14 | from collections import defaultdict
15 | from ordered_set import OrderedSet
16 | from sklearn.decomposition import TruncatedSVD
17 | from sklearn.preprocessing import MinMaxScaler, StandardScaler
18 | from sklearn.feature_extraction.text import TfidfVectorizer
19 |
20 |
21 | class NNDataLoader():
22 | def __init__(self, data, config, shuffle=True, batch_size=128, continuous_features=None):
23 | self.item_id = torch.LongTensor(data.item_id.values)
24 | self.config = config
25 | self.label = torch.FloatTensor(data.label.values)
26 | self.past_interactions = torch.LongTensor(np.vstack(data.past_interactions.values))
27 | self.batch_size = batch_size
28 | self.shuffle = shuffle
29 | self.indices = np.arange(len(self.item_id))
30 | self.past_interaction_masks = self.past_interactions != self.config.transformed_dummy_item
31 | self.price_rank = torch.LongTensor(data.price_rank.values)
32 | self.city = torch.LongTensor(data.city.values)
33 | self.last_item = torch.LongTensor(data.last_item.values)
34 | self.impression_index = torch.LongTensor(data.impression_index)
35 |
36 | self.continuous_features = torch.FloatTensor(data.loc[:,continuous_features].values)
37 |
38 | self.neighbor_prices = torch.FloatTensor(np.vstack(data.neighbor_prices))
39 |
40 |
41 |
42 | self.star = torch.LongTensor(data.star)
43 |
44 | self.past_interactions_sess = torch.LongTensor(np.vstack(data.past_interactions_sess.values))
45 | self.past_actions_sess = torch.LongTensor(np.vstack(data.past_actions_sess.values))
46 | self.last_click_item = torch.LongTensor(data.last_click_item.values)
47 | self.last_click_impression = torch.LongTensor(data.last_click_impression.values)
48 | self.last_interact_index = torch.LongTensor(data.last_interact_index.values)
49 | self.other_item_ids = torch.LongTensor(np.vstack(data.other_item_ids.values))
50 | self.city_platform = torch.LongTensor(data.city_platform.values)
51 |
52 |
53 | assert len(self.item_id) == len(self.past_interactions)
54 | assert len(self.past_interactions) == len(self.label)
55 | def __len__(self):
56 | return len(self.item_id) // self.batch_size
57 |
58 | def __iter__(self):
59 | self.batch_id = 0
60 | if self.shuffle:
61 | np.random.shuffle(self.indices)
62 | return self
63 |
64 | def __next__(self):
65 | if self.batch_id * self.batch_size <= len(self.indices):
66 | current_indices = self.indices[self.batch_id * self.batch_size: (self.batch_id + 1) * self.batch_size]
67 | result = [self.item_id[current_indices], self.label[current_indices], self.past_interactions[current_indices]\
68 | , self.past_interaction_masks[current_indices], self.price_rank[current_indices], self.city[current_indices]\
69 | , self.last_item[current_indices], self.impression_index[current_indices], self.continuous_features[current_indices]\
70 | , self.star[current_indices], self.past_interactions_sess[current_indices], self.past_actions_sess[current_indices]\
71 | , self.last_click_item[current_indices], self.last_click_impression[current_indices], self.last_interact_index[current_indices]\
72 | , self.neighbor_prices[current_indices], self.other_item_ids[current_indices], self.city_platform[current_indices]]
73 | self.batch_id += 1
74 | return result
75 | else:
76 | raise StopIteration
77 |
78 |
79 |
80 | class NNDataGenerator():
81 | """Construct dataset for NN"""
82 | def __init__(self, config):
83 | """
84 | args:
85 | target_action: the target action at the next timestep. Can be 'buy', 'select', 'click', 'view'
86 | monitor_actions: the action that we should keep track with
87 | """
88 |
89 | self.config = config
90 |
91 | self.target_action = self.config.target_action = 'clickout item'
92 | # self.config.keep_columns = self.keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item']
93 | self.config.all_cat_columns = self.all_cat_columns = ['user_id', 'item_id', 'city','action', 'city_platform']
94 |
95 | with open( f'{input_dir}/train_v2.p', 'rb') as f:
96 | train = pickle.load(f)
97 | train['id']= np.arange(len(train))
98 |
99 | with open(f'{input_dir}/test_v2.p', 'rb') as f:
100 | test = pickle.load(f)
101 | test['id'] = np.arange( len(train), len(train)+ len(test))
102 |
103 | with open('../input/item_metadata.p', 'rb') as f:
104 | item_meta = pickle.load(f)
105 | item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|'))
106 | item_meta['item_id'] = item_meta['item_id'].apply(str)
107 |
108 |
109 |
110 | if config.sub_sample:
111 | with open('../input/selected_users_140k.p', 'rb') as f:
112 | selected_users = pickle.load(f)
113 |
114 | train = train.loc[train.user_id.isin(selected_users),:]
115 |
116 | if config.debug:
117 | train = train.sample(1000)
118 | test = test.sample(1000)
119 |
120 | train.rename(columns={'reference': 'item_id', 'action_type':'action'}, inplace=True)
121 | test.rename(columns={'reference': 'item_id', 'action_type':'action'}, inplace=True)
122 |
123 |
124 |
125 |
126 | # fill item_id with DUMMY
127 | train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
128 | test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
129 |
130 |
131 | train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
132 | test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
133 |
134 |
135 |
136 |
137 |
138 |
139 | train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
140 | test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
141 |
142 | train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
143 | test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
144 |
145 | train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
146 | test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
147 |
148 | train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
149 | test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
150 |
151 |
152 |
153 |
154 |
155 | # filter out rows where reference doesn't present in impression
156 | train['in_impressions'] = True
157 | train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
158 | train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
159 |
160 | test['in_impressions'] = True
161 | test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
162 | test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
163 |
164 |
165 | train['item_id'] = train['item_id'].apply(str)
166 | train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|'))
167 | train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
168 |
169 |
170 |
171 | test['item_id'] = test['item_id'].apply(str)
172 | test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|'))
173 | test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
174 |
175 |
176 |
177 | data = pd.concat([train, test], axis=0)
178 | data = data.reset_index(drop=True)
179 | all_items = []
180 |
181 | for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()]:
182 | all_items += imp
183 |
184 | unique_items = OrderedSet(all_items)
185 | unique_actions = OrderedSet(data.action.values)
186 |
187 | train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list))
188 | test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list))
189 |
190 |
191 | train_session_actions = dict(train.groupby('session_id')['action'].apply(list))
192 | test_session_actions = dict(test.groupby('session_id')['action'].apply(list))
193 |
194 |
195 | train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int)
196 | test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int)
197 |
198 |
199 | train['city_platform'] = train.apply(lambda x: x['city'] + x['platform'], axis=1)
200 | test['city_platform'] = test.apply(lambda x: x['city'] + x['platform'], axis=1)
201 | # get last item
202 | train['last_item'] = np.nan
203 | test['last_item'] = np.nan
204 |
205 | train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist()
206 | test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist()
207 | train['last_item'] = train_shifted_item_id
208 | test['last_item'] = test_shifted_item_id
209 |
210 |
211 | train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist()
212 | test_shifted_item_id = [DUMMY_ITEM] *2 + test.item_id.values[:-2].tolist()
213 |
214 | train['second_last_item'] = train_shifted_item_id
215 | test['second_last_item'] = test_shifted_item_id
216 |
217 |
218 | train['step_rank'] = train.groupby('session_id')['timestamp'].rank(method='max', ascending=True)
219 | test['step_rank'] = test.groupby('session_id')['timestamp'].rank(method='max', ascending=True)
220 |
221 |
222 | train.loc[(train.step_rank == 1) , 'last_item'] = DUMMY_ITEM
223 | test.loc[(test.step_rank == 1) , 'last_item'] = DUMMY_ITEM
224 |
225 |
226 | train.loc[(train.step_rank == 2) , 'second_last_item'] = DUMMY_ITEM
227 | test.loc[(test.step_rank == 2) , 'second_last_item'] = DUMMY_ITEM
228 |
229 |
230 |
231 | data = pd.concat([train, test], axis=0)
232 | data = data.reset_index(drop=True)
233 |
234 | data_feature = data.loc[:,['id','session_id','timestamp', 'step']].copy()
235 | data_feature['time_diff'] = data_feature.groupby('session_id')['timestamp'].diff()
236 | data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff()
237 | data_feature['time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values)
238 | data_feature['time_diff_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff_diff'].values)
239 | data_feature['mm_step'] = GaussRankScaler().fit_transform(data_feature['step'].values)
240 | data_feature['day'] = MinMaxScaler().fit_transform(pd.to_datetime(data.timestamp, unit='s').dt.day.values.reshape(-1,1) )
241 | data_feature['rg_timestamp'] = GaussRankScaler().fit_transform(data_feature['timestamp'].values)
242 |
243 |
244 | data_feature = data_feature.drop( ['session_id','timestamp','step'],axis=1)
245 |
246 |
247 | # get time diff
248 | train = train.merge(data_feature, on='id', how='left')
249 | test = test.merge(data_feature, on='id', how='left')
250 |
251 | train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list))
252 | test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list))
253 |
254 | self.cat_encoders = {}
255 |
256 | for col in self.all_cat_columns:
257 | self.cat_encoders[col] = CategoricalEncoder()
258 |
259 | self.cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM] )
260 | self.cat_encoders['city'].fit(data.city.values)
261 | self.cat_encoders['city_platform'].fit(data.city_platform.values)
262 | self.cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION])
263 | self.cat_encoders['user_id'].fit(data.user_id.values)
264 | # with open('../input/user_encoder.p','rb') as f:
265 | # self.cat_encoders['user_id'] = pickle.load(f)
266 | # self.cat_encoders['user_id'].fit(data.user_id.tolist() )
267 |
268 |
269 | for col in self.all_cat_columns:
270 |
271 | train[col] = self.cat_encoders[col].transform(train[col].values)
272 | test[col] = self.cat_encoders[col].transform(test[col].values)
273 | self.config.num_embeddings[col] = self.cat_encoders[col].n_elements
274 |
275 |
276 | #this is an integer
277 | self.config.transformed_clickout_action = self.transformed_clickout_action = self.cat_encoders['action'].transform(['clickout item'])[0]
278 | self.config.transformed_dummy_action = self.transformed_dummy_action = self.cat_encoders['action'].transform([DUMMY_ACTION])[0]
279 | self.transformed_interaction_image = self.cat_encoders['action'].transform(['interaction item image'])[0]
280 | self.transformed_interaction_deals = self.cat_encoders['action'].transform(['interaction item deals'])[0]
281 | self.transformed_interaction_info = self.cat_encoders['action'].transform(['interaction item info'])[0]
282 | self.transformed_interaction_rating = self.cat_encoders['action'].transform(['interaction item rating'])[0]
283 |
284 | self.config.transformed_dummy_item = self.transformed_dummy_item = self.cat_encoders['item_id'].transform([DUMMY_ITEM])[0]
285 | self.config.transformed_nan_item = self.transformed_nan_item = self.cat_encoders['item_id'].transform(['nan'])[0]
286 |
287 |
288 | # transform last item
289 | train['last_item'] = self.cat_encoders['item_id'].transform(train['last_item'].values)
290 | test['last_item'] = self.cat_encoders['item_id'].transform(test['last_item'].values)
291 |
292 | train['second_last_item'] = self.cat_encoders['item_id'].transform(train.second_last_item.values)
293 | test['second_last_item'] = self.cat_encoders['item_id'].transform(test.second_last_item.values)
294 |
295 | # transform session interactions and pad dummy in front of all of them
296 | for session_id, item_list in train_session_interactions.items():
297 | train_session_interactions[session_id] = [self.transformed_dummy_item] * self.config.sess_length + self.cat_encoders['item_id'].transform(item_list)
298 |
299 | for session_id, item_list in test_session_interactions.items():
300 | test_session_interactions[session_id] = [self.transformed_dummy_item] * self.config.sess_length + self.cat_encoders['item_id'].transform(item_list)
301 |
302 | for session_id, action_list in train_session_actions.items():
303 | train_session_actions[session_id] = [self.transformed_dummy_action] * self.config.sess_length + self.cat_encoders['action'].transform(action_list)
304 |
305 | for session_id, action_list in test_session_actions.items():
306 | test_session_actions[session_id] = [self.transformed_dummy_action] * self.config.sess_length + self.cat_encoders['action'].transform(action_list)
307 |
308 |
309 | implicit_train = train.loc[train.action != self.transformed_clickout_action, :]
310 | implicit_test = test.loc[test.action != self.transformed_clickout_action, :]
311 |
312 |
313 |
314 |
315 |
316 | # get interaction count for all item
317 | interaction_item_ids = implicit_train.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + implicit_test.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
318 | unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True)
319 | self.interaction_count_dict = dict(zip(unique_interaction_items, counts))
320 |
321 | # get interaction count for all item
322 | interaction_image_item_ids = train.loc[train.action == self.transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == self.transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
323 | unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True)
324 | self.image_count_dict = dict(zip(unique_interaction_image_items, counts))
325 |
326 |
327 | # get only the clickout
328 | train = train.loc[train.action ==self.transformed_clickout_action,:]
329 | test = test.loc[test.action == self.transformed_clickout_action,:]
330 |
331 |
332 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False)
333 |
334 |
335 |
336 | # compute global item-price DataFrame
337 | # prices = np.hstack([np.hstack(train['prices'].values), np.hstack(test.prices.values)])
338 | item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)])
339 |
340 | unique_items, counts = np.unique(item_ids, return_counts=True)
341 | self.item_popularity_dict = dict(zip(unique_items, counts))
342 |
343 | clickout_item_ids = train.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
344 | unique_clickout_items, counts = np.unique(clickout_item_ids, return_counts=True)
345 |
346 | self.clickout_count_dict = dict(zip(unique_clickout_items, counts))
347 |
348 | self.platform_clickout_count = pd.concat([train, test], axis=0).groupby(['platform','item_id']).size()
349 |
350 |
351 |
352 |
353 |
354 | if config.debug:
355 | val = train.loc[train.step_rank == 1,:].iloc[:5]
356 | else:
357 | val = train.loc[train.step_rank == 1,:].iloc[:50000]
358 |
359 | val_index = val.index
360 | train = train.loc[~train.index.isin(val_index),:]
361 |
362 |
363 |
364 | # {'user_id':[11,2,5,9,]}
365 | self.past_interaction_dict = {}
366 | self.past_interaction_dict_sess = {}
367 | self.last_click_sess_dict = {}
368 | self.last_impressions_dict = {}
369 | self.sess_impressions_dict = {}
370 | self.sess_last_step_dict = {}
371 | self.sess_last_imp_idx_dict = {}
372 | self.sess_last_price_dict = {}
373 | self.sess_time_diff_dict = {}
374 |
375 |
376 | # split the interaction df into train/ val and construct training sequences
377 | self.train_data = self.build_user_item_interactions(train, train_session_interactions, train_session_actions, train_session_time_diff)
378 | self.val_data = self.build_user_item_interactions(val, train_session_interactions, train_session_actions, train_session_time_diff)
379 | self.test_data, labeled_test = self.build_user_item_interactions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False)
380 |
381 | # standard scale price
382 |
383 | price_sc = StandardScaler()
384 |
385 |
386 | self.train_data['price_diff'] = price_sc.fit_transform(self.train_data.price_diff.values.reshape(-1,1))
387 | self.val_data['price_diff'] = price_sc.transform(self.val_data.price_diff.values.reshape(-1,1))
388 | self.test_data['price_diff'] = price_sc.transform(self.test_data.price_diff.values.reshape(-1,1))
389 |
390 |
391 |
392 | price_mm = MinMaxScaler()
393 | self.train_data['price_ratio'] = price_mm.fit_transform(self.train_data.price_ratio.values.reshape(-1,1))
394 | self.val_data['price_ratio'] = price_mm.transform(self.val_data.price_ratio.values.reshape(-1,1))
395 | self.test_data['price_ratio'] = price_mm.transform(self.test_data.price_ratio.values.reshape(-1,1))
396 |
397 |
398 |
399 |
400 |
401 | price_mm.fit(np.hstack([np.hstack(self.train_data.neighbor_prices.values), np.hstack(self.val_data.neighbor_prices.values),\
402 | np.hstack(self.test_data.neighbor_prices.values)]).reshape(-1,1) )
403 | # print(self.train_data['neighbor_prices'].head(5))
404 | self.train_data['neighbor_prices'] = self.train_data['neighbor_prices'].apply(lambda x: price_mm.transform(np.array(x).reshape(-1,1)).reshape(-1))
405 | self.val_data['neighbor_prices'] = self.val_data['neighbor_prices'].apply(lambda x: price_mm.transform(np.array(x).reshape(-1,1)).reshape(-1))
406 | self.test_data['neighbor_prices'] = self.test_data['neighbor_prices'].apply(lambda x: price_mm.transform(np.array(x).reshape(-1,1)).reshape(-1))
407 |
408 |
409 |
410 | if config.use_test:
411 | self.train_data = pd.concat([self.train_data, labeled_test], axis=0)
412 |
413 | sampled_test_session = self.test_data.session_id.sample(frac=0.3)
414 |
415 | # self.train_data = pd.concat([self.train_data, self.test_data.loc[self.test_data.session_id.isin(sampled_test_session)]], axis=0)
416 | # item_meta multi-hot
417 | item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:]
418 | item_meta['item_id'] = self.cat_encoders['item_id'].transform(item_meta['item_id'].values)
419 | item_meta['star'] = 0
420 | item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1
421 | item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2
422 | item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3
423 | item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4
424 | item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5
425 |
426 | unique_property = list(OrderedSet(np.hstack(item_meta.properties.tolist())))
427 | self.unique_property = unique_property
428 |
429 | self.cat_encoders['item_property'] = CategoricalEncoder()
430 | self.cat_encoders['item_property'].fit(unique_property)
431 | item_properties_array = []
432 | for row in item_meta.itertuples():
433 | current_row = np.zeros(len(unique_property) + 2)
434 | one_indices = self.cat_encoders['item_property'].transform(row.properties)
435 | current_row[one_indices] = 1
436 | current_row[-1] = row.item_id
437 | current_row[-2] = row.star
438 | item_properties_array.append(current_row)
439 |
440 | item_properties_array = np.vstack(item_properties_array)
441 | item_properties_df = pd.DataFrame(item_properties_array, columns=unique_property + ['star', 'item_id'])
442 |
443 |
444 | item_properties_item_id = item_properties_df.item_id.values
445 | item_properties_star = item_properties_df.star.values
446 |
447 | tsvd = TruncatedSVD(n_components=30, n_iter=10, random_state=None)
448 | svd_matrix = tsvd.fit_transform(item_properties_df.drop( ['star', 'item_id'],axis=1).values)
449 | print("explained ratio", tsvd.explained_variance_ratio_.sum())
450 | svd_ip_columns = [ f'svd_ip_{i}' for i in np.arange(30)]
451 | item_properties_df = pd.DataFrame(svd_matrix, columns=svd_ip_columns)
452 | item_properties_df['item_id'] = item_properties_item_id
453 | item_properties_df['star'] = item_properties_star
454 | item_properties_df['pet_friendly'] = item_meta.properties.apply(lambda x: 'Pet Friendly' in x)
455 | item_properties_df['parking'] = item_meta.properties.apply(lambda x: 'Car Park' in x)
456 | item_properties_df = item_properties_df.astype(dtype= {"item_id":"int32","pet_friendly":"float32", "parking":"float32"})
457 |
458 |
459 | filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']]
460 | filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|'))
461 | filter_set = list(OrderedSet(np.hstack(filter_df['current_filters'].to_list())))
462 |
463 | self.cat_encoders['filters'] = CategoricalEncoder()
464 | self.cat_encoders['filters'].fit(filter_set)
465 | all_filter_array = []
466 |
467 | for row in filter_df.itertuples():
468 | current_row = np.zeros(len(filter_set) + 1, dtype=object)
469 | current_filters = row.current_filters
470 | one_indices = self.cat_encoders['filters'].transform(row.current_filters)
471 | current_row[one_indices] = 1
472 | current_row[-1] = row.id
473 | all_filter_array.append(current_row)
474 |
475 |
476 | all_filter_array = np.vstack(all_filter_array)
477 | filters_df = pd.DataFrame(all_filter_array, columns= [f'ft_{f}' for f in filter_set] + ['id'])
478 | dtype_dict = {"id":"int32"}
479 | for f in filter_set:
480 | dtype_dict[f'ft_{f}'] = "int32"
481 | filters_df = filters_df.astype(dtype= dtype_dict)
482 |
483 | filters_id = filters_df.id.values
484 |
485 |
486 | tsvd = TruncatedSVD(n_components=10, n_iter=10, random_state=None)
487 | svd_matrix = tsvd.fit_transform(filters_df.drop( ['id'],axis=1).values)
488 | print("explained ratio", tsvd.explained_variance_ratio_.sum())
489 | svd_ft_columns = [ f'svd_ft_{i}' for i in np.arange(10)]
490 | filters_df = pd.DataFrame(svd_matrix, columns=svd_ft_columns)
491 | for c in svd_ft_columns:
492 | filters_df[c] = MinMaxScaler().fit_transform(filters_df[c].values.reshape(-1,1))
493 | filters_df['id'] = filters_id
494 |
495 | del train, test, data
496 | gc.collect()
497 |
498 | self.train_data = self.train_data.merge(item_properties_df, on="item_id", how="left")
499 | self.val_data = self.val_data.merge(item_properties_df, on="item_id", how="left")
500 | self.test_data = self.test_data.merge(item_properties_df, on="item_id", how="left")
501 |
502 | self.train_data = self.train_data.merge(filters_df, on=['id'], how="left")
503 | self.val_data = self.val_data.merge(filters_df, on=['id'], how="left")
504 | self.test_data = self.test_data.merge(filters_df, on=['id'], how="left")
505 |
506 | self.train_data = self.train_data.merge(data_feature, on=['id'], how="left")
507 | self.val_data = self.val_data.merge(data_feature, on=['id'], how="left")
508 | self.test_data = self.test_data.merge(data_feature, on=['id'], how="left")
509 |
510 | self.train_data['interaction_image_count'] = self.train_data.item_id.map(self.image_count_dict)
511 | self.val_data['interaction_image_count'] = self.val_data.item_id.map(self.image_count_dict)
512 | self.test_data['interaction_image_count'] = self.test_data.item_id.map(self.image_count_dict)
513 |
514 | train_other_is_interacted = np.vstack(self.train_data.other_is_interacted.values).astype(np.float32)
515 | val_other_is_interacted = np.vstack(self.val_data.other_is_interacted.values).astype(np.float32)
516 | test_other_is_interacted = np.vstack(self.test_data.other_is_interacted.values).astype(np.float32)
517 |
518 | is_interacted_columns = []
519 | for i in range(train_other_is_interacted.shape[1]):
520 | col = f'is_int_{i}'
521 | is_interacted_columns.append(col)
522 | self.train_data[col] = train_other_is_interacted[:,i]
523 | self.val_data[col] = val_other_is_interacted[:,i]
524 | self.test_data[col] = test_other_is_interacted[:,i]
525 |
526 | self.train_data.drop('other_is_interacted',axis=1, inplace=True)
527 | self.val_data.drop('other_is_interacted',axis=1, inplace=True)
528 | self.test_data.drop('other_is_interacted',axis=1, inplace=True)
529 |
530 | train_other_is_clicked = np.vstack(self.train_data.other_is_clicked.values).astype(np.float32)
531 | val_other_is_clicked = np.vstack(self.val_data.other_is_clicked.values).astype(np.float32)
532 | test_other_is_clicked = np.vstack(self.test_data.other_is_clicked.values).astype(np.float32)
533 |
534 |
535 | is_clicked_columns = []
536 | for i in range(train_other_is_clicked.shape[1]):
537 | col = f'is_cl_{i}'
538 | is_clicked_columns.append(col)
539 | self.train_data[col] = train_other_is_clicked[:,i]
540 | self.val_data[col] = val_other_is_clicked[:,i]
541 | self.test_data[col] = test_other_is_clicked[:,i]
542 |
543 | self.train_data.drop('other_is_clicked',axis=1, inplace=True)
544 | self.val_data.drop('other_is_clicked',axis=1, inplace=True)
545 | self.test_data.drop('other_is_clicked',axis=1, inplace=True)
546 |
547 | # rank gauss transform
548 | train_len = self.train_data.shape[0]
549 | val_len = self.val_data.shape[0]
550 |
551 |
552 |
553 | self.continuous_features = svd_ip_columns + svd_ft_columns + is_interacted_columns + is_clicked_columns + ['mm_step','time_diff', 'day', 'mm_price', 'equal_last_impressions', 'price_diff','price','last_price','price_ratio','is_clicked','is_interacted','item_popularity','is_interacted_image','is_interacted_deals','interaction_count','clickout_count','interaction_image_count','click_diff','rg_timestamp','equal_last_item','global_clickout_count_rank','rg_price','interaction_count_avg','avg_is_interacted_image','avg_is_interacted']
554 |
555 |
556 | # normalize num_impressions
557 |
558 |
559 | # target encoding
560 | agg_cols = ['impression_index','price_rank']
561 | for c in agg_cols:
562 | gp = self.train_data.groupby(c)['label']
563 | mean = gp.mean()
564 | self.train_data[f'{c}_label_avg'] = self.train_data[c].map(mean)
565 | self.val_data[f'{c}_label_avg'] = self.val_data[c].map(mean)
566 | self.test_data[f'{c}_label_avg'] = self.test_data[c].map(mean)
567 |
568 | self.continuous_features.append(f'{c}_label_avg')
569 |
570 |
571 |
572 | agg_cols = ['city']
573 | for c in agg_cols:
574 | gp = self.train_data.groupby(c)['price']
575 | mean = gp.mean()
576 | self.train_data[f'{c}_price_avg'] = self.train_data[c].map(mean)
577 | self.val_data[f'{c}_price_avg'] = self.val_data[c].map(mean)
578 | self.test_data[f'{c}_price_avg'] = self.test_data[c].map(mean)
579 |
580 | self.continuous_features.append(f'{c}_price_avg')
581 |
582 | agg_cols = ['city']
583 | for c in agg_cols:
584 | gp = self.train_data.groupby(c)['price']
585 | mean = gp.std()
586 | self.train_data[f'{c}_price_std'] = self.train_data[c].map(mean)
587 | self.val_data[f'{c}_price_std'] = self.val_data[c].map(mean)
588 | self.test_data[f'{c}_price_std'] = self.test_data[c].map(mean)
589 |
590 | self.continuous_features.append(f'{c}_price_std')
591 |
592 | #normalize
593 | self.train_data['global_clickout_count_rank'] /= 25
594 | self.val_data['global_clickout_count_rank'] /= 25
595 | self.test_data['global_clickout_count_rank'] /= 25
596 |
597 |
598 |
599 |
600 |
601 | # fill zero
602 | for col in ['star','time_diff']:
603 |
604 | self.train_data.loc[:,col].fillna(0, inplace=True)
605 | self.val_data.loc[:,col].fillna(0, inplace=True)
606 | self.test_data.loc[:,col].fillna(0, inplace=True)
607 |
608 |
609 |
610 | for up in self.continuous_features :
611 | mean_value = self.train_data.loc[ ~self.train_data[up].isna() , up].mean()
612 | self.train_data.loc[:,up].fillna(mean_value, inplace=True)
613 | self.val_data.loc[:,up].fillna(mean_value, inplace=True)
614 | self.test_data.loc[:,up].fillna(mean_value, inplace=True)
615 |
616 |
617 | for c in self.continuous_features:
618 | if self.train_data[c].isna().sum() >0 or self.val_data[c].isna().sum() >0 or self.test_data[c].isna().sum() >0:
619 | print("is null!!", c)
620 |
621 | self.config.num_embeddings['price_rank'] = 25
622 | self.config.num_embeddings['impression_index'] = 26
623 |
624 | # self.config.num_embeddings['day_of_week'] = 7
625 | self.config.num_embeddings['star'] = 6
626 |
627 | self.config.all_cat_columns+= ['price_rank', 'impression_index', 'star']
628 |
629 | self.config.continuous_size = len(self.continuous_features)
630 | self.config.neighbor_size = 5
631 |
632 | self.all_cat_columns = self.config.all_cat_columns
633 |
634 | if self.config.verbose:
635 | print(f"Number of training data: {self.train_data.shape}")
636 | print(f"Number of validation data: {self.val_data.shape}")
637 | print(f"Number of test data: {self.test_data.shape}")
638 |
639 | def get_features(self):
640 | return ', '.join([c for c in self.continuous_features if 'svd' not in c])
641 |
642 | def build_user_item_interactions(self, df, session_interactions, session_actions, session_time_diff, training=True):
643 | df_list = []
644 | label_test_df_list = []
645 | # parse impressions for train set
646 | for idx, row in enumerate(tqdm(df.itertuples())):
647 | if row.user_id not in self.past_interaction_dict:
648 | self.past_interaction_dict[row.user_id] = [self.transformed_dummy_item] * self.config.sequence_length
649 | # if row.session_id not in self.past_interaction_dict_sess:
650 | # self.past_interaction_dict_sess[row.session_id] = [self.transformed_dummy_item] * self.config.sess_length
651 | if row.session_id not in self.last_click_sess_dict:
652 | self.last_click_sess_dict[row.session_id] = self.transformed_dummy_item
653 |
654 | if row.session_id not in self.last_impressions_dict:
655 | self.last_impressions_dict[row.session_id] = None
656 |
657 | if row.session_id not in self.sess_last_imp_idx_dict:
658 | self.sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX
659 |
660 | if row.session_id not in self.sess_last_price_dict:
661 | self.sess_last_price_dict[row.session_id] = None
662 |
663 | if row.session_id not in self.sess_time_diff_dict:
664 | self.sess_time_diff_dict[row.session_id] = None
665 |
666 | transformed_impressions = self.cat_encoders['item_id'].transform(row.impressions, to_np=True)
667 |
668 | # compute session_interaction
669 | sess_step = row.sess_step
670 | session_id = row.session_id
671 |
672 | current_session_interactions = session_interactions[session_id][:self.config.sess_length+ sess_step -1] # -1 for excluding the current row
673 | current_session_interactions = current_session_interactions[-self.config.sess_length:]
674 |
675 | current_session_actions = session_actions[session_id][:self.config.sess_length+ sess_step -1]
676 | current_session_actions = current_session_actions[-self.config.sess_length:]
677 |
678 | assert len(current_session_interactions) == self.config.sess_length
679 |
680 | if row.last_item in transformed_impressions:
681 | last_interact_index = transformed_impressions.tolist().index(row.last_item)
682 | else:
683 | last_interact_index = DUMMY_IMPRESSION_INDEX
684 |
685 | if row.second_last_item in transformed_impressions:
686 | second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item)
687 | else:
688 | second_last_interact_index = DUMMY_IMPRESSION_INDEX
689 |
690 | # if row.item_id != self.transformed_nan_item:
691 | # training
692 | label = transformed_impressions == row.item_id
693 | # else:
694 |
695 | # last3_impression_idices = [ transformed_impressions.index(imp) for imp in session_interactions[session_id][self.config.sess_length+ sess_step -4:self.config.sess_length+ sess_step -1] if imp in transformed_impressions else DUMMY_IMPRESSION_INDEX]
696 | # # # test
697 | # label = row.pseudo_label
698 | # if len(transformed_impressions) < 25:
699 | # padded_transformed_impressions = np.array(transformed_impressions.tolist() + [self.transformed_dummy_item] * (25 - len(transformed_impressions)))
700 | # else:
701 | # padded_transformed_impressions = transformed_impressions.copy()
702 | interaction_image_indices = np.array(session_actions[session_id][:self.config.sess_length+ sess_step -1]) == self.transformed_interaction_image
703 | interaction_image_item = np.array(session_interactions[session_id][:self.config.sess_length+ sess_step -1])[interaction_image_indices]
704 | sess_unique_items, counts = np.unique(interaction_image_item, return_counts=True)
705 | interaction_image_count_dict = dict(zip(sess_unique_items, counts))
706 |
707 |
708 | interaction_deals_indices = np.array(session_actions[session_id][:self.config.sess_length+ sess_step -1]) == self.transformed_interaction_deals
709 | interaction_deals_item = np.array(session_interactions[session_id][:self.config.sess_length+ sess_step -1])[interaction_deals_indices]
710 | sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True)
711 | interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts))
712 |
713 |
714 | interaction_clickout_indices = np.array(session_actions[session_id][:self.config.sess_length+ sess_step -1]) == self.transformed_clickout_action
715 | interaction_clickout_item = np.array(session_interactions[session_id][:self.config.sess_length+ sess_step -1])[interaction_clickout_indices]
716 | sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True)
717 | interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts))
718 |
719 | finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1])
720 | finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices]
721 |
722 | # don't leak the current clickout info
723 | unleaked_clickout_count = [self.clickout_count_dict[imp] if imp in self.clickout_count_dict else 0 for imp in transformed_impressions]
724 | unleaked_clickout_count = [unleaked_clickout_count[idx] -1 if imp == row.item_id else unleaked_clickout_count[idx] for idx, imp in enumerate(transformed_impressions)]
725 |
726 | # unleaked_platform_clickout_count = [self.platform_clickout_count[row.platform, imp] if (row.platform, imp) in self.platform_clickout_count else 0 for imp in transformed_impressions]
727 | # unleaked_platform_clickout_count = [unleaked_platform_clickout_count[idx] -1 if imp == row.item_id else unleaked_platform_clickout_count[idx] for idx, imp in enumerate(transformed_impressions)]
728 |
729 | other_is_interacted = [imp in session_interactions[session_id][:self.config.sess_length+ sess_step -1] for imp in transformed_impressions]
730 | padded_other_is_interacted = other_is_interacted + [False] * (25 - len(other_is_interacted))
731 |
732 | other_is_clicked = [imp in self.past_interaction_dict[row.user_id] for imp in transformed_impressions]
733 | padded_other_is_clicked = other_is_clicked + [False] * (25 - len(other_is_clicked))
734 |
735 |
736 | unpad_interactions = session_interactions[session_id][self.config.sess_length:self.config.sess_length+ sess_step -1]
737 |
738 |
739 | unique_interaction = pd.unique(session_interactions[session_id][:self.config.sess_length+ sess_step -1])
740 |
741 | # time elapse of within two steps for each item before the clickout
742 | item_time_elapse_dict = {}
743 |
744 | for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]):
745 | if it not in item_time_elapse_dict: #or elapse > item_time_elapse_dict[it]:
746 |
747 | item_time_elapse_dict[it] = elapse
748 | else:
749 | item_time_elapse_dict[it] += elapse
750 |
751 |
752 | if len(transformed_impressions) < 25:
753 | padded_transformed_impressions = np.array(transformed_impressions.tolist() + [self.transformed_dummy_item] * (25 - len(transformed_impressions)))
754 | else:
755 | padded_transformed_impressions = transformed_impressions.copy()
756 | # padded_transformed_impressions = np.array([transformed_impressions[0]] * 2 + transformed_impressions.tolist() + [transformed_impressions[-1]] * 2)
757 | padded_prices = [ row.prices[0]] * 2 + row.prices + [row.prices[-1]]*2
758 | price_rank = compute_rank(row.prices)
759 | current_rows = np.zeros([len(row.impressions), 41], dtype=object)
760 | current_rows[:, 0] = row.user_id
761 | current_rows[:, 1] = transformed_impressions
762 | current_rows[:, 2] = label
763 | current_rows[:, 3] = row.session_id
764 | current_rows[:, 4] = [np.array(self.past_interaction_dict[row.user_id])] * len(row.impressions)
765 | current_rows[:, 5] = price_rank
766 | current_rows[:, 6] = row.city
767 | current_rows[:, 7] = row.last_item
768 |
769 | # impression index
770 | current_rows[:, 8] = np.arange(len(transformed_impressions))
771 | current_rows[:, 9] = row.step
772 | current_rows[:, 10] = row.id
773 |
774 | current_rows[:, 11] = [np.array(current_session_interactions)] * len(row.impressions)
775 | current_rows[:, 12] = [np.array(current_session_actions)] * len(row.impressions)
776 | current_rows[:, 13] = MinMaxScaler().fit_transform(np.array(row.prices).reshape(-1,1)).reshape(-1)
777 | current_rows[:, 14] = row.prices
778 |
779 | # last click item id
780 | current_rows[:, 15] = self.last_click_sess_dict[row.session_id]
781 |
782 | # equal_last_impressions
783 | current_rows[:, 16] = self.last_impressions_dict[row.session_id] == transformed_impressions.tolist()
784 |
785 | # impression index of last clicked item
786 | current_rows[:, 17] = self.sess_last_imp_idx_dict[row.session_id]
787 |
788 | #impression index of last interacted item
789 | current_rows[:, 18] = last_interact_index
790 |
791 | # price difference with last interacted item
792 | current_rows[:, 19] = row.prices - self.sess_last_price_dict[row.session_id] if self.sess_last_price_dict[row.session_id] else 0
793 |
794 |
795 | current_rows[:, 20] = self.sess_last_price_dict[row.session_id] if self.sess_last_price_dict[row.session_id] else 0
796 | current_rows[:, 21] = row.prices / self.sess_last_price_dict[row.session_id] if self.sess_last_price_dict[row.session_id] else 0
797 | current_rows[:, 22] = [ padded_prices[i:i+5] for i in range(len(row.impressions))]
798 |
799 | current_rows[:, 23] = [np.concatenate([padded_transformed_impressions[:i], padded_transformed_impressions[i+1:]]) for i in range(len(row.impressions))]
800 | current_rows[:, 24] = row.city_platform
801 |
802 | # if that item has been clicked by the current user
803 | current_rows[:, 25] = [imp in self.past_interaction_dict[row.user_id] for imp in transformed_impressions]
804 |
805 | # if that item has been interaced in the current session
806 | current_rows[:, 26] = [imp in session_interactions[session_id][:self.config.sess_length+ sess_step -1] for imp in transformed_impressions]
807 |
808 | # note that the impressions here was not transformed
809 | current_rows[:, 27] = [self.item_popularity_dict[imp] for imp in row.impressions]
810 |
811 | current_rows[:, 28] = [1 if imp in interaction_image_count_dict else 0 for imp in transformed_impressions]
812 | current_rows[:, 29] = [1 if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions]
813 |
814 | current_rows[:, 30] = [self.interaction_count_dict[imp] if imp in self.interaction_count_dict else 0 for imp in transformed_impressions]
815 | current_rows[:, 31] = unleaked_clickout_count
816 | current_rows[:, 32] = [self.past_interaction_dict[row.user_id][::-1].index(imp) if imp in self.past_interaction_dict[row.user_id] else 0 for imp in transformed_impressions]
817 | current_rows[:, 33] = [np.array(padded_other_is_interacted)] * len(row.impressions)
818 | current_rows[:, 34] = [np.array(padded_other_is_clicked)] * len(row.impressions)
819 | current_rows[:, 35] = transformed_impressions == row.last_item
820 | current_rows[:, 36] = np.argsort(np.argsort(unleaked_clickout_count))
821 | current_rows[:, 37] = GaussRankScaler().fit_transform(row.prices)
822 | current_rows[:, 38] = np.mean(current_rows[:, 30])
823 | current_rows[:, 39] = np.mean(current_rows[:, 28])
824 | current_rows[:, 40] = np.mean(current_rows[:, 26])
825 |
826 | # current_rows[:, 41] = np.mean(finite_time_diff_array)
827 | # current_rows[:, 41] = np.std(current_rows[:, 30])
828 | # current_rows[:, 41] = 2 * last_interact_index - second_last_interact_index
829 |
830 |
831 | # current_rows[:, 41] = second_last_interact_index
832 |
833 | #TODO: Rank of statistics
834 |
835 | # print(unleaked_platform_clickout_count)
836 |
837 | # current_rows[:, 35] = [session_interactions[session_id][:self.config.sess_length+ sess_step -1][::-1].index(imp) if imp in session_interactions[session_id][:self.config.sess_length+ sess_step -1] else 0 for imp in transformed_impressions]
838 | # for i in range(35, 42):
839 | # current_rows[:, i] = np.mean(current_rows[:, i-10])
840 |
841 | # current_rows[:, 29] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions]
842 |
843 |
844 | # neighboring item
845 | # current_rows[:, 23] = [np.concatenate([padded_transformed_impressions[:i], padded_transformed_impressions[i+1:]]) for i in range(len(row.impressions))]
846 |
847 |
848 | # current_rows[:, 20] = row.prices - np.concatenate([row.prices[1:], [row.prices[-1]]], axis=0)
849 | # current_rows[:, 21] = row.prices - np.concatenate([[row.prices[0]], row.prices[:-1]], axis=0)
850 |
851 | # current_rows[:, 17] = row.step - self.sess_last_step_dict[row.session_id] if self.sess_last_step_dict[row.session_id] else 0
852 |
853 | # back pad transformed impressions
854 |
855 |
856 |
857 | # current_rows[:, 16] = [np.delete(np.arange(25), i) for i in range(len(row.impressions))]
858 | # print(self.last_click_sess_dict[row.session_id], self.last_impressions_dict[row.session_id] == transformed_impressions.tolist())
859 |
860 | if training or row.item_id == self.transformed_nan_item:
861 |
862 | df_list.append(current_rows)
863 | else:
864 | label_test_df_list.append(current_rows)
865 | #include both labeled and pseudo-labelled
866 |
867 |
868 |
869 | # pad current item_id to default dict
870 | self.past_interaction_dict[row.user_id] = self.past_interaction_dict[row.user_id][1:]
871 | self.past_interaction_dict[row.user_id].append(row.item_id)
872 |
873 |
874 | self.last_click_sess_dict[row.session_id] = row.item_id
875 | self.last_impressions_dict[row.session_id] = transformed_impressions.tolist()
876 | self.sess_last_step_dict[row.session_id] = row.step
877 | self.sess_time_diff_dict[row.session_id] = row.timestamp
878 |
879 |
880 |
881 | # update last impression index
882 | if row.item_id != self.transformed_nan_item:
883 | self.sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True)
884 | self.sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0]
885 |
886 |
887 | data = np.vstack(df_list)
888 | dtype_dict = {"city":"int32", "last_item":"int32", 'impression_index':'int32', "step":"int32","id":"int32", "user_id":"int32",
889 | "item_id":"int32", "label": "int32", "price_rank":"int32", "mm_price":"float32", 'price':'float32', "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'int16', 'price_diff':'float32','last_price':'float32','price_ratio':'float32','city_platform':'int32', 'is_clicked':'int8', 'is_interacted':'int8','item_popularity':'int32', 'is_interacted_image':'int8','is_interacted_deals':'int8','interaction_count':'int32','clickout_count':'int32','click_diff':'float32','equal_last_item':'int8','global_clickout_count_rank':'int8','rg_price':'float32','interaction_count_avg':'float32','avg_is_interacted_image':'float32','avg_is_interacted':'float32'}
890 | df_columns= ['user_id', 'item_id', 'label', 'session_id', 'past_interactions', 'price_rank', 'city', 'last_item', 'impression_index', 'step', 'id', 'past_interactions_sess', 'past_actions_sess', 'mm_price','price','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','neighbor_prices','other_item_ids','city_platform', 'is_clicked', 'is_interacted', 'item_popularity','is_interacted_image','is_interacted_deals','interaction_count','clickout_count','click_diff','other_is_interacted','other_is_clicked','equal_last_item','global_clickout_count_rank','rg_price','interaction_count_avg','avg_is_interacted_image', 'avg_is_interacted']
891 | df = pd.DataFrame(data, columns=df_columns)
892 | df = df.astype(dtype= dtype_dict)
893 | if training:
894 | return df
895 | else:
896 | label_test = np.vstack(label_test_df_list)
897 | label_test = pd.DataFrame(label_test, columns=df_columns)
898 | label_test = label_test.astype(dtype= dtype_dict)
899 | return df, label_test
900 | def instance_a_train_loader(self):
901 |
902 |
903 | train_data = self.train_data
904 |
905 | return NNDataLoader(train_data, self.config, shuffle=True, batch_size=self.config.batch_size, continuous_features=self.continuous_features)
906 | def evaluate_data_valid(self):
907 | val_data = self.val_data
908 | return NNDataLoader(val_data, self.config, shuffle=False, batch_size=self.config.batch_size, continuous_features=self.continuous_features)
909 |
910 | def instance_a_test_loader(self):
911 | test_data = self.test_data
912 | return NNDataLoader(test_data, self.config, shuffle=False, batch_size=self.config.batch_size,continuous_features=self.continuous_features)
913 |
914 |
915 | if __name__ =='__main__':
916 | conf = NNConfiguration()
917 | data_gen = NNDataGenerator(conf)
918 | with timer("gen"):
919 | for result in data_gen.instance_a_train_loader(128):
920 | print(result[-1])
921 | print(torch.LongTensor(result[-1]))
922 |
923 | for result in data_gen.instance_a_train_loader(128):
924 | print(result[-1])
925 | print(torch.LongTensor(result[-1]))
926 |
--------------------------------------------------------------------------------
/src/run_xgb.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | import pickle
5 | import gc
6 | from constant import *
7 | from utils import *
8 | from config import *
9 | import torch
10 | from joblib import Parallel, delayed
11 | from tqdm import tqdm
12 | import xgboost as xgb
13 | import scipy
14 | from sklearn.decomposition import TruncatedSVD
15 | import multiprocessing
16 | from ordered_set import OrderedSet
17 | import os
18 | import itertools
19 | from scipy.sparse import csr_matrix
20 |
21 | # os.environ['CUDA_VISIBLE_DEVICES'] = '2'
22 | configuration = XGBConfiguration()
23 |
24 | model_name='xgb_gic_lic_wosh_lf350_lr002_v2'
25 |
26 | if configuration.sub_sample:
27 | model_name += '_140k'
28 | else:
29 | model_name += '_all'
30 |
31 | if configuration.use_test:
32 | model_name += '_ut'
33 |
34 | seed_everything(42)
35 |
36 | ########################################################### Load data ######################################################################
37 | with open( f'{input_dir}/train_v2.p', 'rb') as f:
38 | train = pickle.load(f)
39 | train['id']= np.arange(len(train))
40 |
41 | with open(f'{input_dir}/test_v2.p', 'rb') as f:
42 | test = pickle.load(f)
43 | test['id'] = np.arange( len(train), len(train)+ len(test))
44 |
45 | with open('../input/item_metadata.p', 'rb') as f:
46 | item_meta = pickle.load(f)
47 | item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|'))
48 | item_meta['item_id'] = item_meta['item_id'].apply(str)
49 |
50 | # whether to use sub sample of the data to speed up the evaluation
51 | if configuration.sub_sample:
52 | with open('../input/selected_users_140k.p', 'rb') as f:
53 | selected_users = pickle.load(f)
54 |
55 | train = train.loc[train.user_id.isin(selected_users),:]
56 |
57 | # check if the code can run with debug mode
58 | if configuration.debug:
59 | train = train.sample(1000)
60 | test = test.sample(1000)
61 |
62 | with timer("preprocessing"):
63 |
64 | # change columns name
65 | train.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True)
66 | test.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True)
67 |
68 | # concatenate the action and reference in string format as these refernce are not actually item id
69 | train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
70 | test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
71 |
72 |
73 | train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
74 | test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
75 |
76 |
77 | # wipe out the item id associated with these actions, reason same as the above
78 | train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
79 | test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
80 |
81 | train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
82 | test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
83 |
84 | train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
85 | test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
86 |
87 | train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
88 | test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
89 |
90 | # remove training example where clicked item is not in the impressions
91 | train['in_impressions'] = True
92 | train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
93 | train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
94 |
95 | test['in_impressions'] = True
96 | test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
97 | test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
98 |
99 | # parse impressions and prices list from string to list
100 | train['item_id'] = train['item_id'].apply(str)
101 | train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|'))
102 | train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
103 |
104 | test['item_id'] = test['item_id'].apply(str)
105 | test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|'))
106 | test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
107 |
108 | # compute the last interacted item by shifted the item_id by 1 position
109 | train['last_item'] = np.nan
110 | test['last_item'] = np.nan
111 |
112 | train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist()
113 | test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist()
114 |
115 | # compute the last interacted item by shifted the item_id by 2 position
116 | train['last_item'] = train_shifted_item_id
117 | test['last_item'] = test_shifted_item_id
118 |
119 | train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist()
120 | test_shifted_item_id = [DUMMY_ITEM] *2 + test.item_id.values[:-2].tolist()
121 |
122 | # compute the last interacted item by shifted the item_id by 3 position
123 | train['second_last_item'] = train_shifted_item_id
124 | test['second_last_item'] = test_shifted_item_id
125 |
126 | train_shifted_item_id = [DUMMY_ITEM] *3 + train.item_id.values[:-3].tolist()
127 | test_shifted_item_id = [DUMMY_ITEM] *3 + test.item_id.values[:-3].tolist()
128 |
129 | train['third_last_item'] = train_shifted_item_id
130 | test['third_last_item'] = test_shifted_item_id
131 |
132 | # mask out the last interacted item if that interaction comes first in its session
133 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=True)
134 | test['step_rank'] = test.groupby('session_id')['step'].rank(method='max', ascending=True)
135 |
136 | # fill the invalid shifted last n item with a constant number
137 | train.loc[(train.step_rank == 1) & (train.action == 'clickout item'), 'last_item'] = DUMMY_ITEM
138 | test.loc[(test.step_rank == 1) & (test.action == 'clickout item'), 'last_item'] = DUMMY_ITEM
139 |
140 | train.loc[(train.step_rank == 2) & (train.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM
141 | test.loc[(test.step_rank == 2) & (test.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM
142 |
143 | train.loc[(train.step_rank == 3) & (train.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM
144 | test.loc[(test.step_rank == 3) & (test.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM
145 |
146 |
147 | # ignore this
148 | keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item']
149 | all_cat_columns = ['item_id', 'city', 'platform', 'device','country','country_platform','action','device_platform']
150 |
151 |
152 | # generate country from city
153 | train['country'] = train.city.apply(lambda x:x.split(',')[-1])
154 | test['country'] = test.city.apply(lambda x:x.split(',')[-1])
155 |
156 | # concate country and platform in string format as a new feature
157 | train['country_platform'] = train.apply(lambda row: row.country + row.platform, axis=1)
158 | test['country_platform'] = test.apply(lambda row: row.country + row.platform, axis=1)
159 |
160 | train['device_platform'] = train.apply(lambda row: row.device + row.platform, axis=1)
161 | test['device_platform'] = test.apply(lambda row: row.device + row.platform, axis=1)
162 | # filter out rows where reference doesn't present in impression
163 | # train = train.loc[train.apply(lambda row:row.item_id in row.impressions, axis=1),:]
164 |
165 | print("train shape",train.shape)
166 |
167 |
168 | # concat train and test
169 | data = pd.concat([train, test], axis=0)
170 | data = data.reset_index(drop=True)
171 |
172 | # compute a dicationary that maps session id to the sequence of item ids in that session
173 | train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list))
174 | test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list))
175 |
176 |
177 | # compute a dicationary that maps session id to the sequence of action in that session
178 | train_session_actions = dict(train.groupby('session_id')['action'].apply(list))
179 | test_session_actions = dict(test.groupby('session_id')['action'].apply(list))
180 |
181 | # compute session session step since the "step" column in some session is not correctly order
182 | train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int)
183 | test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int)
184 |
185 |
186 |
187 |
188 | data_feature = data.loc[:,['id','step','session_id', 'timestamp','platform','country']].copy()
189 |
190 | # compute the time difference between each step
191 | data_feature['time_diff'] = data.groupby('session_id')['timestamp'].diff()
192 |
193 | # compute the difference of time difference between each step
194 | data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff()
195 |
196 | # compute the difference of the difference of time difference between each step
197 | data_feature['time_diff_diff_diff'] = data_feature.groupby('session_id')['time_diff_diff'].diff()
198 |
199 | # compute the time difference from 2 steps ahead
200 | data_feature['time_diff_2'] = data.groupby('session_id')['timestamp'].diff().shift(1)
201 |
202 | # compute the time difference from 3 steps ahead
203 | data_feature['time_diff_3'] = data.groupby('session_id')['timestamp'].diff().shift(2)
204 |
205 | data_feature['hour']= pd.to_datetime(data_feature.timestamp, unit='s').dt.hour//4
206 |
207 | # map platform to country
208 | data_feature['mapped_country'] = data_feature.platform.apply(platform2country)
209 |
210 |
211 | # load the precomputed country to utc offsets from geopy
212 | with open('../input/country2offsets_dict.p','rb') as f:
213 | platform_country2offsets_dict = pickle.load(f)
214 | data_feature['platform2country_utc_offsets'] = data_feature.mapped_country.map(platform_country2offsets_dict)
215 |
216 |
217 | # trasnform time difference with rank gauss
218 | data_feature['rg_time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values)
219 |
220 | # compute the log of step
221 | data_feature['step_log'] = np.log1p(data_feature['step'])
222 |
223 | # drop the useless columns
224 | data_feature = data_feature.drop(['session_id','step','timestamp','hour','platform','country','mapped_country'], axis=1)
225 |
226 |
227 |
228 |
229 | # merge train, test with data_feature
230 | train = train.merge(data_feature, on='id', how='left')
231 | test = test.merge(data_feature, on='id', how='left')
232 |
233 |
234 | # compute the sequence of time difference in each session
235 | train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list))
236 | test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list))
237 |
238 | # encode the categorical feture
239 | cat_encoders = {}
240 | for col in all_cat_columns:
241 | cat_encoders[col] = CategoricalEncoder()
242 |
243 |
244 | all_items = []
245 | for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()] :
246 | all_items += imp
247 |
248 | unique_items = OrderedSet(all_items)
249 | unique_actions = OrderedSet(data.action.values)
250 |
251 | cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM])
252 | cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION])
253 | for col in ['city', 'platform', 'device','country','country_platform', 'device_platform']:
254 |
255 | cat_encoders[col].fit(data[col].tolist() )
256 |
257 |
258 | # transform all the categorical columns to continuous integer
259 | for col in all_cat_columns:
260 | train[col] = cat_encoders[col].transform(train[col].values)
261 | test[col] = cat_encoders[col].transform(test[col].values)
262 |
263 |
264 | # get the encoded action
265 | transformed_clickout_action = cat_encoders['action'].transform(['clickout item'])[0]
266 | transformed_dummy_item = cat_encoders['item_id'].transform([DUMMY_ITEM])[0]
267 | transformed_dummy_action = cat_encoders['action'].transform([DUMMY_ACTION])[0]
268 | transformed_interaction_image = cat_encoders['action'].transform(['interaction item image'])[0]
269 | transformed_interaction_deals = cat_encoders['action'].transform(['interaction item deals'])[0]
270 | transformed_interaction_info = cat_encoders['action'].transform(['interaction item info'])[0]
271 | transformed_interaction_rating = cat_encoders['action'].transform(['interaction item rating'])[0]
272 |
273 | # transform session interactions and pad dummy in front of all of them
274 | for session_id, item_list in train_session_interactions.items():
275 | train_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list)
276 |
277 | for session_id, item_list in test_session_interactions.items():
278 | test_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list)
279 |
280 | for session_id, action_list in train_session_actions.items():
281 | train_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list)
282 |
283 | for session_id, action_list in test_session_actions.items():
284 | test_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list)
285 |
286 |
287 | ### compute co-occurence matrix
288 | implicit_train = train.loc[train.action != transformed_clickout_action, :]
289 | implicit_test = test.loc[test.action != transformed_clickout_action, :]
290 |
291 | # get all interacted items in a session
292 | implicit_all = pd.concat([implicit_train , implicit_test], axis=0)
293 | # a list of list containing items in the same session
294 | co_occ_items = implicit_all.groupby('session_id').item_id.apply(list).to_dict().values()
295 | co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in co_occ_items]
296 |
297 | #aggregate co-ocurrence across sessions
298 | co_occ_coordinates = []
299 | for coordinates in co_occ_permutes:
300 | co_occ_coordinates += coordinates
301 |
302 | #construct csr
303 | row, col, values = zip(*((i,j,1) for i,j in co_occ_coordinates ))
304 | co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32)
305 |
306 | co_occ_matrix_csc = co_occ_matrix.tocsc()
307 |
308 | print("max entry: ", co_occ_matrix.max())
309 |
310 |
311 | ### compute co-occurence matrix for imp list
312 |
313 | # imp_co_occ_items = train.loc[~train.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist() + test.loc[~test.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist()
314 | # imp_co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in imp_co_occ_items]
315 |
316 | # #aggregate co-ocurrence across sessions
317 | # imp_co_occ_coordinates = []
318 | # for coordinates in imp_co_occ_permutes:
319 | # imp_co_occ_coordinates += coordinates
320 |
321 | # #construct csr
322 | # row, col, values = zip(*((i,j,1) for i,j in imp_co_occ_coordinates ))
323 | # imp_co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32)
324 |
325 | # imp_co_occ_matrix_csc = imp_co_occ_matrix.tocsc()
326 |
327 | # print("max entry: ", imp_co_occ_matrix.max())
328 |
329 | # categorically encode last, second last and third item
330 | train['last_item'] = cat_encoders['item_id'].transform(train['last_item'].values)
331 | test['last_item'] = cat_encoders['item_id'].transform(test['last_item'].values)
332 |
333 | train['second_last_item'] = cat_encoders['item_id'].transform(train.second_last_item.values)
334 | test['second_last_item'] = cat_encoders['item_id'].transform(test.second_last_item.values)
335 |
336 | train['third_last_item'] = cat_encoders['item_id'].transform(train.third_last_item.values)
337 | test['third_last_item'] = cat_encoders['item_id'].transform(test.third_last_item.values)
338 |
339 |
340 |
341 |
342 | # genetate item properties features
343 | item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:]
344 | # item_meta multi-hot
345 | item_meta['item_id'] = cat_encoders['item_id'].transform(item_meta['item_id'].values)
346 | item_meta['star'] = np.nan
347 | item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1
348 | item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2
349 | item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3
350 | item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4
351 | item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5
352 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Excellent Rating' in y) ), 'star'] = 9
353 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Very Good Rating' in y) ), 'star'] = 8
354 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Good Rating' in y) ), 'star'] = 7
355 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Satisfactory Rating' in y) ), 'star'] = 6
356 |
357 | item_meta['rating'] = np.nan
358 | item_meta.loc[item_meta.properties.apply(lambda x: 'Satisfactory Rating' in x), 'rating'] = 7.0
359 | item_meta.loc[item_meta.properties.apply(lambda x: 'Good Rating' in x), 'rating'] = 7.5
360 | item_meta.loc[item_meta.properties.apply(lambda x: 'Very Good Rating' in x), 'rating'] = 8.0
361 | item_meta.loc[item_meta.properties.apply(lambda x: 'Excellent Rating' in x), 'rating'] = 8.5
362 |
363 | # get binary properties feature
364 | item_properties_df = pd.DataFrame()
365 | item_properties_df['item_id'] = item_meta.item_id
366 | item_properties_df['num_properties'] = item_meta.properties.apply(len)
367 | item_properties_df['star'] = item_meta.star
368 | item_properties_df['item_Beach'] = item_meta.properties.apply(lambda x: 'Beach' in x).astype(np.float16)
369 | item_properties_df['item_Bed & Breakfast'] = item_meta.properties.apply(lambda x: 'Bed & Breakfast' in x).astype(np.float16)
370 | item_properties_df['rating'] = item_meta['rating']
371 |
372 |
373 | item_star_map = item_properties_df.loc[:,['item_id','star']].set_index('item_id').to_dict()['star']
374 | item_rating_map = item_properties_df.loc[:,['item_id','rating']].set_index('item_id').to_dict()['rating']
375 |
376 |
377 |
378 | del item_meta
379 | gc.collect()
380 |
381 | # ignore filter_df , not using, consume huge memory yet increase a little
382 | filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']]
383 | filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|'))
384 |
385 | # filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Star' in x), 'nights'] = 3
386 | filter_df['nights']=np.nan
387 | filter_df.loc[filter_df.current_filters.apply(lambda x: '2 Nights' in x), 'nights'] = 1
388 | filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Nights' in x), 'nights'] = 2
389 |
390 | filter_set = list(set(np.hstack(filter_df['current_filters'].to_list())))
391 |
392 | cat_encoders['filters'] = CategoricalEncoder()
393 | cat_encoders['filters'].fit(filter_set)
394 |
395 | # get binary filter feature
396 | filters_df = pd.DataFrame()
397 | filters_df['id'] = filter_df.id
398 | filters_df['num_filters'] = filter_df.current_filters.apply(len)
399 | filters_df['breakfast_included'] = filter_df.current_filters.apply( lambda x: 'Breakfast Included' in x).astype(np.float16)
400 | filters_df['filters_Sort By Price'] = filter_df.current_filters.apply( lambda x: 'Sort by Price' in x).astype(np.float16)
401 | filters_df['filters_Sort By Popularity'] = filter_df.current_filters.apply( lambda x: 'Sort By Popularity' in x).astype(np.float16)
402 |
403 |
404 |
405 | # compute interaction image count for each item across train/ test
406 | interaction_image_item_ids = train.loc[train.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
407 | unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True)
408 | global_image_count_dict = dict(zip(unique_interaction_image_items, counts))
409 |
410 | # compute interaction count for each item across train/ test
411 | interaction_item_ids = train.loc[train.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
412 | unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True)
413 | global_interaction_count_dict = dict(zip(unique_interaction_items, counts))
414 |
415 | # compute interaction deals count for each item across train/ test
416 | interaction_deals_item_ids = train.loc[train.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
417 | unique_interaction_deals_items, counts = np.unique(interaction_deals_item_ids, return_counts=True)
418 | global_deals_count_dict = dict(zip(unique_interaction_deals_items, counts))
419 |
420 |
421 | # compute step rank to identify the last row in each session for train/ val split
422 | train = train.loc[train.action == transformed_clickout_action,:]
423 | test = test.loc[test.action == transformed_clickout_action,:]
424 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False)
425 |
426 | # compute the impression count for each item
427 | item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)])
428 | unique_items, counts = np.unique(item_ids, return_counts=True)
429 | impression_count_dict = dict(zip(unique_items, counts))
430 |
431 | # compute the rank gauss transformed prices
432 | unique_prices = np.unique(np.hstack([np.hstack(train.prices.values), np.hstack(test.prices.values)]) )
433 | rg_unique_prices = GaussRankScaler().fit_transform(unique_prices)
434 | price_rg_price_dict = dict(zip(unique_prices, rg_unique_prices))
435 |
436 |
437 | #train/ val split
438 | if configuration.debug:
439 | val = train.loc[train.step_rank == 1,:].iloc[:5]
440 | else:
441 | val = train.loc[train.step_rank == 1,:].iloc[:50000]
442 |
443 | val_index = val.index
444 | train = train.loc[~train.index.isin(val_index),:]
445 |
446 | train = train.drop('step_rank', axis=1)
447 | val = val.drop('step_rank', axis=1)
448 |
449 |
450 | # get the encoded nan item
451 | transformed_nan_item = cat_encoders['item_id'].transform(['nan'])[0]
452 |
453 |
454 |
455 |
456 | from collections import defaultdict, Counter
457 | session_clickout_count_dict = {}
458 | past_interaction_dict = {}
459 | last_click_sess_dict = {}
460 | last_impressions_dict = {}
461 | sess_last_imp_idx_dict={}
462 | sess_last_price_dict = {}
463 | sess_time_diff_dict ={}
464 | sess_step_diff_dict = {}
465 |
466 | cumulative_click_dict = defaultdict(lambda : 0)
467 |
468 |
469 |
470 |
471 | def parse_impressions(df, session_interactions, session_actions, session_time_diff, training=True):
472 | # parse the data into a binary classification task, generate 1 example for each item in the impression list
473 | df_list = []
474 | label_test_df_list = []
475 | # parse impressions for train set
476 | past_interaction_rows = []
477 | past_interaction_columns = []
478 | for idx, row in enumerate(tqdm(df.itertuples())):
479 |
480 | if row.session_id not in session_clickout_count_dict:
481 | session_clickout_count_dict[row.session_id] = 0
482 |
483 | if row.user_id not in past_interaction_dict:
484 | past_interaction_dict[row.user_id] = []
485 |
486 |
487 | sess_step = row.sess_step
488 | session_id = row.session_id
489 |
490 | # compute the categorically encoded impression list
491 | transformed_impressions = cat_encoders['item_id'].transform(row.impressions, to_np=True)
492 |
493 | current_rows = np.zeros([len(row.impressions), 66], dtype=object)
494 |
495 | # compute rank of price this clickout
496 | price_rank = compute_rank(row.prices)
497 |
498 | #compute the number of interactions associated with the last interacted item in this session
499 | equal_last_item_indices = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) == row.last_item
500 | last_item_interaction = len(set(np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1])[equal_last_item_indices]))
501 |
502 | #compute the local interaction count for each item id
503 | interaction_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) != transformed_clickout_action
504 | interaction_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_indices]
505 | sess_unique_items, counts = np.unique(interaction_item, return_counts=True)
506 | interaction_count_dict = dict(zip(sess_unique_items, counts))
507 |
508 | #compute the local interaction image count for each item id
509 | interaction_image_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_image
510 | interaction_image_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_image_indices]
511 | sess_unique_image_items, counts = np.unique(interaction_image_item, return_counts=True)
512 | interaction_image_count_dict = dict(zip(sess_unique_image_items, counts))
513 |
514 | #compute the local interaction deals count for each item id
515 | interaction_deals_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_deals
516 | interaction_deals_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_deals_indices]
517 | sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True)
518 | interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts))
519 |
520 | #compute the local clickout count for each item id
521 | interaction_clickout_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_clickout_action
522 | interaction_clickout_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_clickout_indices]
523 | sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True)
524 | interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts))
525 |
526 | #compute the local interaction rating count for each item id
527 | interaction_rating_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_rating
528 | interaction_rating_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_rating_indices]
529 | sess_unique_rating_items, counts = np.unique(interaction_rating_item, return_counts=True)
530 | interaction_rating_count_dict = dict(zip(sess_unique_rating_items, counts))
531 |
532 |
533 | # get the time diffference array in this session for later computing the average of it
534 | finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1])
535 | finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices]
536 |
537 | # unpad the interactions
538 | unpad_interactions = session_interactions[session_id][configuration.sess_length:configuration.sess_length+ sess_step -1]
539 | unique_interaction = pd.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1])
540 |
541 | # time elapse of within two steps for each item before the clickout
542 | item_time_elapse_dict = {}
543 | for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]):
544 | if it not in item_time_elapse_dict:
545 | item_time_elapse_dict[it] = [elapse]
546 |
547 | else:
548 | item_time_elapse_dict[it].append(elapse)
549 |
550 | # compute time_diff for each item in the session
551 | interact_diff = [unpad_interactions[::-1].index(imp) if imp in unpad_interactions else np.nan for imp in transformed_impressions]
552 | item_time_diff = np.array([ sum(session_time_diff[session_id][sess_step - diff -1 :sess_step]) if np.isfinite(diff) else np.nan for diff in interact_diff])
553 |
554 | target_index = transformed_impressions.tolist().index(row.item_id) if training else np.nan
555 |
556 | #(imp len, num items)
557 | current_co_occ = co_occ_matrix[transformed_impressions,:]
558 |
559 |
560 | #(imp len, num unique items in the session b4 this clickout)
561 | current_co_occ = current_co_occ[:,sess_unique_items].toarray()
562 |
563 | # (1, num unique items in the session b4 this clickout)
564 | # print(current_co_occ.dtype)
565 |
566 | norm = (1 + co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1))
567 |
568 | # #(imp len, num items)
569 | # imp_current_co_occ = imp_co_occ_matrix[transformed_impressions,:]
570 |
571 |
572 | # #(imp len, num unique items in the session b4 this clickout)
573 | # imp_current_co_occ = imp_current_co_occ[:,sess_unique_items].toarray()
574 |
575 | # # (1, num unique items in the session b4 this clickout)
576 | # # print(current_co_occ.dtype)
577 |
578 | # imp_norm = (1 + imp_co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1))
579 |
580 | # norm_imp_current_co_occ = imp_current_co_occ / imp_norm
581 |
582 | # the position of the last interacted item in the current impression list
583 | if row.last_item in transformed_impressions:
584 | last_interact_index = transformed_impressions.tolist().index(row.last_item)
585 | else:
586 | last_interact_index = np.nan
587 |
588 | # the position of the second last interacted item in the current impression list
589 | if row.second_last_item in transformed_impressions:
590 | second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item)
591 | else:
592 | second_last_interact_index = np.nan
593 |
594 | # the position of the third last interacted item in the current impression list
595 | if row.third_last_item in transformed_impressions:
596 | third_last_interact_index = transformed_impressions.tolist().index(row.third_last_item)
597 | else:
598 | third_last_interact_index = np.nan
599 |
600 | # initialize dictionaries
601 | if row.session_id not in last_click_sess_dict:
602 | last_click_sess_dict[row.session_id] = transformed_dummy_item
603 |
604 | if row.session_id not in last_impressions_dict:
605 | last_impressions_dict[row.session_id] = None
606 |
607 | if row.session_id not in sess_last_imp_idx_dict:
608 | sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX
609 |
610 | if row.session_id not in sess_last_price_dict:
611 | sess_last_price_dict[row.session_id] = None
612 |
613 | if row.session_id not in sess_time_diff_dict:
614 | sess_time_diff_dict[row.session_id] = None
615 |
616 | if row.session_id not in sess_step_diff_dict:
617 | sess_step_diff_dict[row.session_id] = None
618 |
619 |
620 | # item id
621 | current_rows[:, 0] = transformed_impressions
622 |
623 | # label
624 | current_rows[:, 1] = transformed_impressions == row.item_id
625 | current_rows[:, 2] = row.session_id
626 |
627 | # whether current item id equal to the last interacted item id
628 | current_rows[:, 3] = transformed_impressions == row.last_item
629 | current_rows[:, 4] = price_rank
630 | current_rows[:, 5] = row.platform
631 | current_rows[:, 6] = row.device
632 | current_rows[:, 7] = row.city
633 | current_rows[:, 8] = row.prices
634 | current_rows[:, 9] = row.country
635 |
636 | # impression index
637 | current_rows[:, 10] = np.arange(len(row.impressions))
638 | current_rows[:, 11] = row.step
639 | current_rows[:, 12] = row.id
640 |
641 | # last_click_item: last clickout item id
642 | current_rows[:, 13] = last_click_sess_dict[row.session_id]
643 |
644 | # equal_last_impressions: current impression list is eactly the same as the last one that the user encountered
645 | current_rows[:, 14] = last_impressions_dict[row.session_id] == transformed_impressions.tolist()
646 |
647 |
648 | current_rows[:, 15] = sess_last_imp_idx_dict[row.session_id]
649 | # last_interact_index
650 | current_rows[:, 16] = last_interact_index
651 |
652 | # price_diff
653 | current_rows[:, 17] = row.prices - sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan
654 |
655 | # last_price
656 | current_rows[:, 18] = sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan
657 |
658 | # price_ratio
659 | current_rows[:, 19] = row.prices / sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np. nan
660 |
661 | # clickout_time_diff
662 | current_rows[:, 20] = row.timestamp - sess_time_diff_dict[row.session_id] if sess_time_diff_dict[row.session_id] else np.nan
663 |
664 | # country_platform
665 | current_rows[:, 21] = row.country_platform
666 |
667 | # impression_count
668 | current_rows[:, 22] = [impression_count_dict[imp] for imp in row.impressions]
669 |
670 | # is_interacted: if that item has been interaced in the current session
671 | current_rows[:, 23] = [imp in session_interactions[session_id][:configuration.sess_length+ sess_step -1] for imp in transformed_impressions]
672 |
673 | # local_interaction_image_count
674 | current_rows[:, 24] = [interaction_image_count_dict[imp] if imp in interaction_image_count_dict else 0 for imp in transformed_impressions]
675 | # local_interaction_deals_count
676 | current_rows[:, 25] = [interaction_deals_count_dict[imp] if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions]
677 |
678 | # local_interaction_clickout_count
679 | current_rows[:, 26] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions]
680 |
681 | # global_interaction_image_count
682 | current_rows[:, 27] = [global_image_count_dict[imp] if imp in global_image_count_dict else 0 for imp in transformed_impressions]
683 |
684 | # global_interaction_deals_count
685 | current_rows[:, 28] = [global_deals_count_dict[imp] if imp in global_deals_count_dict else 0 for imp in transformed_impressions]
686 |
687 | # is_clicked
688 | current_rows[:, 29] = [imp in past_interaction_dict[row.user_id] for imp in transformed_impressions]
689 |
690 | # click_diff
691 | current_rows[:, 30] = [past_interaction_dict[row.user_id][::-1].index(imp) if imp in past_interaction_dict[row.user_id] else np.nan for imp in transformed_impressions]
692 |
693 | # average of the previous features
694 | for i in range(31, 38):
695 | current_rows[:, i] = np.mean(current_rows[:, i-8])
696 |
697 | # impression_avg_prices
698 | current_rows[:, 38] = np.mean(row.prices)
699 | current_rows[:, 39] = row.device_platform
700 |
701 | # equal_max_liic: euqal the maximum of local interaction image count
702 | current_rows[:, 40] = np.array(current_rows[:, 24]) == np.max(current_rows[:, 24]) if sum(current_rows[:, 24]) >0 else False
703 |
704 | # num_interacted_items
705 | current_rows[:, 41] = len(np.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1]))
706 |
707 | # equal_second_last_item
708 | current_rows[:, 42] = transformed_impressions == row.second_last_item
709 |
710 | # last_action
711 | current_rows[:, 43] = session_actions[session_id][configuration.sess_length+ sess_step -2]
712 |
713 | # last_second_last_imp_idx_diff
714 | current_rows[:, 44] = last_interact_index - second_last_interact_index
715 |
716 | # predicted_next_imp_idx (the idea is to trace your eyeball, last_interact_index + (last_interact_index - second_last_interact_index))
717 | current_rows[:, 45] = 2 * last_interact_index - second_last_interact_index
718 |
719 | # list_len
720 | current_rows[:, 46] = len(row.impressions)
721 |
722 | # imp_idx_velocity
723 | current_rows[:, 47] = last_interact_index - 2 * second_last_interact_index + third_last_interact_index
724 |
725 | # time_diff_sess_avg
726 | current_rows[:, 48] = np.mean(finite_time_diff_array)
727 |
728 | # max_time_elapse
729 | current_rows[:, 49] = [ max(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]
730 |
731 | # sum_time_elapse
732 | current_rows[:, 50] = [ sum(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]
733 |
734 | # avg_time_elapse
735 | current_rows[:, 51] = [ np.mean(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]
736 |
737 | # item_time_diff
738 | current_rows[:, 52] = item_time_diff
739 |
740 | # global_interaction_count
741 | current_rows[:, 53] = [global_interaction_count_dict[imp] if imp in global_interaction_count_dict else 0 for imp in transformed_impressions]
742 |
743 | # average global_interaction_count
744 | current_rows[:, 54] = np.mean(current_rows[:, 53])
745 |
746 | # std of global interaction image count
747 | current_rows[:, 55] = np.std(current_rows[:, 27])
748 |
749 | # std of glocal interaction conut
750 | current_rows[:, 56] = np.std(current_rows[:, 53])
751 |
752 | # local_interaction_count
753 | current_rows[:, 57] = [interaction_count_dict[imp] if imp in interaction_count_dict else 0 for imp in transformed_impressions]
754 | current_rows[:, 58] = target_index
755 |
756 | # target price
757 | current_rows[:, 59] = row.prices[target_index] if not np.isnan(target_index) else np.nan
758 |
759 | # normalized co-occurence statistics
760 | current_rows[:, 60] = np.mean(current_co_occ/ norm, axis=1).reshape(-1)
761 | current_rows[:, 61] = np.min(current_co_occ/ norm, axis=1).reshape(-1)
762 | current_rows[:, 62] = np.max(current_co_occ/norm, axis=1).reshape(-1)
763 | current_rows[:, 63] = np.median(current_co_occ/norm, axis=1).reshape(-1)
764 |
765 | # last_item_interaction
766 | current_rows[:, 64] = last_item_interaction
767 |
768 | # target price rank
769 | current_rows[:, 65] = price_rank[target_index] if not np.isnan(target_index) else np.nan
770 | # current_rows[:, 66] = np.mean(norm_imp_current_co_occ, axis=1).reshape(-1)
771 | # current_rows[:, 67] = np.min(norm_imp_current_co_occ, axis=1).reshape(-1)
772 | # current_rows[:, 68] = np.max(norm_imp_current_co_occ, axis=1).reshape(-1)
773 | # current_rows[:, 69] = np.median(norm_imp_current_co_occ, axis=1).reshape(-1)
774 |
775 |
776 |
777 |
778 |
779 | if training or row.item_id == transformed_nan_item:
780 | df_list.append(current_rows)
781 | else:
782 | label_test_df_list.append(current_rows)
783 | # cumulative_click_dict[row.item_id] += 1
784 | past_interaction_dict[row.user_id].append(row.item_id)
785 | last_click_sess_dict[row.session_id] = row.item_id
786 | last_impressions_dict[row.session_id] = transformed_impressions.tolist()
787 | sess_time_diff_dict[row.session_id] = row.timestamp
788 | sess_step_diff_dict[row.session_id] = row.step
789 | if row.item_id != transformed_nan_item:
790 | sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True)
791 | sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0]
792 | # cumulative_click_dict[row.item_id] += 1
793 | data = np.vstack(df_list)
794 | df_columns = ['item_id', 'label', 'session_id', 'equal_last_item', 'price_rank', 'platform', 'device', 'city', 'price', 'country', 'impression_index','step', 'id','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','clickout_time_diff','country_platform','impression_count','is_interacted','local_interaction_image_count','local_interaction_deals_count','local_interaction_clickout_count','global_interaction_image_count','global_interaction_deals_count','is_clicked','click_diff', 'avg_is_interacted','avg_liic', 'avg_lidc','avg_licc','avg_giic','avg_gdc','avg_is_clicked','impression_avg_prices','device_platform','equal_max_liic','num_interacted_items','equal_second_last_item','last_action','last_second_last_imp_idx_diff','predicted_next_imp_idx', 'list_len','imp_idx_velocity','time_diff_sess_avg','max_time_elapse','sum_time_elapse','avg_time_elapse','item_time_diff','global_interaction_count','avg_gic','std_giic','std_gic','local_interaction_count','target_index','target_price','co_occ_mean_norm','co_occ_min_norm','co_occ_max_norm','co_occ_median_norm','last_item_interaction','target_price_rank']
795 | dtype_dict = {"item_id":"int32", "label": "int8", "equal_last_item":"int8", "step":"int16", "price_rank": "int32","impression_index":"int32", "platform":"int32","device":"int32","city":"int32", "id":"int32", "country":"int32", "price":"int16", "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'float32', 'price_diff':'float16','last_price':'float16','price_ratio':'float32','clickout_time_diff':'float16','country_platform':'int32','impression_count':'int32','is_interacted':'int8','local_interaction_image_count':'int32','local_interaction_deals_count':'int32','local_interaction_clickout_count':'int32','global_interaction_image_count':'int32','global_interaction_deals_count':'int32','is_clicked':'int8','click_diff':'float32'\
796 | , 'avg_is_interacted':'float16' ,'avg_liic':'float16', 'avg_lidc':'float32','avg_licc':'float32','avg_giic':'float32','avg_gdc':'float32','avg_is_clicked':'float32','impression_avg_prices':'float32','device_platform':'int32','equal_max_liic':'int8','num_interacted_items':'int32','equal_second_last_item':'int8','last_action':'int32','last_second_last_imp_idx_diff':'float32', 'predicted_next_imp_idx': 'float32','list_len':'int16','imp_idx_velocity':'float32','time_diff_sess_avg':'float32','max_time_elapse':'float32','sum_time_elapse':'float32','avg_time_elapse':'float32','item_time_diff':'float32','global_interaction_count':'float32','avg_gic':'float32','std_giic':'float32','std_gic':'float32','local_interaction_count':'int32','target_index':'float32','target_price':'float32','co_occ_mean_norm':'float32','co_occ_min_norm':'float32','co_occ_max_norm':'float32','co_occ_median_norm':'float32','last_item_interaction':'int32','target_price_rank':'float32'}
797 | df = pd.DataFrame(data, columns=df_columns)
798 | df = df.astype(dtype=dtype_dict )
799 | if training:
800 | return df
801 | else:
802 | label_test = np.vstack(label_test_df_list)
803 | label_test = pd.DataFrame(label_test, columns=df_columns)
804 | label_test = label_test.astype(dtype= dtype_dict)
805 | return df, label_test
806 |
807 |
808 |
809 |
810 | train.sort_values('timestamp',inplace=True)
811 | val.sort_values('timestamp',inplace=True)
812 | test.sort_values('timestamp',inplace=True)
813 |
814 | # print("sorted!!")
815 | train = parse_impressions(train, train_session_interactions, train_session_actions, train_session_time_diff)
816 | val = parse_impressions(val, train_session_interactions, train_session_actions, train_session_time_diff)
817 | test, label_test = parse_impressions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False)
818 |
819 | if configuration.use_test:
820 | train = pd.concat([train, label_test], axis=0)
821 |
822 |
823 |
824 |
825 |
826 |
827 | print("test before merge", test.shape)
828 | train = train.merge(item_properties_df, on="item_id", how="left")
829 | val = val.merge(item_properties_df, on="item_id", how="left")
830 | test = test.merge(item_properties_df, on="item_id", how="left")
831 |
832 |
833 | print("test ", test.shape)
834 | train = train.merge(filters_df, on='id', how="left")
835 | val = val.merge(filters_df, on='id', how="left")
836 | test = test.merge(filters_df, on='id', how="left")
837 |
838 |
839 | # print("test ", test.shape)
840 | # print("test before merge data_feature", test.shape)
841 |
842 | train = train.merge(data_feature, on='id', how="left")
843 | val = val.merge(data_feature, on='id', how="left")
844 | test = test.merge(data_feature, on='id', how="left")
845 | print("test ", test.shape)
846 |
847 | del filters_df, data_feature
848 | del data
849 | gc.collect()
850 |
851 | # target encoding
852 | agg_cols = [ 'price_rank', 'city', 'platform', 'device', 'country', 'impression_index','star']
853 | for c in agg_cols:
854 | gp = train.groupby(c)['label']
855 | mean = gp.mean()
856 | train[f'{c}_label_avg'] = train[c].map(mean)
857 | val[f'{c}_label_avg'] = val[c].map(mean)
858 | test[f'{c}_label_avg'] = test[c].map(mean)
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 | agg_cols = ['city','impression_index', 'platform']
867 | for c in agg_cols:
868 | gp = train.groupby(c)['price']
869 | mean = gp.mean()
870 | train[f'{c}_price_avg'] = train[c].map(mean)
871 | val[f'{c}_price_avg'] = val[c].map(mean)
872 | test[f'{c}_price_avg'] = test[c].map(mean)
873 |
874 |
875 |
876 | agg_cols = ['city']
877 | for c in agg_cols:
878 | gp = train.groupby(c)['rg_time_diff']
879 | mean = gp.mean()
880 | train[f'{c}_td_avg'] = train[c].map(mean)
881 | val[f'{c}_td_avg'] = val[c].map(mean)
882 | test[f'{c}_td_avg'] = test[c].map(mean)
883 |
884 |
885 |
886 | train['rg_price'] = train.price.map(price_rg_price_dict)
887 | val['rg_price'] = val.price.map(price_rg_price_dict)
888 | test['rg_price'] = test.price.map(price_rg_price_dict)
889 |
890 |
891 |
892 | #price cut within city
893 |
894 | data = pd.concat([train,val,test], axis=0).reset_index()
895 | data = data.loc[:,['city','price']].drop_duplicates(['city','price'])
896 | data['city_price_bin'] = data.groupby('city').price.apply(lambda x: qcut_safe(x, q = 40).astype(str))
897 | data['city_price_bin'] = data.apply( lambda x: str(x.city) + x.city_price_bin,axis=1)
898 | data['city_price_bin'] = data['city_price_bin'].factorize()[0]
899 |
900 |
901 | train = train.merge(data, on=['city','price'], how='left')
902 | val = val.merge(data, on=['city','price'], how='left')
903 | test = test.merge(data, on=['city','price'], how='left')
904 |
905 |
906 |
907 | print("train", train.shape)
908 | print("val", val.shape)
909 | print("test", test.shape)
910 | # test = test.merge(item_properties_df, on="item_id", how="left")
911 |
912 |
913 |
914 |
915 |
916 | data_drop_columns= ['label', 'session_id', 'step', 'id']
917 | data_drop_columns+= ['target_index','target_price','target_price_rank']
918 | # data_drop_columns+= ['avg_lidc','avg_licc']
919 |
920 | train_label = train.label
921 |
922 | val_label = val.label
923 |
924 |
925 | d_train = xgb.DMatrix(data=train.drop(data_drop_columns, axis=1), label=train_label.values, silent=True, nthread=-1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist())
926 | d_val = xgb.DMatrix(data=val.drop(data_drop_columns, axis=1), label=val_label.values, silent=True, nthread= -1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist())
927 | d_test = xgb.DMatrix(test.drop(data_drop_columns, axis=1), nthread=-1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist())
928 |
929 | cat_cols = [ 'item_id', "price_rank", 'city', 'platform', 'device', 'country', 'impression_index','star','last_click_impression','last_click_item','last_interact_index','country_platform']
930 |
931 | for col in cat_cols:
932 | if (train[col] < 0).sum() > 0:
933 | print("contains negative ", col)
934 |
935 | del train
936 | gc.collect()
937 |
938 | # params = {
939 | # 'objective': 'binary',
940 | # 'boosting_type': 'gbdt',
941 | # 'nthread': multiprocessing.cpu_count() //2,
942 | # 'num_leaves': 200,
943 | # 'max_depth':10,
944 | # 'learning_rate': 0.05,
945 | # 'bagging_fraction': 0.8,
946 | # 'bagging_freq': 5,
947 | # 'feature_fraction':0.7,
948 | # 'seed': 0,
949 | # 'verbose': -1,
950 |
951 | # }
952 |
953 | params={
954 | 'eta': 0.02, # 0.03,
955 | "booster": "gbtree",
956 | 'tree_method':'hist',
957 | 'max_leaves': 350,
958 | 'max_depth': 10, # 18
959 | "nthread": multiprocessing.cpu_count() -1,
960 | 'subsample': 0.9,
961 | 'colsample_bytree': 0.8,
962 | 'colsample_bylevel': 0.8,
963 | 'min_child_weight': 2,
964 | 'alpha': 1,
965 | 'objective': 'binary:logistic',
966 | 'eval_metric': 'logloss',
967 | 'random_state': 5478,
968 | 'verbosity': 0,
969 | }
970 |
971 |
972 | watchlist = [ (d_train, 'train'), (d_val, 'valid')]
973 | clf = xgb.train(
974 | params=params,
975 | dtrain=d_train,
976 | num_boost_round=50000, #11927
977 | evals= watchlist,
978 | early_stopping_rounds=500,
979 | verbose_eval=500,
980 | # categorical_feature= cat_cols
981 | )
982 |
983 |
984 | # clf.save_model('../weights/lgb-10000-200-01.model')
985 |
986 | def evaluate(val_df, clf):
987 | val_df['scores'] = clf.predict(d_val)
988 | grouped_val = val_df.groupby('session_id')
989 | rss = []
990 | for _, group in grouped_val:
991 |
992 | scores = group.scores
993 | sorted_arg = np.flip(np.argsort(scores))
994 | rss.append( group['label'].values[sorted_arg])
995 |
996 | mrr = compute_mean_reciprocal_rank(rss)
997 | return mrr
998 |
999 |
1000 |
1001 | mrr = evaluate(val, clf)
1002 |
1003 | print("MRR score: ", mrr)
1004 |
1005 |
1006 |
1007 | imp = clf.get_score( importance_type='gain')
1008 | imp_df = pd.DataFrame.from_dict(imp, orient='index').reset_index()
1009 |
1010 | imp_df.columns=['name','importance']
1011 | imp_df.sort_values('importance', ascending=False, inplace=True)
1012 |
1013 |
1014 |
1015 | print(imp_df.head(20))
1016 |
1017 |
1018 | # del d_train
1019 | # gc.collect()
1020 |
1021 | if configuration.debug:
1022 | exit(0)
1023 |
1024 | predictions = []
1025 | session_ids = []
1026 |
1027 | test['score'] = clf.predict(d_test)
1028 | save_test = test.copy()
1029 | save_test['item_id'] = cat_encoders['item_id'].reverse_transform(save_test.item_id.values)
1030 | with open(f'../output/{model_name}_test_score.p', 'wb') as f:
1031 | pickle.dump( save_test.loc[:,['score', 'session_id', 'item_id', 'step']],f, protocol=4)
1032 |
1033 | grouped_test = test.groupby('session_id')
1034 | for session_id, group in grouped_test:
1035 | scores = group['score']
1036 | sorted_arg = np.flip(np.argsort(scores))
1037 | sorted_item_ids = group['item_id'].values[sorted_arg]
1038 | sorted_item_ids = cat_encoders['item_id'].reverse_transform(sorted_item_ids)
1039 | sorted_item_string = ' '.join([str(i) for i in sorted_item_ids])
1040 | predictions.append(sorted_item_string)
1041 | session_ids.append(session_id)
1042 |
1043 | prediction_df = pd.DataFrame()
1044 | prediction_df['session_id'] = session_ids
1045 | prediction_df['item_recommendations'] = predictions
1046 |
1047 | print("pred df shape", prediction_df.shape)
1048 | sub_df = pd.read_csv('../input/submission_popular.csv')
1049 | sub_df.drop('item_recommendations', axis=1, inplace=True)
1050 | sub_df = sub_df.merge(prediction_df, on="session_id")
1051 | # sub_df['item_recommendations'] = predictions
1052 |
1053 | sub_df.to_csv(f'../output/{model_name}.csv', index=None)
1054 |
1055 |
1056 |
1057 |
1058 |
1059 |
1060 |
--------------------------------------------------------------------------------
/src/run_lgb.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | import pickle
5 | import gc
6 | from constant import *
7 | from utils import *
8 | from config import *
9 | import torch
10 | from joblib import Parallel, delayed
11 | from tqdm import tqdm
12 | import lightgbm as lgb
13 | import scipy
14 | from sklearn.decomposition import TruncatedSVD
15 | import multiprocessing
16 | from ordered_set import OrderedSet
17 | from sklearn.metrics.pairwise import cosine_similarity
18 | from sklearn.metrics import log_loss
19 | import pycountry
20 | from sklearn.feature_extraction.text import TfidfVectorizer
21 | from scipy.spatial.distance import cosine
22 | import itertools
23 | from scipy.sparse import csr_matrix
24 |
25 | configuration = LGBConfiguration()
26 |
27 | model_name='lgb_imp_cooc_v2'
28 |
29 | if configuration.sub_sample:
30 | model_name += '_140k'
31 | else:
32 | model_name += '_all'
33 |
34 | if configuration.use_test:
35 | model_name += '_ut'
36 |
37 | seed_everything(42)
38 |
39 | ########################################################### Load data ######################################################################
40 | with open( f'{input_dir}/train_v2.p', 'rb') as f:
41 | train = pickle.load(f)
42 | train['id']= np.arange(len(train))
43 |
44 | with open(f'{input_dir}/test_v2.p', 'rb') as f:
45 | test = pickle.load(f)
46 | test['id'] = np.arange( len(train), len(train)+ len(test))
47 |
48 | with open('../input/item_metadata.p', 'rb') as f:
49 | item_meta = pickle.load(f)
50 | item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|'))
51 | item_meta['item_id'] = item_meta['item_id'].apply(str)
52 |
53 | # whether to use sub sample of the data to speed up the evaluation
54 | if configuration.sub_sample:
55 | with open('../input/selected_users_140k.p', 'rb') as f:
56 | selected_users = pickle.load(f)
57 |
58 | train = train.loc[train.user_id.isin(selected_users),:]
59 |
60 | # check if the code can run with debug mode
61 | if configuration.debug:
62 | train = train.sample(1000)
63 | test = test.sample(1000)
64 |
65 | with timer("preprocessing"):
66 |
67 | # change columns name
68 | train.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True)
69 | test.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True)
70 |
71 | # concatenate the action and reference in string format as these refernce are not actually item id
72 | train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
73 | test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
74 |
75 |
76 | train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
77 | test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
78 |
79 |
80 | # wipe out the item id associated with these actions, reason same as the above
81 | train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
82 | test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
83 |
84 | train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
85 | test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
86 |
87 | train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
88 | test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
89 |
90 | train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
91 | test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
92 |
93 | # remove training example where clicked item is not in the impressions
94 | train['in_impressions'] = True
95 | train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
96 | train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
97 |
98 | test['in_impressions'] = True
99 | test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
100 | test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
101 |
102 | # parse impressions and prices list from string to list
103 | train['item_id'] = train['item_id'].apply(str)
104 | train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|'))
105 | train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
106 |
107 | test['item_id'] = test['item_id'].apply(str)
108 | test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|'))
109 | test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
110 |
111 | # compute the last interacted item by shifted the item_id by 1 position
112 | train['last_item'] = np.nan
113 | test['last_item'] = np.nan
114 |
115 | train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist()
116 | test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist()
117 |
118 | # compute the last interacted item by shifted the item_id by 2 position
119 | train['last_item'] = train_shifted_item_id
120 | test['last_item'] = test_shifted_item_id
121 |
122 | train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist()
123 | test_shifted_item_id = [DUMMY_ITEM] *2 + test.item_id.values[:-2].tolist()
124 |
125 | # compute the last interacted item by shifted the item_id by 3 position
126 | train['second_last_item'] = train_shifted_item_id
127 | test['second_last_item'] = test_shifted_item_id
128 |
129 | train_shifted_item_id = [DUMMY_ITEM] *3 + train.item_id.values[:-3].tolist()
130 | test_shifted_item_id = [DUMMY_ITEM] *3 + test.item_id.values[:-3].tolist()
131 |
132 | train['third_last_item'] = train_shifted_item_id
133 | test['third_last_item'] = test_shifted_item_id
134 |
135 | # mask out the last interacted item if that interaction comes first in its session
136 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=True)
137 | test['step_rank'] = test.groupby('session_id')['step'].rank(method='max', ascending=True)
138 |
139 | # fill the invalid shifted last n item with a constant number
140 | train.loc[(train.step_rank == 1) & (train.action == 'clickout item'), 'last_item'] = DUMMY_ITEM
141 | test.loc[(test.step_rank == 1) & (test.action == 'clickout item'), 'last_item'] = DUMMY_ITEM
142 |
143 | train.loc[(train.step_rank == 2) & (train.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM
144 | test.loc[(test.step_rank == 2) & (test.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM
145 |
146 | train.loc[(train.step_rank == 3) & (train.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM
147 | test.loc[(test.step_rank == 3) & (test.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM
148 |
149 |
150 | # ignore this
151 | keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item']
152 | all_cat_columns = ['item_id', 'city', 'platform', 'device','country','country_platform','action','device_platform']
153 |
154 |
155 | # generate country from city
156 | train['country'] = train.city.apply(lambda x:x.split(',')[-1])
157 | test['country'] = test.city.apply(lambda x:x.split(',')[-1])
158 |
159 | # concate country and platform in string format as a new feature
160 | train['country_platform'] = train.apply(lambda row: row.country + row.platform, axis=1)
161 | test['country_platform'] = test.apply(lambda row: row.country + row.platform, axis=1)
162 |
163 | train['device_platform'] = train.apply(lambda row: row.device + row.platform, axis=1)
164 | test['device_platform'] = test.apply(lambda row: row.device + row.platform, axis=1)
165 | # filter out rows where reference doesn't present in impression
166 | # train = train.loc[train.apply(lambda row:row.item_id in row.impressions, axis=1),:]
167 |
168 | print("train shape",train.shape)
169 |
170 |
171 | # concat train and test
172 | data = pd.concat([train, test], axis=0)
173 | data = data.reset_index(drop=True)
174 |
175 | # compute a dicationary that maps session id to the sequence of item ids in that session
176 | train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list))
177 | test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list))
178 |
179 |
180 | # compute a dicationary that maps session id to the sequence of action in that session
181 | train_session_actions = dict(train.groupby('session_id')['action'].apply(list))
182 | test_session_actions = dict(test.groupby('session_id')['action'].apply(list))
183 |
184 | # compute session session step since the "step" column in some session is not correctly order
185 | train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int)
186 | test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int)
187 |
188 |
189 |
190 |
191 | data_feature = data.loc[:,['id','step','session_id', 'timestamp','platform','country']].copy()
192 |
193 | # compute the time difference between each step
194 | data_feature['time_diff'] = data.groupby('session_id')['timestamp'].diff()
195 |
196 | # compute the difference of time difference between each step
197 | data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff()
198 |
199 | # compute the difference of the difference of time difference between each step
200 | data_feature['time_diff_diff_diff'] = data_feature.groupby('session_id')['time_diff_diff'].diff()
201 |
202 | # compute the time difference from 2 steps ahead
203 | data_feature['time_diff_2'] = data.groupby('session_id')['timestamp'].diff().shift(1)
204 |
205 | # compute the time difference from 3 steps ahead
206 | data_feature['time_diff_3'] = data.groupby('session_id')['timestamp'].diff().shift(2)
207 |
208 | data_feature['hour']= pd.to_datetime(data_feature.timestamp, unit='s').dt.hour//4
209 |
210 | # map platform to country
211 | data_feature['mapped_country'] = data_feature.platform.apply(platform2country)
212 |
213 |
214 | # load the precomputed country to utc offsets from geopy
215 | with open('../input/country2offsets_dict.p','rb') as f:
216 | platform_country2offsets_dict = pickle.load(f)
217 | data_feature['platform2country_utc_offsets'] = data_feature.mapped_country.map(platform_country2offsets_dict)
218 |
219 |
220 | # trasnform time difference with rank gauss
221 | data_feature['rg_time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values)
222 |
223 | # compute the log of step
224 | data_feature['step_log'] = np.log1p(data_feature['step'])
225 |
226 | # drop the useless columns
227 | data_feature = data_feature.drop(['session_id','step','timestamp','hour','platform','country','mapped_country'], axis=1)
228 |
229 |
230 |
231 |
232 | # merge train, test with data_feature
233 | train = train.merge(data_feature, on='id', how='left')
234 | test = test.merge(data_feature, on='id', how='left')
235 |
236 |
237 | # compute the sequence of time difference in each session
238 | train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list))
239 | test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list))
240 |
241 | # encode the categorical feture
242 | cat_encoders = {}
243 | for col in all_cat_columns:
244 | cat_encoders[col] = CategoricalEncoder()
245 |
246 |
247 | all_items = []
248 | for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()] :
249 | all_items += imp
250 |
251 | unique_items = OrderedSet(all_items)
252 | unique_actions = OrderedSet(data.action.values)
253 |
254 | cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM])
255 | cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION])
256 | for col in ['city', 'platform', 'device','country','country_platform', 'device_platform']:
257 |
258 | cat_encoders[col].fit(data[col].tolist() )
259 |
260 |
261 | # transform all the categorical columns to continuous integer
262 | for col in all_cat_columns:
263 | train[col] = cat_encoders[col].transform(train[col].values)
264 | test[col] = cat_encoders[col].transform(test[col].values)
265 |
266 |
267 | # get the encoded action
268 | transformed_clickout_action = cat_encoders['action'].transform(['clickout item'])[0]
269 | transformed_dummy_item = cat_encoders['item_id'].transform([DUMMY_ITEM])[0]
270 | transformed_dummy_action = cat_encoders['action'].transform([DUMMY_ACTION])[0]
271 | transformed_interaction_image = cat_encoders['action'].transform(['interaction item image'])[0]
272 | transformed_interaction_deals = cat_encoders['action'].transform(['interaction item deals'])[0]
273 | transformed_interaction_info = cat_encoders['action'].transform(['interaction item info'])[0]
274 | transformed_interaction_rating = cat_encoders['action'].transform(['interaction item rating'])[0]
275 |
276 | # transform session interactions and pad dummy in front of all of them
277 | for session_id, item_list in train_session_interactions.items():
278 | train_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list)
279 |
280 | for session_id, item_list in test_session_interactions.items():
281 | test_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + cat_encoders['item_id'].transform(item_list)
282 |
283 | for session_id, action_list in train_session_actions.items():
284 | train_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list)
285 |
286 | for session_id, action_list in test_session_actions.items():
287 | test_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + cat_encoders['action'].transform(action_list)
288 |
289 |
290 | ### compute co-occurence matrix
291 | implicit_train = train.loc[train.action != transformed_clickout_action, :]
292 | implicit_test = test.loc[test.action != transformed_clickout_action, :]
293 |
294 | # get all interacted items in a session
295 | implicit_all = pd.concat([implicit_train , implicit_test], axis=0)
296 | # a list of list containing items in the same session
297 | co_occ_items = implicit_all.groupby('session_id').item_id.apply(list).to_dict().values()
298 | co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in co_occ_items]
299 |
300 | #aggregate co-ocurrence across sessions
301 | co_occ_coordinates = []
302 | for coordinates in co_occ_permutes:
303 | co_occ_coordinates += coordinates
304 |
305 | #construct csr
306 | row, col, values = zip(*((i,j,1) for i,j in co_occ_coordinates ))
307 | co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32)
308 |
309 | co_occ_matrix_csc = co_occ_matrix.tocsc()
310 |
311 | print("max entry: ", co_occ_matrix.max())
312 |
313 |
314 | ### compute co-occurence matrix for imp list
315 |
316 | # imp_co_occ_items = train.loc[~train.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist() + test.loc[~test.impressions.isna()].impressions.apply(lambda x: cat_encoders['item_id'].transform(x)).values.tolist()
317 | # imp_co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in imp_co_occ_items]
318 |
319 | # #aggregate co-ocurrence across sessions
320 | # imp_co_occ_coordinates = []
321 | # for coordinates in imp_co_occ_permutes:
322 | # imp_co_occ_coordinates += coordinates
323 |
324 | # #construct csr
325 | # row, col, values = zip(*((i,j,1) for i,j in imp_co_occ_coordinates ))
326 | # imp_co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32)
327 |
328 | # imp_co_occ_matrix_csc = imp_co_occ_matrix.tocsc()
329 |
330 | # print("max entry: ", imp_co_occ_matrix.max())
331 |
332 | # categorically encode last, second last and third item
333 | train['last_item'] = cat_encoders['item_id'].transform(train['last_item'].values)
334 | test['last_item'] = cat_encoders['item_id'].transform(test['last_item'].values)
335 |
336 | train['second_last_item'] = cat_encoders['item_id'].transform(train.second_last_item.values)
337 | test['second_last_item'] = cat_encoders['item_id'].transform(test.second_last_item.values)
338 |
339 | train['third_last_item'] = cat_encoders['item_id'].transform(train.third_last_item.values)
340 | test['third_last_item'] = cat_encoders['item_id'].transform(test.third_last_item.values)
341 |
342 |
343 |
344 |
345 | # genetate item properties features
346 | item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:]
347 | # item_meta multi-hot
348 | item_meta['item_id'] = cat_encoders['item_id'].transform(item_meta['item_id'].values)
349 | item_meta['star'] = np.nan
350 | item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1
351 | item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2
352 | item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3
353 | item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4
354 | item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5
355 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Excellent Rating' in y) ), 'star'] = 9
356 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Very Good Rating' in y) ), 'star'] = 8
357 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Good Rating' in y) ), 'star'] = 7
358 | item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Satisfactory Rating' in y) ), 'star'] = 6
359 |
360 | item_meta['rating'] = np.nan
361 | item_meta.loc[item_meta.properties.apply(lambda x: 'Satisfactory Rating' in x), 'rating'] = 7.0
362 | item_meta.loc[item_meta.properties.apply(lambda x: 'Good Rating' in x), 'rating'] = 7.5
363 | item_meta.loc[item_meta.properties.apply(lambda x: 'Very Good Rating' in x), 'rating'] = 8.0
364 | item_meta.loc[item_meta.properties.apply(lambda x: 'Excellent Rating' in x), 'rating'] = 8.5
365 |
366 | # get binary properties feature
367 | item_properties_df = pd.DataFrame()
368 | item_properties_df['item_id'] = item_meta.item_id
369 | item_properties_df['num_properties'] = item_meta.properties.apply(len)
370 | item_properties_df['star'] = item_meta.star
371 | item_properties_df['item_Beach'] = item_meta.properties.apply(lambda x: 'Beach' in x).astype(np.float16)
372 | item_properties_df['item_Bed & Breakfast'] = item_meta.properties.apply(lambda x: 'Bed & Breakfast' in x).astype(np.float16)
373 | item_properties_df['rating'] = item_meta['rating']
374 |
375 |
376 | item_star_map = item_properties_df.loc[:,['item_id','star']].set_index('item_id').to_dict()['star']
377 | item_rating_map = item_properties_df.loc[:,['item_id','rating']].set_index('item_id').to_dict()['rating']
378 |
379 |
380 |
381 | del item_meta
382 | gc.collect()
383 |
384 | # ignore filter_df , not using, consume huge memory yet increase a little
385 | filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']]
386 | filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|'))
387 |
388 | # filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Star' in x), 'nights'] = 3
389 | filter_df['nights']=np.nan
390 | filter_df.loc[filter_df.current_filters.apply(lambda x: '2 Nights' in x), 'nights'] = 1
391 | filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Nights' in x), 'nights'] = 2
392 |
393 | filter_set = list(set(np.hstack(filter_df['current_filters'].to_list())))
394 |
395 | cat_encoders['filters'] = CategoricalEncoder()
396 | cat_encoders['filters'].fit(filter_set)
397 |
398 | # get binary filter feature
399 | filters_df = pd.DataFrame()
400 | filters_df['id'] = filter_df.id
401 | filters_df['num_filters'] = filter_df.current_filters.apply(len)
402 | filters_df['breakfast_included'] = filter_df.current_filters.apply( lambda x: 'Breakfast Included' in x).astype(np.float16)
403 | filters_df['filters_Sort By Price'] = filter_df.current_filters.apply( lambda x: 'Sort by Price' in x).astype(np.float16)
404 | filters_df['filters_Sort By Popularity'] = filter_df.current_filters.apply( lambda x: 'Sort By Popularity' in x).astype(np.float16)
405 |
406 |
407 |
408 | # compute interaction image count for each item across train/ test
409 | interaction_image_item_ids = train.loc[train.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
410 | unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True)
411 | global_image_count_dict = dict(zip(unique_interaction_image_items, counts))
412 |
413 | # compute interaction count for each item across train/ test
414 | interaction_item_ids = train.loc[train.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
415 | unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True)
416 | global_interaction_count_dict = dict(zip(unique_interaction_items, counts))
417 |
418 | # compute interaction deals count for each item across train/ test
419 | interaction_deals_item_ids = train.loc[train.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
420 | unique_interaction_deals_items, counts = np.unique(interaction_deals_item_ids, return_counts=True)
421 | global_deals_count_dict = dict(zip(unique_interaction_deals_items, counts))
422 |
423 |
424 | # compute step rank to identify the last row in each session for train/ val split
425 | train = train.loc[train.action == transformed_clickout_action,:]
426 | test = test.loc[test.action == transformed_clickout_action,:]
427 | train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False)
428 |
429 | # compute the impression count for each item
430 | item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)])
431 | unique_items, counts = np.unique(item_ids, return_counts=True)
432 | impression_count_dict = dict(zip(unique_items, counts))
433 |
434 | # compute the rank gauss transformed prices
435 | unique_prices = np.unique(np.hstack([np.hstack(train.prices.values), np.hstack(test.prices.values)]) )
436 | rg_unique_prices = GaussRankScaler().fit_transform(unique_prices)
437 | price_rg_price_dict = dict(zip(unique_prices, rg_unique_prices))
438 |
439 |
440 | #train/ val split
441 | if configuration.debug:
442 | val = train.loc[train.step_rank == 1,:].iloc[:5]
443 | else:
444 | val = train.loc[train.step_rank == 1,:].iloc[:50000]
445 |
446 | val_index = val.index
447 | train = train.loc[~train.index.isin(val_index),:]
448 |
449 | train = train.drop('step_rank', axis=1)
450 | val = val.drop('step_rank', axis=1)
451 |
452 |
453 | # get the encoded nan item
454 | transformed_nan_item = cat_encoders['item_id'].transform(['nan'])[0]
455 |
456 |
457 |
458 |
459 | from collections import defaultdict, Counter
460 | session_clickout_count_dict = {}
461 | past_interaction_dict = {}
462 | last_click_sess_dict = {}
463 | last_impressions_dict = {}
464 | sess_last_imp_idx_dict={}
465 | sess_last_price_dict = {}
466 | sess_time_diff_dict ={}
467 | sess_step_diff_dict = {}
468 |
469 | cumulative_click_dict = defaultdict(lambda : 0)
470 |
471 |
472 |
473 |
474 | def parse_impressions(df, session_interactions, session_actions, session_time_diff, training=True):
475 | # parse the data into a binary classification task, generate 1 example for each item in the impression list
476 | df_list = []
477 | label_test_df_list = []
478 | # parse impressions for train set
479 | past_interaction_rows = []
480 | past_interaction_columns = []
481 | for idx, row in enumerate(tqdm(df.itertuples())):
482 |
483 | if row.session_id not in session_clickout_count_dict:
484 | session_clickout_count_dict[row.session_id] = 0
485 |
486 | if row.user_id not in past_interaction_dict:
487 | past_interaction_dict[row.user_id] = []
488 |
489 |
490 | sess_step = row.sess_step
491 | session_id = row.session_id
492 |
493 | # compute the categorically encoded impression list
494 | transformed_impressions = cat_encoders['item_id'].transform(row.impressions, to_np=True)
495 |
496 | current_rows = np.zeros([len(row.impressions), 66], dtype=object)
497 |
498 | # compute rank of price this clickout
499 | price_rank = compute_rank(row.prices)
500 |
501 | #compute the number of interactions associated with the last interacted item in this session
502 | equal_last_item_indices = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) == row.last_item
503 | last_item_interaction = len(set(np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1])[equal_last_item_indices]))
504 |
505 | #compute the local interaction count for each item id
506 | interaction_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) != transformed_clickout_action
507 | interaction_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_indices]
508 | sess_unique_items, counts = np.unique(interaction_item, return_counts=True)
509 | interaction_count_dict = dict(zip(sess_unique_items, counts))
510 |
511 | #compute the local interaction image count for each item id
512 | interaction_image_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_image
513 | interaction_image_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_image_indices]
514 | sess_unique_image_items, counts = np.unique(interaction_image_item, return_counts=True)
515 | interaction_image_count_dict = dict(zip(sess_unique_image_items, counts))
516 |
517 | #compute the local interaction deals count for each item id
518 | interaction_deals_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_deals
519 | interaction_deals_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_deals_indices]
520 | sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True)
521 | interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts))
522 |
523 | #compute the local clickout count for each item id
524 | interaction_clickout_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_clickout_action
525 | interaction_clickout_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_clickout_indices]
526 | sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True)
527 | interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts))
528 |
529 | #compute the local interaction rating count for each item id
530 | interaction_rating_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_rating
531 | interaction_rating_item = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_rating_indices]
532 | sess_unique_rating_items, counts = np.unique(interaction_rating_item, return_counts=True)
533 | interaction_rating_count_dict = dict(zip(sess_unique_rating_items, counts))
534 |
535 |
536 | # get the time diffference array in this session for later computing the average of it
537 | finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1])
538 | finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices]
539 |
540 | # unpad the interactions
541 | unpad_interactions = session_interactions[session_id][configuration.sess_length:configuration.sess_length+ sess_step -1]
542 | unique_interaction = pd.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1])
543 |
544 | # time elapse of within two steps for each item before the clickout
545 | item_time_elapse_dict = {}
546 | for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]):
547 | if it not in item_time_elapse_dict:
548 | item_time_elapse_dict[it] = [elapse]
549 |
550 | else:
551 | item_time_elapse_dict[it].append(elapse)
552 |
553 | # compute time_diff for each item in the session
554 | interact_diff = [unpad_interactions[::-1].index(imp) if imp in unpad_interactions else np.nan for imp in transformed_impressions]
555 | item_time_diff = np.array([ sum(session_time_diff[session_id][sess_step - diff -1 :sess_step]) if np.isfinite(diff) else np.nan for diff in interact_diff])
556 |
557 | target_index = transformed_impressions.tolist().index(row.item_id) if training else np.nan
558 |
559 | #(imp len, num items)
560 | current_co_occ = co_occ_matrix[transformed_impressions,:]
561 |
562 |
563 | #(imp len, num unique items in the session b4 this clickout)
564 | current_co_occ = current_co_occ[:,sess_unique_items].toarray()
565 |
566 | # (1, num unique items in the session b4 this clickout)
567 | # print(current_co_occ.dtype)
568 |
569 | norm = (1 + co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1))
570 |
571 | # #(imp len, num items)
572 | # imp_current_co_occ = imp_co_occ_matrix[transformed_impressions,:]
573 |
574 |
575 | # #(imp len, num unique items in the session b4 this clickout)
576 | # imp_current_co_occ = imp_current_co_occ[:,sess_unique_items].toarray()
577 |
578 | # # (1, num unique items in the session b4 this clickout)
579 | # # print(current_co_occ.dtype)
580 |
581 | # imp_norm = (1 + imp_co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1))
582 |
583 | # norm_imp_current_co_occ = imp_current_co_occ / imp_norm
584 |
585 | # the position of the last interacted item in the current impression list
586 | if row.last_item in transformed_impressions:
587 | last_interact_index = transformed_impressions.tolist().index(row.last_item)
588 | else:
589 | last_interact_index = np.nan
590 |
591 | # the position of the second last interacted item in the current impression list
592 | if row.second_last_item in transformed_impressions:
593 | second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item)
594 | else:
595 | second_last_interact_index = np.nan
596 |
597 | # the position of the third last interacted item in the current impression list
598 | if row.third_last_item in transformed_impressions:
599 | third_last_interact_index = transformed_impressions.tolist().index(row.third_last_item)
600 | else:
601 | third_last_interact_index = np.nan
602 |
603 | # initialize dictionaries
604 | if row.session_id not in last_click_sess_dict:
605 | last_click_sess_dict[row.session_id] = transformed_dummy_item
606 |
607 | if row.session_id not in last_impressions_dict:
608 | last_impressions_dict[row.session_id] = None
609 |
610 | if row.session_id not in sess_last_imp_idx_dict:
611 | sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX
612 |
613 | if row.session_id not in sess_last_price_dict:
614 | sess_last_price_dict[row.session_id] = None
615 |
616 | if row.session_id not in sess_time_diff_dict:
617 | sess_time_diff_dict[row.session_id] = None
618 |
619 | if row.session_id not in sess_step_diff_dict:
620 | sess_step_diff_dict[row.session_id] = None
621 |
622 |
623 | # item id
624 | current_rows[:, 0] = transformed_impressions
625 |
626 | # label
627 | current_rows[:, 1] = transformed_impressions == row.item_id
628 | current_rows[:, 2] = row.session_id
629 |
630 | # whether current item id equal to the last interacted item id
631 | current_rows[:, 3] = transformed_impressions == row.last_item
632 | current_rows[:, 4] = price_rank
633 | current_rows[:, 5] = row.platform
634 | current_rows[:, 6] = row.device
635 | current_rows[:, 7] = row.city
636 | current_rows[:, 8] = row.prices
637 | current_rows[:, 9] = row.country
638 |
639 | # impression index
640 | current_rows[:, 10] = np.arange(len(row.impressions))
641 | current_rows[:, 11] = row.step
642 | current_rows[:, 12] = row.id
643 |
644 | # last_click_item: last clickout item id
645 | current_rows[:, 13] = last_click_sess_dict[row.session_id]
646 |
647 | # equal_last_impressions: current impression list is eactly the same as the last one that the user encountered
648 | current_rows[:, 14] = last_impressions_dict[row.session_id] == transformed_impressions.tolist()
649 |
650 |
651 | current_rows[:, 15] = sess_last_imp_idx_dict[row.session_id]
652 | # last_interact_index
653 | current_rows[:, 16] = last_interact_index
654 |
655 | # price_diff
656 | current_rows[:, 17] = row.prices - sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan
657 |
658 | # last_price
659 | current_rows[:, 18] = sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan
660 |
661 | # price_ratio
662 | current_rows[:, 19] = row.prices / sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np. nan
663 |
664 | # clickout_time_diff
665 | current_rows[:, 20] = row.timestamp - sess_time_diff_dict[row.session_id] if sess_time_diff_dict[row.session_id] else np.nan
666 |
667 | # country_platform
668 | current_rows[:, 21] = row.country_platform
669 |
670 | # impression_count
671 | current_rows[:, 22] = [impression_count_dict[imp] for imp in row.impressions]
672 |
673 | # is_interacted: if that item has been interaced in the current session
674 | current_rows[:, 23] = [imp in session_interactions[session_id][:configuration.sess_length+ sess_step -1] for imp in transformed_impressions]
675 |
676 | # local_interaction_image_count
677 | current_rows[:, 24] = [interaction_image_count_dict[imp] if imp in interaction_image_count_dict else 0 for imp in transformed_impressions]
678 | # local_interaction_deals_count
679 | current_rows[:, 25] = [interaction_deals_count_dict[imp] if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions]
680 |
681 | # local_interaction_clickout_count
682 | current_rows[:, 26] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions]
683 |
684 | # global_interaction_image_count
685 | current_rows[:, 27] = [global_image_count_dict[imp] if imp in global_image_count_dict else 0 for imp in transformed_impressions]
686 |
687 | # global_interaction_deals_count
688 | current_rows[:, 28] = [global_deals_count_dict[imp] if imp in global_deals_count_dict else 0 for imp in transformed_impressions]
689 |
690 | # is_clicked
691 | current_rows[:, 29] = [imp in past_interaction_dict[row.user_id] for imp in transformed_impressions]
692 |
693 | # click_diff
694 | current_rows[:, 30] = [past_interaction_dict[row.user_id][::-1].index(imp) if imp in past_interaction_dict[row.user_id] else np.nan for imp in transformed_impressions]
695 |
696 | # average of the previous features
697 | for i in range(31, 38):
698 | current_rows[:, i] = np.mean(current_rows[:, i-8])
699 |
700 | # impression_avg_prices
701 | current_rows[:, 38] = np.mean(row.prices)
702 | current_rows[:, 39] = row.device_platform
703 |
704 | # equal_max_liic: euqal the maximum of local interaction image count
705 | current_rows[:, 40] = np.array(current_rows[:, 24]) == np.max(current_rows[:, 24]) if sum(current_rows[:, 24]) >0 else False
706 |
707 | # num_interacted_items
708 | current_rows[:, 41] = len(np.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1]))
709 |
710 | # equal_second_last_item
711 | current_rows[:, 42] = transformed_impressions == row.second_last_item
712 |
713 | # last_action
714 | current_rows[:, 43] = session_actions[session_id][configuration.sess_length+ sess_step -2]
715 |
716 | # last_second_last_imp_idx_diff
717 | current_rows[:, 44] = last_interact_index - second_last_interact_index
718 |
719 | # predicted_next_imp_idx (the idea is to trace your eyeball, last_interact_index + (last_interact_index - second_last_interact_index))
720 | current_rows[:, 45] = 2 * last_interact_index - second_last_interact_index
721 |
722 | # list_len
723 | current_rows[:, 46] = len(row.impressions)
724 |
725 | # imp_idx_velocity
726 | current_rows[:, 47] = last_interact_index - 2 * second_last_interact_index + third_last_interact_index
727 |
728 | # time_diff_sess_avg
729 | current_rows[:, 48] = np.mean(finite_time_diff_array)
730 |
731 | # max_time_elapse
732 | current_rows[:, 49] = [ max(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]
733 |
734 | # sum_time_elapse
735 | current_rows[:, 50] = [ sum(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]
736 |
737 | # avg_time_elapse
738 | current_rows[:, 51] = [ np.mean(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]
739 |
740 | # item_time_diff
741 | current_rows[:, 52] = item_time_diff
742 |
743 | # global_interaction_count
744 | current_rows[:, 53] = [global_interaction_count_dict[imp] if imp in global_interaction_count_dict else 0 for imp in transformed_impressions]
745 |
746 | # average global_interaction_count
747 | current_rows[:, 54] = np.mean(current_rows[:, 53])
748 |
749 | # std of global interaction image count
750 | current_rows[:, 55] = np.std(current_rows[:, 27])
751 |
752 | # std of glocal interaction conut
753 | current_rows[:, 56] = np.std(current_rows[:, 53])
754 |
755 | # local_interaction_count
756 | current_rows[:, 57] = [interaction_count_dict[imp] if imp in interaction_count_dict else 0 for imp in transformed_impressions]
757 | current_rows[:, 58] = target_index
758 |
759 | # target price
760 | current_rows[:, 59] = row.prices[target_index] if not np.isnan(target_index) else np.nan
761 |
762 | # normalized co-occurence statistics
763 | current_rows[:, 60] = np.mean(current_co_occ/ norm, axis=1).reshape(-1)
764 | current_rows[:, 61] = np.min(current_co_occ/ norm, axis=1).reshape(-1)
765 | current_rows[:, 62] = np.max(current_co_occ/norm, axis=1).reshape(-1)
766 | current_rows[:, 63] = np.median(current_co_occ/norm, axis=1).reshape(-1)
767 |
768 | # last_item_interaction
769 | current_rows[:, 64] = last_item_interaction
770 |
771 | # target price rank
772 | current_rows[:, 65] = price_rank[target_index] if not np.isnan(target_index) else np.nan
773 | # current_rows[:, 66] = np.mean(norm_imp_current_co_occ, axis=1).reshape(-1)
774 | # current_rows[:, 67] = np.min(norm_imp_current_co_occ, axis=1).reshape(-1)
775 | # current_rows[:, 68] = np.max(norm_imp_current_co_occ, axis=1).reshape(-1)
776 | # current_rows[:, 69] = np.median(norm_imp_current_co_occ, axis=1).reshape(-1)
777 |
778 |
779 |
780 |
781 |
782 | if training or row.item_id == transformed_nan_item:
783 | df_list.append(current_rows)
784 | else:
785 | label_test_df_list.append(current_rows)
786 | # cumulative_click_dict[row.item_id] += 1
787 | past_interaction_dict[row.user_id].append(row.item_id)
788 | last_click_sess_dict[row.session_id] = row.item_id
789 | last_impressions_dict[row.session_id] = transformed_impressions.tolist()
790 | sess_time_diff_dict[row.session_id] = row.timestamp
791 | sess_step_diff_dict[row.session_id] = row.step
792 | if row.item_id != transformed_nan_item:
793 | sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True)
794 | sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0]
795 | # cumulative_click_dict[row.item_id] += 1
796 | data = np.vstack(df_list)
797 | df_columns = ['item_id', 'label', 'session_id', 'equal_last_item', 'price_rank', 'platform', 'device', 'city', 'price', 'country', 'impression_index','step', 'id','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','clickout_time_diff','country_platform','impression_count','is_interacted','local_interaction_image_count','local_interaction_deals_count','local_interaction_clickout_count','global_interaction_image_count','global_interaction_deals_count','is_clicked','click_diff', 'avg_is_interacted','avg_liic', 'avg_lidc','avg_licc','avg_giic','avg_gdc','avg_is_clicked','impression_avg_prices','device_platform','equal_max_liic','num_interacted_items','equal_second_last_item','last_action','last_second_last_imp_idx_diff','predicted_next_imp_idx', 'list_len','imp_idx_velocity','time_diff_sess_avg','max_time_elapse','sum_time_elapse','avg_time_elapse','item_time_diff','global_interaction_count','avg_gic','std_giic','std_gic','local_interaction_count','target_index','target_price','co_occ_mean_norm','co_occ_min_norm','co_occ_max_norm','co_occ_median_norm','last_item_interaction','target_price_rank']
798 | dtype_dict = {"item_id":"int32", "label": "int8", "equal_last_item":"int8", "step":"int16", "price_rank": "int32","impression_index":"int32", "platform":"int32","device":"int32","city":"int32", "id":"int32", "country":"int32", "price":"int16", "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'float32', 'price_diff':'float16','last_price':'float16','price_ratio':'float32','clickout_time_diff':'float16','country_platform':'int32','impression_count':'int32','is_interacted':'int8','local_interaction_image_count':'int32','local_interaction_deals_count':'int32','local_interaction_clickout_count':'int32','global_interaction_image_count':'int32','global_interaction_deals_count':'int32','is_clicked':'int8','click_diff':'float32'\
799 | , 'avg_is_interacted':'float16' ,'avg_liic':'float16', 'avg_lidc':'float32','avg_licc':'float32','avg_giic':'float32','avg_gdc':'float32','avg_is_clicked':'float32','impression_avg_prices':'float32','device_platform':'int32','equal_max_liic':'int8','num_interacted_items':'int32','equal_second_last_item':'int8','last_action':'int32','last_second_last_imp_idx_diff':'float32', 'predicted_next_imp_idx': 'float32','list_len':'int16','imp_idx_velocity':'float32','time_diff_sess_avg':'float32','max_time_elapse':'float32','sum_time_elapse':'float32','avg_time_elapse':'float32','item_time_diff':'float32','global_interaction_count':'float32','avg_gic':'float32','std_giic':'float32','std_gic':'float32','local_interaction_count':'int32','target_index':'float32','target_price':'float32','co_occ_mean_norm':'float32','co_occ_min_norm':'float32','co_occ_max_norm':'float32','co_occ_median_norm':'float32','last_item_interaction':'int32','target_price_rank':'float32'}
800 | df = pd.DataFrame(data, columns=df_columns)
801 | df = df.astype(dtype=dtype_dict )
802 | if training:
803 | return df
804 | else:
805 | label_test = np.vstack(label_test_df_list)
806 | label_test = pd.DataFrame(label_test, columns=df_columns)
807 | label_test = label_test.astype(dtype= dtype_dict)
808 | return df, label_test
809 |
810 |
811 |
812 |
813 | train.sort_values('timestamp',inplace=True)
814 | val.sort_values('timestamp',inplace=True)
815 | test.sort_values('timestamp',inplace=True)
816 |
817 | # print("sorted!!")
818 | train = parse_impressions(train, train_session_interactions, train_session_actions, train_session_time_diff)
819 | val = parse_impressions(val, train_session_interactions, train_session_actions, train_session_time_diff)
820 | test, label_test = parse_impressions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False)
821 |
822 | if configuration.use_test:
823 | train = pd.concat([train, label_test], axis=0)
824 |
825 |
826 |
827 |
828 |
829 |
830 | print("test before merge", test.shape)
831 | train = train.merge(item_properties_df, on="item_id", how="left")
832 | val = val.merge(item_properties_df, on="item_id", how="left")
833 | test = test.merge(item_properties_df, on="item_id", how="left")
834 |
835 |
836 | print("test ", test.shape)
837 | train = train.merge(filters_df, on='id', how="left")
838 | val = val.merge(filters_df, on='id', how="left")
839 | test = test.merge(filters_df, on='id', how="left")
840 |
841 |
842 | # print("test ", test.shape)
843 | # print("test before merge data_feature", test.shape)
844 |
845 | train = train.merge(data_feature, on='id', how="left")
846 | val = val.merge(data_feature, on='id', how="left")
847 | test = test.merge(data_feature, on='id', how="left")
848 | print("test ", test.shape)
849 |
850 | del filters_df, data_feature
851 | del data
852 | gc.collect()
853 |
854 | # target encoding
855 | agg_cols = [ 'price_rank', 'city', 'platform', 'device', 'country', 'impression_index','star']
856 | for c in agg_cols:
857 | gp = train.groupby(c)['label']
858 | mean = gp.mean()
859 | train[f'{c}_label_avg'] = train[c].map(mean)
860 | val[f'{c}_label_avg'] = val[c].map(mean)
861 | test[f'{c}_label_avg'] = test[c].map(mean)
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 | agg_cols = ['city','impression_index', 'platform']
870 | for c in agg_cols:
871 | gp = train.groupby(c)['price']
872 | mean = gp.mean()
873 | train[f'{c}_price_avg'] = train[c].map(mean)
874 | val[f'{c}_price_avg'] = val[c].map(mean)
875 | test[f'{c}_price_avg'] = test[c].map(mean)
876 |
877 |
878 |
879 | agg_cols = ['city']
880 | for c in agg_cols:
881 | gp = train.groupby(c)['rg_time_diff']
882 | mean = gp.mean()
883 | train[f'{c}_td_avg'] = train[c].map(mean)
884 | val[f'{c}_td_avg'] = val[c].map(mean)
885 | test[f'{c}_td_avg'] = test[c].map(mean)
886 |
887 |
888 |
889 | train['rg_price'] = train.price.map(price_rg_price_dict)
890 | val['rg_price'] = val.price.map(price_rg_price_dict)
891 | test['rg_price'] = test.price.map(price_rg_price_dict)
892 |
893 |
894 |
895 | #price cut within city
896 |
897 | data = pd.concat([train,val,test], axis=0).reset_index()
898 | data = data.loc[:,['city','price']].drop_duplicates(['city','price'])
899 | data['city_price_bin'] = data.groupby('city').price.apply(lambda x: qcut_safe(x, q = 40).astype(str))
900 | data['city_price_bin'] = data.apply( lambda x: str(x.city) + x.city_price_bin,axis=1)
901 | data['city_price_bin'] = data['city_price_bin'].factorize()[0]
902 |
903 |
904 | train = train.merge(data, on=['city','price'], how='left')
905 | val = val.merge(data, on=['city','price'], how='left')
906 | test = test.merge(data, on=['city','price'], how='left')
907 |
908 |
909 |
910 | print("train", train.shape)
911 | print("val", val.shape)
912 | print("test", test.shape)
913 | # test = test.merge(item_properties_df, on="item_id", how="left")
914 |
915 |
916 |
917 |
918 |
919 | data_drop_columns= ['label', 'session_id', 'step', 'id']
920 | data_drop_columns+= ['target_index','target_price','target_price_rank']
921 |
922 | train_label = train.label
923 | val_label = val.label
924 |
925 | # build lgbm dataset
926 | d_train = lgb.Dataset(data=train.drop(data_drop_columns, axis=1), label=train_label, free_raw_data=True, silent=True)
927 | d_val = lgb.Dataset(data=val.drop(data_drop_columns, axis=1), label=val_label, free_raw_data=True, silent=True)
928 |
929 |
930 |
931 |
932 |
933 | del train
934 | gc.collect()
935 |
936 | # params = {
937 | # 'objective': 'binary',
938 | # 'boosting_type': 'gbdt',
939 | # 'nthread': multiprocessing.cpu_count() // 3 if configuration.sub_sample else 24,
940 | # 'num_leaves': 200,
941 | # 'max_depth':10,
942 | # 'learning_rate': 0.05 if configuration.sub_sample else 0.01 ,
943 | # 'bagging_fraction': 0.8,
944 | # 'bagging_freq': 5,
945 | # 'feature_fraction':0.7,
946 | # 'seed': 0,
947 | # 'verbose': -1,
948 |
949 | # }
950 | params = {'objective': 'binary',
951 | 'boosting_type': 'gbdt',
952 | 'colsample_bytree': 0.76,
953 | 'learning_rate': 0.01,
954 | 'nthread': multiprocessing.cpu_count() -1,
955 | 'max_depth': 13,
956 | 'min_child_weight': 33,
957 | 'min_data_in_leaf': 94,
958 | 'num_leaves': 302,
959 | 'seed': 30,
960 | 'verbose': -1
961 | }
962 |
963 |
964 |
965 | clf = lgb.train(
966 | params=params,
967 | train_set=d_train,
968 | num_boost_round=50000,
969 | valid_sets=[d_train, d_val],
970 | early_stopping_rounds=200 if configuration.sub_sample else 500,
971 | verbose_eval=500,
972 |
973 | )
974 |
975 |
976 |
977 | # evaluation
978 | def evaluate(val_df, clf):
979 | incorrect_session = {}
980 | val_df['scores'] = clf.predict(val_df.drop(data_drop_columns, axis=1))
981 |
982 | loss = log_loss(val_df.label.values, val_df.scores.values)
983 | grouped_val = val_df.groupby('session_id')
984 | rss_group = {i:[] for i in range(1,26)}
985 | rss = []
986 | for session_id, group in grouped_val:
987 |
988 | scores = group.scores
989 | sorted_arg = np.flip(np.argsort(scores))
990 | rss.append( group['label'].values[sorted_arg])
991 | rss_group[len(group)].append(group['label'].values[sorted_arg])
992 | if group['label'].values[sorted_arg][0] != 1:
993 | incorrect_session[session_id] = (sorted_arg.values, group['label'].values[sorted_arg])
994 | mrr = compute_mean_reciprocal_rank(rss)
995 | mrr_group = {i:(len(rss_group[i]), compute_mean_reciprocal_rank(rss_group[i])) for i in range(1,26)}
996 | print(mrr_group)
997 | if not configuration.debug:
998 | pickle.dump( incorrect_session, open(f'../output/{model_name}_val_incorrect_order.p','wb'))
999 | return mrr, mrr_group, loss
1000 |
1001 |
1002 |
1003 | mrr, mrr_group, val_log_loss = evaluate(val, clf)
1004 |
1005 | print("MRR score: ", mrr)
1006 |
1007 |
1008 |
1009 | imp = clf.feature_importance('gain')
1010 | fn =clf.feature_name()
1011 | imp_df = pd.DataFrame()
1012 | imp_df['importance'] = imp
1013 | imp_df['name'] = fn
1014 | imp_df.sort_values('importance', ascending=False, inplace=True)
1015 |
1016 |
1017 | print(imp_df.head(20))
1018 |
1019 |
1020 |
1021 | del d_train, d_val
1022 | gc.collect()
1023 |
1024 |
1025 | if configuration.debug:
1026 | exit(0)
1027 |
1028 | predictions = []
1029 | session_ids = []
1030 |
1031 | test['score'] = clf.predict(test.drop(data_drop_columns, axis=1))
1032 | save_test = test.copy()
1033 | save_test['item_id'] = cat_encoders['item_id'].reverse_transform(save_test.item_id.values)
1034 | with open(f'../output/{model_name}_test_score.p', 'wb') as f:
1035 | pickle.dump( save_test.loc[:,['score', 'session_id', 'item_id', 'step']],f, protocol=4)
1036 |
1037 | grouped_test = test.groupby('session_id')
1038 | for session_id, group in grouped_test:
1039 | scores = group['score']
1040 | sorted_arg = np.flip(np.argsort(scores))
1041 | sorted_item_ids = group['item_id'].values[sorted_arg]
1042 | sorted_item_ids = cat_encoders['item_id'].reverse_transform(sorted_item_ids)
1043 | sorted_item_string = ' '.join([str(i) for i in sorted_item_ids])
1044 | predictions.append(sorted_item_string)
1045 | session_ids.append(session_id)
1046 |
1047 | prediction_df = pd.DataFrame()
1048 | prediction_df['session_id'] = session_ids
1049 | prediction_df['item_recommendations'] = predictions
1050 |
1051 | print("pred df shape", prediction_df.shape)
1052 | sub_df = pd.read_csv('../input/submission_popular.csv')
1053 | sub_df.drop('item_recommendations', axis=1, inplace=True)
1054 | sub_df = sub_df.merge(prediction_df, on="session_id")
1055 | # sub_df['item_recommendations'] = predictions
1056 |
1057 | sub_df.to_csv(f'../output/{model_name}.csv', index=None)
1058 |
1059 |
1060 |
1061 |
--------------------------------------------------------------------------------