├── README.md
├── code
    ├── SETTINGS.json
    ├── bowl_db.py
    ├── bowl_model.py
    ├── bowl_utils.py
    ├── predict.py
    ├── prepare_data.ipynb
    ├── prepare_data.py
    ├── submission.csv
    ├── train.py
    ├── train.sh
    └── validate.py
├── directory_structure.txt
├── models
    ├── b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-0.pt
    ├── b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-1.pt
    ├── b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-2.pt
    ├── b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-3.pt
    └── b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-4.pt
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | Below you can find a outline of how to reproduce my solution for the 2019 Data Science Bowl competition.
 5 | If you run into any trouble with the setup/code or have any questions please contact me at kfaceapi@gmail.com
 6 | 
 7 | ## Archive contents
 8 | - 3rd_solution.tgz : contains original code, trained models etc
 9 | ```
10 | 3rd_solution/
11 | ├── input/
12 | │   ├── processed/
13 | │   └── data-science-bowl-2019/
14 | ├── models/ # 
15 | └── code/
16 | ```
17 | - `input/processed/` : will be created by the preprocessing step (by running `python prepare_data.py`)
18 | - `input/data-science-bowl-2019/` : raw data dir of the competition (should contain 'train.csv', 'test.csv',  etc)
19 | - `models/` : contains trained models used to generate a 'submission.csv' file
20 | - `code/` : contains codes for training and prediction
21 | 
22 | ### Requirements 
23 | - Ubuntu 18.04.4 LTS
24 | - Python 3.7.5
25 | - pytorch 1.3
26 | - transformers 2.3.0 (or pytorch-transformers 1.2.0)
27 | 
28 | You can use the `pip install -r requirements.txt` to install the necessary packages.
29 | 
30 | #### Hardware
31 | - CPU: 3 x AMD® Ryzen 9 3900X (3 PCs)
32 | - GPU: 5 x NVIDIA RTX2080Ti 11G (2 GPUs in 1 PC)
33 | - RAM: 64G
34 | 
35 | The above is just my PC spec. In fact, GTX 1080 is enough for training.
36 | 
37 | ## Prepare Data
38 | You can generate `bowl.pt`, `bowl_info.pt` files by running `prepare_data.py`
39 | ``` 
40 | .../3rd_solution/code$ python prepare_data.py
41 | ```
42 | 
43 | ## Train Model (GPU needed)
44 | You can reproduce models already in the `model/` directory by running the `train.sh` script.
45 | The `train.sh` reproduce 5 models (5-fold)
46 | 
47 | ``` 
48 | .../3rd_solution/code$ bash train.sh
49 | ```
50 | 
51 | ## Predict
52 | You can reproduce the `submissions/submission.csv` by running the `predict.py`
53 | 
54 | ``` 
55 | .../3rd_solution/code$ python predict.py
56 | ```
57 | 
58 | 
59 | ## Option - Validate (GPU needed)
60 | You can reproduce the local CV score and coefficients by running the `validate.py`
61 | - VALID KAPPA_SCORE:0.5684061832361282
62 | - coefficients=[0.53060865, 1.66266655, 2.31145611]
63 | 


--------------------------------------------------------------------------------
/code/SETTINGS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "RAW_DATA_DIR": "../input/data-science-bowl-2019", 
3 |     "CLEAN_DATA_DIR": "../input/processed",
4 |     "MODEL_DIR": "../models/", 
5 |     "LOGS_DIR": "./logs", 
6 |     "SUBMISSION_DIR": "../submissions/"
7 | }


--------------------------------------------------------------------------------
/code/bowl_db.py:
--------------------------------------------------------------------------------
  1 | #import warnings
  2 | #warnings.filterwarnings('ignore')
  3 | 
  4 | import sys
  5 | import os
  6 | import math
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | import torch
 11 | from torch.utils.data import Dataset
 12 | import torch.nn.functional as F
 13 | import random
 14 | 
 15 | 
 16 | TARGET = ['accuracy_group', 'num_correct', 'num_incorrect']
 17 | GAME_TARGET = ['accuracy_group_game', 'num_correct_game', 'num_incorrect_game']
 18 | #TARGET = ['accuracy_group']
 19 | 
 20 | 
 21 | class BowlDataset(Dataset):
 22 |     def __init__(self, cfg, df, sample_indices, aug=0.0, aug_p=0.5):
 23 |         self.cfg = cfg
 24 |         self.df = df.copy()    
 25 |         self.sample_indices = sample_indices
 26 |         self.seq_len = self.cfg.seq_len
 27 |         self.aug = aug
 28 |         self.aug_p = aug_p
 29 |          
 30 |         self.cate_cols = self.cfg.cate_cols
 31 |         self.cont_cols = self.cfg.cont_cols
 32 |         
 33 |         self.cate_df = self.df[self.cate_cols]
 34 |         self.cont_df = np.log1p(self.df[self.cont_cols])                
 35 |         if 'accuracy_group' in self.df:
 36 |             self.df['num_incorrect'][self.df['num_incorrect']==1] = 0.5
 37 |             self.df['num_incorrect'][self.df['num_incorrect']>1] = 1.0            
 38 |             self.df['num_correct'][self.df['num_correct']>1] = 1.0
 39 |             self.target_df = self.df[TARGET]
 40 |         else:
 41 |             self.target_df = None
 42 |             
 43 |         if 'accuracy_group_game' in self.df:
 44 |             self.df['num_incorrect_game'][self.df['num_incorrect_game']==1] = 0.5
 45 |             self.df['num_incorrect_game'][self.df['num_incorrect_game']>1] = 1.0            
 46 |             self.df['num_correct_game'][self.df['num_correct_game']>1] = 1.0
 47 |             self.target_game_df = self.df[GAME_TARGET]
 48 |         else:
 49 |             self.target_game_df = None
 50 |         
 51 |     def __getitem__(self, idx):
 52 |         indices = self.sample_indices[idx]
 53 |         
 54 |         seq_len = min(self.seq_len, len(indices))
 55 |         
 56 |         if self.aug > 0:
 57 |             if len(indices)>30:
 58 |                 if np.random.binomial(1, self.aug_p) == 1:
 59 |                     cut_ratio = np.random.rand()
 60 |                     if cut_ratio > self.aug:
 61 |                         cut_ratio = self.aug
 62 |                     #cut_ratio = self.aug
 63 |                     start_idx = max(int(len(indices)*cut_ratio), 30)
 64 |                     indices = indices[start_idx:]
 65 |                     seq_len = min(self.seq_len, len(indices))
 66 |         
 67 |         tmp_cate_x = torch.LongTensor(self.cate_df.iloc[indices].values)
 68 |         cate_x = torch.LongTensor(self.seq_len, len(self.cate_cols)).zero_()
 69 |         cate_x[-seq_len:] = tmp_cate_x[-seq_len:]        
 70 |         
 71 |         tmp_cont_x = torch.FloatTensor(self.cont_df.iloc[indices].values)
 72 |         tmp_cont_x[-1] = 0
 73 |         cont_x = torch.FloatTensor(self.seq_len, len(self.cont_cols)).zero_()
 74 |         cont_x[-seq_len:] = tmp_cont_x[-seq_len:]
 75 |         
 76 |         mask = torch.ByteTensor(self.seq_len).zero_()
 77 |         mask[-seq_len:] = 1
 78 |         
 79 |         if self.target_df is not None:
 80 |             target = torch.FloatTensor(self.target_df.iloc[indices[-1]].values)
 81 |             if target.sum() == 0:                
 82 |                 target = torch.FloatTensor(self.target_game_df.iloc[indices[-1]].values)            
 83 |         else:
 84 |             target = 0
 85 |         
 86 |         return cate_x, cont_x, mask, target
 87 | 
 88 |     def __len__(self):
 89 |         return len(self.sample_indices)
 90 | 
 91 | DB_PATH='../../input/data-science-bowl-2019'
 92 | def main():    
 93 |     (train_df, test_df, mappers_dict, cate_offset, cate_cols, 
 94 |      cont_cols, extra_cont_cls, train_samples, train_groups, test_samples) = (
 95 |         torch.load(os.path.join(DB_PATH, 'bowl_v28.pt')))    
 96 |     
 97 |     train_db = BowlDataset(train_df, train_samples, mappers_dict)
 98 | 
 99 |     for cate_x, cont_x, mask, target in train_db:
100 |         a = 0
101 |     
102 |     
103 | 
104 | if __name__ == '__main__':
105 |     main()


--------------------------------------------------------------------------------
/code/bowl_model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | import torch.nn.functional as F
  6 | from transformers.modeling_bert import BertConfig, BertEncoder, BertModel
  7 | from transformers.modeling_albert import AlbertConfig, AlbertModel
  8 | from transformers.modeling_xlnet import XLNetConfig, XLNetModel
  9 | from transformers.modeling_xlm import XLMConfig, XLMModel
 10 | from transformers.modeling_gpt2 import GPT2Model, GPT2Config, GPT2PreTrainedModel, Block
 11 | import bowl_db
 12 | 
 13 | 
 14 | class TransfomerModel(nn.Module):
 15 |     def __init__(self, cfg):
 16 |         super(TransfomerModel, self).__init__()
 17 |         self.cfg = cfg
 18 |         cate_col_size = len(cfg.cate_cols)
 19 |         cont_col_size = len(cfg.cont_cols)
 20 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)
 21 |         self.cate_proj = nn.Sequential(
 22 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
 23 |             nn.LayerNorm(cfg.hidden_size//2),
 24 |         )        
 25 |         self.cont_emb = nn.Sequential(                
 26 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
 27 |             nn.LayerNorm(cfg.hidden_size//2),
 28 |         )
 29 |         
 30 |         self.config = BertConfig( 
 31 |             3, # not used
 32 |             hidden_size=cfg.hidden_size,
 33 |             num_hidden_layers=cfg.nlayers,
 34 |             num_attention_heads=cfg.nheads,
 35 |             intermediate_size=cfg.hidden_size,
 36 |             hidden_dropout_prob=cfg.dropout,
 37 |             attention_probs_dropout_prob=cfg.dropout,
 38 |         )
 39 |         self.encoder = BertEncoder(self.config)        
 40 |         
 41 |         def get_reg():
 42 |             return nn.Sequential(
 43 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
 44 |             nn.LayerNorm(cfg.hidden_size),
 45 |             nn.Dropout(cfg.dropout),
 46 |             nn.ReLU(),
 47 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
 48 |             nn.LayerNorm(cfg.hidden_size),
 49 |             nn.Dropout(cfg.dropout),
 50 |             nn.ReLU(),
 51 |             nn.Linear(cfg.hidden_size, cfg.target_size),            
 52 |         )        
 53 |         self.reg_layer = get_reg()
 54 |         
 55 |     def forward(self, cate_x, cont_x, mask):        
 56 |         batch_size = cate_x.size(0)
 57 |         
 58 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
 59 |         cate_emb = self.cate_proj(cate_emb)     
 60 |         cont_emb = self.cont_emb(cont_x)
 61 |         
 62 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)        
 63 |         #seq_length = self.cfg.seq_len
 64 |         #position_ids = torch.arange(seq_length, dtype=torch.long, device=cate_x.device)
 65 |         #position_ids = position_ids.unsqueeze(0).expand((batch_size, seq_length))
 66 |         #position_emb = self.position_emb(position_ids)
 67 |         #seq_emb = (seq_emb + position_emb)        
 68 |         #seq_emb = self.ln(seq_emb)
 69 |         
 70 |         extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
 71 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
 72 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 73 |         head_mask = [None] * self.config.num_hidden_layers
 74 |         
 75 |         encoded_layers = self.encoder(seq_emb, extended_attention_mask, head_mask=head_mask)
 76 |         sequence_output = encoded_layers[-1]
 77 |         sequence_output = sequence_output[:, -1]
 78 |         
 79 |         pred_y = self.reg_layer(sequence_output)
 80 |         return pred_y
 81 | 
 82 | 
 83 | class DSB_BertModel(nn.Module):
 84 |     def __init__(self, cfg):
 85 |         super(DSB_BertModel, self).__init__()
 86 |         self.cfg = cfg
 87 |         cate_col_size = len(cfg.cate_cols)
 88 |         cont_col_size = len(cfg.cont_cols)
 89 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)
 90 |         self.cate_proj = nn.Sequential(
 91 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
 92 |             nn.LayerNorm(cfg.hidden_size//2),
 93 |         )        
 94 |         self.cont_emb = nn.Sequential(                
 95 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
 96 |             nn.LayerNorm(cfg.hidden_size//2),
 97 |         )
 98 |         
 99 |         self.config = BertConfig( 
100 |             3, # not used
101 |             hidden_size=cfg.hidden_size,
102 |             num_hidden_layers=cfg.nlayers,
103 |             num_attention_heads=cfg.nheads,
104 |             intermediate_size=cfg.hidden_size,
105 |             hidden_dropout_prob=cfg.dropout,
106 |             attention_probs_dropout_prob=cfg.dropout,
107 |         )
108 |         self.encoder = BertModel(self.config)        
109 |         
110 |         def get_reg():
111 |             return nn.Sequential(
112 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
113 |             nn.LayerNorm(cfg.hidden_size),
114 |             nn.Dropout(cfg.dropout),
115 |             nn.ReLU(),            
116 |             nn.Linear(cfg.hidden_size, cfg.target_size),
117 |         )
118 |         self.reg_layer = get_reg()
119 |         
120 |     def forward(self, cate_x, cont_x, mask):        
121 |         batch_size = cate_x.size(0)
122 |         
123 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
124 |         cate_emb = self.cate_proj(cate_emb)     
125 |         cont_emb = self.cont_emb(cont_x)
126 |         
127 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)        
128 |         
129 |         encoded_layers = self.encoder(inputs_embeds=seq_emb, attention_mask=mask)
130 |         sequence_output = encoded_layers[0]
131 |         sequence_output = sequence_output[:, -1]        
132 |         
133 |         pred_y = self.reg_layer(sequence_output)
134 |         return pred_y
135 | 
136 | 
137 | class LSTMModel(nn.Module):
138 |     def __init__(self, cfg):
139 |         super(LSTMModel, self).__init__()
140 |         self.cfg = cfg
141 |         cate_col_size = len(cfg.cate_cols)
142 |         cont_col_size = len(cfg.cont_cols)
143 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)        
144 |         self.cate_proj = nn.Sequential(
145 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
146 |             nn.LayerNorm(cfg.hidden_size//2),
147 |         )        
148 |         self.cont_emb = nn.Sequential(                
149 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
150 |             nn.LayerNorm(cfg.hidden_size//2),
151 |         )
152 |         
153 |         self.encoder = nn.LSTM(cfg.hidden_size, 
154 |                             cfg.hidden_size, cfg.nlayers, dropout=cfg.dropout, batch_first=True)           
155 |         
156 |         def get_reg():
157 |             return nn.Sequential(
158 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
159 |             nn.LayerNorm(cfg.hidden_size),
160 |             nn.Dropout(cfg.dropout),
161 |             nn.ReLU(),
162 |             nn.Linear(cfg.hidden_size, cfg.target_size),
163 |         )        
164 |         self.reg_layer = get_reg()
165 |         
166 |     def forward(self, cate_x, cont_x, mask):        
167 |         batch_size = cate_x.size(0)
168 |         
169 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
170 |         cate_emb = self.cate_proj(cate_emb)     
171 |         cont_emb = self.cont_emb(cont_x)
172 |         
173 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)
174 |         
175 |         _, (h, c) = self.encoder(seq_emb)
176 |         sequence_output = h[-1]
177 |         
178 |         pred_y = self.reg_layer(sequence_output)
179 |         return pred_y
180 | 
181 | 
182 | class DSB_GPT2Model(nn.Module):
183 |     def __init__(self, cfg):
184 |         super(DSB_GPT2Model, self).__init__()
185 |         self.cfg = cfg
186 |         cate_col_size = len(cfg.cate_cols)
187 |         cont_col_size = len(cfg.cont_cols)
188 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)        
189 |         self.cate_proj = nn.Sequential(
190 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
191 |             nn.LayerNorm(cfg.hidden_size//2),
192 |         )        
193 |         self.cont_emb = nn.Sequential(                
194 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
195 |             nn.LayerNorm(cfg.hidden_size//2),
196 |         )
197 |         self.config = GPT2Config( 
198 |             3, # not used
199 |             n_positions=cfg.seq_len,
200 |             n_ctx=cfg.hidden_size,
201 |             n_embd=cfg.hidden_size,
202 |             n_layer=cfg.nlayers,
203 |             n_head=cfg.nheads,
204 |             #embd_pdrop=cfg.dropout,
205 |             #attn_pdrop=cfg.dropout,                 
206 |         )
207 |         self.encoder = GPT2Model(self.config)
208 |         
209 |         def get_reg():
210 |             return nn.Sequential(
211 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
212 |             nn.LayerNorm(cfg.hidden_size),
213 |             nn.Dropout(cfg.dropout),
214 |             nn.ReLU(),            
215 |             nn.Linear(cfg.hidden_size, cfg.target_size),
216 |         )
217 |         self.reg_layer = get_reg()
218 |         
219 |     def forward(self, cate_x, cont_x, mask):        
220 |         batch_size = cate_x.size(0)
221 |         
222 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
223 |         cate_emb = self.cate_proj(cate_emb)     
224 |         cont_emb = self.cont_emb(cont_x)
225 |         
226 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)
227 |         
228 |         encoded_layers = self.encoder(inputs_embeds=seq_emb, attention_mask=mask)
229 |         sequence_output = encoded_layers[0]
230 |         sequence_output = sequence_output[:, -1]
231 |         
232 |         pred_y = self.reg_layer(sequence_output)
233 |         return pred_y
234 | 
235 | 
236 | class DSB_ALBERTModel(nn.Module):
237 |     def __init__(self, cfg):
238 |         super(DSB_ALBERTModel, self).__init__()
239 |         self.cfg = cfg
240 |         cate_col_size = len(cfg.cate_cols)
241 |         cont_col_size = len(cfg.cont_cols)
242 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)        
243 |         def get_cont_emb():
244 |             return nn.Sequential(                
245 |                 nn.Linear(cont_col_size, cfg.hidden_size),
246 |                 nn.LayerNorm(cfg.hidden_size),
247 |                 nn.ReLU(),
248 |                 nn.Linear(cfg.hidden_size, cfg.hidden_size)                
249 |             )
250 |         self.cont_emb = get_cont_emb()
251 |         self.config = AlbertConfig( 
252 |             3, # not used
253 |             embedding_size=cfg.emb_size*cate_col_size + cfg.hidden_size,
254 |             hidden_size=cfg.emb_size*cate_col_size + cfg.hidden_size,
255 |             num_hidden_layers=cfg.nlayers,
256 |             #num_hidden_groups=1,
257 |             num_attention_heads=cfg.nheads,
258 |             intermediate_size=cfg.hidden_size,            
259 |             hidden_dropout_prob=cfg.dropout,
260 |             attention_probs_dropout_prob=cfg.dropout,
261 |             max_position_embeddings=cfg.seq_len,
262 |             type_vocab_size=1,
263 |             #initializer_range=0.02,
264 |             #layer_norm_eps=1e-12,
265 |         )        
266 |         
267 |         self.encoder = AlbertModel(self.config)
268 |         
269 |         def get_reg():
270 |             return nn.Sequential(
271 |             nn.Linear(cfg.emb_size*cate_col_size + cfg.hidden_size, cfg.hidden_size),
272 |             nn.LayerNorm(cfg.hidden_size),
273 |             nn.Dropout(cfg.dropout),
274 |             nn.ReLU(),
275 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
276 |             nn.LayerNorm(cfg.hidden_size),
277 |             nn.Dropout(cfg.dropout),
278 |             nn.ReLU(),
279 |             nn.Linear(cfg.hidden_size, cfg.target_size),
280 |         )
281 |         self.reg_layer = get_reg()
282 |         
283 |     def forward(self, cate_x, cont_x, mask):        
284 |         batch_size = cate_x.size(0)
285 |         
286 |         cate_emb = self.cate_emb(cate_x)        
287 |         cont_emb = self.cont_emb(cont_x)
288 |         
289 |         cate_emb = cate_emb.view(batch_size, self.cfg.seq_len, -1)        
290 |         cont_emb = cont_emb.view(batch_size, self.cfg.seq_len, -1)
291 |         
292 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)
293 |         
294 |         encoded_layers = self.encoder(inputs_embeds=seq_emb, attention_mask=mask)
295 |         sequence_output = encoded_layers[0]
296 |         sequence_output = sequence_output[:, -1]
297 |         
298 |         pred_y = self.reg_layer(sequence_output)
299 |         return pred_y
300 | 
301 | 
302 | class DSB_XLNetModel(nn.Module):
303 |     def __init__(self, cfg):
304 |         super(DSB_XLNetModel, self).__init__()
305 |         self.cfg = cfg
306 |         cate_col_size = len(cfg.cate_cols)
307 |         cont_col_size = len(cfg.cont_cols)
308 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)
309 |         self.cate_proj = nn.Sequential(
310 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
311 |             nn.LayerNorm(cfg.hidden_size//2),
312 |         )        
313 |         self.cont_emb = nn.Sequential(                
314 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
315 |             nn.LayerNorm(cfg.hidden_size//2),
316 |         )
317 |         self.config = XLNetConfig( 
318 |             3, # not used            
319 |             d_model=cfg.hidden_size,
320 |             n_layer=cfg.nlayers,
321 |             n_head=cfg.nheads,
322 |             d_inner=cfg.hidden_size,
323 |             #ff_activation="gelu",
324 |             #untie_r=True,
325 |             #attn_type="bi",
326 |             #initializer_range=0.02,
327 |             #layer_norm_eps=1e-12,
328 |             dropout=cfg.dropout,
329 |             #mem_len=None,
330 |             #reuse_len=None,
331 |             #bi_data=False,
332 |             #clamp_len=-1,
333 |             #same_length=False,
334 |             #summary_type="last",
335 |             #summary_use_proj=True,
336 |             #summary_activation="tanh",
337 |             summary_last_dropout=cfg.dropout,
338 |             #start_n_top=5,
339 |             #end_n_top=5,
340 |         )        
341 |         
342 |         self.encoder = XLNetModel(self.config)
343 |         
344 |         def get_reg():
345 |             return nn.Sequential(
346 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
347 |             nn.LayerNorm(cfg.hidden_size),
348 |             nn.Dropout(cfg.dropout),
349 |             nn.ReLU(),
350 |             nn.Linear(cfg.hidden_size, cfg.target_size),
351 |         )
352 |         self.reg_layer = get_reg()
353 |         
354 |     def forward(self, cate_x, cont_x, mask):        
355 |         batch_size = cate_x.size(0)
356 |         
357 |         cate_emb = self.cate_emb(cate_x)        
358 |         cont_emb = self.cont_emb(cont_x)
359 |         
360 |         cate_emb = cate_emb.view(batch_size, self.cfg.seq_len, -1)        
361 |         cont_emb = cont_emb.view(batch_size, self.cfg.seq_len, -1)
362 |         
363 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)
364 |         
365 |         encoded_layers = self.encoder(inputs_embeds=seq_emb, attention_mask=mask)
366 |         sequence_output = encoded_layers[0]
367 |         sequence_output = sequence_output[:, -1]
368 |         
369 |         pred_y = self.reg_layer(sequence_output)
370 |         return pred_y
371 | 
372 |     
373 | class DSB_XLMModel(nn.Module):
374 |     def __init__(self, cfg):
375 |         super(DSB_XLMModel, self).__init__()
376 |         self.cfg = cfg
377 |         cate_col_size = len(cfg.cate_cols)
378 |         cont_col_size = len(cfg.cont_cols)
379 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)
380 |         self.cate_proj = nn.Sequential(
381 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
382 |             nn.LayerNorm(cfg.hidden_size//2),
383 |         )        
384 |         self.cont_emb = nn.Sequential(                
385 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
386 |             nn.LayerNorm(cfg.hidden_size//2),
387 |         )
388 |         self.config = XLMConfig( 
389 |             3, # not used
390 |             emb_dim=cfg.hidden_size,
391 |             n_layers=cfg.nlayers,
392 |             n_heads=cfg.nheads,
393 |             dropout=cfg.dropout,
394 |             attention_dropout=cfg.dropout,
395 |             gelu_activation=True,
396 |             sinusoidal_embeddings=False,
397 |             causal=False,
398 |             asm=False,
399 |             n_langs=1,
400 |             use_lang_emb=True,
401 |             max_position_embeddings=cfg.seq_len,
402 |             embed_init_std=(cfg.hidden_size) ** -0.5,
403 |             layer_norm_eps=1e-12,
404 |             init_std=0.02,
405 |             bos_index=0,
406 |             eos_index=1,
407 |             pad_index=2,
408 |             unk_index=3,
409 |             mask_index=5,
410 |             is_encoder=True,
411 |             summary_type="first",
412 |             summary_use_proj=True,
413 |             summary_activation=None,
414 |             summary_proj_to_labels=True,
415 |             summary_first_dropout=cfg.dropout,
416 |             start_n_top=5,
417 |             end_n_top=5,
418 |             mask_token_id=0,
419 |             lang_id=0,     
420 |         )        
421 |         
422 |         self.encoder = XLMModel(self.config)
423 |         
424 |         def get_reg():
425 |             return nn.Sequential(
426 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
427 |             nn.LayerNorm(cfg.hidden_size),
428 |             nn.Dropout(cfg.dropout),
429 |             nn.ReLU(),            
430 |             nn.Linear(cfg.hidden_size, cfg.target_size),
431 |         )
432 |         self.reg_layer = get_reg()
433 |         
434 |     def forward(self, cate_x, cont_x, mask):        
435 |         batch_size = cate_x.size(0)
436 |         
437 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
438 |         cate_emb = self.cate_proj(cate_emb)     
439 |         cont_emb = self.cont_emb(cont_x)
440 |              
441 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)
442 |         
443 |         encoded_layers = self.encoder(inputs_embeds=seq_emb, attention_mask=mask)
444 |         sequence_output = encoded_layers[0]
445 |         sequence_output = sequence_output[:, -1]
446 |         
447 |         pred_y = self.reg_layer(sequence_output)
448 |         return pred_y
449 |     
450 | 
451 | encoders = {
452 |     'LSTM':LSTMModel,
453 |     'TRANSFORMER':TransfomerModel,
454 |     'BERT':DSB_BertModel,    
455 |     'GPT2':DSB_GPT2Model,
456 |     'ALBERT':DSB_ALBERTModel,
457 |     'XLNET':DSB_XLNetModel,
458 |     'XLM':DSB_XLMModel,
459 |     
460 | }
461 | 


--------------------------------------------------------------------------------
/code/bowl_utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from numba import jit 
  4 | from functools import partial
  5 | import scipy as sp
  6 | import random
  7 | from sklearn.model_selection import train_test_split, StratifiedKFold
  8 | 
  9 | 
 10 | def get_train_valid_groups(train_groups, k, random_state, shuffle=True):
 11 |     train_groups_df = pd.DataFrame({'group':train_groups})
 12 |     g, g_len = zip(*train_groups_df['group'].value_counts().items())
 13 |     
 14 |     kfold = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=shuffle)
 15 |     train_index, _ = list(kfold.split(g, g_len))[k]    
 16 |     train_indices = np.array(g)[train_index]
 17 |     
 18 |     train_valid_bool = np.isin(train_groups, train_indices)    
 19 |          
 20 |     return train_valid_bool
 21 | 
 22 | 
 23 | def get_train_valid_rowids(train_samples, train_groups, k, random_state, random_state2=None, shuffle=True, choice=True):
 24 |     if random_state2 is None:
 25 |         random_state2 = random_state
 26 |     train_groups_df = pd.DataFrame({'group':train_groups})
 27 |     g, g_len = zip(*train_groups_df['group'].value_counts().items())
 28 |     
 29 |     kfold = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=shuffle)
 30 |     train_index, _ = list(kfold.split(g, g_len))[k]
 31 |     
 32 |     train_indices = np.array(g)[train_index]
 33 |     new_train_samples = [rowid for rowid, group in enumerate(train_groups) if group in train_indices]
 34 |     new_valid_samples = [rowid for rowid, group in enumerate(train_groups) if group not in train_indices]
 35 |     
 36 |     random.seed(random_state2)
 37 |     if choice:
 38 |         group_dict = {}
 39 |         for rowid, group in enumerate(train_groups):
 40 |             if group not in train_indices:
 41 |                 if group not in group_dict:
 42 |                     group_dict[group] = []
 43 |                 group_dict[group].append(rowid)
 44 |         new_valid_samples = [random.choice(v) for v in group_dict.values()]
 45 |     
 46 |     return np.array(new_train_samples), np.array(new_valid_samples)
 47 | 
 48 | 
 49 | def train_valid_split(train_samples, train_groups, k, random_state, random_state2=None, shuffle=True, choice=True):
 50 |     if random_state2 is None:
 51 |         random_state2 = random_state
 52 |     
 53 |     #print(f'random_state:{random_state}, random_state2:{random_state2}')
 54 |     train_groups_df = pd.DataFrame({'group':train_groups})
 55 |     g, g_len = zip(*train_groups_df['group'].value_counts().items())
 56 |     
 57 |     kfold = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=shuffle)
 58 |     train_index, _ = list(kfold.split(g, g_len))[k]
 59 |     
 60 |     train_indices = np.array(g)[train_index]
 61 |     new_train_samples = [row_id for row_id, group in zip(train_samples, train_groups) if group in train_indices]
 62 |     new_valid_samples = [row_id for row_id, group in zip(train_samples, train_groups) if group not in train_indices]    
 63 |     
 64 |     random.seed(random_state2)
 65 |     if choice:
 66 |         group_dict = {}
 67 |         for row_id, group in zip(train_samples, train_groups):
 68 |             if group not in train_indices:
 69 |                 if group not in group_dict:
 70 |                     group_dict[group] = []
 71 |                 group_dict[group].append(row_id)        
 72 |         new_valid_samples = [random.choice(v) for v in group_dict.values()]
 73 |     
 74 |     return np.array(new_train_samples), np.array(new_valid_samples)
 75 | 
 76 | 
 77 | @jit
 78 | def qwk3(a1, a2, max_rat=3):
 79 |     assert(len(a1) == len(a2))
 80 |     a1 = np.asarray(a1, dtype=int)
 81 |     a2 = np.asarray(a2, dtype=int)
 82 | 
 83 |     hist1 = np.zeros((max_rat + 1, ))
 84 |     hist2 = np.zeros((max_rat + 1, ))
 85 | 
 86 |     o = 0
 87 |     for k in range(a1.shape[0]):
 88 |         i, j = a1[k], a2[k]
 89 |         hist1[i] += 1
 90 |         hist2[j] += 1
 91 |         o +=  (i - j) * (i - j)
 92 | 
 93 |     e = 0
 94 |     for i in range(max_rat + 1):
 95 |         for j in range(max_rat + 1):
 96 |             e += hist1[i] * hist2[j] * (i - j) * (i - j)
 97 | 
 98 |     e = e / a1.shape[0]
 99 | 
100 |     return 1 - o / (e+1e-08)
101 | 
102 | 
103 | 
104 | class OptimizedRounder(object):
105 |     """
106 |     An optimizer for rounding thresholds
107 |     to maximize Quadratic Weighted Kappa (QWK) score
108 |     # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
109 |     """
110 |     def __init__(self):
111 |         self.coef_ = 0
112 | 
113 |     def _kappa_loss(self, coef, X, y):
114 |         """
115 |         Get loss according to
116 |         using current coefficients
117 |         
118 |         :param coef: A list of coefficients that will be used for rounding
119 |         :param X: The raw predictions
120 |         :param y: The ground truth labels
121 |         """
122 |         X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
123 | 
124 |         return -qwk3(y, X_p)
125 | 
126 |     def fit(self, X, y):
127 |         """
128 |         Optimize rounding thresholds
129 |         
130 |         :param X: The raw predictions
131 |         :param y: The ground truth labels
132 |         """
133 |         loss_partial = partial(self._kappa_loss, X=X, y=y)
134 |         initial_coef = [0.5, 1.5, 2.5]
135 |         self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
136 | 
137 |     def predict(self, X, coef):
138 |         """
139 |         Make predictions with specified thresholds
140 |         
141 |         :param X: The raw predictions
142 |         :param coef: A list of coefficients that will be used for rounding
143 |         """
144 |         return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
145 | 
146 |     def coefficients(self):
147 |         """
148 |         Return the optimized coefficients
149 |         """
150 |         return self.coef_['x']
151 | 
152 | 
153 | def get_optimized_kappa_score(predictions, groundtruth):
154 |     optR = OptimizedRounder()
155 |     optR.fit(predictions, groundtruth)
156 |     coefficients = optR.coefficients()
157 |     #print(coefficients)
158 |     temp_predictions = predictions.copy()
159 |     temp_predictions[temp_predictions < coefficients[0]] = 0
160 |     temp_predictions[(coefficients[0]<=temp_predictions)&(temp_predictions< coefficients[1])] = 1
161 |     temp_predictions[(coefficients[1]<=temp_predictions)&(temp_predictions< coefficients[2])] = 2
162 |     temp_predictions[(coefficients[2]<=temp_predictions)] = 3
163 | 
164 |     kappa_score = qwk3(temp_predictions, groundtruth)
165 |     return kappa_score
166 | 


--------------------------------------------------------------------------------
/code/predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import json
  5 | import gc
  6 | import time
  7 | import random
  8 | import numpy as np
  9 | import pandas as pd
 10 | import torch.nn as nn
 11 | from numba import jit 
 12 | from functools import partial
 13 | from scipy import optimize
 14 | from torch.utils.data import DataLoader
 15 | from pytorch_transformers.modeling_bert import BertConfig, BertEncoder
 16 | 
 17 | import warnings
 18 | warnings.filterwarnings(action='ignore')
 19 | 
 20 | 
 21 | TARGET = ['accuracy_group', 'num_correct', 'num_incorrect']
 22 | GAME_TARGET = ['accuracy_group_game', 'num_correct_game', 'num_incorrect_game']
 23 | #TARGET = ['accuracy_group']
 24 | 
 25 | from torch.utils.data import Dataset
 26 | 
 27 | 
 28 | class BowlDataset(Dataset):
 29 |     def __init__(self, cfg, df, sample_indices, aug=0.0, aug_p=0.5, padding_front=True, use_tta=False):
 30 |         self.cfg = cfg
 31 |         self.df = df.copy()    
 32 |         self.sample_indices = sample_indices
 33 |         self.seq_len = self.cfg.seq_len
 34 |         self.aug = aug
 35 |         self.aug_p = aug_p
 36 |         self.use_tta = use_tta
 37 |         self.padding_front = padding_front
 38 |          
 39 |         self.cate_cols = self.cfg.cate_cols
 40 |         self.cont_cols = self.cfg.cont_cols
 41 |         
 42 |         self.cate_df = self.df[self.cate_cols]
 43 |         self.cont_df = np.log1p(self.df[self.cont_cols])                
 44 |         if 'accuracy_group' in self.df:
 45 |             self.df['num_incorrect'][self.df['num_incorrect']==1] = 0.5
 46 |             self.df['num_incorrect'][self.df['num_incorrect']>1] = 1.0            
 47 |             self.df['num_correct'][self.df['num_correct']>1] = 1.0
 48 |             self.target_df = self.df[TARGET]
 49 |         else:
 50 |             self.target_df = None
 51 |             
 52 |         if 'accuracy_group_game' in self.df:
 53 |             self.df['num_incorrect_game'][self.df['num_incorrect_game']==1] = 0.5
 54 |             self.df['num_incorrect_game'][self.df['num_incorrect_game']>1] = 1.0            
 55 |             self.df['num_correct_game'][self.df['num_correct_game']>1] = 1.0
 56 |             self.target_game_df = self.df[GAME_TARGET]
 57 |         else:
 58 |             self.target_game_df = None
 59 |         
 60 |     def __getitem__(self, idx):
 61 |         indices = self.sample_indices[idx]
 62 |         
 63 |         seq_len = min(self.seq_len, len(indices))
 64 |         
 65 |         if self.aug > 0:
 66 |             if len(indices)>30:
 67 |                 if np.random.binomial(1, self.aug_p) == 1:
 68 |                     cut_ratio = random.random()
 69 |                     if cut_ratio > self.aug:
 70 |                         cut_ratio = self.aug
 71 |                     #cut_ratio = self.aug
 72 |                     start_idx = max(int(len(indices)*cut_ratio), 30)
 73 |                     indices = indices[start_idx:]
 74 |                     seq_len = min(self.seq_len, len(indices))
 75 |         
 76 |         tmp_cate_x = torch.LongTensor(self.cate_df.iloc[indices].values)
 77 |         cate_x = torch.LongTensor(self.seq_len, len(self.cate_cols)).zero_()
 78 |         if self.padding_front:
 79 |             cate_x[-seq_len:] = tmp_cate_x[-seq_len:]
 80 |         else:
 81 |             cate_x[:seq_len] = tmp_cate_x[-seq_len:]
 82 |         
 83 |         tmp_cont_x = torch.FloatTensor(self.cont_df.iloc[indices].values)
 84 |         tmp_cont_x[-1] = 0
 85 |         cont_x = torch.FloatTensor(self.seq_len, len(self.cont_cols)).zero_()
 86 |         if self.padding_front:            
 87 |             cont_x[-seq_len:] = tmp_cont_x[-seq_len:]
 88 |         else:
 89 |             cont_x[:seq_len] = tmp_cont_x[-seq_len:]
 90 |         
 91 |         mask = torch.ByteTensor(self.seq_len).zero_()
 92 |         if self.padding_front:
 93 |             mask[-seq_len:] = 1
 94 |         else:
 95 |             mask[:seq_len] = 1
 96 |         
 97 |         if self.target_df is not None:
 98 |             target = torch.FloatTensor(self.target_df.iloc[indices[-1]].values)
 99 |             if target.sum() == 0:                
100 |                 target = torch.FloatTensor(self.target_game_df.iloc[indices[-1]].values)            
101 |         else:
102 |             target = 0
103 |         
104 |         return cate_x, cont_x, mask, target
105 | 
106 |     def __len__(self):
107 |         return len(self.sample_indices)
108 | 
109 | 
110 | class TransfomerModel(nn.Module):
111 |     def __init__(self, cfg):
112 |         super(TransfomerModel, self).__init__()
113 |         self.cfg = cfg
114 |         cate_col_size = len(cfg.cate_cols)
115 |         cont_col_size = len(cfg.cont_cols)
116 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)
117 |         self.cate_proj = nn.Sequential(
118 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
119 |             nn.LayerNorm(cfg.hidden_size//2),
120 |         )        
121 |         self.cont_emb = nn.Sequential(                
122 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
123 |             nn.LayerNorm(cfg.hidden_size//2),
124 |         )
125 |         
126 |         self.config = BertConfig( 
127 |             3, # not used
128 |             hidden_size=cfg.hidden_size,
129 |             num_hidden_layers=cfg.nlayers,
130 |             num_attention_heads=cfg.nheads,
131 |             intermediate_size=cfg.hidden_size,
132 |             hidden_dropout_prob=cfg.dropout,
133 |             attention_probs_dropout_prob=cfg.dropout,
134 |         )
135 |         self.encoder = BertEncoder(self.config)        
136 |         
137 |         def get_reg():
138 |             return nn.Sequential(
139 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
140 |             nn.LayerNorm(cfg.hidden_size),
141 |             nn.Dropout(cfg.dropout),
142 |             nn.ReLU(),
143 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
144 |             nn.LayerNorm(cfg.hidden_size),
145 |             nn.Dropout(cfg.dropout),
146 |             nn.ReLU(),
147 |             nn.Linear(cfg.hidden_size, cfg.target_size),            
148 |         )        
149 |         self.reg_layer = get_reg()
150 |         
151 |     def forward(self, cate_x, cont_x, mask):        
152 |         batch_size = cate_x.size(0)
153 |         
154 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
155 |         cate_emb = self.cate_proj(cate_emb)     
156 |         cont_emb = self.cont_emb(cont_x)
157 |         
158 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)
159 |         
160 |         extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
161 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
162 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
163 |         head_mask = [None] * self.config.num_hidden_layers
164 |         
165 |         encoded_layers = self.encoder(seq_emb, extended_attention_mask, head_mask=head_mask)
166 |         sequence_output = encoded_layers[-1]
167 |         sequence_output = sequence_output[:, -1]
168 |         
169 |         pred_y = self.reg_layer(sequence_output)
170 |         return pred_y
171 | 
172 |     
173 | class LSTMATTNModel(nn.Module):
174 |     def __init__(self, cfg):
175 |         super(LSTMATTNModel, self).__init__()
176 |         self.cfg = cfg
177 |         cate_col_size = len(cfg.cate_cols)
178 |         cont_col_size = len(cfg.cont_cols)
179 |         self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)        
180 |         self.cate_proj = nn.Sequential(
181 |             nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
182 |             nn.LayerNorm(cfg.hidden_size//2),
183 |         )        
184 |         self.cont_emb = nn.Sequential(                
185 |             nn.Linear(cont_col_size, cfg.hidden_size//2),
186 |             nn.LayerNorm(cfg.hidden_size//2),
187 |         )
188 |         
189 |         self.encoder = nn.LSTM(cfg.hidden_size, 
190 |                             cfg.hidden_size, 1, dropout=cfg.dropout, batch_first=True)
191 |         
192 |         self.config = BertConfig( 
193 |             3, # not used
194 |             hidden_size=cfg.hidden_size,
195 |             num_hidden_layers=1,
196 |             num_attention_heads=cfg.nheads,
197 |             intermediate_size=cfg.hidden_size,
198 |             hidden_dropout_prob=cfg.dropout,
199 |             attention_probs_dropout_prob=cfg.dropout,
200 |         )
201 |         self.attn = BertEncoder(self.config)                 
202 |         
203 |         def get_reg():
204 |             return nn.Sequential(
205 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
206 |             nn.LayerNorm(cfg.hidden_size),
207 |             nn.Dropout(cfg.dropout),
208 |             nn.ReLU(),
209 |             nn.Linear(cfg.hidden_size, cfg.hidden_size),
210 |             nn.LayerNorm(cfg.hidden_size),
211 |             nn.Dropout(cfg.dropout),
212 |             nn.ReLU(),
213 |             nn.Linear(cfg.hidden_size, cfg.target_size),            
214 |         )           
215 |         self.reg_layer = get_reg()
216 |         
217 |     def forward(self, cate_x, cont_x, mask):        
218 |         batch_size = cate_x.size(0)
219 |         
220 |         cate_emb = self.cate_emb(cate_x).view(batch_size, self.cfg.seq_len, -1)
221 |         cate_emb = self.cate_proj(cate_emb) 
222 |         cont_emb = self.cont_emb(cont_x)
223 |         
224 |         seq_emb = torch.cat([cate_emb, cont_emb], 2)        
225 |         
226 |         output, _ = self.encoder(seq_emb)
227 |         
228 |         extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
229 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
230 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
231 |         head_mask = [None] * self.config.num_hidden_layers
232 |         
233 |         encoded_layers = self.attn(output, extended_attention_mask, head_mask=head_mask)        
234 |         sequence_output = encoded_layers[-1]
235 |         sequence_output = sequence_output[:, -1]
236 |         pred_y = self.reg_layer(sequence_output)
237 |         return pred_y
238 | 
239 |     
240 | ENCODERS = {    
241 |     'TRANSFORMER':TransfomerModel,
242 |     'LSTMATTN':LSTMATTNModel,
243 | }
244 | 
245 | 
246 | def replace_4110_4100(df):
247 |     rep_code4110_bool = (df['title']=='Bird Measurer (Assessment)')&(df['event_code']==4110)
248 |     rep_code4100_bool = (df['title']=='Bird Measurer (Assessment)')&(df['event_code']==4100)
249 |     df['event_code'][rep_code4110_bool] = 4100
250 |     df['event_code'][rep_code4100_bool] = 5110
251 | 
252 | 
253 | def get_agged_session(df):    
254 |     event_code = pd.crosstab(df['game_session'], df['event_code'])
255 |     event_id = pd.crosstab(df['game_session'], df['event_id'])
256 |     event_num_correct = pd.pivot_table(df[(~df['correct'].isna())], index='game_session', columns='event_code', values='num_correct', aggfunc='sum')
257 |     event_num_incorrect = pd.pivot_table(df[(~df['correct'].isna())], index='game_session', columns='event_code', values='num_incorrect', aggfunc='sum')
258 |     event_accuracy = event_num_correct/(event_num_correct+event_num_incorrect[event_num_correct.columns])
259 |     event_accuracy = event_accuracy.add_prefix('accuray_')    
260 |     del event_num_correct, event_num_incorrect    
261 |     
262 |     event_round = pd.pivot_table(df[~df['correct'].isna()], index='game_session', columns='event_code', values='round', aggfunc='max')
263 |     event_round = event_round.add_prefix('round_')
264 |     
265 |     print('max_game_time')    
266 |     df['elapsed_time'] = df[['game_session', 'game_time']].groupby('game_session')['game_time'].diff()
267 |     game_time = df.groupby('game_session', as_index=False)['elapsed_time'].agg(['mean', 'max']).reset_index()
268 |     game_time.columns = ['game_session', 'mean_game_time', 'max_game_time']    
269 |     df = df.merge(game_time, on='game_session', how='left')    
270 |     event_max_game_time = pd.pivot_table(df, index='game_session', columns='event_code', values='elapsed_time', aggfunc='max')
271 |     event_max_game_time = event_max_game_time.add_prefix('max_game_time_')
272 |     del df['elapsed_time'] 
273 |     
274 |     print('session_extra_df')
275 |     session_extra_df = pd.concat([event_code, event_id, event_accuracy, event_round], 1)
276 |     session_extra_df.index.name = 'game_session'
277 |     session_extra_df.reset_index(inplace=True)
278 |     del event_code, event_id, event_accuracy, event_round
279 |     
280 |     print('session_df')
281 |     session_df = df.drop_duplicates('game_session', keep='last').reset_index(drop=True)
282 |     session_df['row_id'] = session_df.index
283 |     session_df = session_df.merge(session_extra_df, how='left', on='game_session')
284 |     return session_df
285 | 
286 | def gen_label(df):
287 |     num_corrects = []
288 |     for inst_id, one_df in df.groupby('installation_id'):
289 |         one_df = one_df[(one_df['type']=='Assessment')&(one_df['event_code']==4100)]
290 |         for game_session, title_df in one_df.groupby('game_session'):            
291 |             num_correct = title_df['event_data'].str.contains('"correct":true').sum()
292 |             num_incorrect = title_df['event_data'].str.contains('"correct":false').sum()            
293 |             num_corrects.append([inst_id, game_session, num_correct, num_incorrect])
294 |     label_df = pd.DataFrame(num_corrects, columns=['installation_id', 'game_session', 'num_correct', 'num_incorrect'])
295 |     label_df['accuracy'] = label_df['num_correct'] / (label_df['num_correct']+label_df['num_incorrect'])
296 |     label_df['accuracy_group'] = 3
297 |     label_df['accuracy_group'][label_df['accuracy']==0.5] = 2    
298 |     label_df['accuracy_group'][label_df['accuracy']<0.5] = 1
299 |     label_df['accuracy_group'][label_df['accuracy']==0] = 0    
300 |     return label_df
301 | 
302 | 
303 | def extract_data_from_event_code(df, columns=['correct', 'round']):
304 |     for col in columns:
305 |         col_bool = df['event_data'].str.contains(col)
306 |         df[col] = np.nan
307 |         df[col][col_bool] = df['event_data'][col_bool].apply(lambda x: json.loads(x).get(col)).astype(float)
308 | 
309 |         
310 | def get_train_sample_indices(df):
311 |     sample_indices = []
312 |     inst_indiecs = []    
313 |     df_groups = df.groupby('installation_id').groups
314 |     for inst_idx, indices in enumerate(df_groups.values()):
315 |         one_df = df.iloc[indices].reset_index(drop=True)
316 |         assessment_start_indices = one_df[(one_df['type']=='Assessment')&
317 |                                           (one_df['accuracy_group']>=0)
318 |                                          ].index
319 |         for num, start_index in enumerate(assessment_start_indices):
320 |             sample_indices.append( one_df.iloc[:start_index+1]['row_id'].tolist() )
321 |             inst_indiecs.append(inst_idx)            
322 |     return sample_indices, inst_indiecs
323 | 
324 | def choose_one(train_samples, train_groups, random_state):    
325 |     random.seed(random_state)    
326 |     group_dict = {}
327 |     for row_id, group in zip(train_samples, train_groups):
328 |         if group not in group_dict:
329 |             group_dict[group] = []
330 |         group_dict[group].append(row_id)
331 |     new_train_samples = []    
332 |     for v in group_dict.values():        
333 |         new_train_samples.append(random.choice(v))         
334 |     
335 |     return np.array(new_train_samples)
336 | 
337 | def preprocessing(df, train_columns, mappers_dict, cate_offset, cate_cols, cont_cols, extra_cont_cls):
338 |     print('preprocessing ... ')
339 |     replace_4110_4100(df)
340 |     
341 |     print('generating label ...')
342 |     label_df = gen_label(df)
343 |     
344 |     print('extract_data_from_event_code ...')
345 |     extract_data_from_event_code(df)
346 |     df['num_incorrect'] = np.where(df['correct']==0, 1, np.nan)
347 |     df['num_correct'] = np.where(df['correct']==1, 1, np.nan)
348 |     
349 |     df['game_time'] = df['game_time'] // 1000
350 |     
351 |     df = get_agged_session(df)
352 |     df = df.drop(['correct', 'round', 'num_correct', 'num_incorrect'], axis=1)
353 |     
354 |     df = df.merge(label_df, on=['game_session', 'installation_id'], how='left')
355 |     
356 |     samples, groups = get_train_sample_indices(df)
357 |     
358 |     df = df.append(pd.DataFrame(columns=train_columns))[train_columns]
359 |     df = df.fillna(0)
360 |     
361 |     for col in cate_cols:
362 |         df[col] = df[col].map(mappers_dict[col]).fillna(0).astype(int)
363 |     
364 |     print('preprocessing ... done')        
365 |     return df, samples, groups
366 | 
367 | @jit
368 | def qwk3(a1, a2, max_rat=3):
369 |     assert(len(a1) == len(a2))
370 |     a1 = np.asarray(a1, dtype=int)
371 |     a2 = np.asarray(a2, dtype=int)
372 | 
373 |     hist1 = np.zeros((max_rat + 1, ))
374 |     hist2 = np.zeros((max_rat + 1, ))
375 | 
376 |     o = 0
377 |     for k in range(a1.shape[0]):
378 |         i, j = a1[k], a2[k]
379 |         hist1[i] += 1
380 |         hist2[j] += 1
381 |         o +=  (i - j) * (i - j)
382 | 
383 |     e = 0
384 |     for i in range(max_rat + 1):
385 |         for j in range(max_rat + 1):
386 |             e += hist1[i] * hist2[j] * (i - j) * (i - j)
387 | 
388 |     e = e / a1.shape[0]
389 | 
390 |     return 1 - o / (e+1e-08)
391 | 
392 | 
393 | class OptimizedRounder(object):
394 |     """
395 |     An optimizer for rounding thresholds
396 |     to maximize Quadratic Weighted Kappa (QWK) score
397 |     # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
398 |     """
399 |     def __init__(self):
400 |         self.coef_ = 0
401 | 
402 |     def _kappa_loss(self, coef, X, y):
403 |         """
404 |         Get loss according to
405 |         using current coefficients
406 |         
407 |         :param coef: A list of coefficients that will be used for rounding
408 |         :param X: The raw predictions
409 |         :param y: The ground truth labels
410 |         """
411 |         X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
412 | 
413 |         return -qwk3(y, X_p)
414 | 
415 |     def fit(self, X, y):
416 |         """
417 |         Optimize rounding thresholds
418 |         
419 |         :param X: The raw predictions
420 |         :param y: The ground truth labels
421 |         """
422 |         loss_partial = partial(self._kappa_loss, X=X, y=y)
423 |         initial_coef = [0.5, 1.5, 2.5]
424 |         self.coef_ = optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
425 | 
426 |     def predict(self, X, coef):
427 |         """
428 |         Make predictions with specified thresholds
429 |         
430 |         :param X: The raw predictions
431 |         :param coef: A list of coefficients that will be used for rounding
432 |         """
433 |         return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
434 | 
435 |     def coefficients(self):
436 |         """
437 |         Return the optimized coefficients
438 |         """
439 |         return self.coef_['x']
440 | 
441 | def get_optimized_kappa_score(predictions, groundtruth):
442 |     optR = OptimizedRounder()
443 |     optR.fit(predictions, groundtruth)
444 |     coefficients = optR.coefficients()
445 |     #print(coefficients)
446 |     temp_predictions = predictions.copy()
447 |     temp_predictions[temp_predictions < coefficients[0]] = 0
448 |     temp_predictions[(coefficients[0]<=temp_predictions)&(temp_predictions< coefficients[1])] = 1
449 |     temp_predictions[(coefficients[1]<=temp_predictions)&(temp_predictions< coefficients[2])] = 2
450 |     temp_predictions[(coefficients[2]<=temp_predictions)] = 3
451 | 
452 |     kappa_score = qwk3(temp_predictions, groundtruth)
453 |     return kappa_score, coefficients 
454 | 
455 | class CFG:
456 |     learning_rate=1.0e-4
457 |     batch_size=64
458 |     num_workers=4
459 |     print_freq=100
460 |     test_freq=1
461 |     start_epoch=0
462 |     num_train_epochs=1
463 |     warmup_steps=30
464 |     max_grad_norm=1000
465 |     gradient_accumulation_steps=1
466 |     weight_decay=0.01    
467 |     dropout=0.2
468 |     emb_size=100
469 |     hidden_size=500
470 |     nlayers=2
471 |     nheads=8    
472 |     device='cpu'
473 |     #device='cuda:0'
474 |     seed=7
475 |     ntta = [0, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6] # TEST KAPPA_SCORE:0.5990772768904306
476 |     wtta = [0.8]
477 | CFG.wtta += [ (1-CFG.wtta[0])/(len(CFG.ntta)-1) for _ in range(len(CFG.ntta)-1)]
478 | 
479 |     
480 | def main():
481 |     os.environ['PYTHONHASHSEED'] = str(CFG.seed)
482 |     random.seed(CFG.seed)
483 |     np.random.seed(CFG.seed)
484 |     torch.manual_seed(CFG.seed)    
485 |     torch.cuda.manual_seed(CFG.seed)
486 |     torch.backends.cudnn.deterministic = True       
487 |     
488 |     settings = json.load(open('SETTINGS.json'))
489 |     
490 |     test_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'test.csv'))
491 | 
492 |     [train_columns, mappers_dict, cate_offset, 
493 |      cate_cols, cont_cols, extra_cont_cls] = torch.load(os.path.join(settings['CLEAN_DATA_DIR'], 'bowl_info.pt'))
494 |     test_df, test_samples, test_groups = preprocessing(test_df, train_columns, mappers_dict, cate_offset, 
495 |                             cate_cols, cont_cols, extra_cont_cls)    
496 |     
497 |     CFG.target_size = 3
498 |     CFG.total_cate_size = cate_offset
499 |     print(CFG.__dict__)
500 |     CFG.cate_cols = cate_cols
501 |     CFG.cont_cols = cont_cols+extra_cont_cls    
502 |         
503 |     base_model_path_list = [
504 |         ['bowl.pt', [
505 |             [1.0, os.path.join(settings['MODEL_DIR'], 'b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-0.pt')],            
506 |         ]],
507 |     ]
508 | 
509 |     ################################################
510 |     # find the coefficients
511 |     ################################################
512 |     rand_seed_list = [7, 77, 777, 1, 2]
513 |     #rand_seed_list = [110798, 497274, 885651, 673327, 599183, 272713, 582394, 180043, 855725, 932850]    
514 |     sum_coefficients = 0
515 |     sum_cnt = 0
516 |     for _, base_model_paths in base_model_path_list:        
517 |         for model_w, base_model_path in base_model_paths:        
518 |             path = base_model_path.split('/')[-1]
519 |             path = path.replace('bowl_', '')
520 |             cfg_dict = dict([tok.split('-') for tok in path.split('_')])
521 |             CFG.encoder = cfg_dict['a']
522 |             CFG.seq_len = int(cfg_dict['len'])
523 |             CFG.emb_size = int(cfg_dict['e'])
524 |             CFG.hidden_size = int(cfg_dict['h'])
525 |             CFG.nlayers = int(cfg_dict['l'])
526 |             CFG.nheads = int(cfg_dict['hd'])
527 |             CFG.seed = int(cfg_dict['s'])
528 |             CFG.data_seed = int(cfg_dict['s'])
529 |             
530 |             for k in range(5):
531 |                 model = ENCODERS[CFG.encoder](CFG)
532 |                 model_path = base_model_path.replace('k-0', f'k-{k}')
533 |                 
534 |                 checkpoint = torch.load(model_path, map_location=CFG.device)        
535 |                 model.load_state_dict(checkpoint['state_dict'])
536 |                 model.to(CFG.device)
537 |                 print("=> loaded checkpoint '{}' (epoch {})".format(model_path, checkpoint['epoch']))            
538 |                 
539 |                 for rand_seed in rand_seed_list:
540 |                     chosen_samples = choose_one(test_samples, test_groups, random_state=rand_seed)
541 |                     predictions = 0    
542 |                     for w, tta in zip(CFG.wtta, CFG.ntta):
543 |                         padding_front = False if CFG.encoder=='LSTM' else True
544 |                         valid_db = BowlDataset(CFG, test_df, chosen_samples, aug=tta, aug_p=1.0, 
545 |                                                padding_front=padding_front, use_tta=True)
546 |                         valid_loader = DataLoader(
547 |                                 valid_db, batch_size=CFG.batch_size, shuffle=False,
548 |                                 num_workers=CFG.num_workers, pin_memory=True)                
549 |                         prediction, groundtruths = validate(valid_loader, model)
550 |                         predictions += w*prediction                                            
551 |                     try:
552 |                         valid_kappa, valid_coefficients = get_optimized_kappa_score(predictions, groundtruths)
553 |                         print(f'k[{k}]-s2[{rand_seed}]: valid_kappa:{valid_kappa} - {valid_coefficients}') 
554 |                         sum_coefficients += np.array(valid_coefficients)
555 |                         sum_cnt += 1
556 |                     except Exception as e:
557 |                         print(e)
558 |                         print(f'k[{k}]-s2[{rand_seed}]: valid_kappa: Failed!')
559 |                         pass
560 |                 del model
561 |     ################################################
562 |     test_samples = list(test_df.groupby(['installation_id']).groups.values())    
563 |     
564 |     coefficients = 0.2*sum_coefficients/sum_cnt + 0.8*np.array([0.53060865, 1.66266655, 2.31145611])       
565 |     print('=======================================')
566 |     print(f'coefficients - {coefficients}')
567 |     print('=======================================')
568 |     
569 |     random.seed(CFG.seed)
570 |     
571 |     submission_df = test_df.groupby('installation_id').tail(1)[['installation_id']]
572 |     submission_df['accuracy_group'] = 0
573 |     
574 |     for _, base_model_paths in base_model_path_list:
575 |         for model_w, base_model_path in base_model_paths:        
576 |             path = base_model_path.split('/')[-1]
577 |             path = path.replace('bowl_', '')
578 |             cfg_dict = dict([tok.split('-') for tok in path.split('_')])
579 |             CFG.encoder = cfg_dict['a']
580 |             CFG.seq_len = int(cfg_dict['len'])
581 |             CFG.emb_size = int(cfg_dict['e'])
582 |             CFG.hidden_size = int(cfg_dict['h'])
583 |             CFG.nlayers = int(cfg_dict['l'])
584 |             CFG.nheads = int(cfg_dict['hd'])
585 |             CFG.seed = int(cfg_dict['s'])
586 |             CFG.data_seed = int(cfg_dict['s'])
587 |             
588 |             for k in range(5):
589 |                 model = ENCODERS[CFG.encoder](CFG)
590 |                 model_path = base_model_path.replace('k-0', f'k-{k}')
591 |                 
592 |                 checkpoint = torch.load(model_path, map_location=CFG.device)        
593 |                 model.load_state_dict(checkpoint['state_dict'])
594 |                 model.to(CFG.device)
595 |                 print("=> loaded checkpoint '{}' (epoch {})".format(model_path, checkpoint['epoch']))            
596 |                                       
597 |                 for w, tta in zip(CFG.wtta, CFG.ntta):
598 |                     padding_front = False if CFG.encoder=='LSTM' else True
599 |                     valid_db = BowlDataset(CFG, test_df, test_samples, aug=tta, aug_p=1.0, 
600 |                                            padding_front=padding_front, use_tta=True)
601 |                     valid_loader = DataLoader(
602 |                             valid_db, batch_size=CFG.batch_size, shuffle=False,
603 |                             num_workers=CFG.num_workers, pin_memory=True)                
604 |                     predictions = test(valid_loader, model)
605 |                     submission_df['accuracy_group'] += w*predictions*model_w*(1/5)
606 |                 del model
607 |     
608 |     submission_df['accuracy_group'] /= len(base_model_path_list)
609 |     compute_th_acc_gp(submission_df['accuracy_group'], coefficients) 
610 |     submission_df['accuracy_group'] = submission_df['accuracy_group'].astype(int)
611 |     os.makedirs(settings['SUBMISSION_DIR'], exist_ok=True)
612 |     submission_df.to_csv(os.path.join(settings['SUBMISSION_DIR'], 'submission.csv'), index=False)
613 |     print('done')
614 | 
615 | def compute_th_acc_gp(temp, coef):
616 |     temp[temp < coef[0]] = 0
617 |     temp[(coef[0]<=temp)&(temp< coef[1])] = 1
618 |     temp[(coef[1]<=temp)&(temp< coef[2])] = 2
619 |     temp[(coef[2]<=temp)] = 3    
620 | 
621 | def compute_acc_gp(pred):
622 |     #batch_size = pred.size(0)
623 |     pred = (3*pred[:, 0] - 2*pred[:, 1])    
624 |     pred[pred < 0] = 0    
625 |     return pred
626 | 
627 | 
628 | def validate(valid_loader, model):
629 |     model.eval()    
630 |     
631 |     predictions = []
632 |     groundtruths = []
633 |     for step, (cate_x, cont_x, mask, y) in enumerate(valid_loader):
634 |         
635 |         cate_x, cont_x, mask = cate_x.to(CFG.device), cont_x.to(CFG.device), mask.to(CFG.device)        
636 |         
637 |         k = 0.5
638 |         with torch.no_grad():        
639 |             pred = model(cate_x, cont_x, mask)
640 |           
641 |         # record accuracy
642 |         pred_y = (1-k)*pred[:, 0] + (k)*compute_acc_gp(pred[:, 1:])
643 |         predictions.append(pred_y.detach().cpu())        
644 |         groundtruths.append(y[:, 0])
645 | 
646 |     predictions = torch.cat(predictions).numpy()
647 |     groundtruths = torch.cat(groundtruths).numpy()
648 |     
649 |     return predictions, groundtruths
650 | 
651 | 
652 | def test(valid_loader, model):
653 |     model.eval()    
654 |     
655 |     predictions = []
656 |     for step, (cate_x, cont_x, mask, _) in enumerate(valid_loader):
657 |         
658 |         cate_x, cont_x, mask = cate_x.to(CFG.device), cont_x.to(CFG.device), mask.to(CFG.device)        
659 |         
660 |         k = 0.5
661 |         with torch.no_grad():        
662 |             pred = model(cate_x, cont_x, mask)
663 |           
664 |         # record accuracy
665 |         pred_y = (1-k)*pred[:, 0] + (k)*compute_acc_gp(pred[:, 1:])
666 |         predictions.append(pred_y.detach().cpu())        
667 | 
668 |     predictions = torch.cat(predictions).numpy()
669 |     
670 |     return predictions
671 | 
672 | 
673 | if __name__ == '__main__':
674 |     main()


--------------------------------------------------------------------------------
/code/prepare_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  8 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import torch\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd\n",
 15 |     "import os, gc, sys, warnings, random, math, psutil, pickle\n",
 16 |     "\n",
 17 |     "from tqdm.notebook import tqdm as tqdm_notebook\n",
 18 |     "import json\n",
 19 |     "pd.set_option('display.max_rows', 1000)\n",
 20 |     "pd.set_option('display.max_columns', 100)\n",
 21 |     "pd.set_option('display.max_colwidth', 500)\n",
 22 |     "pd.set_option('min_rows', 200)\n",
 23 |     "warnings.filterwarnings('ignore')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "# DATA LOAD"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/plain": [
 41 |        "{'RAW_DATA_DIR': '../input/data-science-bowl-2019',\n",
 42 |        " 'TRAIN_DATA_DIR': '../input/processed',\n",
 43 |        " 'MODEL_CHECKPOINT_DIR': '../models/',\n",
 44 |        " 'LOGS_DIR': './logs',\n",
 45 |        " 'SUBMISSION_DIR': '../submissions/'}"
 46 |       ]
 47 |      },
 48 |      "execution_count": 2,
 49 |      "metadata": {},
 50 |      "output_type": "execute_result"
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "settings = json.load(open('SETTINGS.json'))\n",
 55 |     "settings"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {
 62 |     "scrolled": true
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "CPU times: user 36.3 s, sys: 1.94 s, total: 38.3 s\n",
 70 |       "Wall time: 33.7 s\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "%%time\n",
 76 |     "train_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'train.csv'))\n",
 77 |     "test_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'test.csv'))\n",
 78 |     "train_label_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'train_labels.csv'))\n",
 79 |     "specs_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'specs.csv'))"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "CPU times: user 5.06 s, sys: 180 ms, total: 5.24 s\n",
 92 |       "Wall time: 1.05 s\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "%%time\n",
 98 |     "def replace_4110_4100(df):\n",
 99 |     "    rep_code4110_bool = (df['title']=='Bird Measurer (Assessment)')&(df['event_code']==4110)\n",
100 |     "    rep_code4100_bool = (df['title']=='Bird Measurer (Assessment)')&(df['event_code']==4100)\n",
101 |     "    df['event_code'][rep_code4110_bool] = 4100\n",
102 |     "    df['event_code'][rep_code4100_bool] = 5110 # 다른 type의 코드와 겹치지 않도록\n",
103 |     "replace_4110_4100(train_df)\n",
104 |     "replace_4110_4100(test_df)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "# Create additional columns from event_code"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 5,
117 |    "metadata": {
118 |     "scrolled": true
119 |    },
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "CPU times: user 24 µs, sys: 1 µs, total: 25 µs\n",
126 |       "Wall time: 3.34 µs\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "%%time\n",
132 |     "def extract_data_from_event_code(df, columns=['correct', 'round']):\n",
133 |     "    for col in columns:\n",
134 |     "        col_bool = df['event_data'].str.contains(col)\n",
135 |     "        df[col] = np.nan\n",
136 |     "        df[col][col_bool] = df['event_data'][col_bool].apply(lambda x: json.loads(x).get(col)).astype(float)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 6,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "CPU times: user 1min 4s, sys: 1.93 s, total: 1min 6s\n",
149 |       "Wall time: 47.4 s\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "%%time\n",
155 |     "extract_data_from_event_code(train_df)\n",
156 |     "extract_data_from_event_code(test_df)\n",
157 |     "        \n",
158 |     "train_df['num_incorrect'] = np.where(train_df['correct']==0, 1, np.nan)\n",
159 |     "train_df['num_correct'] = np.where(train_df['correct']==1, 1, np.nan)\n",
160 |     "test_df['num_incorrect'] = np.where(test_df['correct']==0, 1, np.nan)\n",
161 |     "test_df['num_correct'] = np.where(test_df['correct']==1, 1, np.nan)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# Convert game_time to seconds"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 7,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "train_df['game_time'] = train_df['game_time'] // 1000\n",
178 |     "test_df['game_time'] = test_df['game_time'] // 1000"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "# Aggregation by game_session"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {
192 |     "scrolled": true
193 |    },
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "CPU times: user 2min 29s, sys: 21.9 s, total: 2min 51s\n",
200 |       "Wall time: 1min 21s\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "%%time\n",
206 |     "def get_agged_session(df):\n",
207 |     "    event_code = pd.crosstab(df['game_session'], df['event_code'])\n",
208 |     "    event_id = pd.crosstab(df['game_session'], df['event_id'])\n",
209 |     "        \n",
210 |     "    event_num_correct = pd.pivot_table(df[(~df['correct'].isna())], index='game_session', columns='event_code', values='num_correct', aggfunc='sum')\n",
211 |     "    event_num_incorrect = pd.pivot_table(df[(~df['correct'].isna())], index='game_session', columns='event_code', values='num_incorrect', aggfunc='sum')\n",
212 |     "    event_accuracy = event_num_correct/(event_num_correct+event_num_incorrect[event_num_correct.columns])\n",
213 |     "    event_accuracy = event_accuracy.add_prefix('accuray_')    \n",
214 |     "    \n",
215 |     "    event_round = pd.pivot_table(df[~df['correct'].isna()], index='game_session', columns='event_code', values='round', aggfunc='max')\n",
216 |     "    event_round = event_round.add_prefix('round_')    \n",
217 |     "    \n",
218 |     "    df['elapsed_time'] = df[['game_session', 'game_time']].groupby('game_session')['game_time'].diff()\n",
219 |     "    game_time = df.groupby('game_session', as_index=False)['elapsed_time'].agg(['mean', 'max']).reset_index()\n",
220 |     "    game_time.columns = ['game_session', 'mean_game_time', 'max_game_time']    \n",
221 |     "    df = df.merge(game_time, on='game_session', how='left')     \n",
222 |     "    del df['elapsed_time']\n",
223 |     "    \n",
224 |     "    session_extra_df = pd.concat([event_code, event_id, event_accuracy, event_round], 1)\n",
225 |     "    session_extra_df.index.name = 'game_session'\n",
226 |     "    session_extra_df.reset_index(inplace=True)\n",
227 |     "    \n",
228 |     "    session_df = df.drop_duplicates('game_session', keep='last').reset_index(drop=True)\n",
229 |     "    session_df['row_id'] = session_df.index\n",
230 |     "    session_df = session_df.merge(session_extra_df, how='left', on='game_session')\n",
231 |     "    return session_df\n",
232 |     "agged_train_df = get_agged_session(train_df)\n",
233 |     "agged_test_df = get_agged_session(test_df)\n",
234 |     "\n",
235 |     "agged_train_df = agged_train_df.drop(['correct', 'round', 'num_correct', 'num_incorrect'], axis=1)\n",
236 |     "agged_test_df = agged_test_df.drop(['correct', 'round', 'num_correct', 'num_incorrect'], axis=1)\n",
237 |     "\n",
238 |     "agged_test_df = agged_test_df.append(pd.DataFrame(columns=agged_train_df.columns))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "# Additional training data generation"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 10,
251 |    "metadata": {},
252 |    "outputs": [
253 |     {
254 |      "data": {
255 |       "application/vnd.jupyter.widget-view+json": {
256 |        "model_id": "",
257 |        "version_major": 2,
258 |        "version_minor": 0
259 |       },
260 |       "text/plain": [
261 |        "HBox(children=(FloatProgress(value=0.0, max=17000.0), HTML(value='')))"
262 |       ]
263 |      },
264 |      "metadata": {},
265 |      "output_type": "display_data"
266 |     },
267 |     {
268 |      "name": "stdout",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "\r"
272 |      ]
273 |     },
274 |     {
275 |      "data": {
276 |       "application/vnd.jupyter.widget-view+json": {
277 |        "model_id": "",
278 |        "version_major": 2,
279 |        "version_minor": 0
280 |       },
281 |       "text/plain": [
282 |        "HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))"
283 |       ]
284 |      },
285 |      "metadata": {},
286 |      "output_type": "display_data"
287 |     },
288 |     {
289 |      "name": "stdout",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "\r",
293 |       "CPU times: user 44 s, sys: 549 ms, total: 44.6 s\n",
294 |       "Wall time: 44.4 s\n"
295 |      ]
296 |     }
297 |    ],
298 |    "source": [
299 |     "%%time\n",
300 |     "def gen_game_label(df):\n",
301 |     "    num_corrects = []\n",
302 |     "    for inst_id, one_df in tqdm_notebook(df.groupby('installation_id'), leave=False):\n",
303 |     "        one_df = one_df[(one_df['type']=='Game')&(one_df['event_code'].isin([4020, 4025]) )]\n",
304 |     "        for game_session, title_df in one_df.groupby('game_session'):            \n",
305 |     "            num_correct = title_df['event_data'].str.contains('\"correct\":true').sum()\n",
306 |     "            num_incorrect = title_df['event_data'].str.contains('\"correct\":false').sum()            \n",
307 |     "            num_corrects.append([inst_id, game_session, num_correct, num_incorrect])\n",
308 |     "    label_df = pd.DataFrame(num_corrects, columns=['installation_id', 'game_session', 'num_correct', 'num_incorrect'])\n",
309 |     "    label_df['accuracy'] = label_df['num_correct'] / (label_df['num_correct']+label_df['num_incorrect'])\n",
310 |     "    label_df['accuracy_group'] = 3\n",
311 |     "    label_df['accuracy_group'][label_df['accuracy']==0.5] = 2\n",
312 |     "    label_df['accuracy_group'][label_df['accuracy']<0.5] = 1\n",
313 |     "    label_df['accuracy_group'][label_df['accuracy']==0] = 0\n",
314 |     "    return label_df\n",
315 |     "train_game_label_df = gen_game_label(train_df)\n",
316 |     "test_game_label_df = gen_game_label(test_df)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "# Generate&Merge label"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 11,
329 |    "metadata": {
330 |     "scrolled": true
331 |    },
332 |    "outputs": [
333 |     {
334 |      "data": {
335 |       "application/vnd.jupyter.widget-view+json": {
336 |        "model_id": "",
337 |        "version_major": 2,
338 |        "version_minor": 0
339 |       },
340 |       "text/plain": [
341 |        "HBox(children=(FloatProgress(value=0.0, max=17000.0), HTML(value='')))"
342 |       ]
343 |      },
344 |      "metadata": {},
345 |      "output_type": "display_data"
346 |     },
347 |     {
348 |      "name": "stdout",
349 |      "output_type": "stream",
350 |      "text": [
351 |       "\r"
352 |      ]
353 |     },
354 |     {
355 |      "data": {
356 |       "application/vnd.jupyter.widget-view+json": {
357 |        "model_id": "",
358 |        "version_major": 2,
359 |        "version_minor": 0
360 |       },
361 |       "text/plain": [
362 |        "HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))"
363 |       ]
364 |      },
365 |      "metadata": {},
366 |      "output_type": "display_data"
367 |     },
368 |     {
369 |      "name": "stdout",
370 |      "output_type": "stream",
371 |      "text": [
372 |       "\r",
373 |       "CPU times: user 30.5 s, sys: 572 ms, total: 31.1 s\n",
374 |       "Wall time: 31 s\n"
375 |      ]
376 |     }
377 |    ],
378 |    "source": [
379 |     "%%time\n",
380 |     "def gen_label(df):\n",
381 |     "    num_corrects = []\n",
382 |     "    for inst_id, one_df in tqdm_notebook(df.groupby('installation_id'), leave=False):\n",
383 |     "        one_df = one_df[(one_df['type']=='Assessment')&(one_df['event_code']==4100)]\n",
384 |     "        for game_session, title_df in one_df.groupby('game_session'):            \n",
385 |     "            num_correct = title_df['event_data'].str.contains('\"correct\":true').sum()\n",
386 |     "            num_incorrect = title_df['event_data'].str.contains('\"correct\":false').sum()            \n",
387 |     "            num_corrects.append([inst_id, game_session, num_correct, num_incorrect])\n",
388 |     "    label_df = pd.DataFrame(num_corrects, columns=['installation_id', 'game_session', 'num_correct', 'num_incorrect'])\n",
389 |     "    label_df['accuracy'] = label_df['num_correct'] / (label_df['num_correct']+label_df['num_incorrect'])\n",
390 |     "    label_df['accuracy_group'] = 3\n",
391 |     "    label_df['accuracy_group'][label_df['accuracy']==0.5] = 2    \n",
392 |     "    label_df['accuracy_group'][label_df['accuracy']<0.5] = 1\n",
393 |     "    label_df['accuracy_group'][label_df['accuracy']==0] = 0    \n",
394 |     "    return label_df\n",
395 |     "train_label_df = gen_label(train_df)\n",
396 |     "test_label_df = gen_label(test_df)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 12,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "(303319, 456)\n",
409 |       "(28445, 456)\n",
410 |       "CPU times: user 5.62 s, sys: 516 ms, total: 6.14 s\n",
411 |       "Wall time: 1.79 s\n"
412 |      ]
413 |     }
414 |    ],
415 |    "source": [
416 |     "%%time\n",
417 |     "agged_train_df = agged_train_df.merge(train_label_df, on=['game_session', 'installation_id'], how='left')\n",
418 |     "agged_train_df = agged_train_df.merge(train_game_label_df, on=['game_session', 'installation_id'], how='left', suffixes=('', '_game'))\n",
419 |     "agged_test_df = agged_test_df.merge(test_label_df, on=['game_session', 'installation_id'], how='left')\n",
420 |     "agged_test_df = agged_test_df.merge(test_game_label_df, on=['game_session', 'installation_id'], how='left', suffixes=('', '_game'))\n",
421 |     "agged_test_df = agged_test_df[agged_train_df.columns]\n",
422 |     "print(agged_train_df.shape)\n",
423 |     "print(agged_test_df.shape)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 13,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/plain": [
434 |        "(17690, 456)"
435 |       ]
436 |      },
437 |      "execution_count": 13,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "agged_train_df[(agged_train_df['accuracy_group'] >= 0)&(agged_train_df['type']=='Assessment')].shape"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "### Generate sample_indices"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 14,
456 |    "metadata": {
457 |     "scrolled": true
458 |    },
459 |    "outputs": [
460 |     {
461 |      "data": {
462 |       "application/vnd.jupyter.widget-view+json": {
463 |        "model_id": "3a8c0157add041fabfd28cd7281ea685",
464 |        "version_major": 2,
465 |        "version_minor": 0
466 |       },
467 |       "text/plain": [
468 |        "HBox(children=(FloatProgress(value=0.0, max=17000.0), HTML(value='')))"
469 |       ]
470 |      },
471 |      "metadata": {},
472 |      "output_type": "display_data"
473 |     },
474 |     {
475 |      "name": "stdout",
476 |      "output_type": "stream",
477 |      "text": [
478 |       "\n"
479 |      ]
480 |     },
481 |     {
482 |      "data": {
483 |       "application/vnd.jupyter.widget-view+json": {
484 |        "model_id": "fe8266958d444be6b56443ad98aee75b",
485 |        "version_major": 2,
486 |        "version_minor": 0
487 |       },
488 |       "text/plain": [
489 |        "HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))"
490 |       ]
491 |      },
492 |      "metadata": {},
493 |      "output_type": "display_data"
494 |     },
495 |     {
496 |      "name": "stdout",
497 |      "output_type": "stream",
498 |      "text": [
499 |       "\n",
500 |       "17690 2018\n",
501 |       "CPU times: user 41.5 s, sys: 512 ms, total: 42 s\n",
502 |       "Wall time: 22.5 s\n"
503 |      ]
504 |     }
505 |    ],
506 |    "source": [
507 |     "%%time\n",
508 |     "def get_train_sample_indices(df):\n",
509 |     "    sample_indices = []\n",
510 |     "    inst_indiecs = []    \n",
511 |     "    df_groups = df.groupby('installation_id').groups\n",
512 |     "    for inst_idx, indices in enumerate(tqdm_notebook(df_groups.values())):\n",
513 |     "        one_df = df.iloc[indices].reset_index(drop=True)\n",
514 |     "        assessment_start_indices = one_df[(one_df['type']=='Assessment')&\n",
515 |     "                                          (one_df['accuracy_group']>=0)\n",
516 |     "                                         ].index\n",
517 |     "        for num, start_index in enumerate(assessment_start_indices):\n",
518 |     "            sample_indices.append( one_df.iloc[:start_index+1]['row_id'].tolist() )\n",
519 |     "            inst_indiecs.append(inst_idx)            \n",
520 |     "    return sample_indices, inst_indiecs\n",
521 |     "\n",
522 |     "train_samples, train_groups = get_train_sample_indices(agged_train_df)\n",
523 |     "test_samples, test_groups = get_train_sample_indices(agged_test_df)\n",
524 |     "print(len(train_samples), len(test_samples))"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 15,
530 |    "metadata": {},
531 |    "outputs": [
532 |     {
533 |      "data": {
534 |       "application/vnd.jupyter.widget-view+json": {
535 |        "model_id": "f868669f4b9f4b57a267832b7711b0fc",
536 |        "version_major": 2,
537 |        "version_minor": 0
538 |       },
539 |       "text/plain": [
540 |        "HBox(children=(FloatProgress(value=0.0, max=17000.0), HTML(value='')))"
541 |       ]
542 |      },
543 |      "metadata": {},
544 |      "output_type": "display_data"
545 |     },
546 |     {
547 |      "name": "stdout",
548 |      "output_type": "stream",
549 |      "text": [
550 |       "\n"
551 |      ]
552 |     },
553 |     {
554 |      "data": {
555 |       "application/vnd.jupyter.widget-view+json": {
556 |        "model_id": "9ee19aa4b97b455a84dd90f9c6129861",
557 |        "version_major": 2,
558 |        "version_minor": 0
559 |       },
560 |       "text/plain": [
561 |        "HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))"
562 |       ]
563 |      },
564 |      "metadata": {},
565 |      "output_type": "display_data"
566 |     },
567 |     {
568 |      "name": "stdout",
569 |      "output_type": "stream",
570 |      "text": [
571 |       "\n",
572 |       "41194 4225\n",
573 |       "CPU times: user 44.1 s, sys: 686 ms, total: 44.8 s\n",
574 |       "Wall time: 25.3 s\n"
575 |      ]
576 |     }
577 |    ],
578 |    "source": [
579 |     "%%time\n",
580 |     "def get_train_game_sample_indices(df):\n",
581 |     "    sample_indices = []\n",
582 |     "    inst_indiecs = []    \n",
583 |     "    df_groups = df.groupby('installation_id').groups\n",
584 |     "    for inst_idx, indices in enumerate(tqdm_notebook(df_groups.values())):\n",
585 |     "        one_df = df.iloc[indices].reset_index(drop=True)\n",
586 |     "        assessment_start_indices = one_df[(one_df['type']=='Game')&\n",
587 |     "                                          (one_df['accuracy_group_game']>=0)\n",
588 |     "                                         ].index\n",
589 |     "        for num, start_index in enumerate(assessment_start_indices):\n",
590 |     "            sample_indices.append( one_df.iloc[:start_index+1]['row_id'].tolist() )\n",
591 |     "            inst_indiecs.append(inst_idx)            \n",
592 |     "    return sample_indices, inst_indiecs\n",
593 |     "\n",
594 |     "train_game_samples, train_game_groups = get_train_game_sample_indices(agged_train_df)\n",
595 |     "test_game_samples, test_game_groups = get_train_game_sample_indices(agged_test_df)\n",
596 |     "print(len(train_game_samples), len(test_game_samples))"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": 16,
602 |    "metadata": {
603 |     "scrolled": true
604 |    },
605 |    "outputs": [],
606 |    "source": [
607 |     "agged_train_df = agged_train_df.fillna(0)\n",
608 |     "agged_test_df = agged_test_df.fillna(0)"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "markdown",
613 |    "metadata": {},
614 |    "source": [
615 |     "# Convert categorical data to corresponding index"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": 17,
621 |    "metadata": {
622 |     "scrolled": true
623 |    },
624 |    "outputs": [
625 |     {
626 |      "data": {
627 |       "application/vnd.jupyter.widget-view+json": {
628 |        "model_id": "de294c897ebc4c58b5287986e67920d8",
629 |        "version_major": 2,
630 |        "version_minor": 0
631 |       },
632 |       "text/plain": [
633 |        "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))"
634 |       ]
635 |      },
636 |      "metadata": {},
637 |      "output_type": "display_data"
638 |     },
639 |     {
640 |      "name": "stdout",
641 |      "output_type": "stream",
642 |      "text": [
643 |       "\n",
644 |       "CPU times: user 8.64 s, sys: 2.29 s, total: 10.9 s\n",
645 |       "Wall time: 916 ms\n"
646 |      ]
647 |     }
648 |    ],
649 |    "source": [
650 |     "%%time\n",
651 |     "all_df = pd.concat([agged_train_df, agged_test_df])\n",
652 |     "cate_cols = ['title', 'type', 'world']\n",
653 |     "cont_cols = ['event_count', 'game_time', 'max_game_time']\n",
654 |     "extra_cont_cls = list(agged_train_df.columns[15:-4]) # except 2000\n",
655 |     "mappers_dict = {}\n",
656 |     "\n",
657 |     "cate_offset = 1\n",
658 |     "for col in tqdm_notebook(cate_cols):    \n",
659 |     "    cate2idx = {}\n",
660 |     "    for v in all_df[col].unique():\n",
661 |     "        if (v != v) | (v == None): continue \n",
662 |     "        cate2idx[v] = len(cate2idx)+cate_offset\n",
663 |     "    mappers_dict[col] = cate2idx    \n",
664 |     "    agged_train_df[col] = agged_train_df[col].map(cate2idx).fillna(0).astype(int)\n",
665 |     "    agged_test_df[col] = agged_test_df[col].map(cate2idx).fillna(0).astype(int)\n",
666 |     "    cate_offset += len(cate2idx)\n",
667 |     "del all_df"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 20,
673 |    "metadata": {},
674 |    "outputs": [
675 |     {
676 |      "name": "stdout",
677 |      "output_type": "stream",
678 |      "text": [
679 |       "CPU times: user 2.5 s, sys: 1.85 s, total: 4.35 s\n",
680 |       "Wall time: 4.37 s\n"
681 |      ]
682 |     }
683 |    ],
684 |    "source": [
685 |     "%%time\n",
686 |     "os.makedirs(settings['TRAIN_DATA_DIR'], exist_ok=True)\n",
687 |     "torch.save([agged_train_df, agged_test_df, mappers_dict, cate_offset, cate_cols, cont_cols, extra_cont_cls, \n",
688 |     "            train_samples, train_groups, test_samples, train_game_samples, test_game_samples],\n",
689 |     "           os.path.join(settings['TRAIN_DATA_DIR'], 'bowl.pt'))"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 21,
695 |    "metadata": {},
696 |    "outputs": [],
697 |    "source": [
698 |     "torch.save([agged_train_df.columns, mappers_dict, cate_offset, cate_cols, cont_cols, extra_cont_cls],\n",
699 |     "           os.path.join(settings['TRAIN_DATA_DIR'], 'bowl_info.pt'))"
700 |    ]
701 |   }
702 |  ],
703 |  "metadata": {
704 |   "kernelspec": {
705 |    "display_name": "Python 3",
706 |    "language": "python",
707 |    "name": "python3"
708 |   },
709 |   "language_info": {
710 |    "codemirror_mode": {
711 |     "name": "ipython",
712 |     "version": 3
713 |    },
714 |    "file_extension": ".py",
715 |    "mimetype": "text/x-python",
716 |    "name": "python",
717 |    "nbconvert_exporter": "python",
718 |    "pygments_lexer": "ipython3",
719 |    "version": "3.7.5"
720 |   }
721 |  },
722 |  "nbformat": 4,
723 |  "nbformat_minor": 1
724 | }
725 | 


--------------------------------------------------------------------------------
/code/prepare_data.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os, gc, sys, warnings, random, math, psutil, pickle
  5 | 
  6 | from tqdm import tqdm as tqdm_notebook
  7 | import json
  8 | 
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | settings = json.load(open('SETTINGS.json'))
 12 | 
 13 | # DATA LOAD
 14 | print('loading data ...')
 15 | train_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'train.csv'))
 16 | test_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'test.csv'))
 17 | train_label_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'train_labels.csv'))
 18 | specs_df = pd.read_csv(os.path.join(settings['RAW_DATA_DIR'], 'specs.csv'))
 19 | print('loading ... done')
 20 | 
 21 | def replace_4110_4100(df):
 22 |     rep_code4110_bool = (df['title']=='Bird Measurer (Assessment)')&(df['event_code']==4110)
 23 |     rep_code4100_bool = (df['title']=='Bird Measurer (Assessment)')&(df['event_code']==4100)
 24 |     df['event_code'][rep_code4110_bool] = 4100
 25 |     df['event_code'][rep_code4100_bool] = 5110 
 26 | replace_4110_4100(train_df)
 27 | replace_4110_4100(test_df)
 28 | 
 29 | # Create additional columns from event_code
 30 | def extract_data_from_event_code(df, columns=['correct', 'round']):
 31 |     for col in columns:
 32 |         col_bool = df['event_data'].str.contains(col)
 33 |         df[col] = np.nan
 34 |         df[col][col_bool] = df['event_data'][col_bool].apply(lambda x: json.loads(x).get(col)).astype(float)
 35 | 
 36 | print('extract_data_from_event_code ...')
 37 | extract_data_from_event_code(train_df)
 38 | extract_data_from_event_code(test_df)
 39 |         
 40 | train_df['num_incorrect'] = np.where(train_df['correct']==0, 1, np.nan)
 41 | train_df['num_correct'] = np.where(train_df['correct']==1, 1, np.nan)
 42 | test_df['num_incorrect'] = np.where(test_df['correct']==0, 1, np.nan)
 43 | test_df['num_correct'] = np.where(test_df['correct']==1, 1, np.nan)
 44 | 
 45 | # Convert game_time to seconds
 46 | train_df['game_time'] = train_df['game_time'] // 1000
 47 | test_df['game_time'] = test_df['game_time'] // 1000
 48 | 
 49 | # Aggregation by game_session
 50 | def get_agged_session(df):
 51 |     event_code = pd.crosstab(df['game_session'], df['event_code'])
 52 |     event_id = pd.crosstab(df['game_session'], df['event_id'])
 53 |         
 54 |     event_num_correct = pd.pivot_table(df[(~df['correct'].isna())], index='game_session', columns='event_code', values='num_correct', aggfunc='sum')
 55 |     event_num_incorrect = pd.pivot_table(df[(~df['correct'].isna())], index='game_session', columns='event_code', values='num_incorrect', aggfunc='sum')
 56 |     event_accuracy = event_num_correct/(event_num_correct+event_num_incorrect[event_num_correct.columns])
 57 |     event_accuracy = event_accuracy.add_prefix('accuray_')    
 58 |     
 59 |     event_round = pd.pivot_table(df[~df['correct'].isna()], index='game_session', columns='event_code', values='round', aggfunc='max')
 60 |     event_round = event_round.add_prefix('round_')    
 61 |     
 62 |     df['elapsed_time'] = df[['game_session', 'game_time']].groupby('game_session')['game_time'].diff()
 63 |     game_time = df.groupby('game_session', as_index=False)['elapsed_time'].agg(['mean', 'max']).reset_index()
 64 |     game_time.columns = ['game_session', 'mean_game_time', 'max_game_time']    
 65 |     df = df.merge(game_time, on='game_session', how='left')     
 66 |     del df['elapsed_time']
 67 |     
 68 |     session_extra_df = pd.concat([event_code, event_id, event_accuracy, event_round], 1)
 69 |     session_extra_df.index.name = 'game_session'
 70 |     session_extra_df.reset_index(inplace=True)
 71 |     
 72 |     session_df = df.drop_duplicates('game_session', keep='last').reset_index(drop=True)
 73 |     session_df['row_id'] = session_df.index
 74 |     session_df = session_df.merge(session_extra_df, how='left', on='game_session')
 75 |     return session_df
 76 | 
 77 | print('get_agged_session ...')
 78 | agged_train_df = get_agged_session(train_df)
 79 | agged_test_df = get_agged_session(test_df)
 80 | 
 81 | agged_train_df = agged_train_df.drop(['correct', 'round', 'num_correct', 'num_incorrect'], axis=1)
 82 | agged_test_df = agged_test_df.drop(['correct', 'round', 'num_correct', 'num_incorrect'], axis=1)
 83 | 
 84 | agged_test_df = agged_test_df.append(pd.DataFrame(columns=agged_train_df.columns))
 85 | 
 86 | #Additional training data generation
 87 | def gen_game_label(df):
 88 |     num_corrects = []
 89 |     for inst_id, one_df in tqdm_notebook(df.groupby('installation_id'), leave=False):
 90 |         one_df = one_df[(one_df['type']=='Game')&(one_df['event_code'].isin([4020, 4025]) )]
 91 |         for game_session, title_df in one_df.groupby('game_session'):            
 92 |             num_correct = title_df['event_data'].str.contains('"correct":true').sum()
 93 |             num_incorrect = title_df['event_data'].str.contains('"correct":false').sum()            
 94 |             num_corrects.append([inst_id, game_session, num_correct, num_incorrect])
 95 |     label_df = pd.DataFrame(num_corrects, columns=['installation_id', 'game_session', 'num_correct', 'num_incorrect'])
 96 |     label_df['accuracy'] = label_df['num_correct'] / (label_df['num_correct']+label_df['num_incorrect'])
 97 |     label_df['accuracy_group'] = 3
 98 |     label_df['accuracy_group'][label_df['accuracy']==0.5] = 2
 99 |     label_df['accuracy_group'][label_df['accuracy']<0.5] = 1
100 |     label_df['accuracy_group'][label_df['accuracy']==0] = 0
101 |     return label_df
102 | print('gen_game_label ...')
103 | train_game_label_df = gen_game_label(train_df)
104 | test_game_label_df = gen_game_label(test_df)
105 | 
106 | # Generate&Merge label
107 | def gen_label(df):
108 |     num_corrects = []
109 |     for inst_id, one_df in tqdm_notebook(df.groupby('installation_id'), leave=False):
110 |         one_df = one_df[(one_df['type']=='Assessment')&(one_df['event_code']==4100)]
111 |         for game_session, title_df in one_df.groupby('game_session'):            
112 |             num_correct = title_df['event_data'].str.contains('"correct":true').sum()
113 |             num_incorrect = title_df['event_data'].str.contains('"correct":false').sum()            
114 |             num_corrects.append([inst_id, game_session, num_correct, num_incorrect])
115 |     label_df = pd.DataFrame(num_corrects, columns=['installation_id', 'game_session', 'num_correct', 'num_incorrect'])
116 |     label_df['accuracy'] = label_df['num_correct'] / (label_df['num_correct']+label_df['num_incorrect'])
117 |     label_df['accuracy_group'] = 3
118 |     label_df['accuracy_group'][label_df['accuracy']==0.5] = 2    
119 |     label_df['accuracy_group'][label_df['accuracy']<0.5] = 1
120 |     label_df['accuracy_group'][label_df['accuracy']==0] = 0    
121 |     return label_df
122 | print('gen_label ...')
123 | train_label_df = gen_label(train_df)
124 | test_label_df = gen_label(test_df)
125 | 
126 | agged_train_df = agged_train_df.merge(train_label_df, on=['game_session', 'installation_id'], how='left')
127 | agged_train_df = agged_train_df.merge(train_game_label_df, on=['game_session', 'installation_id'], how='left', suffixes=('', '_game'))
128 | agged_test_df = agged_test_df.merge(test_label_df, on=['game_session', 'installation_id'], how='left')
129 | agged_test_df = agged_test_df.merge(test_game_label_df, on=['game_session', 'installation_id'], how='left', suffixes=('', '_game'))
130 | agged_test_df = agged_test_df[agged_train_df.columns]
131 | print(agged_train_df.shape)
132 | print(agged_test_df.shape)
133 | 
134 | agged_train_df[(agged_train_df['accuracy_group'] >= 0)&(agged_train_df['type']=='Assessment')].shape
135 | 
136 | # Generate sample_indices
137 | def get_train_sample_indices(df):
138 |     sample_indices = []
139 |     inst_indiecs = []    
140 |     df_groups = df.groupby('installation_id').groups
141 |     for inst_idx, indices in enumerate(tqdm_notebook(df_groups.values())):
142 |         one_df = df.iloc[indices].reset_index(drop=True)
143 |         assessment_start_indices = one_df[(one_df['type']=='Assessment')&
144 |                                           (one_df['accuracy_group']>=0)
145 |                                          ].index
146 |         for num, start_index in enumerate(assessment_start_indices):
147 |             sample_indices.append( one_df.iloc[:start_index+1]['row_id'].tolist() )
148 |             inst_indiecs.append(inst_idx)            
149 |     return sample_indices, inst_indiecs
150 | 
151 | train_samples, train_groups = get_train_sample_indices(agged_train_df)
152 | test_samples, test_groups = get_train_sample_indices(agged_test_df)
153 | print(len(train_samples), len(test_samples))
154 | 
155 | def get_train_game_sample_indices(df):
156 |     sample_indices = []
157 |     inst_indiecs = []    
158 |     df_groups = df.groupby('installation_id').groups
159 |     for inst_idx, indices in enumerate(tqdm_notebook(df_groups.values())):
160 |         one_df = df.iloc[indices].reset_index(drop=True)
161 |         assessment_start_indices = one_df[(one_df['type']=='Game')&
162 |                                           (one_df['accuracy_group_game']>=0)
163 |                                          ].index
164 |         for num, start_index in enumerate(assessment_start_indices):
165 |             sample_indices.append( one_df.iloc[:start_index+1]['row_id'].tolist() )
166 |             inst_indiecs.append(inst_idx)            
167 |     return sample_indices, inst_indiecs
168 | 
169 | print('get_train_game_sample_indices ...')
170 | train_game_samples, train_game_groups = get_train_game_sample_indices(agged_train_df)
171 | test_game_samples, test_game_groups = get_train_game_sample_indices(agged_test_df)
172 | print(len(train_game_samples), len(test_game_samples))
173 | 
174 | agged_train_df = agged_train_df.fillna(0)
175 | agged_test_df = agged_test_df.fillna(0)
176 | 
177 | # Convert categorical data to corresponding index
178 | all_df = pd.concat([agged_train_df, agged_test_df])
179 | cate_cols = ['title', 'type', 'world']
180 | cont_cols = ['event_count', 'game_time', 'max_game_time']
181 | extra_cont_cls = list(agged_train_df.columns[15:-4]) # except 2000
182 | mappers_dict = {}
183 | 
184 | cate_offset = 1
185 | for col in tqdm_notebook(cate_cols):    
186 |     cate2idx = {}
187 |     for v in all_df[col].unique():
188 |         if (v != v) | (v == None): continue 
189 |         cate2idx[v] = len(cate2idx)+cate_offset
190 |     mappers_dict[col] = cate2idx    
191 |     agged_train_df[col] = agged_train_df[col].map(cate2idx).fillna(0).astype(int)
192 |     agged_test_df[col] = agged_test_df[col].map(cate2idx).fillna(0).astype(int)
193 |     cate_offset += len(cate2idx)
194 | del all_df
195 | 
196 | os.makedirs(settings['CLEAN_DATA_DIR'], exist_ok=True)
197 | torch.save([agged_train_df, agged_test_df, mappers_dict, cate_offset, cate_cols, cont_cols, extra_cont_cls, 
198 |             train_samples, train_groups, test_samples, train_game_samples, test_game_samples],
199 |            os.path.join(settings['CLEAN_DATA_DIR'], 'bowl.pt'))
200 | 
201 | torch.save([agged_train_df.columns, mappers_dict, cate_offset, cate_cols, cont_cols, extra_cont_cls],
202 |            os.path.join(settings['CLEAN_DATA_DIR'], 'bowl_info.pt'))
203 | 


--------------------------------------------------------------------------------
/code/submission.csv:
--------------------------------------------------------------------------------
   1 | installation_id,accuracy_group
   2 | 00abaee7,2
   3 | 01242218,3
   4 | 017c5718,3
   5 | 01a44906,3
   6 | 01bc6cb6,3
   7 | 02256298,3
   8 | 0267757a,2
   9 | 027e7ce5,2
  10 | 02a29f99,0
  11 | 0300c576,1
  12 | 03885368,3
  13 | 03ac279b,2
  14 | 03e33699,3
  15 | 048e7427,2
  16 | 04a7bc3f,1
  17 | 04d31500,1
  18 | 0500e23b,2
  19 | 0512bf0e,1
  20 | 0525589b,3
  21 | 05488e26,2
  22 | 05771bba,1
  23 | 05b82cf5,1
  24 | 05e17e19,3
  25 | 0617500d,3
  26 | 068ae11f,1
  27 | 0754f13b,0
  28 | 07749e99,3
  29 | 08611cc8,1
  30 | 08671ec7,2
  31 | 0889b0ae,3
  32 | 090fe325,0
  33 | 0937340d,2
  34 | 09aaaf83,0
  35 | 09aefe80,1
  36 | 0a126293,3
  37 | 0a2a77b2,2
  38 | 0a4c0f78,0
  39 | 0af94ba5,2
  40 | 0b24b6ac,2
  41 | 0b607c82,0
  42 | 0d5735f2,2
  43 | 0d735146,2
  44 | 0d7752d3,0
  45 | 0dd670e9,1
  46 | 0de6863d,3
  47 | 0e514571,3
  48 | 0e718764,3
  49 | 0ea27b66,1
  50 | 0f584054,3
  51 | 0f7116a6,3
  52 | 101999d8,0
  53 | 101d16f5,1
  54 | 108044a0,3
  55 | 109ad724,3
  56 | 10acf963,3
  57 | 1121f331,1
  58 | 1181ce7c,0
  59 | 11fa34d0,3
  60 | 125a3d09,1
  61 | 12771ee9,0
  62 | 1294d68e,2
  63 | 12bcbbce,2
  64 | 13629687,1
  65 | 138a2ecc,3
  66 | 13a0754c,1
  67 | 13bcaf23,3
  68 | 13cf3fc0,3
  69 | 13d608cb,3
  70 | 140087ce,1
  71 | 140ea7a3,0
  72 | 1423dc8f,2
  73 | 14cdc97f,3
  74 | 153f087c,1
  75 | 1594c19e,0
  76 | 15d86999,3
  77 | 15ec4544,3
  78 | 15f9b137,2
  79 | 16160dde,3
  80 | 1619f838,2
  81 | 16298f20,1
  82 | 163ffbd7,2
  83 | 167fe32c,1
  84 | 16992352,3
  85 | 16f8159f,3
  86 | 17d1ea55,2
  87 | 182fe06c,3
  88 | 18569185,3
  89 | 18a43ba3,0
  90 | 18dd112c,3
  91 | 1962067f,3
  92 | 19b97a3f,3
  93 | 1a153269,2
  94 | 1a1977ce,2
  95 | 1a20a79d,1
  96 | 1b38b81a,0
  97 | 1b3987c8,2
  98 | 1b5d84df,3
  99 | 1b6ecb54,1
 100 | 1baa3c99,0
 101 | 1bc2e77b,3
 102 | 1c5a2a78,3
 103 | 1d2eae73,0
 104 | 1d9c883e,3
 105 | 1dd31d7c,2
 106 | 1df0bdd4,0
 107 | 1e5124c5,3
 108 | 1e554379,2
 109 | 1e725305,2
 110 | 1e8622da,3
 111 | 1ed372f8,3
 112 | 1ed78db3,1
 113 | 1ef4452b,3
 114 | 1f04f739,2
 115 | 1f398b6b,2
 116 | 1f3ae424,2
 117 | 1f8bf570,2
 118 | 1f8c8bbc,2
 119 | 20244184,3
 120 | 205aea02,3
 121 | 20ade427,1
 122 | 20d2e0ab,3
 123 | 210c8bc0,3
 124 | 225505b3,2
 125 | 225e3787,3
 126 | 229464d8,3
 127 | 229918c1,0
 128 | 23f2ef8f,2
 129 | 240b4d74,2
 130 | 24858760,3
 131 | 253ac98f,3
 132 | 25504d06,3
 133 | 2569a283,1
 134 | 25914d08,1
 135 | 25faffc4,2
 136 | 269ac751,3
 137 | 26ae75a7,3
 138 | 26b5e264,3
 139 | 26e59e27,0
 140 | 2721167a,1
 141 | 274f2012,3
 142 | 2777912c,2
 143 | 27841c37,3
 144 | 27e272e5,1
 145 | 280f398c,0
 146 | 285b65c8,3
 147 | 28badbc7,3
 148 | 293c26ac,0
 149 | 29b40ab9,3
 150 | 29d19d88,2
 151 | 29feed5c,1
 152 | 2a2a27e1,2
 153 | 2a47e474,1
 154 | 2a6b9553,2
 155 | 2ab22ff2,0
 156 | 2ae3169a,2
 157 | 2ae57465,0
 158 | 2af16b0d,3
 159 | 2b05a2ce,0
 160 | 2b14f163,2
 161 | 2b1dfd68,3
 162 | 2b2d47da,3
 163 | 2b52c1de,0
 164 | 2c6bf518,3
 165 | 2c73ac9c,3
 166 | 2c8f4ade,1
 167 | 2cc6999b,2
 168 | 2d303e19,3
 169 | 2d4207d2,1
 170 | 2da73a3a,3
 171 | 2db60743,0
 172 | 2db7ae36,2
 173 | 2dbf3f3d,1
 174 | 2e1eb9be,3
 175 | 2e5371cc,3
 176 | 2e6923d6,3
 177 | 2e8174d0,3
 178 | 2f14f89e,1
 179 | 2f5ef5e1,3
 180 | 2f9de8b5,0
 181 | 2fd070b1,0
 182 | 30c40fff,3
 183 | 30d3ebf6,0
 184 | 30fdddc8,0
 185 | 3168a8b4,0
 186 | 318e4ea8,2
 187 | 31dd49ae,1
 188 | 326575f9,3
 189 | 3267eb2c,2
 190 | 33018ca2,3
 191 | 33206fdc,2
 192 | 33333852,3
 193 | 3382a49e,2
 194 | 33ba4783,1
 195 | 3412391a,1
 196 | 3455fc19,1
 197 | 34a8a031,3
 198 | 350caf20,2
 199 | 358017fd,1
 200 | 36025ee6,1
 201 | 364f0cb7,3
 202 | 3668cc54,3
 203 | 367f4a5c,3
 204 | 369cd1cf,0
 205 | 36c299c2,0
 206 | 3713a3ae,3
 207 | 374460a9,0
 208 | 37a73d25,3
 209 | 37b2ecaa,0
 210 | 37c1b132,2
 211 | 37ca2bf0,2
 212 | 37db1bb3,3
 213 | 37de20cb,1
 214 | 382ba0a6,0
 215 | 3843afa3,3
 216 | 38f6cbf4,3
 217 | 390d2e18,3
 218 | 390f3909,3
 219 | 394d4011,2
 220 | 3a3900a7,0
 221 | 3a5b7b96,0
 222 | 3b47bb68,2
 223 | 3bfe4ea5,3
 224 | 3c0e0aa5,3
 225 | 3c131ab0,0
 226 | 3c16cc84,1
 227 | 3c7f1ba5,2
 228 | 3ca86805,3
 229 | 3cb30295,0
 230 | 3d214090,3
 231 | 3d74fc27,1
 232 | 3dddb6b4,0
 233 | 3e7ccccd,3
 234 | 3f3e32f7,3
 235 | 40ee2242,3
 236 | 4102c6d6,2
 237 | 4108ffbd,3
 238 | 4124318e,1
 239 | 418fc138,1
 240 | 41f8ed07,3
 241 | 4203e3c8,1
 242 | 428f4daa,3
 243 | 42cb86bc,1
 244 | 4360a0a2,3
 245 | 43bd1ec6,2
 246 | 43c57f97,3
 247 | 43c76004,3
 248 | 43e459b7,3
 249 | 448282ef,3
 250 | 44b8eb05,0
 251 | 44d7fd71,1
 252 | 45757e3a,3
 253 | 4577d16b,2
 254 | 457d8c6f,3
 255 | 460f9e89,1
 256 | 46486950,3
 257 | 46da14b7,2
 258 | 47846823,1
 259 | 4794785e,2
 260 | 47c5cd9d,3
 261 | 480bfe4f,3
 262 | 48166507,1
 263 | 482bcef5,3
 264 | 48452e0c,3
 265 | 485608be,1
 266 | 4886dc84,2
 267 | 48b488e9,2
 268 | 48fcf9a7,2
 269 | 4914dd68,2
 270 | 49795b37,0
 271 | 49968645,1
 272 | 49cb38c4,0
 273 | 4a2f4b3d,1
 274 | 4a47b299,3
 275 | 4afece2b,0
 276 | 4b15ffde,1
 277 | 4b83d461,2
 278 | 4bd03fc5,0
 279 | 4be715ec,1
 280 | 4c3b1ab4,3
 281 | 4c90f5cf,2
 282 | 4cd1add0,3
 283 | 4d3008d8,3
 284 | 4da8008a,1
 285 | 4dd9451a,2
 286 | 4dea7fc7,1
 287 | 4e4bd932,3
 288 | 4e527a82,2
 289 | 4ec94f2f,0
 290 | 4ee75403,2
 291 | 4f0b76d2,3
 292 | 4f9a7001,3
 293 | 4fc92163,2
 294 | 4fdeb30a,1
 295 | 4fe21d91,2
 296 | 4ff2702e,2
 297 | 501d9bb3,3
 298 | 502f7923,2
 299 | 505f3ed3,3
 300 | 50797f53,2
 301 | 5090c28a,3
 302 | 50921014,0
 303 | 50c15bd1,3
 304 | 50c78eb8,1
 305 | 50cac90b,1
 306 | 50da9551,3
 307 | 50fb714a,1
 308 | 5157a015,3
 309 | 518b4490,3
 310 | 51936be2,2
 311 | 51bc6b81,2
 312 | 5241b7aa,3
 313 | 52797079,2
 314 | 52ac2e43,3
 315 | 52bd3cc7,1
 316 | 532002a8,1
 317 | 555d58e9,3
 318 | 557f5465,3
 319 | 55854e2f,2
 320 | 55a610f2,0
 321 | 55bf8834,3
 322 | 56034655,3
 323 | 56a739ec,3
 324 | 57447735,3
 325 | 57669c7a,3
 326 | 578d8afa,2
 327 | 57b3f0b8,3
 328 | 5822f351,3
 329 | 583a519a,2
 330 | 587b89b2,2
 331 | 59384e23,2
 332 | 596ecac3,1
 333 | 59e14ec0,0
 334 | 59e24114,0
 335 | 5a4a2704,3
 336 | 5a6f3de8,2
 337 | 5aac2dfe,2
 338 | 5ac4ddcf,3
 339 | 5b2b9654,1
 340 | 5b5cf755,1
 341 | 5b827e23,0
 342 | 5b952598,3
 343 | 5bc238a1,1
 344 | 5c762b02,2
 345 | 5c7dd83d,1
 346 | 5cdb3a18,0
 347 | 5d21020f,3
 348 | 5d3d2ce3,2
 349 | 5d46e893,1
 350 | 5d47aecb,1
 351 | 5e75d818,3
 352 | 5f52cd36,3
 353 | 5f70abd1,0
 354 | 5f8d5cd7,3
 355 | 5f9c37c8,3
 356 | 60152ce4,3
 357 | 601a8b2d,2
 358 | 606cf51e,3
 359 | 6074418e,3
 360 | 60d239c2,0
 361 | 60d98685,3
 362 | 626f7020,1
 363 | 62796d2c,3
 364 | 62ae8014,2
 365 | 62b910a2,3
 366 | 633b2936,2
 367 | 63460bf5,3
 368 | 63977a8d,3
 369 | 64032a5c,3
 370 | 6453e840,3
 371 | 654eee8f,3
 372 | 6586f9f0,2
 373 | 65b92dd5,1
 374 | 65bbd37f,1
 375 | 65fd1600,3
 376 | 66422987,2
 377 | 66854b19,2
 378 | 66e692ff,3
 379 | 6701a905,2
 380 | 6730aca1,3
 381 | 67842689,3
 382 | 67e9e0a3,3
 383 | 6802fa8b,3
 384 | 680e965a,2
 385 | 686db428,0
 386 | 68f9a1a3,2
 387 | 68fcbd36,0
 388 | 69164a28,2
 389 | 695e3677,1
 390 | 69861c96,3
 391 | 69aa9d69,3
 392 | 69e79a9d,2
 393 | 6a30ec55,2
 394 | 6a3880d1,1
 395 | 6acb248d,3
 396 | 6b159812,3
 397 | 6bb35199,1
 398 | 6c14c6ae,3
 399 | 6c611192,2
 400 | 6c8222a7,0
 401 | 6cd8f2d8,3
 402 | 6cdc30a1,3
 403 | 6d0059c6,3
 404 | 6d15f7d6,1
 405 | 6d2b5fde,2
 406 | 6d647776,2
 407 | 6d819195,1
 408 | 6eadd18e,1
 409 | 6ed69306,3
 410 | 6ed73a07,3
 411 | 6f1c0c5b,1
 412 | 6f5dc340,2
 413 | 6f6269a6,2
 414 | 6fc0fe6a,3
 415 | 6fec35e8,2
 416 | 6fec8226,3
 417 | 6ff3c298,2
 418 | 7014058f,0
 419 | 701de923,3
 420 | 703442eb,0
 421 | 70381620,0
 422 | 70f9963f,2
 423 | 7103c70e,1
 424 | 71a2c9f1,3
 425 | 71da16a1,2
 426 | 71e82b16,0
 427 | 723cb47c,3
 428 | 7262504d,3
 429 | 727c6239,3
 430 | 730ff2d5,2
 431 | 735a0533,2
 432 | 73a78f04,3
 433 | 7405e887,0
 434 | 7416c0e7,3
 435 | 74202006,0
 436 | 746670f0,3
 437 | 75857694,2
 438 | 763b2ac3,2
 439 | 76562474,1
 440 | 77238d3d,0
 441 | 773d63e3,2
 442 | 776e321e,2
 443 | 779b71a3,0
 444 | 77b2b854,3
 445 | 77d5414c,3
 446 | 781b4d97,0
 447 | 783b8f77,0
 448 | 784e8941,0
 449 | 7851dce6,3
 450 | 7973812a,3
 451 | 7973d319,0
 452 | 79a46c4b,1
 453 | 79fa657d,3
 454 | 7a1eba1a,3
 455 | 7a31ed2b,3
 456 | 7a33564c,3
 457 | 7ace042c,1
 458 | 7b4612fe,3
 459 | 7b4f19bd,2
 460 | 7b728c89,1
 461 | 7b9cc36a,2
 462 | 7bcaf152,2
 463 | 7bf58421,0
 464 | 7c505151,3
 465 | 7caa7f00,3
 466 | 7d0601ca,3
 467 | 7d261cf4,2
 468 | 7d5815b6,1
 469 | 7dd1c274,3
 470 | 7dd852d8,1
 471 | 7e3e2605,3
 472 | 7eab19c4,3
 473 | 7eff5909,3
 474 | 7f20aa0d,3
 475 | 7f521b1a,3
 476 | 7fdd456b,1
 477 | 7fe9ca96,3
 478 | 7ff648d8,1
 479 | 8017da91,3
 480 | 8023deae,2
 481 | 803be493,1
 482 | 803ffba5,3
 483 | 80b51e23,2
 484 | 80c464d7,3
 485 | 80d2682f,3
 486 | 80d28878,1
 487 | 80e0766c,1
 488 | 80f0f3d2,2
 489 | 81247ab3,3
 490 | 8127c0f7,0
 491 | 817fb400,2
 492 | 819d08a4,0
 493 | 83cfc0b2,3
 494 | 8446fe1a,1
 495 | 844a6e20,3
 496 | 849ffc5e,2
 497 | 84ad5637,3
 498 | 84ccdedf,3
 499 | 84fa3abb,3
 500 | 854dfe3a,0
 501 | 8551489a,2
 502 | 85567705,3
 503 | 85a36690,3
 504 | 85d2f821,2
 505 | 869ec6d3,3
 506 | 86a08ba0,2
 507 | 8712e11d,0
 508 | 876d9d93,3
 509 | 876e4c1c,3
 510 | 879f6a58,2
 511 | 87b899d9,2
 512 | 87c15b6f,0
 513 | 87f58bfa,2
 514 | 88514bde,2
 515 | 8854354f,3
 516 | 886cebc2,1
 517 | 88b74185,1
 518 | 88f5c349,3
 519 | 891c55ba,3
 520 | 893772d5,3
 521 | 89905528,3
 522 | 89a1d680,3
 523 | 89bda256,3
 524 | 8a3aca1f,3
 525 | 8ace5f29,2
 526 | 8ae49f4c,0
 527 | 8aed5ab4,1
 528 | 8b008e24,0
 529 | 8b29ddb4,3
 530 | 8b6dfb4c,1
 531 | 8b7b3eaa,0
 532 | 8be4aedf,3
 533 | 8c16d72a,1
 534 | 8c539e8e,0
 535 | 8c7d8d9b,3
 536 | 8c9314db,3
 537 | 8c9859b9,1
 538 | 8cd5bc7c,1
 539 | 8d03d0a5,3
 540 | 8d517034,1
 541 | 8d8f5d9f,0
 542 | 8dc53df3,3
 543 | 8ddb7ac1,1
 544 | 8e333109,3
 545 | 8eab953e,3
 546 | 8eeba692,3
 547 | 8f38fe78,3
 548 | 8f4e22b2,1
 549 | 8f71efea,2
 550 | 8fdb5402,3
 551 | 8fe03c35,1
 552 | 9032b145,2
 553 | 9046e327,2
 554 | 904abba7,3
 555 | 9069db29,0
 556 | 90859667,2
 557 | 909407b8,2
 558 | 90c8034d,0
 559 | 9131d5cf,3
 560 | 915c809a,3
 561 | 91796a02,2
 562 | 91d4699e,3
 563 | 92468e29,1
 564 | 92d63f0f,3
 565 | 92e5a83a,1
 566 | 9323c154,3
 567 | 933308f9,2
 568 | 93938a28,3
 569 | 940d51f8,3
 570 | 94a7461d,2
 571 | 94d43b7a,3
 572 | 953ddba9,0
 573 | 957219b8,2
 574 | 959c3da3,2
 575 | 95b591c1,3
 576 | 95dfe687,3
 577 | 95e63c0a,3
 578 | 96182fa7,3
 579 | 962183dc,2
 580 | 9638675f,3
 581 | 965c5adb,3
 582 | 96a7f636,0
 583 | 96f57ec6,3
 584 | 9701ffb5,0
 585 | 972c2d7f,2
 586 | 97501794,1
 587 | 97f14e50,0
 588 | 982cab25,3
 589 | 987a7222,2
 590 | 9885ddd8,1
 591 | 98c958c6,3
 592 | 996a3149,2
 593 | 99e2b46c,3
 594 | 9a13c8a2,3
 595 | 9a5f6d12,1
 596 | 9a7f22db,3
 597 | 9aa5dcea,2
 598 | 9aba2ff5,3
 599 | 9ae73003,1
 600 | 9afdf962,1
 601 | 9b001268,3
 602 | 9b9eb930,3
 603 | 9bb426a7,0
 604 | 9bf6cb31,1
 605 | 9c20c73b,1
 606 | 9c217eb9,2
 607 | 9c4cf176,2
 608 | 9c791ccc,1
 609 | 9ce70bef,3
 610 | 9cee9fc9,2
 611 | 9cfde2cc,1
 612 | 9d43d142,2
 613 | 9d7e8158,3
 614 | 9dc9534c,3
 615 | 9e0880ca,1
 616 | 9e266d34,2
 617 | 9e7e6cd8,3
 618 | 9e9f5e38,3
 619 | 9ea5577f,2
 620 | 9ec31362,1
 621 | 9f2a8f08,0
 622 | 9f4b32f8,3
 623 | 9f688c66,0
 624 | 9f9119f4,3
 625 | 9fbe5106,1
 626 | a002f1ac,2
 627 | a07f6fc5,3
 628 | a0808b82,1
 629 | a0cc50c3,1
 630 | a11fe7b5,0
 631 | a1491477,1
 632 | a20e6921,3
 633 | a21c8e70,3
 634 | a22c1a5c,3
 635 | a23567d2,0
 636 | a2b6a4b8,3
 637 | a344c900,1
 638 | a3a93b63,1
 639 | a41a15f2,3
 640 | a447f081,3
 641 | a4856de5,3
 642 | a49120c7,0
 643 | a53d1ba0,1
 644 | a5ac9b55,3
 645 | a5ba72f9,2
 646 | a5ca7ea0,3
 647 | a6742227,1
 648 | a6f65253,2
 649 | a702d1c5,0
 650 | a71b99a6,3
 651 | a723a382,3
 652 | a788da19,1
 653 | a83e60be,1
 654 | a8668fb6,3
 655 | a87db7ff,1
 656 | a9190917,3
 657 | a93d6f19,3
 658 | a9566517,3
 659 | aa3dfd63,0
 660 | aa58cb91,2
 661 | aa84d895,2
 662 | aab22722,2
 663 | aac5d998,0
 664 | aadc1247,1
 665 | aaf695d0,3
 666 | ab83ea80,0
 667 | abc7eeda,2
 668 | ac44ef65,2
 669 | ac564de9,1
 670 | ad0f90e1,2
 671 | ad3c8e7b,3
 672 | add06668,3
 673 | ae10e514,3
 674 | aee093c4,3
 675 | af07fb5c,2
 676 | af08cb8d,1
 677 | af147c52,2
 678 | af1c1bee,3
 679 | af4572f6,0
 680 | af47aac5,3
 681 | af70bd34,1
 682 | af82ea2d,0
 683 | af83bf7c,0
 684 | af908793,1
 685 | afb1f807,2
 686 | aff5e5ee,2
 687 | b024bf05,3
 688 | b04d8f0c,0
 689 | b0efc6f4,1
 690 | b1bb8dd0,3
 691 | b1ce5ee9,1
 692 | b20c5326,0
 693 | b2235d5f,0
 694 | b23db0ea,3
 695 | b265d311,1
 696 | b2e2ed7d,3
 697 | b2e61027,2
 698 | b2f94d1f,0
 699 | b3451ce1,2
 700 | b3523d81,3
 701 | b3de6c53,3
 702 | b43665c9,0
 703 | b4738558,3
 704 | b479a4e5,2
 705 | b47d249b,3
 706 | b4a52ce8,2
 707 | b4ea7a14,3
 708 | b50add36,3
 709 | b51ba618,2
 710 | b55b1bec,3
 711 | b563ef9d,3
 712 | b5bb257a,3
 713 | b5be8f08,3
 714 | b5e21f0b,1
 715 | b68a3662,3
 716 | b6a4854f,1
 717 | b6b80f42,3
 718 | b6c3e1ab,3
 719 | b7adc30a,2
 720 | b7ce52b8,3
 721 | b7ffd685,3
 722 | b84e696a,3
 723 | b8ae746c,3
 724 | b8b6e81e,2
 725 | b905f26e,1
 726 | b98668f4,3
 727 | ba59e168,3
 728 | ba67bd3d,3
 729 | ba709e6a,3
 730 | ba91cc7f,0
 731 | bb6a19af,3
 732 | bbd98f65,1
 733 | bc1b756f,2
 734 | bc696a1e,1
 735 | bc81d58f,0
 736 | bca32990,0
 737 | bcc553ab,3
 738 | bccf5379,1
 739 | bd2e184f,3
 740 | bd337a90,3
 741 | bd4ed818,2
 742 | bd544e63,3
 743 | be0381e4,2
 744 | be1333d2,0
 745 | bec5a6ab,1
 746 | becd49ac,3
 747 | bed8b41c,3
 748 | bf287639,2
 749 | bf685281,3
 750 | bf6e55c3,1
 751 | bfddcc77,0
 752 | bfe1e41f,3
 753 | c1143024,2
 754 | c1406fca,3
 755 | c1617227,3
 756 | c195e65e,1
 757 | c1d8cab1,2
 758 | c233f4f4,3
 759 | c285f89d,3
 760 | c2a5380a,1
 761 | c2cfee57,2
 762 | c31c4183,3
 763 | c3cabf93,3
 764 | c4271e38,1
 765 | c42aeee6,3
 766 | c44551c7,3
 767 | c4957cd6,1
 768 | c4d15da1,1
 769 | c5609e08,2
 770 | c571fbe4,2
 771 | c59ae92a,2
 772 | c64ad87e,0
 773 | c664f245,3
 774 | c683651f,2
 775 | c6dcee3d,1
 776 | c6dd3ee6,3
 777 | c796f42b,1
 778 | c798859f,1
 779 | c83a956e,2
 780 | c85bfc99,1
 781 | c9057808,3
 782 | c92e8b4c,0
 783 | c9422f9c,0
 784 | c96f2dd1,3
 785 | c98f2490,3
 786 | c9ce5a9e,2
 787 | ca5f2610,0
 788 | ca61bf3a,2
 789 | cab2ca4a,3
 790 | cbf3a6f3,3
 791 | cc0963c8,3
 792 | cc7d27b2,0
 793 | cc8dd5f8,2
 794 | ccad11ca,3
 795 | ccb21848,1
 796 | cd00d04e,3
 797 | cd0e657d,2
 798 | cd2f3fbe,3
 799 | cd389d57,3
 800 | cda0fd9b,0
 801 | cdda168a,3
 802 | ce08e98b,1
 803 | cec7185c,2
 804 | cf22f7b2,1
 805 | cf434979,3
 806 | cfa3af82,0
 807 | cfd27471,2
 808 | d02d21ec,2
 809 | d096c7f6,0
 810 | d09b7e2e,2
 811 | d09cff3b,2
 812 | d09ebf52,0
 813 | d0ca8163,0
 814 | d192a327,3
 815 | d1ac59d7,2
 816 | d1b7c089,3
 817 | d1e3bd8c,2
 818 | d1e82789,3
 819 | d277cc27,1
 820 | d3167e9d,2
 821 | d32ee860,3
 822 | d33c3eeb,2
 823 | d373ded5,3
 824 | d3d56480,2
 825 | d3e70da6,2
 826 | d40a6175,1
 827 | d4201b5c,3
 828 | d473029e,1
 829 | d4b8e447,2
 830 | d4cc2b9f,1
 831 | d4d67a36,3
 832 | d4f34b24,0
 833 | d51199b6,2
 834 | d5a527da,3
 835 | d5c40330,2
 836 | d5d620e2,2
 837 | d5d66a77,3
 838 | d6285c55,1
 839 | d630ee07,2
 840 | d6dc72c6,3
 841 | d70f0551,2
 842 | d719a0ac,3
 843 | d747b73c,1
 844 | d76c904c,2
 845 | d771d065,0
 846 | d7a365c9,1
 847 | d85537e2,2
 848 | d95dc7cb,3
 849 | d9649ea4,3
 850 | d9a34273,0
 851 | db4220b8,3
 852 | db4b6cbb,3
 853 | db9ae6e9,2
 854 | dbb1a1d3,3
 855 | dc1082c4,3
 856 | dc11ffb1,1
 857 | dc786a89,3
 858 | dcb34a93,2
 859 | dd3caf14,3
 860 | dd6867df,2
 861 | dd9a2277,1
 862 | ddb6f6f9,3
 863 | ddc7bffa,1
 864 | de032a3b,3
 865 | de2d545b,3
 866 | de3b703c,2
 867 | deb1622e,1
 868 | deb75085,3
 869 | dee2e2d6,2
 870 | df03c95d,3
 871 | dfac1a41,3
 872 | e0d45902,3
 873 | e0d8c625,2
 874 | e0e41475,3
 875 | e0f35499,0
 876 | e1022480,0
 877 | e1171430,1
 878 | e12d2470,0
 879 | e158441d,2
 880 | e1a319f5,2
 881 | e1a3b275,0
 882 | e1c0ba22,0
 883 | e1c50806,0
 884 | e1c81090,2
 885 | e2246114,3
 886 | e25c4a14,3
 887 | e27072f1,3
 888 | e2843455,2
 889 | e292f086,3
 890 | e2e5af76,3
 891 | e31a2e32,3
 892 | e342853d,1
 893 | e3953813,2
 894 | e44548a4,0
 895 | e47bebac,1
 896 | e5662c84,1
 897 | e566fc58,3
 898 | e5766f90,3
 899 | e658fb6a,3
 900 | e6735cb5,1
 901 | e6862711,1
 902 | e6ac1b51,2
 903 | e6ea5608,2
 904 | e6eefaf1,3
 905 | e7c0b097,3
 906 | e80eb67d,1
 907 | e814d20e,1
 908 | e8ec2a8a,3
 909 | e8ee5595,1
 910 | e945f044,3
 911 | e9cadf3b,3
 912 | ea075c48,0
 913 | ea0ad162,0
 914 | ea10c101,3
 915 | ea18fbd2,3
 916 | ea245eca,2
 917 | ea71286a,0
 918 | ea7f3ceb,3
 919 | eb296337,3
 920 | eb4ec7dd,3
 921 | eb7b5d02,3
 922 | eb98a24a,2
 923 | eb9dec2a,3
 924 | ebbbc1aa,2
 925 | ebc1278c,0
 926 | ebd77787,0
 927 | ec1f7c54,3
 928 | ec290def,3
 929 | ec307b05,3
 930 | ec8e3f91,3
 931 | ecae5976,0
 932 | ed17207f,2
 933 | edd5f3be,3
 934 | ede81700,2
 935 | ee222c36,2
 936 | eea1b45c,3
 937 | ef5de3c6,1
 938 | efacf214,1
 939 | efe2449e,0
 940 | f031811a,1
 941 | f0564da1,3
 942 | f099ba2a,0
 943 | f162b7a4,2
 944 | f1757815,1
 945 | f187f5f3,2
 946 | f1c6d8ab,3
 947 | f249834f,0
 948 | f2a1b17d,3
 949 | f2ce44bc,2
 950 | f3049748,2
 951 | f399b8a6,3
 952 | f3a12be8,3
 953 | f3a5f201,3
 954 | f3ac859a,3
 955 | f3c4b893,0
 956 | f3f98ebe,0
 957 | f452eef7,3
 958 | f456a3fd,1
 959 | f47ef997,0
 960 | f4a37ec8,3
 961 | f4ecc4cc,1
 962 | f538a295,0
 963 | f57d5a4a,2
 964 | f5f6689c,3
 965 | f5fa5578,1
 966 | f61591cb,3
 967 | f632a30c,1
 968 | f6494040,3
 969 | f694a537,2
 970 | f6954829,1
 971 | f69fc509,3
 972 | f6fb106b,0
 973 | f7c1b0f3,0
 974 | f7ec4dd3,0
 975 | f842b8b3,1
 976 | f86a6ed4,0
 977 | f8d9593e,2
 978 | f8dacbde,0
 979 | f99be0ba,3
 980 | f9dd0fe3,3
 981 | fa21f0e4,1
 982 | fa537c22,3
 983 | fa845dbf,2
 984 | faa8c019,3
 985 | faee065f,3
 986 | fb3e85f5,0
 987 | fbe1fea6,3
 988 | fc0367c0,1
 989 | fc5612b9,3
 990 | fca866bc,1
 991 | fcff43b4,3
 992 | fdfed7eb,2
 993 | fe50e0ea,2
 994 | fe5f7da8,1
 995 | fe8984b5,1
 996 | feaa21ac,1
 997 | fee254cf,3
 998 | ff57e602,0
 999 | ffc73fb2,3
1000 | ffe00ca8,1
1001 | ffe774cc,1
1002 | 


--------------------------------------------------------------------------------
/code/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import copy
  4 | import torch
  5 | import json
  6 | import time
  7 | import random
  8 | import logging
  9 | import argparse
 10 | import collections
 11 | import numpy as np
 12 | import pandas as pd
 13 | import bowl_db
 14 | import bowl_model
 15 | import bowl_utils
 16 | import torch.nn.functional as F
 17 | from torch.utils.data import DataLoader
 18 | from transformers import AdamW, get_linear_schedule_with_warmup
 19 | 
 20 | import warnings
 21 | warnings.filterwarnings(action='ignore')
 22 | 
 23 | settings = json.load(open('SETTINGS.json'))
 24 | DB_PATH=settings['CLEAN_DATA_DIR']
 25 | FINETUNED_MODEL_PATH=settings['MODEL_DIR']
 26 | 
 27 | 
 28 | class CFG:
 29 |     learning_rate=1.0e-4
 30 |     batch_size=64
 31 |     num_workers=4
 32 |     print_freq=100
 33 |     test_freq=1
 34 |     start_epoch=0
 35 |     num_train_epochs=10
 36 |     warmup_steps=30
 37 |     max_grad_norm=1000
 38 |     gradient_accumulation_steps=1
 39 |     weight_decay=0.01    
 40 |     dropout=0.2
 41 |     emb_size=100
 42 |     hidden_size=500
 43 |     nlayers=2
 44 |     nheads=10
 45 |     seq_len=100
 46 | 
 47 | 
 48 | def main():    
 49 |     parser = argparse.ArgumentParser("")
 50 |     parser.add_argument("--model", type=str, default='')
 51 |     parser.add_argument("--data", type=str, default='bowl.pt')
 52 |     parser.add_argument("--resume", action='store_true')
 53 |     parser.add_argument("--eval", action='store_true')
 54 |     parser.add_argument("--use_test", action='store_true')
 55 |     parser.add_argument("--aug", type=float, default=0.0)
 56 |     parser.add_argument("--batch_size", type=int, default=CFG.batch_size)
 57 |     parser.add_argument("--grad_accums", type=int, default=CFG.gradient_accumulation_steps)
 58 |     parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs)    
 59 |     parser.add_argument("--wsteps", type=int, default=CFG.warmup_steps)
 60 |     parser.add_argument("--seed", type=int, default=7)
 61 |     parser.add_argument("--data_seed", type=int, default=7)
 62 |     parser.add_argument("--seq_len", type=int, default=CFG.seq_len)
 63 |     parser.add_argument("--nlayers", type=int, default=CFG.nlayers)
 64 |     parser.add_argument("--nheads", type=int, default=CFG.nheads)
 65 |     parser.add_argument("--hidden_size", type=int, default=CFG.hidden_size)
 66 |     parser.add_argument("--k", type=int, default=0)
 67 |     parser.add_argument("--lr", type=float, default=CFG.learning_rate)
 68 |     parser.add_argument("--dropout", type=float, default=CFG.dropout)
 69 |     parser.add_argument("--encoder", type=str, default='TRANSFORMER')
 70 |     args = parser.parse_args()
 71 |     print(args) 
 72 |     
 73 |     CFG.batch_size=args.batch_size
 74 |     CFG.gradient_accumulation_steps = args.grad_accums
 75 |     CFG.batch_size = CFG.batch_size // CFG.gradient_accumulation_steps
 76 |     CFG.num_train_epochs=args.nepochs
 77 |     CFG.warmup_steps=args.wsteps    
 78 |     CFG.learning_rate=args.lr
 79 |     CFG.dropout=args.dropout
 80 |     CFG.seed =  args.seed
 81 |     CFG.data_seed =  args.data_seed
 82 |     CFG.seq_len =  args.seq_len
 83 |     CFG.nlayers =  args.nlayers
 84 |     CFG.nheads =  args.nheads
 85 |     CFG.hidden_size =  args.hidden_size
 86 |     CFG.res_dir=f'res_dir_{args.k}'
 87 |     CFG.target_size = 3
 88 |     CFG.encoder = args.encoder
 89 |     CFG.aug = args.aug
 90 |     print(CFG.__dict__)    
 91 |     
 92 |     os.environ['PYTHONHASHSEED'] = str(CFG.seed)
 93 |     random.seed(CFG.seed)
 94 |     np.random.seed(CFG.seed)
 95 |     torch.manual_seed(CFG.seed)    
 96 |     torch.cuda.manual_seed(CFG.seed)
 97 |     torch.backends.cudnn.deterministic = True 
 98 |     
 99 |     data_path = os.path.join(DB_PATH, args.data)
100 |     (train_df, test_df, mappers_dict, cate_offset, cate_cols,  
101 |      cont_cols, extra_cont_cls, train_samples, train_groups, test_samples,
102 |      train_game_samples, test_game_samples) = (
103 |         torch.load(data_path))
104 |     print(data_path)
105 |     print(cate_cols, cont_cols)
106 |     
107 |     CFG.total_cate_size = cate_offset
108 |     CFG.cate_cols = cate_cols
109 |     CFG.cont_cols = cont_cols + extra_cont_cls    
110 |     
111 |     model = bowl_model.encoders[CFG.encoder](CFG)
112 |     if args.model != "":
113 |         print("=> loading checkpoint '{}'".format(args.model))
114 |         checkpoint = torch.load(args.model)
115 |         
116 |         state_dict = collections.OrderedDict([(k, v) for k, v in checkpoint['state_dict'].items() if 'reg.' not in k])
117 |         CFG.start_epoch = checkpoint['epoch']        
118 |         model.load_state_dict(state_dict, strict=False)        
119 |         print("=> loaded checkpoint '{}' (epoch {})"
120 |               .format(args.model, checkpoint['epoch']))        
121 |     
122 |     model.cuda()
123 |     model._dropout = CFG.dropout
124 |     print('model.dropout:', model._dropout)
125 |     
126 |     def count_parameters(model):
127 |         return sum(p.numel() for p in model.parameters() if p.requires_grad)
128 |     print('parameters: ', count_parameters(model))
129 |     
130 |     #n_gpu = torch.cuda.device_count()
131 |     #if n_gpu > 1:
132 |     #    model = torch.nn.DataParallel(model)  
133 |     
134 |     train_samples, valid_samples = bowl_utils.train_valid_split(train_samples, train_groups, args.k, 
135 |                                                                 random_state=CFG.data_seed, random_state2=CFG.data_seed, choice=True)
136 |     print(train_samples.shape, valid_samples.shape)
137 |     if args.use_test:        
138 |         extra_samples = np.array([np.array(indices) + len(train_df) for indices in test_samples])
139 |         train_df = train_df.append(test_df).reset_index(drop=True)
140 |         train_df['row_id'] = train_df.index
141 |         train_samples = np.concatenate([train_samples, extra_samples])
142 |         print(train_samples.shape, valid_samples.shape)    
143 |     
144 |     last_indices = [indices[-1] for indices in valid_samples]
145 |     valid_installation_ids = train_df.iloc[last_indices]['installation_id'].unique()
146 |     
147 |     # remove samples for validation 
148 |     print(len(train_game_samples))
149 |     last_game_indices = [indices[-1] for indices in train_game_samples]
150 |     train_game_samples = [indices for indices, inst_id in zip(train_game_samples, train_df.iloc[last_game_indices]['installation_id']) if inst_id not in valid_installation_ids]
151 |     print(len(train_game_samples))
152 |     
153 |     
154 |     ext_train_db = bowl_db.BowlDataset(CFG, train_df, train_game_samples+list(train_samples), aug=CFG.aug)
155 |     train_db = bowl_db.BowlDataset(CFG, train_df, train_samples, aug=CFG.aug)
156 |     valid_db = bowl_db.BowlDataset(CFG, train_df, valid_samples)
157 |     
158 |     num_train_optimization_steps = int(
159 |         len(ext_train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (3)
160 |     num_train_optimization_steps += int(
161 |         len(train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (7)
162 |     print('num_train_optimization_steps', num_train_optimization_steps)    
163 | 
164 |     ext_train_loader = DataLoader(
165 |         ext_train_db, batch_size=CFG.batch_size, shuffle=True,
166 |         num_workers=CFG.num_workers, pin_memory=True)
167 |     train_loader = DataLoader(
168 |         train_db, batch_size=CFG.batch_size, shuffle=True,
169 |         num_workers=CFG.num_workers, pin_memory=True)
170 |     valid_loader = DataLoader(
171 |         valid_db, batch_size=CFG.batch_size, shuffle=False,
172 |         num_workers=CFG.num_workers, pin_memory=True)
173 |     
174 |     # Prepare optimizer
175 |     param_optimizer = list(model.named_parameters())
176 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
177 |     optimizer_grouped_parameters = [
178 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
179 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
180 |         ]
181 |     
182 |     optimizer = AdamW(optimizer_grouped_parameters,
183 |                            lr=CFG.learning_rate,
184 |                            weight_decay=CFG.weight_decay,                           
185 |                            )
186 | 
187 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=CFG.warmup_steps,
188 |                                         num_training_steps=num_train_optimization_steps
189 |                                      )                                   
190 |   
191 |     print('use WarmupLinearSchedule ...')
192 |     
193 |     def get_lr():
194 |         return scheduler.get_lr()[0]
195 |     
196 |     if args.model != "":
197 |         if args.resume:
198 |             optimizer.load_state_dict(checkpoint['optimizer'])
199 |             scheduler.load_state_dict(checkpoint['scheduler'])        
200 |         log_df = checkpoint['log']
201 |         del checkpoint
202 |     else:
203 |         log_df = pd.DataFrame(columns=(['EPOCH']+['LR']+['TRAIN_LOSS', 'TRAIN_KAPPA']+
204 |                                        ['VALID_LOSS', 'VALID_KAPPA']) )     
205 |     os.makedirs('log', exist_ok=True)
206 |     
207 |     curr_lr = get_lr()    
208 |     
209 |     print(f'initial learning rate:{curr_lr}')
210 |         
211 |     submission_df = train_df.iloc[[indices[-1] for indices in valid_samples]]
212 |     print(submission_df.shape)
213 |     
214 |     best_kappa = 0
215 |     best_model = None
216 |     best_epoch = 0
217 |     
218 |     model_list = []        
219 |             
220 |     for epoch in range(CFG.start_epoch, CFG.num_train_epochs):
221 |         # train for one epoch
222 |         
223 |         if epoch < 3:
224 |             train_loss, train_kappa = train(ext_train_loader, model, optimizer, epoch, scheduler)
225 |         else:
226 |             train_loss, train_kappa = train(train_loader, model, optimizer, epoch, scheduler)
227 |         
228 |         valid_loss, valid_kappa, _, _, _ = validate(valid_loader, model)        
229 |     
230 |         curr_lr = get_lr()       
231 |         print(f'set the learning_rate: {curr_lr}')
232 |         
233 |         model_list.append(copy.deepcopy(model))
234 |         if epoch % CFG.test_freq == 0 and epoch >= 0:
235 |             log_row = {'EPOCH':epoch, 'LR':curr_lr,
236 |                        'TRAIN_LOSS':train_loss, 'TRAIN_KAPPA':train_kappa,
237 |                        'VALID_LOSS':valid_loss, 'VALID_KAPPA':valid_kappa,
238 |                        } 
239 |                            
240 |             log_df = log_df.append(pd.DataFrame(log_row, index=[0]), sort=False)                        
241 |             print(log_df.tail(20))           
242 |             
243 |             batch_size = CFG.batch_size*CFG.gradient_accumulation_steps
244 |                         
245 |             if (best_kappa < valid_kappa):
246 |                 best_model = copy.deepcopy(model)
247 |                 best_kappa = valid_kappa
248 |                 best_epoch = epoch
249 |             
250 |     model_list = model_list[6:]    
251 |     last_model = best_model
252 |     last_params = dict(last_model.named_parameters())
253 |     for i in range(len(model_list)-1):
254 |         curr_params = dict(model_list[i].named_parameters())        
255 |         for name, param  in last_params.items():      
256 |             param.data += curr_params[name].data
257 |     for name, param  in last_params.items():
258 |         param.data /= len(model_list)
259 |     model = last_model
260 |     
261 |     valid_loss, valid_acc, coefficients, _, _ = validate(valid_loader, model)
262 |     print(valid_loss, valid_acc)
263 |     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the cust_model it-self    
264 |     
265 |     input_filename = args.data.split('/')[-1]
266 |     curr_model_name = (f'b-{batch_size}_a-{CFG.encoder}_e-{CFG.emb_size}_h-{CFG.hidden_size}_'                               
267 |                                f'd-{CFG.dropout}_l-{CFG.nlayers}_hd-{CFG.nheads}_'
268 |                                f's-{CFG.seed}_len-{CFG.seq_len}_aug-{CFG.aug}_da-{input_filename}_k-{args.k}.pt')
269 |     save_checkpoint({
270 |         'epoch': best_epoch + 1,
271 |         'arch': 'transformer',
272 |         'state_dict': model_to_save.state_dict(),
273 |         'log': log_df,
274 |         'coefficients': coefficients,
275 |         },        
276 |         FINETUNED_MODEL_PATH, curr_model_name,
277 |     )    
278 |     print('done')
279 | 
280 | 
281 | def compute_acc_gp(pred):
282 |     pred = (3*pred[:, 0] - 2*pred[:, 1])
283 |     pred[pred < 0] = 0    
284 |     return pred
285 | 
286 | 
287 | def train(train_loader, model, optimizer, epoch, scheduler):
288 |     batch_time = AverageMeter()
289 |     data_time = AverageMeter()
290 |     losses = AverageMeter()
291 |     accuracies = AverageMeter()
292 |     
293 |     sent_count = AverageMeter()
294 |     #meter = bowl_utils.Meter()
295 |     
296 |     # switch to train mode
297 |     model.train()
298 | 
299 |     start = end = time.time()
300 |     global_step = 0
301 |     
302 |     for step, (cate_x, cont_x, mask, y) in enumerate(train_loader):
303 |         # measure data loading time
304 |         data_time.update(time.time() - end)
305 |         
306 |         cate_x, cont_x, mask, y = cate_x.cuda(), cont_x.cuda(), mask.cuda(), y.cuda()
307 |         batch_size = cate_x.size(0)        
308 |         
309 |         # compute loss
310 |         k = 0.5
311 |         pred = model(cate_x, cont_x, mask)
312 |         loss = F.mse_loss(pred.view(-1), y.view(-1))
313 |         
314 |         # record loss
315 |         losses.update(loss.item(), batch_size)
316 |         
317 |         if CFG.gradient_accumulation_steps > 1:
318 |             loss = loss / CFG.gradient_accumulation_steps
319 |         
320 |         loss.backward()
321 |         grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
322 |         
323 |         if (step + 1) % CFG.gradient_accumulation_steps == 0:      
324 |             scheduler.step()
325 |             optimizer.step()
326 |             optimizer.zero_grad()
327 |             global_step += 1    
328 | 
329 |         # measure elapsed time
330 |         batch_time.update(time.time() - end)
331 |         end = time.time()
332 | 
333 |         sent_count.update(batch_size)
334 | 
335 |         if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
336 |             # record accuracy
337 |             pred_y = (1-k)*pred[:, 0] + (k)*compute_acc_gp(pred[:, 1:])
338 |             pred_y = (pred_y+0.5).int()
339 |             pred_y[pred_y > 3] = 3
340 |             
341 |             kappa_score = bowl_utils.qwk3(pred_y.detach().cpu().numpy(), y[:, 0].cpu().numpy())        
342 |             accuracies.update( kappa_score, batch_size)
343 |         
344 |             print('Epoch: [{0}][{1}/{2}] '
345 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
346 |                   'Elapsed {remain:s} '
347 |                   'Loss: {loss.val:.4f}({loss.avg:.4f}) '
348 |                   'Acc: {acc.val:.4f}({acc.avg:.4f}) '
349 |                   'Grad: {grad_norm:.4f}  '
350 |                   'LR: {lr:.6f}  '
351 |                   'sent/s {sent_s:.0f} '
352 |                   .format(
353 |                    epoch, step, len(train_loader), batch_time=batch_time,                   
354 |                    data_time=data_time, loss=losses,
355 |                    acc=accuracies,
356 |                    remain=timeSince(start, float(step+1)/len(train_loader)),
357 |                    grad_norm=grad_norm,
358 |                    lr=scheduler.get_lr()[0],
359 |                    #lr=scheduler.optimizer.param_groups[0]['lr'],
360 |                    sent_s=sent_count.avg/batch_time.avg
361 |                    ))
362 |     return losses.avg, accuracies.avg
363 | 
364 | 
365 | def validate(valid_loader, model):
366 |     batch_time = AverageMeter()
367 |     data_time = AverageMeter()
368 |     losses = AverageMeter()
369 |     accuracies = AverageMeter()
370 |     
371 |     sent_count = AverageMeter()
372 |     #meter = bowl_utils.Meter()
373 |     
374 |     # switch to evaluation mode
375 |     model.eval()
376 | 
377 |     start = end = time.time()
378 |     
379 |     predictions = []
380 |     groundtruth = []
381 |     for step, (cate_x, cont_x, mask, y) in enumerate(valid_loader):
382 |         # measure data loading time
383 |         data_time.update(time.time() - end)        
384 |         
385 |         cate_x, cont_x, mask, y = cate_x.cuda(), cont_x.cuda(), mask.cuda(), y.cuda()
386 |         batch_size = cate_x.size(0)        
387 |         
388 |         # compute loss
389 |         k = 0.5
390 |         with torch.no_grad():        
391 |             pred = model(cate_x, cont_x, mask)        
392 |             loss = F.mse_loss(pred.view(-1), y.view(-1))
393 | 
394 |             # record loss
395 |             losses.update(loss.item(), batch_size)
396 |         
397 |         # record accuracy
398 |         pred_y = (1-k)*pred[:, 0] + (k)*compute_acc_gp(pred[:, 1:])
399 |         predictions.append(pred_y.detach().cpu())        
400 |         pred_y = (pred_y+0.5).int()
401 |         pred_y[pred_y > 3] = 3        
402 |         y = y[:, 0]
403 |         
404 |         pred_y = pred_y.detach().cpu()
405 |         y = y.cpu()
406 |         
407 |         groundtruth.append(y)
408 |         
409 |         kappa_score = bowl_utils.qwk3(pred_y.numpy(), y.numpy())        
410 |         accuracies.update( kappa_score, batch_size)
411 |         
412 |         if CFG.gradient_accumulation_steps > 1:
413 |             loss = loss / CFG.gradient_accumulation_steps    
414 | 
415 |         # measure elapsed time
416 |         batch_time.update(time.time() - end)
417 |         end = time.time()
418 | 
419 |         sent_count.update(batch_size)
420 | 
421 |         if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):            
422 |             print('TEST: {0}/{1}] '
423 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
424 |                   'Elapsed {remain:s} '
425 |                   'Loss: {loss.val:.4f}({loss.avg:.4f}) '
426 |                   'Acc: {acc.val:.4f}({acc.avg:.4f}) '
427 |                   'sent/s {sent_s:.0f} '
428 |                   .format(
429 |                    step, len(valid_loader), batch_time=batch_time,                   
430 |                    data_time=data_time, loss=losses,
431 |                    acc=accuracies,
432 |                    remain=timeSince(start, float(step+1)/len(valid_loader)),
433 |                    sent_s=sent_count.avg/batch_time.avg
434 |                    ))
435 |     predictions = torch.cat(predictions).numpy()
436 |     groundtruth = torch.cat(groundtruth).numpy()
437 |     
438 |     try:
439 |         optR = bowl_utils.OptimizedRounder()
440 |         optR.fit(predictions, groundtruth)
441 |         coefficients = optR.coefficients()
442 |         print(coefficients)
443 |         predictions[predictions < coefficients[0]] = 0
444 |         predictions[(coefficients[0]<=predictions)&(predictions< coefficients[1])] = 1
445 |         predictions[(coefficients[1]<=predictions)&(predictions< coefficients[2])] = 2
446 |         predictions[(coefficients[2]<=predictions)] = 3
447 | 
448 |         kappa_score = bowl_utils.qwk3(predictions, groundtruth)
449 |     except:
450 |         kappa_score = 0
451 |         coefficients = [0.5, 1.5, 2.5]
452 |         
453 |     
454 |     return losses.avg, kappa_score, coefficients, predictions, groundtruth
455 | 
456 | 
457 | def get_logger():
458 |     FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
459 |     logging.basicConfig(format=FORMAT, level=logging.INFO)
460 |     logger = logging.getLogger('main')
461 |     logger.setLevel(logging.DEBUG)
462 |     return logger
463 | 
464 | logger = get_logger()
465 | 
466 | 
467 | def save_checkpoint(state, model_path, model_filename, is_best=False):
468 |     print('saving cust_model ...')
469 |     if not os.path.exists(model_path):
470 |         os.makedirs(model_path)
471 |     torch.save(state, os.path.join(model_path, model_filename))
472 |     if is_best:
473 |         torch.save(state, os.path.join(model_path, 'best_' + model_filename))
474 | 
475 | 
476 | class AverageMeter(object):
477 |     """Computes and stores the average and current value"""
478 |     def __init__(self):
479 |         self.reset()
480 | 
481 |     def reset(self):
482 |         self.val = 0
483 |         self.avg = 0
484 |         self.sum = 0
485 |         self.count = 0
486 | 
487 |     def update(self, val, n=1):
488 |         self.val = val
489 |         self.sum += val * n
490 |         self.count += n
491 |         self.avg = self.sum / self.count
492 | 
493 | 
494 | def asMinutes(s):
495 |     m = math.floor(s / 60)
496 |     s -= m * 60
497 |     return '%dm %ds' % (m, s)
498 | 
499 | 
500 | def timeSince(since, percent):
501 |     now = time.time()
502 |     s = now - since
503 |     es = s / (percent)
504 |     rs = es - s
505 |     return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
506 | 
507 | 
508 | def adjust_learning_rate(optimizer, epoch):  
509 |     #lr  = CFG.learning_rate     
510 |     lr = (CFG.lr_decay)**(epoch//10) * CFG.learning_rate    
511 |     for param_group in optimizer.param_groups:
512 |         param_group['lr'] = lr    
513 |     return lr
514 | 
515 | 
516 | 
517 | 
518 | if __name__ == '__main__':
519 |     main()
520 | 


--------------------------------------------------------------------------------
/code/train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | python train.py --batch_size 32 --nepochs 10 --lr 1e-04 --dropout 0.2 --encoder TRANSFORMER --aug 0.5 --seed 7 --data_seed 7 --seq_len 100 --k 0
5 | python train.py --batch_size 32 --nepochs 10 --lr 1e-04 --dropout 0.2 --encoder TRANSFORMER --aug 0.5 --seed 7 --data_seed 7 --seq_len 100 --k 1
6 | python train.py --batch_size 32 --nepochs 10 --lr 1e-04 --dropout 0.2 --encoder TRANSFORMER --aug 0.5 --seed 7 --data_seed 7 --seq_len 100 --k 2
7 | python train.py --batch_size 32 --nepochs 10 --lr 1e-04 --dropout 0.2 --encoder TRANSFORMER --aug 0.5 --seed 7 --data_seed 7 --seq_len 100 --k 3
8 | python train.py --batch_size 32 --nepochs 10 --lr 1e-04 --dropout 0.2 --encoder TRANSFORMER --aug 0.5 --seed 7 --data_seed 7 --seq_len 100 --k 4
9 | 


--------------------------------------------------------------------------------
/code/validate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | os.environ['OMP_NUM_THREADS'] = '16'
  4 | os.environ['NUMEXPR_MAX_THREADS'] = '16'
  5 | import os
  6 | import math
  7 | import copy
  8 | import torch
  9 | import bowl_db
 10 | import bowl_model
 11 | import bowl_utils
 12 | import time
 13 | import random
 14 | import numpy as np
 15 | import pandas as pd
 16 | from tqdm import tqdm
 17 | import torch.nn.functional as F
 18 | from sklearn import metrics
 19 | from torch.utils.data import DataLoader
 20 | from sklearn.model_selection import train_test_split, StratifiedKFold
 21 | from transformers import AdamW, get_linear_schedule_with_warmup
 22 | 
 23 | import warnings
 24 | warnings.filterwarnings(action='ignore')
 25 | 
 26 | import argparse
 27 | import logging
 28 | import json
 29 | import collections
 30 | 
 31 | settings = json.load(open('SETTINGS.json'))
 32 | 
 33 | DB_PATH=settings['CLEAN_DATA_DIR']
 34 | 
 35 | 
 36 | class CFG:
 37 |     learning_rate=1.0e-4
 38 |     batch_size=64
 39 |     num_workers=4
 40 |     print_freq=100
 41 |     test_freq=1
 42 |     start_epoch=0
 43 |     num_train_epochs=1
 44 |     warmup_steps=30
 45 |     max_grad_norm=1000
 46 |     gradient_accumulation_steps=1
 47 |     weight_decay=0.01    
 48 |     dropout=0.2
 49 |     emb_size=100
 50 |     hidden_size=500
 51 |     nlayers=2
 52 |     nheads=8
 53 |     ntta = [0.0, 0.3, 0.5]
 54 |     wtta = [0.6, 0.2, 0.2]
 55 |     #ntta = [0.0]
 56 |     #wtta = [1.0]
 57 |     
 58 | 
 59 | 
 60 | def main():    
 61 |     parser = argparse.ArgumentParser("")
 62 |     parser.add_argument("--batch_size", type=int, default=CFG.batch_size)    
 63 |     parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs)
 64 |     parser.add_argument("--seed", type=int, default=7)    
 65 |     parser.add_argument("--data_seed", type=int, default=7)
 66 |     parser.add_argument("--lr", type=float, default=CFG.learning_rate)
 67 |     parser.add_argument("--dropout", type=float, default=CFG.dropout)        
 68 |     args = parser.parse_args()
 69 |     print(args) 
 70 |     
 71 |     CFG.batch_size=args.batch_size   
 72 |     CFG.seed =  args.seed
 73 |     CFG.data_seed =  args.data_seed
 74 |     CFG.target_size = 3
 75 |     print(CFG.__dict__)    
 76 |     
 77 |     os.environ['PYTHONHASHSEED'] = str(CFG.seed)
 78 |     random.seed(CFG.seed)
 79 |     np.random.seed(CFG.seed)
 80 |     torch.manual_seed(CFG.seed)    
 81 |     torch.cuda.manual_seed(CFG.seed)
 82 |     torch.backends.cudnn.deterministic = True        
 83 |     
 84 |     
 85 |     base_model_path_list = [        
 86 |         ['bowl.pt', os.path.join(settings['MODEL_DIR'], 'b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-0.pt')],        
 87 |     ]
 88 |     
 89 |     
 90 |     rand_seed_list = [7, 77, 777, 1, 2]
 91 |     
 92 |     total_predictions = []
 93 |     total_groundtruth = []      
 94 |     
 95 |     for k in range(5):
 96 |         mean_predictions = 0
 97 |         mean_groundtruth = 0
 98 |         prev_filename = ''
 99 |         for filename, base_model_path in base_model_path_list:            
100 |             if prev_filename != filename:
101 |                 (train_df, test_df, mappers_dict, cate_offset, cate_cols, 
102 |                  cont_cols, extra_cont_cls, train_samples, train_groups, test_samples) = (
103 |                     torch.load(os.path.join(DB_PATH, filename)))[:10]
104 |             prev_filename = filename
105 |                
106 |             
107 |             CFG.total_cate_size = cate_offset
108 |             CFG.cate_cols = cate_cols
109 |             CFG.cont_cols = cont_cols+extra_cont_cls
110 |             
111 |             path = base_model_path.split('/')[-1]
112 |             path = path.replace('bowl_', '')
113 |             cfg_dict = dict([tok.split('-') for tok in path.split('_')])
114 |             CFG.encoder = cfg_dict['a']
115 |             CFG.seq_len = int(cfg_dict['len'])
116 |             CFG.emb_size = int(cfg_dict['e'])
117 |             CFG.hidden_size = int(cfg_dict['h'])
118 |             CFG.nlayers = int(cfg_dict['l'])
119 |             CFG.nheads = int(cfg_dict['hd'])
120 |             
121 |             model_path = base_model_path.replace('k-0', f'k-{k}')
122 |             model = bowl_model.encoders[CFG.encoder](CFG)   
123 |             checkpoint = torch.load(model_path)
124 |             model.load_state_dict(checkpoint['state_dict'])        
125 |             print("=> loaded checkpoint '{}' (epoch {})".format(model_path, checkpoint['epoch']))
126 |             model.cuda()
127 |             
128 |             rand_predictions = []
129 |             rand_groundtruth = []
130 |             
131 |             for rand_seed in rand_seed_list:
132 |                 _, valid_samples = bowl_utils.train_valid_split(train_samples, train_groups, k, 
133 |                                                                 random_state=CFG.data_seed, random_state2=rand_seed, choice=True)                                
134 |                 predictions = 0
135 |                 for w, tta in zip(CFG.wtta, CFG.ntta):                    
136 |                     valid_db = bowl_db.BowlDataset(CFG, train_df, valid_samples, aug=tta, aug_p=1.0)
137 |                     valid_loader = DataLoader(
138 |                             valid_db, batch_size=CFG.batch_size, shuffle=False,
139 |                             num_workers=CFG.num_workers, pin_memory=True)                
140 |                     _, valid_kappa, _, prediction, groundtruth = validate(valid_loader, model)
141 |                     predictions += w*prediction                                
142 | 
143 |                 rand_predictions.append(predictions)
144 |                 rand_groundtruth.append(groundtruth)                    
145 |                 
146 |                 valid_kappa = bowl_utils.get_optimized_kappa_score(predictions, groundtruth)
147 |                 print(f'k[{k}]-s2[{rand_seed}]: valid_kappa:{valid_kappa}')                
148 |             del model
149 |             mean_predictions += np.concatenate(rand_predictions)
150 |             mean_groundtruth += np.concatenate(rand_groundtruth)
151 |         
152 |         total_predictions.append(mean_predictions/len(base_model_path_list))
153 |         total_groundtruth.append(mean_groundtruth/len(base_model_path_list))
154 |         
155 |     total_predictions = np.concatenate(total_predictions)
156 |     total_groundtruth = np.concatenate(total_groundtruth)    
157 |     
158 |     print(total_predictions.shape)
159 |     
160 |     optR = bowl_utils.OptimizedRounder()
161 |     optR.fit(total_predictions, total_groundtruth)
162 |     coefficients = optR.coefficients()
163 |     #coefficients = [0.50755102, 1.64870448, 2.23524805]
164 |     #coefficients = [0.49057894, 1.66282769, 2.26743377]
165 |     #print('FIXED COEEFICIENT !!!!')
166 |     
167 |     total_predictions[total_predictions < coefficients[0]] = 0
168 |     total_predictions[(coefficients[0]<=total_predictions)&(total_predictions< coefficients[1])] = 1
169 |     total_predictions[(coefficients[1]<=total_predictions)&(total_predictions< coefficients[2])] = 2
170 |     total_predictions[(coefficients[2]<=total_predictions)] = 3
171 | 
172 |     kappa_score = bowl_utils.qwk3(total_predictions, total_groundtruth)
173 |     print('==============================')    
174 |     print(f'VALID KAPPA_SCORE:{kappa_score} - {coefficients}')
175 |     print('==============================')
176 |     
177 |     
178 |     if len(test_samples)>1000:        
179 |         predictions = 0
180 |         accum_count = 0
181 |         for filename, base_model_path in base_model_path_list:
182 |             if prev_filename != filename:
183 |                 (train_df, test_df, mappers_dict, cate_offset, cate_cols, 
184 |                  cont_cols, extra_cont_cls, train_samples, train_groups, test_samples) = (
185 |                     torch.load(os.path.join(DB_PATH, filename)))[:10]
186 |             prev_filename = filename
187 |             
188 |             CFG.total_cate_size = cate_offset
189 |             CFG.cate_cols = cate_cols
190 |             CFG.cont_cols = cont_cols+extra_cont_cls            
191 |             
192 |             path = base_model_path.split('/')[-1]
193 |             path = path.replace('bowl_', '')
194 |             cfg_dict = dict([tok.split('-') for tok in path.split('_')])
195 |             CFG.encoder = cfg_dict['a']
196 |             CFG.seq_len = int(cfg_dict['len'])
197 |             CFG.emb_size = int(cfg_dict['e'])
198 |             CFG.hidden_size = int(cfg_dict['h'])
199 |             CFG.nlayers = int(cfg_dict['l'])
200 |             CFG.nheads = int(cfg_dict['hd'])
201 |                 
202 |             for k in range(5):            
203 |                 model = bowl_model.encoders[CFG.encoder](CFG)
204 |                 model_path = base_model_path.replace('k-0', f'k-{k}')
205 |                 checkpoint = torch.load(model_path)        
206 |                 model.load_state_dict(checkpoint['state_dict'])        
207 |                 print("=> loaded checkpoint '{}' (epoch {})".format(model_path, checkpoint['epoch']))
208 |                 model.cuda()
209 |                                                 
210 |                 for w, tta in zip(CFG.wtta, CFG.ntta):
211 |                     valid_db = bowl_db.BowlDataset(CFG, test_df, test_samples, aug=tta, aug_p=1.0)
212 |                     valid_loader = DataLoader(
213 |                             valid_db, batch_size=CFG.batch_size, shuffle=False,
214 |                             num_workers=CFG.num_workers, pin_memory=True)                
215 |                     _, valid_kappa, _, prediction, groundtruth = validate(valid_loader, model)
216 |                     predictions += w*prediction
217 |                 accum_count += 1
218 |                 del model                
219 |         kappa_score = bowl_utils.get_optimized_kappa_score(predictions/accum_count, groundtruth)
220 |         print('==============================')            
221 |         print(f'TEST KAPPA_SCORE:{kappa_score}')            
222 |         
223 |     #ass_bool = (test_df['event_code']!=2000)&(test_df['num_correct']==0)&(test_df['num_incorrect']==0)&(test_df['type']==mappers_dict['type']['Assessment'])
224 |     #test_df = test_df[~ass_bool].reset_index(drop=True)
225 |         
226 |     submission_df = test_df.groupby('installation_id').tail(1)[['installation_id']]
227 |     submission_df['accuracy_group'] = 0
228 |     accum_count = 0
229 |     
230 |     for filename, base_model_path in base_model_path_list:
231 |         if prev_filename != filename:
232 |                 (train_df, test_df, mappers_dict, cate_offset, cate_cols, 
233 |                  cont_cols, extra_cont_cls, train_samples, train_groups, _) = (
234 |                     torch.load(os.path.join(DB_PATH, filename)))[:10]
235 |         prev_filename = filename
236 |         
237 |         test_samples = list(test_df.groupby(['installation_id']).groups.values())
238 |         
239 |         CFG.total_cate_size = cate_offset
240 |         CFG.cate_cols = cate_cols
241 |         CFG.cont_cols = cont_cols+extra_cont_cls        
242 |         
243 |         path = base_model_path.split('/')[-1]
244 |         path = path.replace('bowl_', '')
245 |         cfg_dict = dict([tok.split('-') for tok in path.split('_')])
246 |         CFG.encoder = cfg_dict['a']
247 |         CFG.seq_len = int(cfg_dict['len'])
248 |         CFG.emb_size = int(cfg_dict['e'])
249 |         CFG.hidden_size = int(cfg_dict['h'])
250 |         CFG.nlayers = int(cfg_dict['l'])
251 |         CFG.nheads = int(cfg_dict['hd'])
252 |         
253 |         for k in range(5):
254 |             model = bowl_model.encoders[CFG.encoder](CFG)
255 |             model_path = base_model_path.replace('k-0', f'k-{k}')
256 |             
257 |             checkpoint = torch.load(model_path)        
258 |             model.load_state_dict(checkpoint['state_dict'])        
259 |             print("=> loaded checkpoint '{}' (epoch {})".format(model_path, checkpoint['epoch']))
260 |             model.cuda()
261 |                                   
262 |             for w, tta in zip(CFG.wtta, CFG.ntta):       
263 |                 valid_db = bowl_db.BowlDataset(CFG, test_df, test_samples, aug=tta, aug_p=1.0)
264 |                 valid_loader = DataLoader(
265 |                         valid_db, batch_size=CFG.batch_size, shuffle=False,
266 |                         num_workers=CFG.num_workers, pin_memory=True)                
267 |                 predictions = test(valid_loader, model)
268 |                 submission_df['accuracy_group'] += w*predictions
269 |             accum_count += 1
270 |             
271 |                             
272 |             del model
273 |     
274 |     submission_df['accuracy_group'] /= accum_count    
275 |     compute_th_acc_gp(submission_df['accuracy_group'], coefficients) 
276 |     submission_df['accuracy_group'] = submission_df['accuracy_group'].astype(int)
277 |     submission_df.to_csv('submission.csv', index=False)
278 |     print('done')
279 | 
280 | def compute_th_acc_gp(temp, coef):
281 |     temp[temp < coef[0]] = 0
282 |     temp[(coef[0]<=temp)&(temp< coef[1])] = 1
283 |     temp[(coef[1]<=temp)&(temp< coef[2])] = 2
284 |     temp[(coef[2]<=temp)] = 3    
285 | 
286 | def compute_acc_gp(pred):
287 |     #batch_size = pred.size(0)
288 |     pred = (3*pred[:, 0] - 2*pred[:, 1])    
289 |     pred[pred < 0] = 0    
290 |     return pred
291 | 
292 | 
293 | def test(test_loader, model):
294 |     batch_time = AverageMeter()
295 |     data_time = AverageMeter()
296 |     losses = AverageMeter()
297 |     accuracies = AverageMeter()
298 |     
299 |     sent_count = AverageMeter()
300 |     #meter = bowl_utils.Meter()
301 |     
302 |     # switch to evaluation mode
303 |     model.eval()
304 | 
305 |     start = end = time.time()
306 |     
307 |     predictions = []    
308 |     for step, (cate_x, cont_x, mask, y) in enumerate(test_loader):
309 |         # measure data loading time
310 |         data_time.update(time.time() - end)        
311 |         
312 |         cate_x, cont_x, mask = cate_x.cuda(), cont_x.cuda(), mask.cuda()
313 |         batch_size = cate_x.size(0)        
314 |         
315 |         # compute loss
316 |         k = 0.5
317 |         with torch.no_grad():        
318 |             pred = model(cate_x, cont_x, mask)
319 |         # record accuracy
320 |         pred_y = (1-k)*pred[:, 0] + (k)*compute_acc_gp(pred[:, 1:])
321 |         predictions.append(pred_y.detach().cpu())        
322 |         
323 |         # measure elapsed time
324 |         batch_time.update(time.time() - end)
325 |         end = time.time()
326 | 
327 |         sent_count.update(batch_size)
328 |         """
329 |         if step % CFG.print_freq == 0 or step == (len(test_loader)-1):            
330 |             print('TEST: {0}/{1}] '
331 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
332 |                   'Elapsed {remain:s} '                  
333 |                   'sent/s {sent_s:.0f} '
334 |                   .format(
335 |                    step, len(test_loader), batch_time=batch_time,                   
336 |                    data_time=data_time,                    
337 |                    remain=timeSince(start, float(step+1)/len(test_loader)),
338 |                    sent_s=sent_count.avg/batch_time.avg
339 |                    ))
340 |         """
341 |     predictions = torch.cat(predictions).numpy()    
342 |     
343 |     return predictions
344 | 
345 | 
346 | def validate(valid_loader, model):
347 |     batch_time = AverageMeter()
348 |     data_time = AverageMeter()
349 |     losses = AverageMeter()
350 |     accuracies = AverageMeter()
351 |     
352 |     sent_count = AverageMeter()
353 |     #meter = bowl_utils.Meter()
354 |     
355 |     # switch to evaluation mode
356 |     model.eval()
357 | 
358 |     start = end = time.time()
359 |     
360 |     predictions = []
361 |     groundtruth = []
362 |     for step, (cate_x, cont_x, mask, y) in enumerate(valid_loader):
363 |         # measure data loading time
364 |         data_time.update(time.time() - end)        
365 |         
366 |         cate_x, cont_x, mask, y = cate_x.cuda(), cont_x.cuda(), mask.cuda(), y.cuda()
367 |         batch_size = cate_x.size(0)        
368 |         
369 |         # compute loss
370 |         k = 0.5
371 |         with torch.no_grad():        
372 |             pred = model(cate_x, cont_x, mask)
373 |             loss1 = F.mse_loss(pred[:, 0].contiguous().view(-1), y[:, 0].contiguous().view(-1))
374 |             loss2 = F.mse_loss(pred[:, 1].contiguous().view(-1), y[:, 1].contiguous().view(-1))
375 |             loss = (1-k)*loss1+k*loss2
376 | 
377 |             # record loss
378 |             losses.update(loss.item(), batch_size)
379 |         
380 |         # record accuracy
381 |         pred_y = (1-k)*pred[:, 0] + (k)*compute_acc_gp(pred[:, 1:])
382 |         predictions.append(pred_y.detach().cpu())        
383 |         pred_y = (pred_y+0.5).int()
384 |         pred_y[pred_y > 3] = 3        
385 |         y = y[:, 0]
386 |         
387 |         pred_y = pred_y.detach().cpu()
388 |         y = y.cpu()
389 |         
390 |         groundtruth.append(y)
391 |         
392 |         kappa_score = bowl_utils.qwk3(pred_y.numpy(), y.numpy())        
393 |         accuracies.update( kappa_score, batch_size)
394 |         
395 |         if CFG.gradient_accumulation_steps > 1:
396 |             loss = loss / CFG.gradient_accumulation_steps    
397 | 
398 |         # measure elapsed time
399 |         batch_time.update(time.time() - end)
400 |         end = time.time()
401 | 
402 |         sent_count.update(batch_size)
403 |         
404 |         """
405 |         if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):            
406 |             print('TEST: {0}/{1}] '
407 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
408 |                   'Elapsed {remain:s} '
409 |                   'Loss: {loss.val:.4f}({loss.avg:.4f}) '
410 |                   'Acc: {acc.val:.4f}({acc.avg:.4f}) '
411 |                   'sent/s {sent_s:.0f} '
412 |                   .format(
413 |                    step, len(valid_loader), batch_time=batch_time,                   
414 |                    data_time=data_time, loss=losses,
415 |                    acc=accuracies,
416 |                    remain=timeSince(start, float(step+1)/len(valid_loader)),
417 |                    sent_s=sent_count.avg/batch_time.avg
418 |                    ))
419 |         """
420 |     predictions = torch.cat(predictions).numpy()
421 |     groundtruth = torch.cat(groundtruth).numpy()
422 |     
423 |     """
424 |     try:
425 |         optR = bowl_utils.OptimizedRounder()
426 |         optR.fit(predictions, groundtruth)
427 |         coefficients = optR.coefficients()
428 |         print(coefficients)
429 |         temp_predictions = predictions.copy()
430 |         temp_predictions[temp_predictions < coefficients[0]] = 0
431 |         temp_predictions[(coefficients[0]<=temp_predictions)&(temp_predictions< coefficients[1])] = 1
432 |         temp_predictions[(coefficients[1]<=temp_predictions)&(temp_predictions< coefficients[2])] = 2
433 |         temp_predictions[(coefficients[2]<=temp_predictions)] = 3
434 | 
435 |         kappa_score = bowl_utils.qwk3(temp_predictions, groundtruth)
436 |     except:
437 |         kappa_score = 0
438 |         coefficients = [0.5, 1.5, 2.5]
439 |     """
440 |     coefficients = [0.5, 1.5, 2.5]  
441 |     
442 |     return losses.avg, kappa_score, coefficients, predictions, groundtruth
443 | 
444 | 
445 | def get_logger():
446 |     FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
447 |     logging.basicConfig(format=FORMAT, level=logging.INFO)
448 |     logger = logging.getLogger('main')
449 |     logger.setLevel(logging.DEBUG)
450 |     return logger
451 | 
452 | logger = get_logger()
453 | 
454 | 
455 | def save_checkpoint(state, model_path, model_filename, is_best=False):
456 |     print('saving cust_model ...')
457 |     if not os.path.exists(model_path):
458 |         os.makedirs(model_path)
459 |     torch.save(state, os.path.join(model_path, model_filename))
460 |     if is_best:
461 |         torch.save(state, os.path.join(model_path, 'best_' + model_filename))
462 | 
463 | 
464 | class AverageMeter(object):
465 |     """Computes and stores the average and current value"""
466 |     def __init__(self):
467 |         self.reset()
468 | 
469 |     def reset(self):
470 |         self.val = 0
471 |         self.avg = 0
472 |         self.sum = 0
473 |         self.count = 0
474 | 
475 |     def update(self, val, n=1):
476 |         self.val = val
477 |         self.sum += val * n
478 |         self.count += n
479 |         self.avg = self.sum / self.count
480 | 
481 | 
482 | def asMinutes(s):
483 |     m = math.floor(s / 60)
484 |     s -= m * 60
485 |     return '%dm %ds' % (m, s)
486 | 
487 | 
488 | def timeSince(since, percent):
489 |     now = time.time()
490 |     s = now - since
491 |     es = s / (percent)
492 |     rs = es - s
493 |     return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
494 | 
495 | 
496 | def adjust_learning_rate(optimizer, epoch):  
497 |     #lr  = CFG.learning_rate     
498 |     lr = (CFG.lr_decay)**(epoch//10) * CFG.learning_rate    
499 |     for param_group in optimizer.param_groups:
500 |         param_group['lr'] = lr    
501 |     return lr
502 | 
503 | 
504 | 
505 | 
506 | if __name__ == '__main__':
507 |     main()
508 | 


--------------------------------------------------------------------------------
/directory_structure.txt:
--------------------------------------------------------------------------------
1 | .
2 | ./models
3 | ./code
4 | ./input
5 | ./input/data-science-bowl-2019
6 | 


--------------------------------------------------------------------------------
/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-0.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lime-robot/dsb2019/70446f1f56b778721fc6607154a3a9d59f12e792/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-0.pt


--------------------------------------------------------------------------------
/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lime-robot/dsb2019/70446f1f56b778721fc6607154a3a9d59f12e792/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-1.pt


--------------------------------------------------------------------------------
/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lime-robot/dsb2019/70446f1f56b778721fc6607154a3a9d59f12e792/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-2.pt


--------------------------------------------------------------------------------
/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-3.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lime-robot/dsb2019/70446f1f56b778721fc6607154a3a9d59f12e792/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-3.pt


--------------------------------------------------------------------------------
/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-4.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lime-robot/dsb2019/70446f1f56b778721fc6607154a3a9d59f12e792/models/b-32_a-TRANSFORMER_e-100_h-500_d-0.2_l-2_hd-10_s-7_len-100_aug-0.5_da-bowl.pt_k-4.pt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # PyTorch
2 | torch==1.3.1
3 | tqdm
4 | transformers==2.3.0
5 | pytorch-transformers==1.2.0
6 | 
7 | 


--------------------------------------------------------------------------------