├── README.md ├── scripts └── run_gru_d.py └── src ├── GRUD_layer.py └── GRUD_model.py /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch-GRU-D 2 | PyTorch Implementation of GRU-D from "Recurrent Neural Networks for Multivariate Time Series with Missing Values" https://arxiv.org/abs/1606.01865 3 | 4 | Code based on 5 | https://github.com/Han-JD/GRU-D 6 | 7 | Adapted for batchwise training, GPU support and fixed bugs. 8 | PyTorch Version 1.3.1 9 | 10 | Model takes input of shape ( n_samples, 3, features, seq_length ). 11 | Dimension 1 is (input_matrix, masking_matrix, delta_t_matrix). Input_matrix has 0 where values are missing. 12 | -------------------------------------------------------------------------------- /scripts/run_gru_d.py: -------------------------------------------------------------------------------- 1 | from sacred import Experiment 2 | import torch 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | import math 7 | import warnings 8 | import itertools 9 | import numbers 10 | import torch.utils.data as utils 11 | #import matplotlib.pyplot as plt 12 | from sklearn.metrics import roc_curve, auc, roc_auc_score, average_precision_score 13 | 14 | import sys 15 | sys.path.insert(0, '../src') 16 | from data_loader_oo import DataContainer 17 | from GRUD_model import grud_model 18 | 19 | 20 | 21 | ex = Experiment("GRU-D-mean") 22 | use_cuda = torch.cuda.is_available() 23 | device = torch.device("cuda:0" if use_cuda else "cpu") 24 | 25 | #TODO: data gets padded before, does this have any effect if i just take the last step output as output? 26 | 27 | def fit(model, criterion, l2_penalty, learning_rate,\ 28 | train_dataloader, dev_dataloader, test_dataloader,\ 29 | learning_rate_decay=0, n_epochs=30, checkpoint_path = 'model_checkpoints', sacred_run=None): 30 | 31 | test_freq = int(len(train_dataloader) / 4) 32 | use_cuda = torch.cuda.is_available() 33 | device = torch.device("cuda:0" if use_cuda else "cpu") 34 | 35 | # to check the update 36 | old_state_dict = {} 37 | for key in model.state_dict(): 38 | old_state_dict[key] = model.state_dict()[key].clone() 39 | 40 | 41 | AU_PRCs = list() 42 | n_batch = 0 43 | for epoch in range(n_epochs): 44 | print("starting Epoch: {}".format(epoch)) 45 | if learning_rate_decay != 0: 46 | 47 | # every [decay_step] epoch reduce the learning rate by half 48 | if epoch % learning_rate_decay == 0: 49 | learning_rate = learning_rate/2 50 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= l2_penalty) 51 | print('at epoch {} learning_rate is updated to {}'.format(epoch, learning_rate)) 52 | 53 | # train the model 54 | losses, acc = [], [] 55 | label, pred = [], [] 56 | y_pred_col= [] 57 | 58 | 59 | #get a minibatch 60 | for train_data, train_label, train_num_obs in train_dataloader: 61 | model.train() 62 | #push current sample to GPU or CPU 63 | train_data, train_label, train_num_obs = train_data.to(device), train_label.to(device), train_num_obs.to(device) 64 | 65 | 66 | # Zero the parameter gradients 67 | optimizer.zero_grad() 68 | 69 | # Forward pass : Compute predicted y by passing train data to the model 70 | #y_pred = model(train_data)[:,-1,:] #last output 71 | 72 | #with pad after: 73 | y_pred = model(train_data) 74 | #print(y_pred.size()) 75 | y_pred = torch.squeeze(y_pred) 76 | #print(y_pred.size()) 77 | y_pred = torch.gather(y_pred,1,train_num_obs.long()) 78 | #print(y_pred.size()) 79 | #print(train_num_obs.size()) 80 | 81 | 82 | #print(train_num_obs) 83 | 84 | 85 | # Compute loss 86 | loss = criterion(y_pred, train_label) 87 | #print(loss) 88 | acc.append( 89 | torch.eq( 90 | (torch.sigmoid(y_pred).data > 0.5).float(), 91 | train_label) 92 | ) 93 | losses.append(loss.item()) 94 | print(str(n_batch)+" loss: "+ str(loss.item()) ) 95 | #print(model.w_dg_x.weight) 96 | #print(model.w_dg_x.bias) 97 | 98 | # perform a backward pass, and update the weights. 99 | loss.backward() 100 | optimizer.step() 101 | 102 | #sacred_run.log_scalar('loss', loss, n_batch) 103 | #sacred_run.log_scalar('acc', acc, n_batch) 104 | n_batch = n_batch + 1 105 | 106 | if n_batch % test_freq == 0: 107 | print("Validate...") 108 | losses, acc = [], [] 109 | label, pred = np.array([]), np.array([]) 110 | model.eval() 111 | 112 | for dev_data, dev_label, dev_num_obs in dev_dataloader: 113 | 114 | dev_data, dev_label, dev_num_obs = dev_data.to(device), dev_label.to(device), dev_num_obs.to(device) 115 | 116 | print(dev_data.size()) 117 | print(dev_label.size()) 118 | print(dev_num_obs.size()) 119 | 120 | optimizer.zero_grad() 121 | # Forward pass : Compute predicted y by passing train data to the model 122 | 123 | #y_pred = model(dev_data)[:,-1,:] #last output 124 | y_pred = model(dev_data) 125 | y_pred = torch.squeeze(y_pred) 126 | #print(y_pred.size()) 127 | y_pred = torch.gather(y_pred,1,dev_num_obs.long()) 128 | 129 | # Compute loss 130 | loss = criterion(y_pred, dev_label) 131 | acc.append( 132 | torch.eq( 133 | (torch.sigmoid(y_pred).data > 0.5).float(), 134 | dev_label) 135 | ) 136 | losses.append(loss.item()) 137 | 138 | label = np.append(label, dev_label.detach().cpu().numpy()) 139 | pred = np.append(pred, y_pred.detach().cpu().numpy()) 140 | 141 | 142 | dev_acc = torch.mean(torch.cat(acc).float()) 143 | dev_loss = np.mean(losses) 144 | 145 | dev_pred_out = pred 146 | dev_label_out = label 147 | 148 | va_auc = roc_auc_score(label, pred) 149 | va_prc = average_precision_score(label, pred) 150 | print("validation auc:{}".format(va_auc)) 151 | print("validation prc:{}".format(va_prc)) 152 | 153 | sacred_run.log_scalar('va_auc', va_auc, n_batch) 154 | sacred_run.log_scalar('va_prc', va_prc, n_batch) 155 | sacred_run.log_scalar('loss', loss.item(), n_batch) #TODO untested 156 | 157 | filename = checkpoint_path+"/GRU_D_epoch_"+str(epoch)+"_step_"+str(n_batch)+".pth" 158 | print('trying to save file {}'.format(filename) ) 159 | torch.save(model.state_dict(),filename) 160 | 161 | AU_PRCs.append(va_prc) 162 | train_loss = np.mean(losses) 163 | # print("Epoch: {} Train: {:.4f}/{:.2f}%, Dev: {:.4f}/{:.2f}%, Test: {:.4f}/{:.2f}% AUC: {:.4f}".format( 164 | # epoch, train_loss, train_acc*100, dev_loss, dev_acc*100, test_loss, test_acc*100, auc_score)) 165 | print("Epoch: {} Train loss: {:.4f}, Dev loss: {:.4f}, Val PRC: {:.4f}".format( 166 | epoch, train_loss, dev_loss, va_prc)) 167 | 168 | 169 | train_acc = torch.mean(torch.cat(acc).float()) 170 | train_loss = np.mean(losses) 171 | 172 | train_pred_out = pred 173 | train_label_out = label 174 | 175 | # save new params 176 | new_state_dict= {} 177 | for key in model.state_dict(): 178 | new_state_dict[key] = model.state_dict()[key].clone() 179 | 180 | # compare params 181 | for key in old_state_dict: 182 | if (old_state_dict[key] == new_state_dict[key]).all(): 183 | print('Not updated in {}'.format(key)) 184 | 185 | """ 186 | # dev loss 187 | print("Validate...") 188 | losses, acc = [], [] 189 | label, pred = np.array([]), np.array([]) 190 | model.eval() 191 | for dev_data, dev_label, num_obs in dev_dataloader: 192 | # Squeeze the data [1, 33, 49], [1,5] to [33, 49], [5] 193 | #dev_data = torch.squeeze(dev_data) 194 | #dev_label = torch.squeeze(dev_label) 195 | dev_data, dev_label, = dev_data.to(device), dev_label.to(device) 196 | 197 | optimizer.zero_grad() 198 | # Forward pass : Compute predicted y by passing train data to the model 199 | #print(dev_data.size()) 200 | #print(model.w_dg_x.weight) 201 | #print(model.w_dg_x.bias) 202 | y_pred = model(dev_data)[:,-1,:] #last output 203 | #print(y_pred.size()) 204 | 205 | # Save predict and label 206 | #pred.append(y_pred.item()) 207 | #label.append(dev_label.item()) 208 | 209 | # Compute loss 210 | loss = criterion(y_pred, dev_label) 211 | acc.append( 212 | torch.eq( 213 | (torch.sigmoid(y_pred).data > 0.5).float(), 214 | dev_label) 215 | ) 216 | losses.append(loss.item()) 217 | 218 | # Save predict and label 219 | #print(dev_label.detach().numpy().shape) 220 | #print(y_pred.detach().numpy().shape) 221 | label = np.append(label, dev_label.detach().cpu().numpy()) 222 | pred = np.append(pred, y_pred.detach().cpu().numpy()) 223 | #print(label.shape) 224 | 225 | dev_acc = torch.mean(torch.cat(acc).float()) 226 | #print(dev_acc) 227 | #print(label.shape) 228 | dev_loss = np.mean(losses) 229 | 230 | dev_pred_out = pred 231 | dev_label_out = label 232 | 233 | va_auc = roc_auc_score(label, pred) 234 | va_prc = average_precision_score(label, pred) 235 | print("validation auc:{}".format(va_auc)) 236 | print("validation prc:{}".format(va_prc)) 237 | 238 | sacred_run.log_scalar('va_auc', va_auc, n_batch) 239 | sacred_run.log_scalar('va_prc', va_prc, n_batch) 240 | 241 | AU_PRCs[epoch] = va_prc 242 | # print("Epoch: {} Train: {:.4f}/{:.2f}%, Dev: {:.4f}/{:.2f}%, Test: {:.4f}/{:.2f}% AUC: {:.4f}".format( 243 | # epoch, train_loss, train_acc*100, dev_loss, dev_acc*100, test_loss, test_acc*100, auc_score)) 244 | print("Epoch: {} Train loss: {:.4f}, Dev loss: {:.4f}, Val PRC: {:.4f}".format( 245 | epoch, train_loss, dev_loss, va_prc)) 246 | """ 247 | 248 | # save the parameters 249 | train_log = [] 250 | train_log.append(model.state_dict()) 251 | #torch.save(model.state_dict(), './save/grud_mean_grud_para.pt') 252 | 253 | #print(train_log) 254 | 255 | return AU_PRCs 256 | 257 | #____________________________________________________________________________________________________________________________________________________ 258 | 259 | @ex.config 260 | def gru_d_config(): 261 | #data config 262 | file_path = "../mgp-rnn-datadump_labs_vitals_covs_na_thres_500_min_length_7_max_length_200_horizon_0_split_0.pkl" 263 | data_sources = ['labs','vitals','covs'] 264 | 265 | 266 | #model config 267 | hidden_size = 100 268 | n_layers = 1 269 | 270 | #training config 271 | n_epochs = 40 272 | batch_size = 100 273 | learning_rate = 0.001 274 | learning_rate_decay =7 275 | dropout = 0.1 276 | l2_penalty = 0.001 277 | 278 | def count_parameters(model: torch.nn.Module): 279 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 280 | 281 | @ex.main 282 | def run_gru_d(_run, file_path, data_sources, batch_size, n_epochs, hidden_size, n_layers, learning_rate, learning_rate_decay, l2_penalty, dropout): 283 | 284 | 285 | #setup data 286 | data = DataContainer(file_path,data_sources) #problem: need same padding for train and va data 287 | va_data = data.va_data_gru_d() 288 | tr_data = data.tr_data_gru_d() 289 | x_mean = tr_data[:,0,:,:].mean(axis=0).mean(axis=1) 290 | 291 | train_dataset = torch.utils.data.TensorDataset(torch.Tensor(tr_data),torch.Tensor(np.expand_dims(data.labels_tr, axis =1)),torch.Tensor(np.expand_dims(data.num_obs_times_tr-1, axis =1))) 292 | val_dataset = torch.utils.data.TensorDataset(torch.Tensor(va_data), torch.Tensor(np.expand_dims(data.labels_va, axis =1)),torch.Tensor(np.expand_dims(data.num_obs_times_va-1, axis =1))) 293 | x_mean = torch.Tensor(x_mean) 294 | 295 | #needs int() bc hyperparameter script passes np.int64 and pytorch doesnt like that 296 | #TODO add num_obs to dataset, then dataloader 297 | 298 | train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True) 299 | val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size= int(batch_size), shuffle=True) 300 | 301 | 302 | print(va_data.shape) 303 | print(tr_data.shape) 304 | print('Data loaded!') 305 | #print(tr_data[1]) 306 | 307 | print("Build model with params:") 308 | print("hidden_size: {}, dropout: {}, num_layers: {}".format(hidden_size,dropout,n_layers)) 309 | 310 | #fit the model 311 | input_size = 44 # num of variables 312 | #hidden_size = 100 # same as inputsize 313 | output_size = 1 314 | #num_layers = 3 # num of GRU layers (only first layer is GRU-D) 315 | seq_len = 1264 # max seq len based on data 316 | model = grud_model(input_size = input_size, hidden_size= int(hidden_size), output_size=output_size, dropout=dropout, dropout_type='mloss', x_mean=x_mean, num_layers=int(n_layers)) 317 | 318 | 319 | use_cuda = torch.cuda.is_available() 320 | if use_cuda : 321 | torch.backends.cudnn.benchmark = True 322 | model = model.cuda() 323 | print("Running on GPU") 324 | 325 | criterion = torch.nn.BCELoss() 326 | #untested 327 | #logits=preds, targets=minibatch.O_dupe_onehot, pos_weight=self.class_imb) 328 | #Get class imbalance (for weighted loss): 329 | case_prev = data.labels_tr.sum()/float(len(data.labels_tr)) #get prevalence of cases in train dataset 330 | class_imb = torch.tensor(1/case_prev) #class imbalance to use as class weight if losstype='weighted' 331 | #TODO then we do not need to apply sigmoid to the model output in the model 332 | criterion = torch.nn.BCEWithLogitsLoss(pos_weight=class_imb) 333 | 334 | 335 | 336 | if len(_run.observers) > 0: 337 | checkpoint_path = os.path.join(_run.observers[0].dir, 'model_checkpoints') 338 | else: 339 | checkpoint_path = 'model_checkpoints' 340 | 341 | if not os.path.exists(checkpoint_path): 342 | os.makedirs(checkpoint_path) 343 | print(model) 344 | 345 | 346 | print('Begin training!') 347 | AU_PRCs = fit(model, criterion, l2_penalty, learning_rate,\ 348 | train_dataloader, val_dataloader, val_dataloader,\ 349 | learning_rate_decay, n_epochs, checkpoint_path, _run) 350 | 351 | 352 | print(AU_PRCs) 353 | best_val = np.array(AU_PRCs).max() 354 | 355 | return {'Best Validation AUPRC': best_val} 356 | 357 | 358 | 359 | 360 | if __name__ == '__main__': 361 | ex.run_commandline() -------------------------------------------------------------------------------- /src/GRUD_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | import math 6 | import warnings 7 | import itertools 8 | import numbers 9 | import torch.utils.data as utils 10 | 11 | # 12 | #The convention for RNNs is that the feature dimension is last - adapt for that. also need to adapt input tensors in the run file. 13 | # 14 | 15 | class GRUD_cell(torch.nn.Module): 16 | """ 17 | Implementation of GRUD. 18 | Inputs: x_mean 19 | n_smp x 3 x n_channels x len_seq tensor (0: data, 1: mask, 2: deltat) 20 | """ 21 | def __init__(self, input_size, hidden_size, output_size, num_layers=1, x_mean=0,\ 22 | bias=True, batch_first=False, bidirectional=False, dropout_type='mloss', dropout=0, return_hidden = False): 23 | 24 | use_cuda = torch.cuda.is_available() 25 | device = torch.device("cuda:0" if use_cuda else "cpu") 26 | 27 | super(GRUD_cell, self).__init__() 28 | self.input_size = input_size 29 | self.hidden_size = hidden_size 30 | self.output_size = output_size 31 | self.num_layers = num_layers 32 | self.return_hidden = return_hidden #controls the output, True if another GRU-D layer follows 33 | 34 | 35 | x_mean = torch.tensor(x_mean, requires_grad = True) 36 | self.register_buffer('x_mean', x_mean) 37 | self.bias = bias 38 | self.batch_first = batch_first 39 | self.dropout_type = dropout_type 40 | self.dropout = dropout 41 | self.bidirectional = bidirectional 42 | num_directions = 2 if bidirectional else 1 43 | 44 | if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \ 45 | isinstance(dropout, bool): 46 | raise ValueError("dropout should be a number in range [0, 1] " 47 | "representing the probability of an element being " 48 | "zeroed") 49 | if dropout > 0 and num_layers == 1: 50 | warnings.warn("dropout option adds dropout after all but last " 51 | "recurrent layer, so non-zero dropout expects " 52 | "num_layers greater than 1, but got dropout={} and " 53 | "num_layers={}".format(dropout, num_layers)) 54 | 55 | 56 | 57 | #set up all the operations that are needed in the forward pass 58 | self.w_dg_x = torch.nn.Linear(input_size,input_size, bias=True) 59 | self.w_dg_h = torch.nn.Linear(input_size, hidden_size, bias = True) 60 | 61 | self.w_xz = torch.nn.Linear(input_size, hidden_size, bias=False) 62 | self.w_hz = torch.nn.Linear(hidden_size, hidden_size, bias=False) 63 | self.w_mz = torch.nn.Linear(input_size, hidden_size, bias=True) 64 | 65 | self.w_xr = torch.nn.Linear(input_size, hidden_size, bias=False) 66 | self.w_hr = torch.nn.Linear(hidden_size, hidden_size, bias=False) 67 | self.w_mr = torch.nn.Linear(input_size, hidden_size, bias=False) 68 | self.w_xh = torch.nn.Linear(input_size, hidden_size, bias=False) 69 | self.w_hh = torch.nn.Linear(hidden_size, hidden_size, bias=False) 70 | self.w_mh = torch.nn.Linear(input_size, hidden_size, bias=True) 71 | 72 | self.w_hy = torch.nn.Linear(hidden_size, output_size, bias=True) 73 | 74 | 75 | 76 | 77 | Hidden_State = torch.zeros(self.hidden_size, requires_grad = True) 78 | #we use buffers because pytorch will take care of pushing them to GPU for us 79 | self.register_buffer('Hidden_State', Hidden_State) 80 | self.register_buffer('X_last_obs', torch.zeros(input_size)) #torch.tensor(x_mean) #TODO: what to initialize last observed values with?, also check broadcasting behaviour 81 | 82 | 83 | #TODO: check usefulness of everything below here, just copied skeleton 84 | 85 | 86 | self.reset_parameters() 87 | 88 | 89 | 90 | 91 | def reset_parameters(self): 92 | stdv = 1.0 / math.sqrt(self.hidden_size) 93 | for weight in self.parameters(): 94 | torch.nn.init.uniform_(weight, -stdv, stdv) 95 | 96 | def check_forward_args(self, input, hidden, batch_sizes): 97 | is_input_packed = batch_sizes is not None 98 | expected_input_dim = 2 if is_input_packed else 3 99 | if input.dim() != expected_input_dim: 100 | raise RuntimeError( 101 | 'input must have {} dimensions, got {}'.format( 102 | expected_input_dim, input.dim())) 103 | if self.input_size != input.size(-1): 104 | raise RuntimeError( 105 | 'input.size(-1) must be equal to input_size. Expected {}, got {}'.format( 106 | self.input_size, input.size(-1))) 107 | 108 | if is_input_packed: 109 | mini_batch = int(batch_sizes[0]) 110 | else: 111 | mini_batch = input.size(0) if self.batch_first else input.size(1) 112 | 113 | num_directions = 2 if self.bidirectional else 1 114 | expected_hidden_size = (self.num_layers * num_directions, 115 | mini_batch, self.hidden_size) 116 | 117 | def check_hidden_size(hx, expected_hidden_size, msg='Expected hidden size {}, got {}'): 118 | if tuple(hx.size()) != expected_hidden_size: 119 | raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size()))) 120 | 121 | if self.mode == 'LSTM': 122 | check_hidden_size(hidden[0], expected_hidden_size, 123 | 'Expected hidden[0] size {}, got {}') 124 | check_hidden_size(hidden[1], expected_hidden_size, 125 | 'Expected hidden[1] size {}, got {}') 126 | else: 127 | check_hidden_size(hidden, expected_hidden_size) 128 | 129 | def extra_repr(self): 130 | s = '{input_size}, {hidden_size}' 131 | if self.num_layers != 1: 132 | s += ', num_layers={num_layers}' 133 | if self.bias is not True: 134 | s += ', bias={bias}' 135 | if self.batch_first is not False: 136 | s += ', batch_first={batch_first}' 137 | if self.dropout != 0: 138 | s += ', dropout={dropout}' 139 | if self.bidirectional is not False: 140 | s += ', bidirectional={bidirectional}' 141 | return s.format(**self.__dict__) 142 | 143 | 144 | 145 | 146 | 147 | @property 148 | def _flat_weights(self): 149 | return list(self._parameters.values()) 150 | 151 | 152 | def forward(self, input): 153 | # input.size = (3, 33,49) : num_input or num_hidden, num_layer or step 154 | #X = torch.squeeze(input[0]) # .size = (33,49) 155 | #Mask = torch.squeeze(input[1]) # .size = (33,49) 156 | #Delta = torch.squeeze(input[2]) # .size = (33,49) 157 | X = input[:,0,:,:] 158 | Mask = input[:,1,:,:] 159 | Delta = input[:,2,:,:] 160 | 161 | 162 | step_size = X.size(1) # 49 163 | #print('step size : ', step_size) 164 | 165 | output = None 166 | #h = Hidden_State 167 | h = getattr(self, 'Hidden_State') 168 | #felix - buffer system from newer pytorch version 169 | x_mean = getattr(self, 'x_mean') 170 | x_last_obsv = getattr(self, 'X_last_obs') 171 | 172 | 173 | device = next(self.parameters()).device 174 | output_tensor = torch.empty([X.size()[0], X.size()[2], self.output_size], dtype=X.dtype, device= device) 175 | hidden_tensor = torch.empty(X.size()[0], X.size()[2], self.hidden_size, dtype=X.dtype, device = device) 176 | 177 | #iterate over seq 178 | for timestep in range(X.size()[2]): 179 | 180 | #x = torch.squeeze(X[:,layer:layer+1]) 181 | #m = torch.squeeze(Mask[:,layer:layer+1]) 182 | #d = torch.squeeze(Delta[:,layer:layer+1]) 183 | x = torch.squeeze(X[:,:,timestep]) 184 | m = torch.squeeze(Mask[:,:,timestep]) 185 | d = torch.squeeze(Delta[:,:,timestep]) 186 | 187 | 188 | #(4) 189 | gamma_x = torch.exp(-1* torch.nn.functional.relu( self.w_dg_x(d) )) 190 | gamma_h = torch.exp(-1* torch.nn.functional.relu( self.w_dg_h(d) )) 191 | 192 | 193 | #(5) 194 | #standard mult handles case correctly, this should work - maybe broadcast x_mean, seems to be taking care of that anyway 195 | 196 | #update x_last_obsv 197 | #print(x.size()) 198 | #print(x_last_obsv.size()) 199 | x_last_obsv = torch.where(m>0,x,x_last_obsv) 200 | #print('after update') 201 | #print(x_last_obsv) 202 | x = m * x + (1 - m) * (gamma_x * x + (1 - gamma_x) * x_mean) 203 | x = m * x + (1 - m) * (gamma_x * x_last_obsv + (1 - gamma_x) * x_mean) 204 | 205 | #(6) 206 | if self.dropout == 0: 207 | 208 | h = gamma_h*h 209 | z = torch.sigmoid( self.w_xz(x) + self.w_hz(h) + self.w_mz(m)) 210 | r = torch.sigmoid( self.w_xr(x) + self.w_hr(h) + self.w_mr(m)) 211 | 212 | h_tilde = torch.tanh( self.w_xh(x) + self.w_hh( r*h ) + self.w_mh(m)) 213 | 214 | 215 | h = (1 - z) * h + z * h_tilde 216 | 217 | #TODO: not adapted yet 218 | elif self.dropout_type == 'Moon': 219 | ''' 220 | RNNDROP: a novel dropout for rnn in asr(2015) 221 | ''' 222 | h = gamma_h * h 223 | 224 | z = torch.sigmoid((w_xz*x + w_hz*h + w_mz*m + b_z)) 225 | r = torch.sigmoid((w_xr*x + w_hr*h + w_mr*m + b_r)) 226 | 227 | h_tilde = torch.tanh((w_xh*x + w_hh*(r * h) + w_mh*m + b_h)) 228 | 229 | h = (1 - z) * h + z * h_tilde 230 | dropout = torch.nn.Dropout(p=self.dropout) 231 | h = dropout(h) 232 | 233 | elif self.dropout_type == 'Gal': 234 | ''' 235 | A Theoretically grounded application of dropout in recurrent neural networks(2015) 236 | ''' 237 | dropout = torch.nn.Dropout(p=self.dropout) 238 | h = dropout(h) 239 | 240 | h = gamma_h * h 241 | 242 | z = torch.sigmoid((w_xz*x + w_hz*h + w_mz*m + b_z)) 243 | r = torch.sigmoid((w_xr*x + w_hr*h + w_mr*m + b_r)) 244 | h_tilde = torch.tanh((w_xh*x + w_hh*(r * h) + w_mh*m + b_h)) 245 | 246 | h = (1 - z) * h + z * h_tilde 247 | 248 | elif self.dropout_type == 'mloss': 249 | ''' 250 | recurrent dropout without memory loss arXiv 1603.05118 251 | g = h_tilde, p = the probability to not drop a neuron 252 | ''' 253 | h = gamma_h*h 254 | z = torch.sigmoid( self.w_xz(x) + self.w_hz(h) + self.w_mz(m)) 255 | r = torch.sigmoid( self.w_xr(x) + self.w_hr(h) + self.w_mr(m)) 256 | 257 | 258 | dropout = torch.nn.Dropout(p=self.dropout) 259 | h_tilde = torch.tanh( self.w_xh(x) + self.w_hh( r*h ) + self.w_mh(m)) 260 | 261 | 262 | h = (1 - z) * h + z * h_tilde 263 | ####### 264 | 265 | else: 266 | h = gamma_h * h 267 | 268 | z = torch.sigmoid((w_xz*x + w_hz*h + w_mz*m + b_z)) 269 | r = torch.sigmoid((w_xr*x + w_hr*h + w_mr*m + b_r)) 270 | h_tilde = torch.tanh((w_xh*x + w_hh*(r * h) + w_mh*m + b_h)) 271 | 272 | h = (1 - z) * h + z * h_tilde 273 | 274 | 275 | 276 | step_output = self.w_hy(h) 277 | step_output = torch.sigmoid(step_output) 278 | output_tensor[:,timestep,:] = step_output 279 | hidden_tensor[:,timestep,:] = h 280 | 281 | #if self.return_hidden: 282 | #when i want to stack GRU-Ds, need to put the tensor back together 283 | #output = torch.stack([hidden_tensor,Mask,Delta], dim=1) 284 | 285 | output = output_tensor, hidden_tensor 286 | #else: 287 | # output = output_tensor 288 | return output -------------------------------------------------------------------------------- /src/GRUD_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | import math 6 | import warnings 7 | import itertools 8 | import numbers 9 | import torch.utils.data as utils 10 | from GRUD_layer import GRUD_cell 11 | 12 | 13 | 14 | def grud_model_old( input_size, hidden_size, output_size, num_layers=1, x_mean=0,\ 15 | bias=True, batch_first=False, bidirectional=False, dropout_type='mloss', dropout=0): 16 | 17 | layer_list =[] 18 | #intermediate layers return input size as output size 19 | for i in range(num_layers-1): #subtract 1 bc last layer needs to be called with different params 20 | layer = GRUD_cell(input_size = input_size, hidden_size= hidden_size, output_size=input_size, dropout=dropout, dropout_type=dropout_type, x_mean=x_mean, num_layers=num_layers, return_hidden = True) 21 | layer_list.append(layer) 22 | 23 | #last layer with final output size 24 | layer = GRUD_cell(input_size = input_size, hidden_size=hidden_size, output_size=output_size, dropout=dropout, dropout_type=dropout_type, x_mean=x_mean, num_layers=num_layers, return_hidden = False) 25 | layer_list.append(layer) 26 | 27 | model = torch.nn.Sequential(*layer_list) 28 | 29 | return model 30 | 31 | 32 | #TODO: error on GPU is probably caused because hidden state of GRU is not initialized on GPU. 33 | class grud_model(torch.nn.Module): 34 | def __init__(self,input_size, hidden_size, output_size, num_layers = 1, x_mean = 0,\ 35 | bias =True, batch_first = False, bidirectional = False, dropout_type ='mloss', dropout = 0): 36 | super(grud_model, self).__init__() 37 | 38 | self.gru_d = GRUD_cell(input_size = input_size, hidden_size= hidden_size, output_size=output_size, 39 | dropout=dropout, dropout_type=dropout_type, x_mean=x_mean) 40 | self.hidden_to_output = torch.nn.Linear(hidden_size, output_size, bias=True) 41 | self.num_layers = num_layers 42 | self.hidden_size = hidden_size 43 | 44 | if self.num_layers >1: 45 | #(batch, seq, feature) 46 | self.gru_layers = torch.nn.GRU(input_size = hidden_size, hidden_size = hidden_size, batch_first = True, num_layers = self.num_layers -1, dropout=dropout) 47 | 48 | def initialize_hidden(self, batch_size): 49 | device = next(self.parameters()).device 50 | # The hidden state at the start are all zeros 51 | return torch.zeros(self.num_layers-1, batch_size, self.hidden_size, device=device) 52 | 53 | def forward(self,input): 54 | 55 | #pass through GRU-D 56 | output, hidden = self.gru_d(input) 57 | #print(self.gru_d.return_hidden) 58 | #output = self.gru_d(input) 59 | #print(output.size()) 60 | 61 | # batch_size, n_hidden, n_timesteps 62 | 63 | if self.num_layers >1: 64 | #TODO remove init hidden, not necessary, auto init works fine 65 | init_hidden = self.initialize_hidden(hidden.size()[0]) 66 | 67 | 68 | output, hidden = self.gru_layers(hidden)#, init_hidden) 69 | 70 | 71 | output = self.hidden_to_output(output) 72 | output = torch.sigmoid(output) 73 | 74 | #print("final output size passed as model result") 75 | #print(output.size()) 76 | return output 77 | 78 | --------------------------------------------------------------------------------