├── README.md
├── scripts
    └── run_gru_d.py
└── src
    ├── GRUD_layer.py
    └── GRUD_model.py


/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch-GRU-D
 2 | PyTorch Implementation of GRU-D from "Recurrent Neural Networks for Multivariate Time Series with Missing Values" https://arxiv.org/abs/1606.01865
 3 | 
 4 | Code based on
 5 | https://github.com/Han-JD/GRU-D
 6 | 
 7 | Adapted for batchwise training, GPU support and fixed bugs.
 8 | PyTorch Version 1.3.1
 9 | 
10 | Model takes input of shape ( n_samples, 3, features, seq_length ).
11 | Dimension 1 is (input_matrix, masking_matrix, delta_t_matrix). Input_matrix has 0 where values are missing.
12 | 


--------------------------------------------------------------------------------
/scripts/run_gru_d.py:
--------------------------------------------------------------------------------
  1 | from sacred import Experiment
  2 | import torch
  3 | import numpy as np
  4 | import pandas as pd
  5 | import os
  6 | import math
  7 | import warnings
  8 | import itertools
  9 | import numbers
 10 | import torch.utils.data as utils
 11 | #import matplotlib.pyplot as plt
 12 | from sklearn.metrics import roc_curve, auc, roc_auc_score, average_precision_score
 13 | 
 14 | import sys
 15 | sys.path.insert(0, '../src')
 16 | from data_loader_oo import DataContainer
 17 | from GRUD_model import grud_model
 18 | 
 19 | 
 20 | 
 21 | ex = Experiment("GRU-D-mean")
 22 | use_cuda = torch.cuda.is_available()
 23 | device = torch.device("cuda:0" if use_cuda else "cpu")
 24 | 
 25 | #TODO: data gets padded before, does this have any effect if i just take the last step output as output?
 26 | 
 27 | def fit(model, criterion, l2_penalty, learning_rate,\
 28 |         train_dataloader, dev_dataloader, test_dataloader,\
 29 |         learning_rate_decay=0, n_epochs=30, checkpoint_path = 'model_checkpoints', sacred_run=None):
 30 |     
 31 |     test_freq = int(len(train_dataloader) / 4)
 32 |     use_cuda = torch.cuda.is_available()
 33 |     device = torch.device("cuda:0" if use_cuda else "cpu")
 34 |     
 35 |     # to check the update 
 36 |     old_state_dict = {}
 37 |     for key in model.state_dict():
 38 |         old_state_dict[key] = model.state_dict()[key].clone()
 39 |     
 40 | 
 41 |     AU_PRCs = list()
 42 |     n_batch = 0
 43 |     for epoch in range(n_epochs):
 44 |         print("starting Epoch: {}".format(epoch))
 45 |         if learning_rate_decay != 0:
 46 | 
 47 |             # every [decay_step] epoch reduce the learning rate by half
 48 |             if  epoch % learning_rate_decay == 0:
 49 |                 learning_rate = learning_rate/2
 50 |                 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= l2_penalty)
 51 |                 print('at epoch {} learning_rate is updated to {}'.format(epoch, learning_rate))
 52 |         
 53 |         # train the model
 54 |         losses, acc = [], []
 55 |         label, pred = [], []
 56 |         y_pred_col= []
 57 |         
 58 |         
 59 |         #get a minibatch
 60 |         for train_data, train_label, train_num_obs in train_dataloader:
 61 |             model.train()
 62 |             #push current sample to GPU or CPU
 63 |             train_data, train_label, train_num_obs = train_data.to(device), train_label.to(device), train_num_obs.to(device)
 64 | 
 65 | 
 66 |             # Zero the parameter gradients
 67 |             optimizer.zero_grad()
 68 |             
 69 |             # Forward pass : Compute predicted y by passing train data to the model
 70 |             #y_pred = model(train_data)[:,-1,:] #last output
 71 | 
 72 |             #with pad after:
 73 |             y_pred = model(train_data)
 74 |             #print(y_pred.size())
 75 |             y_pred = torch.squeeze(y_pred)
 76 |             #print(y_pred.size())
 77 |             y_pred = torch.gather(y_pred,1,train_num_obs.long())
 78 |             #print(y_pred.size())
 79 |             #print(train_num_obs.size())
 80 | 
 81 |             
 82 |             #print(train_num_obs)
 83 | 
 84 | 
 85 |             # Compute loss
 86 |             loss = criterion(y_pred, train_label)
 87 |             #print(loss)
 88 |             acc.append(
 89 |                 torch.eq(
 90 |                     (torch.sigmoid(y_pred).data > 0.5).float(),
 91 |                     train_label)
 92 |             )
 93 |             losses.append(loss.item())
 94 |             print(str(n_batch)+" loss: "+ str(loss.item()) )
 95 |             #print(model.w_dg_x.weight)
 96 |             #print(model.w_dg_x.bias)
 97 | 
 98 |             # perform a backward pass, and update the weights.
 99 |             loss.backward()
100 |             optimizer.step()
101 | 
102 |             #sacred_run.log_scalar('loss', loss, n_batch)
103 |             #sacred_run.log_scalar('acc', acc, n_batch)
104 |             n_batch = n_batch + 1
105 | 
106 |             if n_batch % test_freq == 0:
107 |                 print("Validate...")
108 |                 losses, acc = [], []
109 |                 label, pred = np.array([]), np.array([])
110 |                 model.eval()
111 | 
112 |                 for dev_data, dev_label, dev_num_obs in dev_dataloader:
113 | 
114 |                     dev_data, dev_label, dev_num_obs = dev_data.to(device), dev_label.to(device), dev_num_obs.to(device)
115 | 
116 |                     print(dev_data.size())
117 |                     print(dev_label.size())
118 |                     print(dev_num_obs.size())
119 | 
120 |                     optimizer.zero_grad()
121 |                     # Forward pass : Compute predicted y by passing train data to the model
122 |     
123 |                     #y_pred = model(dev_data)[:,-1,:] #last output
124 |                     y_pred = model(dev_data)
125 |                     y_pred = torch.squeeze(y_pred)
126 |                     #print(y_pred.size())
127 |                     y_pred = torch.gather(y_pred,1,dev_num_obs.long())
128 | 
129 |                     # Compute loss
130 |                     loss = criterion(y_pred, dev_label)
131 |                     acc.append(
132 |                         torch.eq(
133 |                             (torch.sigmoid(y_pred).data > 0.5).float(),
134 |                             dev_label)
135 |                     )
136 |                     losses.append(loss.item())
137 | 
138 |                     label = np.append(label, dev_label.detach().cpu().numpy())
139 |                     pred = np.append(pred, y_pred.detach().cpu().numpy())
140 | 
141 | 
142 |                 dev_acc = torch.mean(torch.cat(acc).float())
143 |                 dev_loss = np.mean(losses)
144 |                 
145 |                 dev_pred_out = pred
146 |                 dev_label_out = label
147 | 
148 |                 va_auc = roc_auc_score(label, pred)
149 |                 va_prc = average_precision_score(label, pred)   
150 |                 print("validation auc:{}".format(va_auc))
151 |                 print("validation prc:{}".format(va_prc))
152 | 
153 |                 sacred_run.log_scalar('va_auc', va_auc, n_batch)
154 |                 sacred_run.log_scalar('va_prc', va_prc, n_batch)
155 |                 sacred_run.log_scalar('loss', loss.item(), n_batch) #TODO untested
156 | 
157 |                 filename = checkpoint_path+"/GRU_D_epoch_"+str(epoch)+"_step_"+str(n_batch)+".pth"
158 |                 print('trying to save file {}'.format(filename) )
159 |                 torch.save(model.state_dict(),filename)
160 | 
161 |                 AU_PRCs.append(va_prc)        
162 |                 train_loss = np.mean(losses)
163 |                 # print("Epoch: {} Train: {:.4f}/{:.2f}%, Dev: {:.4f}/{:.2f}%, Test: {:.4f}/{:.2f}% AUC: {:.4f}".format(
164 |                 #     epoch, train_loss, train_acc*100, dev_loss, dev_acc*100, test_loss, test_acc*100, auc_score))
165 |                 print("Epoch: {} Train loss: {:.4f}, Dev loss: {:.4f}, Val PRC: {:.4f}".format(
166 |                     epoch, train_loss, dev_loss, va_prc))
167 | 
168 |         
169 |         train_acc = torch.mean(torch.cat(acc).float())
170 |         train_loss = np.mean(losses)
171 |         
172 |         train_pred_out = pred
173 |         train_label_out = label
174 |         
175 |         # save new params
176 |         new_state_dict= {}
177 |         for key in model.state_dict():
178 |             new_state_dict[key] = model.state_dict()[key].clone()
179 |             
180 |         # compare params
181 |         for key in old_state_dict:
182 |             if (old_state_dict[key] == new_state_dict[key]).all():
183 |                 print('Not updated in {}'.format(key))
184 |    
185 |         """
186 |         # dev loss
187 |         print("Validate...")
188 |         losses, acc = [], []
189 |         label, pred = np.array([]), np.array([])
190 |         model.eval()
191 |         for dev_data, dev_label, num_obs in dev_dataloader:
192 |             # Squeeze the data [1, 33, 49], [1,5] to [33, 49], [5]
193 |             #dev_data = torch.squeeze(dev_data)
194 |             #dev_label = torch.squeeze(dev_label)
195 |             dev_data, dev_label, = dev_data.to(device), dev_label.to(device)
196 | 
197 |             optimizer.zero_grad()
198 |             # Forward pass : Compute predicted y by passing train data to the model
199 |             #print(dev_data.size())
200 |             #print(model.w_dg_x.weight)
201 |             #print(model.w_dg_x.bias)
202 |             y_pred = model(dev_data)[:,-1,:] #last output
203 |             #print(y_pred.size())
204 |             
205 |             # Save predict and label
206 |             #pred.append(y_pred.item())
207 |             #label.append(dev_label.item())
208 | 
209 |             # Compute loss
210 |             loss = criterion(y_pred, dev_label)
211 |             acc.append(
212 |                 torch.eq(
213 |                     (torch.sigmoid(y_pred).data > 0.5).float(),
214 |                     dev_label)
215 |             )
216 |             losses.append(loss.item())
217 | 
218 |             # Save predict and label
219 |             #print(dev_label.detach().numpy().shape)
220 |             #print(y_pred.detach().numpy().shape)
221 |             label = np.append(label, dev_label.detach().cpu().numpy())
222 |             pred = np.append(pred, y_pred.detach().cpu().numpy())
223 |             #print(label.shape)
224 | 
225 |         dev_acc = torch.mean(torch.cat(acc).float())
226 |         #print(dev_acc)
227 |         #print(label.shape)
228 |         dev_loss = np.mean(losses)
229 |         
230 |         dev_pred_out = pred
231 |         dev_label_out = label
232 | 
233 |         va_auc = roc_auc_score(label, pred)
234 |         va_prc = average_precision_score(label, pred)   
235 |         print("validation auc:{}".format(va_auc))
236 |         print("validation prc:{}".format(va_prc))
237 | 
238 |         sacred_run.log_scalar('va_auc', va_auc, n_batch)
239 |         sacred_run.log_scalar('va_prc', va_prc, n_batch)
240 | 
241 |         AU_PRCs[epoch] = va_prc        
242 |         # print("Epoch: {} Train: {:.4f}/{:.2f}%, Dev: {:.4f}/{:.2f}%, Test: {:.4f}/{:.2f}% AUC: {:.4f}".format(
243 |         #     epoch, train_loss, train_acc*100, dev_loss, dev_acc*100, test_loss, test_acc*100, auc_score))
244 |         print("Epoch: {} Train loss: {:.4f}, Dev loss: {:.4f}, Val PRC: {:.4f}".format(
245 |             epoch, train_loss, dev_loss, va_prc))
246 |         """
247 |         
248 |         # save the parameters
249 |         train_log = []
250 |         train_log.append(model.state_dict())
251 |         #torch.save(model.state_dict(), './save/grud_mean_grud_para.pt')
252 |         
253 |         #print(train_log)
254 |     
255 |     return AU_PRCs        
256 | 
257 | #____________________________________________________________________________________________________________________________________________________
258 | 
259 | @ex.config
260 | def gru_d_config():
261 |     #data config
262 |     file_path = "../mgp-rnn-datadump_labs_vitals_covs_na_thres_500_min_length_7_max_length_200_horizon_0_split_0.pkl"
263 |     data_sources = ['labs','vitals','covs']
264 |     
265 | 
266 |     #model config
267 |     hidden_size = 100
268 |     n_layers = 1
269 | 
270 |     #training config
271 |     n_epochs = 40
272 |     batch_size = 100
273 |     learning_rate = 0.001   
274 |     learning_rate_decay =7
275 |     dropout = 0.1
276 |     l2_penalty = 0.001
277 | 
278 | def count_parameters(model: torch.nn.Module):
279 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
280 | 
281 | @ex.main
282 | def run_gru_d(_run, file_path, data_sources, batch_size, n_epochs, hidden_size, n_layers, learning_rate, learning_rate_decay, l2_penalty, dropout):
283 | 
284 | 
285 |     #setup data
286 |     data = DataContainer(file_path,data_sources) #problem: need same padding for train and va data
287 |     va_data = data.va_data_gru_d()
288 |     tr_data = data.tr_data_gru_d()
289 |     x_mean = tr_data[:,0,:,:].mean(axis=0).mean(axis=1)
290 | 
291 |     train_dataset = torch.utils.data.TensorDataset(torch.Tensor(tr_data),torch.Tensor(np.expand_dims(data.labels_tr, axis =1)),torch.Tensor(np.expand_dims(data.num_obs_times_tr-1, axis =1)))
292 |     val_dataset = torch.utils.data.TensorDataset(torch.Tensor(va_data), torch.Tensor(np.expand_dims(data.labels_va, axis =1)),torch.Tensor(np.expand_dims(data.num_obs_times_va-1, axis =1)))
293 |     x_mean = torch.Tensor(x_mean)
294 | 
295 |     #needs int() bc hyperparameter script passes np.int64 and pytorch doesnt like that
296 |     #TODO add num_obs to dataset, then dataloader 
297 | 
298 |     train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
299 |     val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size= int(batch_size), shuffle=True)
300 | 
301 |     
302 |     print(va_data.shape)
303 |     print(tr_data.shape)
304 |     print('Data loaded!')
305 |     #print(tr_data[1])
306 | 
307 |     print("Build model with params:")
308 |     print("hidden_size: {}, dropout: {}, num_layers: {}".format(hidden_size,dropout,n_layers))
309 | 
310 |     #fit the model
311 |     input_size = 44 # num of variables
312 |     #hidden_size = 100 # same as inputsize
313 |     output_size = 1
314 |     #num_layers = 3 # num of GRU layers (only first layer is GRU-D)
315 |     seq_len = 1264 # max seq len based on data
316 |     model = grud_model(input_size = input_size, hidden_size= int(hidden_size), output_size=output_size, dropout=dropout, dropout_type='mloss', x_mean=x_mean, num_layers=int(n_layers))
317 |     
318 |   
319 |     use_cuda = torch.cuda.is_available()
320 |     if use_cuda :
321 |         torch.backends.cudnn.benchmark = True
322 |         model = model.cuda()
323 |         print("Running on GPU")
324 | 
325 |     criterion = torch.nn.BCELoss()
326 |     #untested
327 |     #logits=preds, targets=minibatch.O_dupe_onehot, pos_weight=self.class_imb)
328 |     #Get class imbalance (for weighted loss):
329 |     case_prev = data.labels_tr.sum()/float(len(data.labels_tr)) #get prevalence of cases in train dataset
330 |     class_imb = torch.tensor(1/case_prev) #class imbalance to use as class weight if losstype='weighted'
331 |     #TODO then we do not need to apply sigmoid to the model output in the model
332 |     criterion = torch.nn.BCEWithLogitsLoss(pos_weight=class_imb)
333 |      
334 |     
335 | 
336 |     if len(_run.observers) > 0:
337 |          checkpoint_path = os.path.join(_run.observers[0].dir, 'model_checkpoints')
338 |     else:
339 |         checkpoint_path = 'model_checkpoints'
340 | 
341 |     if not os.path.exists(checkpoint_path):
342 |         os.makedirs(checkpoint_path)
343 |     print(model) 
344 | 
345 | 
346 |     print('Begin training!')
347 |     AU_PRCs = fit(model, criterion, l2_penalty, learning_rate,\
348 |                    train_dataloader, val_dataloader, val_dataloader,\
349 |                    learning_rate_decay, n_epochs, checkpoint_path, _run)
350 | 
351 |     
352 |     print(AU_PRCs)
353 |     best_val = np.array(AU_PRCs).max()
354 | 
355 |     return {'Best Validation AUPRC': best_val}
356 | 
357 | 
358 | 
359 | 
360 | if __name__ == '__main__':
361 |     ex.run_commandline()


--------------------------------------------------------------------------------
/src/GRUD_layer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os
  5 | import math
  6 | import warnings
  7 | import itertools
  8 | import numbers
  9 | import torch.utils.data as utils
 10 | 
 11 | #
 12 | #The convention for RNNs is that the feature dimension is last - adapt for that. also need to adapt input tensors in the run file.
 13 | #
 14 | 
 15 | class GRUD_cell(torch.nn.Module):
 16 |     """
 17 |     Implementation of GRUD.
 18 |     Inputs: x_mean
 19 |             n_smp x 3 x n_channels x len_seq tensor (0: data, 1: mask, 2: deltat)
 20 |     """
 21 |     def __init__(self, input_size, hidden_size, output_size, num_layers=1, x_mean=0,\
 22 |                  bias=True, batch_first=False, bidirectional=False, dropout_type='mloss', dropout=0, return_hidden = False):
 23 | 
 24 |         use_cuda = torch.cuda.is_available()
 25 |         device = torch.device("cuda:0" if use_cuda else "cpu")
 26 |         
 27 |         super(GRUD_cell, self).__init__()
 28 |         self.input_size = input_size
 29 |         self.hidden_size = hidden_size
 30 |         self.output_size = output_size
 31 |         self.num_layers = num_layers
 32 |         self.return_hidden = return_hidden #controls the output, True if another GRU-D layer follows
 33 | 
 34 | 
 35 |         x_mean = torch.tensor(x_mean, requires_grad = True)
 36 |         self.register_buffer('x_mean', x_mean)
 37 |         self.bias = bias
 38 |         self.batch_first = batch_first
 39 |         self.dropout_type = dropout_type
 40 |         self.dropout = dropout
 41 |         self.bidirectional = bidirectional
 42 |         num_directions = 2 if bidirectional else 1
 43 |         
 44 |         if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
 45 |                 isinstance(dropout, bool):
 46 |             raise ValueError("dropout should be a number in range [0, 1] "
 47 |                              "representing the probability of an element being "
 48 |                              "zeroed")
 49 |         if dropout > 0 and num_layers == 1:
 50 |             warnings.warn("dropout option adds dropout after all but last "
 51 |                           "recurrent layer, so non-zero dropout expects "
 52 |                           "num_layers greater than 1, but got dropout={} and "
 53 |                           "num_layers={}".format(dropout, num_layers))
 54 |         
 55 |         
 56 | 
 57 |         #set up all the operations that are needed in the forward pass
 58 |         self.w_dg_x = torch.nn.Linear(input_size,input_size, bias=True)
 59 |         self.w_dg_h = torch.nn.Linear(input_size, hidden_size, bias = True)
 60 | 
 61 |         self.w_xz = torch.nn.Linear(input_size, hidden_size, bias=False)
 62 |         self.w_hz = torch.nn.Linear(hidden_size, hidden_size, bias=False)
 63 |         self.w_mz = torch.nn.Linear(input_size, hidden_size, bias=True)
 64 | 
 65 |         self.w_xr = torch.nn.Linear(input_size, hidden_size, bias=False)
 66 |         self.w_hr = torch.nn.Linear(hidden_size, hidden_size, bias=False)
 67 |         self.w_mr = torch.nn.Linear(input_size, hidden_size, bias=False)
 68 |         self.w_xh = torch.nn.Linear(input_size, hidden_size, bias=False)
 69 |         self.w_hh = torch.nn.Linear(hidden_size, hidden_size, bias=False)
 70 |         self.w_mh = torch.nn.Linear(input_size, hidden_size, bias=True)
 71 | 
 72 |         self.w_hy = torch.nn.Linear(hidden_size, output_size, bias=True)
 73 |         
 74 |     
 75 | 
 76 | 
 77 |         Hidden_State = torch.zeros(self.hidden_size, requires_grad = True)
 78 |         #we use buffers because pytorch will take care of pushing them to GPU for us
 79 |         self.register_buffer('Hidden_State', Hidden_State)
 80 |         self.register_buffer('X_last_obs', torch.zeros(input_size)) #torch.tensor(x_mean) #TODO: what to initialize last observed values with?, also check broadcasting behaviour
 81 | 
 82 |     
 83 |     #TODO: check usefulness of everything below here, just copied skeleton
 84 | 
 85 | 
 86 |         self.reset_parameters()
 87 |         
 88 |     
 89 | 
 90 | 
 91 |     def reset_parameters(self):
 92 |         stdv = 1.0 / math.sqrt(self.hidden_size)
 93 |         for weight in self.parameters():
 94 |             torch.nn.init.uniform_(weight, -stdv, stdv)
 95 | 
 96 |     def check_forward_args(self, input, hidden, batch_sizes):
 97 |         is_input_packed = batch_sizes is not None
 98 |         expected_input_dim = 2 if is_input_packed else 3
 99 |         if input.dim() != expected_input_dim:
100 |             raise RuntimeError(
101 |                 'input must have {} dimensions, got {}'.format(
102 |                     expected_input_dim, input.dim()))
103 |         if self.input_size != input.size(-1):
104 |             raise RuntimeError(
105 |                 'input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
106 |                     self.input_size, input.size(-1)))
107 | 
108 |         if is_input_packed:
109 |             mini_batch = int(batch_sizes[0])
110 |         else:
111 |             mini_batch = input.size(0) if self.batch_first else input.size(1)
112 | 
113 |         num_directions = 2 if self.bidirectional else 1
114 |         expected_hidden_size = (self.num_layers * num_directions,
115 |                                 mini_batch, self.hidden_size)
116 |         
117 |         def check_hidden_size(hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
118 |             if tuple(hx.size()) != expected_hidden_size:
119 |                 raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
120 | 
121 |         if self.mode == 'LSTM':
122 |             check_hidden_size(hidden[0], expected_hidden_size,
123 |                               'Expected hidden[0] size {}, got {}')
124 |             check_hidden_size(hidden[1], expected_hidden_size,
125 |                               'Expected hidden[1] size {}, got {}')
126 |         else:
127 |             check_hidden_size(hidden, expected_hidden_size)
128 |     
129 |     def extra_repr(self):
130 |         s = '{input_size}, {hidden_size}'
131 |         if self.num_layers != 1:
132 |             s += ', num_layers={num_layers}'
133 |         if self.bias is not True:
134 |             s += ', bias={bias}'
135 |         if self.batch_first is not False:
136 |             s += ', batch_first={batch_first}'
137 |         if self.dropout != 0:
138 |             s += ', dropout={dropout}'
139 |         if self.bidirectional is not False:
140 |             s += ', bidirectional={bidirectional}'
141 |         return s.format(**self.__dict__)
142 |     
143 |     
144 | 
145 | 
146 | 
147 |     @property
148 |     def _flat_weights(self):
149 |         return list(self._parameters.values())
150 | 
151 | 
152 |     def forward(self, input):
153 |         # input.size = (3, 33,49) : num_input or num_hidden, num_layer or step
154 |         #X = torch.squeeze(input[0]) # .size = (33,49)
155 |         #Mask = torch.squeeze(input[1]) # .size = (33,49)
156 |         #Delta = torch.squeeze(input[2]) # .size = (33,49)
157 |         X = input[:,0,:,:]
158 |         Mask = input[:,1,:,:]
159 |         Delta = input[:,2,:,:]
160 |         
161 | 
162 |         step_size = X.size(1) # 49
163 |         #print('step size : ', step_size)
164 |         
165 |         output = None
166 |         #h = Hidden_State
167 |         h = getattr(self, 'Hidden_State')
168 |         #felix - buffer system from newer pytorch version
169 |         x_mean = getattr(self, 'x_mean')
170 |         x_last_obsv = getattr(self, 'X_last_obs')
171 |         
172 | 
173 |         device = next(self.parameters()).device
174 |         output_tensor = torch.empty([X.size()[0], X.size()[2], self.output_size], dtype=X.dtype, device= device)
175 |         hidden_tensor = torch.empty(X.size()[0], X.size()[2], self.hidden_size, dtype=X.dtype, device = device)
176 | 
177 |         #iterate over seq
178 |         for timestep in range(X.size()[2]):
179 |             
180 |             #x = torch.squeeze(X[:,layer:layer+1])
181 |             #m = torch.squeeze(Mask[:,layer:layer+1])
182 |             #d = torch.squeeze(Delta[:,layer:layer+1])
183 |             x = torch.squeeze(X[:,:,timestep])
184 |             m = torch.squeeze(Mask[:,:,timestep])
185 |             d = torch.squeeze(Delta[:,:,timestep])
186 |             
187 | 
188 |             #(4)
189 |             gamma_x = torch.exp(-1* torch.nn.functional.relu( self.w_dg_x(d) ))
190 |             gamma_h = torch.exp(-1* torch.nn.functional.relu( self.w_dg_h(d) ))
191 | 
192 | 
193 |             #(5)
194 |             #standard mult handles case correctly, this should work - maybe broadcast x_mean, seems to be taking care of that anyway
195 |             
196 |             #update x_last_obsv
197 |             #print(x.size())
198 |             #print(x_last_obsv.size())
199 |             x_last_obsv = torch.where(m>0,x,x_last_obsv)
200 |             #print('after update')
201 |             #print(x_last_obsv)
202 |             x = m * x + (1 - m) * (gamma_x * x + (1 - gamma_x) * x_mean)
203 |             x = m * x + (1 - m) * (gamma_x * x_last_obsv + (1 - gamma_x) * x_mean)
204 | 
205 |             #(6)
206 |             if self.dropout == 0:
207 | 
208 |                 h = gamma_h*h
209 |                 z = torch.sigmoid( self.w_xz(x) + self.w_hz(h) + self.w_mz(m))
210 |                 r = torch.sigmoid( self.w_xr(x) + self.w_hr(h) + self.w_mr(m))
211 | 
212 |                 h_tilde = torch.tanh( self.w_xh(x) + self.w_hh( r*h ) + self.w_mh(m))
213 | 
214 | 
215 |                 h = (1 - z) * h + z * h_tilde
216 | 
217 |             #TODO: not adapted yet
218 |             elif self.dropout_type == 'Moon':
219 |                 '''
220 |                 RNNDROP: a novel dropout for rnn in asr(2015)
221 |                 '''
222 |                 h = gamma_h * h
223 | 
224 |                 z = torch.sigmoid((w_xz*x + w_hz*h + w_mz*m + b_z))
225 |                 r = torch.sigmoid((w_xr*x + w_hr*h + w_mr*m + b_r))
226 | 
227 |                 h_tilde = torch.tanh((w_xh*x + w_hh*(r * h) + w_mh*m + b_h))
228 | 
229 |                 h = (1 - z) * h + z * h_tilde
230 |                 dropout = torch.nn.Dropout(p=self.dropout)
231 |                 h = dropout(h)
232 | 
233 |             elif self.dropout_type == 'Gal':
234 |                 '''
235 |                 A Theoretically grounded application of dropout in recurrent neural networks(2015)
236 |                 '''
237 |                 dropout = torch.nn.Dropout(p=self.dropout)
238 |                 h = dropout(h)
239 | 
240 |                 h = gamma_h * h
241 | 
242 |                 z = torch.sigmoid((w_xz*x + w_hz*h + w_mz*m + b_z))
243 |                 r = torch.sigmoid((w_xr*x + w_hr*h + w_mr*m + b_r))
244 |                 h_tilde = torch.tanh((w_xh*x + w_hh*(r * h) + w_mh*m + b_h))
245 | 
246 |                 h = (1 - z) * h + z * h_tilde
247 | 
248 |             elif self.dropout_type == 'mloss':
249 |                 '''
250 |                 recurrent dropout without memory loss arXiv 1603.05118
251 |                 g = h_tilde, p = the probability to not drop a neuron
252 |                 '''
253 |                 h = gamma_h*h
254 |                 z = torch.sigmoid( self.w_xz(x) + self.w_hz(h) + self.w_mz(m))
255 |                 r = torch.sigmoid( self.w_xr(x) + self.w_hr(h) + self.w_mr(m))
256 | 
257 | 
258 |                 dropout = torch.nn.Dropout(p=self.dropout)
259 |                 h_tilde = torch.tanh( self.w_xh(x) + self.w_hh( r*h ) + self.w_mh(m))
260 | 
261 | 
262 |                 h = (1 - z) * h + z * h_tilde
263 |                 #######
264 | 
265 |             else:
266 |                 h = gamma_h * h
267 | 
268 |                 z = torch.sigmoid((w_xz*x + w_hz*h + w_mz*m + b_z))
269 |                 r = torch.sigmoid((w_xr*x + w_hr*h + w_mr*m + b_r))
270 |                 h_tilde = torch.tanh((w_xh*x + w_hh*(r * h) + w_mh*m + b_h))
271 | 
272 |                 h = (1 - z) * h + z * h_tilde
273 | 
274 |             
275 | 
276 |             step_output = self.w_hy(h)
277 |             step_output = torch.sigmoid(step_output)
278 |             output_tensor[:,timestep,:] = step_output
279 |             hidden_tensor[:,timestep,:] = h
280 |             
281 |         #if self.return_hidden:
282 |             #when i want to stack GRU-Ds, need to put the tensor back together
283 |             #output = torch.stack([hidden_tensor,Mask,Delta], dim=1)
284 |         
285 |         output = output_tensor, hidden_tensor
286 |         #else:
287 |         #    output = output_tensor
288 |         return output


--------------------------------------------------------------------------------
/src/GRUD_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pandas as pd
 4 | import os
 5 | import math
 6 | import warnings
 7 | import itertools
 8 | import numbers
 9 | import torch.utils.data as utils
10 | from GRUD_layer import GRUD_cell
11 | 
12 | 
13 | 
14 | def grud_model_old( input_size, hidden_size, output_size, num_layers=1, x_mean=0,\
15 |                 bias=True, batch_first=False, bidirectional=False, dropout_type='mloss', dropout=0):
16 | 
17 |     layer_list =[]
18 |     #intermediate layers return input size as output size
19 |     for i in range(num_layers-1): #subtract 1 bc last layer needs to be called with different params
20 |         layer = GRUD_cell(input_size = input_size, hidden_size= hidden_size, output_size=input_size, dropout=dropout, dropout_type=dropout_type, x_mean=x_mean, num_layers=num_layers, return_hidden = True)
21 |         layer_list.append(layer)
22 | 
23 |     #last layer with final output size
24 |     layer = GRUD_cell(input_size = input_size, hidden_size=hidden_size, output_size=output_size, dropout=dropout, dropout_type=dropout_type, x_mean=x_mean, num_layers=num_layers, return_hidden = False)
25 |     layer_list.append(layer)
26 | 
27 |     model = torch.nn.Sequential(*layer_list)
28 | 
29 |     return model
30 | 
31 | 
32 | #TODO: error on GPU is probably caused because hidden state of GRU is not initialized on GPU.
33 | class grud_model(torch.nn.Module):
34 |     def __init__(self,input_size, hidden_size, output_size, num_layers = 1, x_mean = 0,\
35 |      bias =True, batch_first = False, bidirectional = False, dropout_type ='mloss', dropout = 0):
36 |         super(grud_model, self).__init__()
37 | 
38 |         self.gru_d = GRUD_cell(input_size = input_size, hidden_size= hidden_size, output_size=output_size, 
39 |                 dropout=dropout, dropout_type=dropout_type, x_mean=x_mean)
40 |         self.hidden_to_output = torch.nn.Linear(hidden_size, output_size, bias=True)
41 |         self.num_layers = num_layers
42 |         self.hidden_size = hidden_size
43 | 
44 |         if self.num_layers >1:
45 |             #(batch, seq, feature)
46 |             self.gru_layers = torch.nn.GRU(input_size = hidden_size, hidden_size = hidden_size, batch_first = True, num_layers = self.num_layers -1, dropout=dropout)
47 | 
48 |     def initialize_hidden(self, batch_size):
49 |         device = next(self.parameters()).device
50 |         # The hidden state at the start are all zeros
51 |         return torch.zeros(self.num_layers-1, batch_size, self.hidden_size, device=device)
52 | 
53 |     def forward(self,input):
54 | 
55 |         #pass through GRU-D
56 |         output, hidden = self.gru_d(input)
57 |         #print(self.gru_d.return_hidden)
58 |         #output = self.gru_d(input)
59 |         #print(output.size())
60 | 
61 |         # batch_size, n_hidden, n_timesteps
62 | 
63 |         if self.num_layers >1:
64 |             #TODO remove init hidden, not necessary, auto init works fine
65 |             init_hidden = self.initialize_hidden(hidden.size()[0])
66 |             
67 | 
68 |             output, hidden = self.gru_layers(hidden)#, init_hidden)
69 |   
70 | 
71 |             output = self.hidden_to_output(output)
72 |             output = torch.sigmoid(output)
73 | 
74 |         #print("final output size passed as model result")
75 |         #print(output.size())
76 |         return output
77 | 
78 | 


--------------------------------------------------------------------------------