├── README.md ├── initial.py ├── load_data.py ├── main.py ├── models ├── CNN.py ├── LSTM.py ├── LSTM_Attn.py ├── RCNN.py ├── RNN.py ├── __pycache__ │ ├── LSTM.cpython-36.pyc │ └── LSTM.cpython-37.pyc └── selfAttention.py └── run history.txt /README.md: -------------------------------------------------------------------------------- 1 | # Text-Classification via RL Pytorch 2 | ## Description 3 | This repository contains the Pytorch implmentation of AAAI 2018 Paper "Learning Structured Representation for Text Classification via Reinforcement Learning". 4 | 5 | 6 | ## Requirements 7 | * Python==3.6.6 8 | * PyTorch==0.4.0 9 | * torchtext==0.2.3 10 | 11 | ## Downloads and Setup 12 | Once you clone this repo, run the main.py file to process the dataset and to train the model. 13 | ```shell 14 | $ python main.py 15 | ``` 16 | 17 | ## References 18 | * Learning Structured Representation for Text Classification via Reinforcement Learning : [Paper][1] 19 | 20 | 21 | [1]:https://www.microsoft.com/en-us/research/wp-content/uploads/2017/11/zhang.pdf 22 | 23 | -------------------------------------------------------------------------------- /initial.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import load_data 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import torch.optim as optim 8 | import numpy as np 9 | from models.LSTM import LSTMClassifier 10 | 11 | TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset() 12 | 13 | def clip_gradient(model, clip_value): 14 | params = list(filter(lambda p: p.grad is not None, model.parameters())) 15 | for p in params: 16 | p.grad.data.clamp_(-clip_value, clip_value) 17 | 18 | def train_model(model, train_iter, epoch): 19 | total_epoch_loss = 0 20 | total_epoch_acc = 0 21 | model.cuda() 22 | optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters())) 23 | steps = 0 24 | model.train() 25 | for idx, batch in enumerate(train_iter): 26 | text = batch.text[0] 27 | target = batch.label 28 | target = torch.autograd.Variable(target).long() 29 | if torch.cuda.is_available(): 30 | text = text.cuda() 31 | target = target.cuda() 32 | if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32. 33 | continue 34 | optim.zero_grad() 35 | prediction = model(text) 36 | loss = loss_fn(prediction, target) 37 | num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum() 38 | acc = 100.0 * num_corrects/len(batch) 39 | loss.backward() 40 | clip_gradient(model, 1e-1) 41 | optim.step() 42 | steps += 1 43 | 44 | if steps % 100 == 0: 45 | print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%') 46 | 47 | total_epoch_loss += loss.item() 48 | total_epoch_acc += acc.item() 49 | 50 | return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter) 51 | 52 | def eval_model(model, val_iter): 53 | total_epoch_loss = 0 54 | total_epoch_acc = 0 55 | model.eval() 56 | with torch.no_grad(): 57 | for idx, batch in enumerate(val_iter): 58 | text = batch.text[0] 59 | if (text.size()[0] is not 32): 60 | continue 61 | target = batch.label 62 | target = torch.autograd.Variable(target).long() 63 | if torch.cuda.is_available(): 64 | text = text.cuda() 65 | target = target.cuda() 66 | prediction = model(text) 67 | loss = loss_fn(prediction, target) 68 | num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum() 69 | acc = 100.0 * num_corrects/len(batch) 70 | total_epoch_loss += loss.item() 71 | total_epoch_acc += acc.item() 72 | 73 | return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter) 74 | 75 | 76 | learning_rate = 2e-5 77 | batch_size = 32 78 | output_size = 2 79 | hidden_size = 256 80 | embedding_length = 300 81 | 82 | model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings) 83 | loss_fn = F.cross_entropy 84 | 85 | for epoch in range(10): 86 | train_loss, train_acc = train_model(model, train_iter, epoch) 87 | val_loss, val_acc = eval_model(model, valid_iter) 88 | 89 | print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%') 90 | 91 | test_loss, test_acc = eval_model(model, test_iter) 92 | print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') 93 | 94 | ''' Let us now predict the sentiment on a single sentence just for the testing purpose. ''' 95 | test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues." 96 | test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money." 97 | 98 | test_sen1 = TEXT.preprocess(test_sen1) 99 | test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]] 100 | 101 | test_sen2 = TEXT.preprocess(test_sen2) 102 | test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]] 103 | 104 | test_sen = np.asarray(test_sen1) 105 | test_sen = torch.LongTensor(test_sen) 106 | test_tensor = Variable(test_sen, volatile=True) 107 | test_tensor = test_tensor.cuda() 108 | model.eval() 109 | output = model(test_tensor, 1) 110 | out = F.softmax(output, 1) 111 | if (torch.argmax(out[0]) == 1): 112 | print ("Sentiment: Positive") 113 | else: 114 | print ("Sentiment: Negative") 115 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import os 4 | import sys 5 | import torch 6 | from torch.nn import functional as F 7 | import numpy as np 8 | from torchtext import data 9 | from torchtext import datasets 10 | #from torchtext.vocab import Vectors, GloVe 11 | 12 | def load_dataset(test_sen=None, batch_size=32): 13 | 14 | """ 15 | tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied 16 | Field : A class that stores information about the way of preprocessing 17 | fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will 18 | dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which 19 | will pad each sequence to have a fix length of 200. 20 | 21 | build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an 22 | idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. 23 | 24 | vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. 25 | BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. 26 | 27 | """ 28 | 29 | tokenize = lambda x: x.split() 30 | TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) 31 | LABEL = data.LabelField() 32 | train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) 33 | TEXT.build_vocab(train_data, vectors="glove.6B.300d") 34 | LABEL.build_vocab(train_data) 35 | 36 | word_embeddings = TEXT.vocab.vectors 37 | print ("Length of Text Vocabulary: " + str(len(TEXT.vocab))) 38 | print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) 39 | print ("Label Length: " + str(len(LABEL.vocab))) 40 | 41 | train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data 42 | train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) 43 | 44 | '''Alternatively we can also use the default configurations''' 45 | # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) 46 | 47 | vocab_size = len(TEXT.vocab) 48 | 49 | return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter 50 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import load_data 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import torch.optim as optim 8 | import numpy as np 9 | from models.LSTM import LSTMClassifier 10 | import torch.nn as nn 11 | from copy import deepcopy 12 | import random 13 | from tqdm import tqdm 14 | 15 | learning_rate = 2e-5 16 | batch_size = 5 17 | global_batch_size = 5 18 | output_size = 2 19 | hidden_size = 300 20 | embedding_length = 300 21 | samplecnt = 5 22 | epsilon = 0.05 23 | maxlength = 200 24 | alpha = 0.1 25 | tau = 0.1 26 | delay_critic = True 27 | 28 | TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset(batch_size=batch_size) 29 | 30 | def clip_gradient(model, clip_value): 31 | params = list(filter(lambda p: p.grad is not None, model.parameters())) 32 | for p in params: 33 | p.grad.data.clamp_(-clip_value, clip_value) 34 | 35 | def Sampling_RL(actor, critic, inputs, vector, length, epsilon, Random = True): 36 | current_lower_state = torch.zeros(1,2*hidden_size).cuda() 37 | actions = [] 38 | states = [] 39 | for pos in range(length): 40 | predicted = actor.get_target_output(current_lower_state, vector[0][pos], scope = "target") 41 | states.append([current_lower_state, vector[0][pos]]) 42 | if Random: 43 | if random.random() > epsilon: 44 | action = (0 if random.random() < float(predicted[0].item()) else 1) 45 | else: 46 | action = (1 if random.random() < float(predicted[0].item()) else 0) 47 | else: 48 | action = np.argmax(predicted).item() 49 | actions.append(action) 50 | if action == 1: 51 | out_d, current_lower_state = critic.forward_lstm(current_lower_state, inputs[0][pos], scope = "target") 52 | Rinput = [] 53 | for (i, a) in enumerate(actions): 54 | if a == 1: 55 | Rinput.append(int(inputs[0][i].item())) #### 56 | Rlength = len(Rinput) 57 | #print("problem") 58 | if Rlength == 0: 59 | actions[length-2] = 1 60 | Rinput.append(inputs[0][length-2]) 61 | Rlength = 1 62 | Rinput += [1] * (maxlength - Rlength) 63 | 64 | Rinput = torch.tensor(Rinput).view(1,-1).cuda() 65 | 66 | return actions, states, Rinput, Rlength 67 | 68 | 69 | class policyNet(nn.Module): 70 | def __init__(self): 71 | super(policyNet, self).__init__() 72 | self.hidden = hidden_size 73 | self.W1 = nn.Parameter(torch.cuda.FloatTensor(2*self.hidden, 1).uniform_(-0.5, 0.5)) 74 | self.W2 = nn.Parameter(torch.cuda.FloatTensor(embedding_length, 1).uniform_(-0.5, 0.5)) 75 | self.b = nn.Parameter(torch.cuda.FloatTensor(1, 1).uniform_(-0.5, 0.5)) 76 | 77 | def forward(self, h, x): 78 | h_ = torch.matmul(h.view(1,-1), self.W1) # 1x1 79 | x_ = torch.matmul(x.view(1,-1), self.W2) # 1x1 80 | scaled_out = torch.sigmoid(h_ + x_ + self.b) # 1x1 81 | scaled_out = torch.clamp(scaled_out, min=1e-5, max=1 - 1e-5) 82 | scaled_out = torch.cat([1.0 - scaled_out, scaled_out],0) 83 | return scaled_out 84 | 85 | 86 | 87 | class critic(nn.Module): 88 | def __init__(self): 89 | super(critic, self).__init__() 90 | self.target_pred = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings) 91 | self.active_pred = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings) 92 | 93 | 94 | def forward(self, x, scope): 95 | if scope == "target": 96 | out = self.target_pred(x) 97 | if scope == "active": 98 | out = self.active_pred(x) 99 | return out 100 | 101 | def assign_target_network(self): 102 | params = [] 103 | for name, x in self.active_pred.named_parameters(): 104 | params.append(x) 105 | i=0 106 | for name, x in self.target_pred.named_parameters(): 107 | x.data = deepcopy(params[i].data) 108 | i+=1 109 | 110 | def update_target_network(self): 111 | params = [] 112 | for name, x in self.active_pred.named_parameters(): 113 | params.append(x) 114 | i=0 115 | for name, x in self.target_pred.named_parameters(): 116 | x.data = deepcopy(params[i].data * (tau) + x.data * (1-tau)) 117 | i+=1 118 | 119 | def assign_active_network(self): 120 | params = [] 121 | for name, x in self.target_pred.named_parameters(): 122 | params.append(x) 123 | i=0 124 | for name, x in self.active_pred.named_parameters(): 125 | x.data = deepcopy(params[i].data) 126 | i+=1 127 | 128 | def assign_active_network_gradients(self): 129 | params = [] 130 | for name, x in self.target_pred.named_parameters(): 131 | params.append(x) 132 | i=0 133 | for name, x in self.active_pred.named_parameters(): 134 | x.grad = deepcopy(params[i].grad) 135 | i+=1 136 | for name, x in self.target_pred.named_parameters(): 137 | x.grad = None 138 | 139 | def forward_lstm(self, hc, x, scope): 140 | if scope == "target": 141 | out, state = self.target_pred.getNextHiddenState(hc, x) 142 | if scope == "active": 143 | out, state = self.active_pred.getNextHiddenState(hc, x) 144 | return out, state 145 | 146 | def wordvector_find(self, x): 147 | return self.target_pred.wordvector_find(x) 148 | 149 | 150 | class actor(nn.Module): 151 | def __init__(self): 152 | super(actor, self).__init__() 153 | self.target_policy = policyNet() 154 | self.active_policy = policyNet() 155 | 156 | def get_target_logOutput(self, h, x): 157 | out = self.target_policy(h, x) 158 | logOut = torch.log(out) 159 | return logOut 160 | 161 | def get_target_output(self, h, x, scope): 162 | if scope == "target": 163 | out = self.target_policy(h, x) 164 | if scope == "active": 165 | out = self.active_policy(h, x) 166 | return out 167 | 168 | def get_gradient(self, h, x, reward, scope): 169 | if scope == "target": 170 | out = self.target_policy(h, x) 171 | logout = torch.log(out).view(-1) 172 | index = reward.index(0) 173 | index = (index + 1) % 2 174 | #print(out, reward, index, logout[index].view(-1), logout) 175 | #print(logout[index].view(-1)) 176 | grad = torch.autograd.grad(logout[index].view(-1), self.target_policy.parameters()) # torch.cuda.FloatTensor(reward[index]) 177 | #print(grad[0].size(), grad[1].size(), grad[2].size()) 178 | #print(grad[0], grad[1], grad[2]) 179 | grad[0].data = grad[0].data * reward[index] 180 | grad[1].data = grad[1].data * reward[index] 181 | grad[2].data = grad[2].data * reward[index] 182 | #print(grad[0], grad[1], grad[2]) 183 | return grad 184 | if scope == "active": 185 | out = self.active_policy(h, x) 186 | return out 187 | def assign_active_network_gradients(self, grad1, grad2, grad3): 188 | params = [grad1, grad2, grad3] 189 | i=0 190 | for name, x in self.active_policy.named_parameters(): 191 | x.grad = deepcopy(params[i]) 192 | i+=1 193 | 194 | def update_target_network(self): 195 | params = [] 196 | for name, x in self.active_policy.named_parameters(): 197 | params.append(x) 198 | i=0 199 | for name, x in self.target_policy.named_parameters(): 200 | x.data = deepcopy(params[i].data * (tau) + x.data * (1-tau)) 201 | i+=1 202 | 203 | def assign_active_network(self): 204 | params = [] 205 | for name, x in self.target_policy.named_parameters(): 206 | params.append(x) 207 | i=0 208 | for name, x in self.active_policy.named_parameters(): 209 | x.data = deepcopy(params[i].data) 210 | i+=1 211 | 212 | def train_model(criticModel, actorModel, train_iter, epoch, RL_train = True, LSTM_train = True): 213 | total_epoch_loss = 0 214 | total_epoch_acc = 0 215 | criticModel.cuda() 216 | actorModel.cuda() 217 | critic_target_optimizer = torch.optim.Adam(criticModel.target_pred.parameters()) 218 | critic_active_optimizer = torch.optim.Adam(criticModel.active_pred.parameters()) 219 | 220 | actor_target_optimizer = torch.optim.Adam(actorModel.target_policy.parameters()) 221 | actor_active_optimizer = torch.optim.Adam(actorModel.active_policy.parameters()) 222 | steps = 0 223 | for idx, batch in enumerate(train_iter): 224 | if idx % 100 == 0: 225 | print(idx , "/", len(train_iter)) 226 | totloss = 0. 227 | text = batch.text[0] 228 | target = batch.label 229 | lengths = batch.text[1] 230 | target = torch.autograd.Variable(target).long() 231 | pred = torch.zeros(batch_size, 2).cuda() 232 | if torch.cuda.is_available(): 233 | text = text.cuda() 234 | target = target.cuda() 235 | if (text.size()[0] is not batch_size):# One of the batch returned by BucketIterator has length different than 32. 236 | continue 237 | #if steps % 50 == 0: 238 | #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data) 239 | criticModel.assign_active_network() 240 | actorModel.assign_active_network() 241 | #if steps % 50 == 0: 242 | #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data) 243 | #print(actorModel.target_policy.W1, actorModel.active_policy.W1, "\n\n", criticModel.target_pred.label.bias, criticModel.active_pred.label.bias) 244 | avgloss = 0 245 | aveloss = 0. 246 | for i in range(batch_size): 247 | x = text[i].view(1,-1) 248 | y = target[i].view(1) 249 | length = int(lengths[i]) 250 | if RL_train: 251 | #print("RL True") 252 | criticModel.train(False) 253 | actorModel.train() 254 | actionlist, statelist, losslist = [], [], [] 255 | aveLoss = 0. 256 | for i in range(samplecnt): 257 | actions, states, Rinput, Rlength = Sampling_RL(actorModel, criticModel, x, criticModel.wordvector_find(x), length, epsilon, Random=True) 258 | ''' 259 | if (steps) % 50 == 0: 260 | criticModel.eval() 261 | actorModel.eval() 262 | act, _, _, _ = Sampling_RL(actorModel, criticModel, x, criticModel.wordvector_find(x), length, epsilon, Random=False) 263 | print(act, "\n\n") 264 | criticModel.train() 265 | actorModel.train() 266 | ''' 267 | actionlist.append(actions) 268 | statelist.append(states) 269 | out = criticModel(Rinput, scope = "target") 270 | loss_ = loss_fn(out, y) 271 | loss_ += (float(Rlength) / length) **2 *0.15 272 | aveloss += loss_ 273 | losslist.append(loss_) 274 | ''' 275 | if (steps) % 50 == 0: 276 | print("-------------------------------------------") 277 | ''' 278 | aveloss /= samplecnt 279 | totloss += aveloss 280 | grad1 = None 281 | grad2 = None 282 | grad3 = None 283 | flag = 0 284 | if LSTM_train: 285 | #print("RL and LSTM True") 286 | criticModel.train() 287 | actorModel.train() 288 | critic_active_optimizer.zero_grad() 289 | critic_target_optimizer.zero_grad() 290 | prediction = criticModel(Rinput, scope = "target") 291 | pred[i] = prediction 292 | loss = loss_fn(prediction, y) 293 | loss.backward() 294 | #print(criticModel.active_pred.label.bias.grad, criticModel.target_pred.label.bias.grad) 295 | #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias) 296 | criticModel.assign_active_network_gradients() 297 | #print(criticModel.active_pred.label.bias.grad, criticModel.target_pred.label.bias.grad) 298 | critic_active_optimizer.step() 299 | #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias) 300 | for i in range(samplecnt): 301 | for pos in range(len(actionlist[i])): 302 | rr = [0, 0] 303 | rr[actionlist[i][pos]] = ((losslist[i] - aveloss) * alpha).cpu().item() 304 | g = actorModel.get_gradient(statelist[i][pos][0], statelist[i][pos][1], rr, scope = "target") 305 | if flag == 0: 306 | grad1 = g[0] 307 | grad2 = g[1] 308 | grad3 = g[2] 309 | flag = 1 310 | else: 311 | grad1 += g[0] 312 | grad2 += g[1] 313 | grad3 += g[2] 314 | #print("++", grad3) 315 | #print("\n\n before: active: ", actorModel.active_policy.b, "target: ", actorModel.target_policy.b, "gradient to be applied: ", grad3) 316 | actor_target_optimizer.zero_grad() 317 | actor_active_optimizer.zero_grad() 318 | #print("previous grad: ", actorModel.active_policy.b.grad) 319 | actorModel.assign_active_network_gradients(grad1, grad2, grad3) 320 | actor_active_optimizer.step() 321 | #print("after: active: ", actorModel.active_policy.b, "target: ", actorModel.target_policy.b) 322 | else: 323 | #print("RL False LSTM True") 324 | criticModel.train() 325 | actorModel.train(False) 326 | critic_active_optimizer.zero_grad() 327 | critic_target_optimizer.zero_grad() 328 | prediction = criticModel(x, scope = "target") 329 | pred[i] = prediction 330 | loss = loss_fn(prediction, y) 331 | avgloss += loss.item() 332 | loss.backward() 333 | criticModel.assign_active_network_gradients() 334 | critic_active_optimizer.step() 335 | 336 | if RL_train: 337 | #print("Again RL True") 338 | criticModel.train(False) 339 | actorModel.train() 340 | #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data) 341 | actorModel.update_target_network() 342 | #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data) 343 | if LSTM_train: 344 | #print("Again RL AND LSTM True") 345 | criticModel.train() 346 | actorModel.train() 347 | #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias) 348 | criticModel.update_target_network() 349 | #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias) 350 | 351 | else: 352 | #print("Again RL False and LSTM True") 353 | criticModel.train() 354 | actorModel.train(False) 355 | criticModel.assign_target_network() 356 | avgloss /= batch_size 357 | num_corrects = (torch.max(pred, 1)[1].view(target.size()).data == target.data).float().sum() 358 | acc = 100.0 * num_corrects/len(batch) 359 | steps += 1 360 | 361 | #if steps % 50 == 0: 362 | #print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {avgloss:.4f}, Training Accuracy: {acc.item(): .2f}%') 363 | #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data) 364 | 365 | total_epoch_loss += avgloss 366 | total_epoch_acc += acc.item() 367 | 368 | return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter) 369 | 370 | 371 | def train_model_without_delay(model, train_iter, epoch): 372 | total_epoch_loss = 0 373 | total_epoch_acc = 0 374 | model.cuda() 375 | optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.target_pred.parameters())) 376 | steps = 0 377 | model.train() 378 | for idx, batch in enumerate(train_iter): 379 | text = batch.text[0] 380 | target = batch.label 381 | target = torch.autograd.Variable(target).long() 382 | if torch.cuda.is_available(): 383 | text = text.cuda() 384 | target = target.cuda() 385 | if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32. 386 | continue 387 | optim.zero_grad() 388 | prediction = model(text, scope = "target") 389 | loss = loss_fn(prediction, target) 390 | num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum() 391 | acc = 100.0 * num_corrects/len(batch) 392 | loss.backward() 393 | clip_gradient(model, 1e-1) 394 | optim.step() 395 | steps += 1 396 | 397 | if steps % 100 == 0: 398 | print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%') 399 | 400 | total_epoch_loss += loss.item() 401 | total_epoch_acc += acc.item() 402 | 403 | return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter) 404 | 405 | def eval_model(model, val_iter): 406 | total_epoch_loss = 0 407 | total_epoch_acc = 0 408 | model.eval() 409 | with torch.no_grad(): 410 | for idx, batch in enumerate(val_iter): 411 | text = batch.text[0] 412 | if (text.size()[0] is not batch_size): 413 | continue 414 | target = batch.label 415 | target = torch.autograd.Variable(target).long() 416 | if torch.cuda.is_available(): 417 | text = text.cuda() 418 | target = target.cuda() 419 | prediction = model(text, scope = "target") 420 | loss = loss_fn(prediction, target) 421 | num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum() 422 | acc = 100.0 * num_corrects/len(batch) 423 | total_epoch_loss += loss.item() 424 | total_epoch_acc += acc.item() 425 | 426 | return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter) 427 | 428 | def eval_model_RL(criticModel, actorModel, val_iter): 429 | total_epoch_loss = 0 430 | total_epoch_acc = 0 431 | criticModel.eval() 432 | actorModel.eval() 433 | with torch.no_grad(): 434 | for idx, batch in enumerate(val_iter): 435 | if idx % 100 == 0: 436 | print(idx, "/", len(val_iter)) 437 | text = batch.text[0] 438 | if (text.size()[0] is not batch_size): 439 | continue 440 | target = batch.label 441 | lengths = batch.text[1] 442 | target = torch.autograd.Variable(target).long() 443 | if torch.cuda.is_available(): 444 | text = text.cuda() 445 | target = target.cuda() 446 | batch_loss = 0 447 | pred = torch.zeros(batch_size, 2).cuda() 448 | for i in range(batch_size): 449 | x = text[i].view(1,-1) 450 | y = target[i].view(1) 451 | length = int(lengths[i]) 452 | 453 | actions, states, Rinput, Rlenth = Sampling_RL(actorModel, criticModel, x, criticModel.wordvector_find(x), length, epsilon, Random=False) 454 | #print(x, Rinput, length, Rlenth) 455 | #if (i % 50) == 0: 456 | #print(actions) 457 | prediction = criticModel(Rinput, scope = "target") 458 | loss = loss_fn(prediction, y) 459 | batch_loss += loss 460 | pred[i] = prediction 461 | num_corrects = (torch.max(pred, 1)[1].view(target.size()).data == target.data).sum() 462 | acc = 100.0 * num_corrects/len(batch) 463 | total_epoch_loss += batch_loss.item() 464 | total_epoch_acc += acc.item() 465 | 466 | return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter) 467 | 468 | criticModel = critic() 469 | actorModel = actor() 470 | 471 | actorModel.cuda() 472 | criticModel.cuda() 473 | 474 | loss_fn = F.cross_entropy 475 | best_val_acc = 0. 476 | 477 | criticModel.load_state_dict(torch.load('savedModels/critic_with_delay.pt')) 478 | _, best_val_acc = eval_model(criticModel, valid_iter) 479 | print(best_val_acc) 480 | _, best_val_acc = eval_model(criticModel, train_iter) 481 | print(best_val_acc) 482 | 483 | if delay_critic: 484 | for epoch in range(0): 485 | print("Pre-training Critic...") 486 | train_loss, train_acc = train_model(criticModel, actorModel, train_iter, epoch, RL_train = False) 487 | val_loss, val_acc = eval_model(criticModel, valid_iter) 488 | if val_acc > best_val_acc: 489 | torch.save(criticModel.state_dict(), 'savedModels/critic_with_delay.pt') 490 | best_val_acc = val_acc 491 | print("saved Model with acc: ", val_acc) 492 | print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%') 493 | else: 494 | for epoch in range(0): 495 | train_loss, train_acc = train_model_without_delay(criticModel, train_iter, epoch) 496 | val_loss, val_acc = eval_model(criticModel, valid_iter) 497 | if val_acc > best_val_acc: 498 | torch.save(criticModel.state_dict(), 'savedModels/critic_without_delay.pt') 499 | best_val_acc = val_acc 500 | print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%') 501 | 502 | #val_loss, val_acc = eval_model(criticModel, valid_iter) 503 | #test_loss, test_acc = eval_model(criticModel, test_iter) 504 | #train_loss, train_acc = eval_model(criticModel, train_iter) 505 | epoch = 0 506 | #print("LSTM Pretraining Done: ") 507 | #print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:3f}, Train Acc: {train_acc:.2f}, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}, Test. Loss: {test_loss:3f}, Test. Acc: {test_acc:.2f}%') 508 | 509 | ''' 510 | val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter) 511 | print(val_loss, val_acc) 512 | asaas 513 | ''' 514 | 515 | ''' 516 | val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter) 517 | print(f'Epoch: {epoch+1:02}, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%') 518 | asas 519 | ''' 520 | ''' 521 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay.pt')) 522 | print("Model loaded after epoch 10") 523 | print("Starting Reinforcement....") 524 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, valid_iter) 525 | print(best_val_acc2) 526 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, train_iter) 527 | print(best_val_acc2) 528 | ''' 529 | 530 | best_val_acc1 = 810.5 531 | for epoch in range(0): 532 | train_loss, train_acc = train_model(criticModel, actorModel, train_iter, epoch, LSTM_train = False) 533 | val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter) 534 | if val_acc > best_val_acc1: 535 | torch.save(actorModel.state_dict(), 'savedModels/actor_with_delay.pt') 536 | best_val_acc1 = val_acc 537 | print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%') 538 | print("Reinforcement Done!!!!") 539 | ''' 540 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay.pt')) 541 | print("Model Loaded..") 542 | 543 | val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter) 544 | print(val_acc) 545 | ''' 546 | criticModel.load_state_dict(torch.load('savedModels/critic_with_delay_joint.pt')) 547 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay_joint.pt')) 548 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, valid_iter) 549 | print(best_val_acc2) 550 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, train_iter) 551 | print(best_val_acc2) 552 | asasa 553 | for epoch in range(0): 554 | train_loss, train_acc = train_model(criticModel, actorModel, train_iter, epoch) 555 | val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter) 556 | print(val_acc) 557 | if val_acc > best_val_acc2: 558 | torch.save(actorModel.state_dict(), 'savedModels/actor_with_delay_joint.pt') 559 | torch.save(criticModel.state_dict(), 'savedModels/critic_with_delay_joint.pt') 560 | best_val_acc2 = val_acc 561 | print("----Mdoel Saved-----") 562 | 563 | 564 | criticModel.load_state_dict(torch.load('savedModels/critic_with_delay_joint.pt')) 565 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay_joint.pt')) 566 | test_loss, test_acc = eval_model_RL(criticModel, actorModel, valid_iter) 567 | print(test_acc) 568 | test_loss, test_acc = eval_model_RL(criticModel, actorModel, test_iter) 569 | print(test_acc) 570 | 571 | ''' 572 | #Let us now predict the sentiment on a single sentence just for the testing purpose 573 | 574 | test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues." 575 | test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money." 576 | 577 | test_sen1 = TEXT.preprocess(test_sen1) 578 | test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]] 579 | 580 | test_sen2 = TEXT.preprocess(test_sen2) 581 | test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]] 582 | 583 | test_sen = np.asarray(test_sen1) 584 | test_sen = torch.LongTensor(test_sen) 585 | test_tensor = Variable(test_sen, volatile=True) 586 | test_tensor = test_tensor.cuda() 587 | model.eval() 588 | output = model(test_tensor, 1) 589 | out = F.softmax(output, 1) 590 | if (torch.argmax(out[0]) == 1): 591 | print ("Sentiment: Positive") 592 | else: 593 | print ("Sentiment: Negative") 594 | ''' -------------------------------------------------------------------------------- /models/CNN.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from torch.nn import functional as F 7 | 8 | class CNN(nn.Module): 9 | def __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, weights): 10 | super(CNN, self).__init__() 11 | 12 | """ 13 | Arguments 14 | --------- 15 | batch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator 16 | output_size : 2 = (pos, neg) 17 | in_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length) 18 | out_channels : Number of output channels after convolution operation performed on the input matrix 19 | kernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated. 20 | keep_probab : Probability of retaining an activation node during dropout operation 21 | vocab_size : Size of the vocabulary containing unique words 22 | embedding_length : Embedding dimension of GloVe word embeddings 23 | weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 24 | -------- 25 | 26 | """ 27 | self.batch_size = batch_size 28 | self.output_size = output_size 29 | self.in_channels = in_channels 30 | self.out_channels = out_channels 31 | self.kernel_heights = kernel_heights 32 | self.stride = stride 33 | self.padding = padding 34 | self.vocab_size = vocab_size 35 | self.embedding_length = embedding_length 36 | 37 | self.word_embeddings = nn.Embedding(vocab_size, embedding_length) 38 | self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) 39 | self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding) 40 | self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding) 41 | self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding) 42 | self.dropout = nn.Dropout(keep_probab) 43 | self.label = nn.Linear(len(kernel_heights)*out_channels, output_size) 44 | 45 | def conv_block(self, input, conv_layer): 46 | conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1) 47 | activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1) 48 | max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels) 49 | 50 | return max_out 51 | 52 | def forward(self, input_sentences, batch_size=None): 53 | 54 | """ 55 | The idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix 56 | whose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length. 57 | We will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor 58 | and will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected 59 | to the output layers consisting two units which basically gives us the logits for both positive and negative classes. 60 | 61 | Parameters 62 | ---------- 63 | input_sentences: input_sentences of shape = (batch_size, num_sequences) 64 | batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1) 65 | 66 | Returns 67 | ------- 68 | Output of the linear layer containing logits for pos & neg class. 69 | logits.size() = (batch_size, output_size) 70 | 71 | """ 72 | 73 | input = self.word_embeddings(input_sentences) 74 | # input.size() = (batch_size, num_seq, embedding_length) 75 | input = input.unsqueeze(1) 76 | # input.size() = (batch_size, 1, num_seq, embedding_length) 77 | max_out1 = self.conv_block(input, self.conv1) 78 | max_out2 = self.conv_block(input, self.conv2) 79 | max_out3 = self.conv_block(input, self.conv3) 80 | 81 | all_out = torch.cat((max_out1, max_out2, max_out3), 1) 82 | # all_out.size() = (batch_size, num_kernels*out_channels) 83 | fc_in = self.dropout(all_out) 84 | # fc_in.size()) = (batch_size, num_kernels*out_channels) 85 | logits = self.label(fc_in) 86 | 87 | return logits 88 | -------------------------------------------------------------------------------- /models/LSTM.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from torch.nn import functional as F 7 | 8 | class LSTMClassifier(nn.Module): 9 | def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights): 10 | super(LSTMClassifier, self).__init__() 11 | 12 | """ 13 | Arguments 14 | --------- 15 | batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator 16 | output_size : 2 = (pos, neg) 17 | hidden_sie : Size of the hidden_state of the LSTM 18 | vocab_size : Size of the vocabulary containing unique words 19 | embedding_length : Embeddding dimension of GloVe word embeddings 20 | weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 21 | 22 | """ 23 | 24 | self.batch_size = batch_size 25 | self.output_size = output_size 26 | self.hidden_size = hidden_size 27 | self.vocab_size = vocab_size 28 | self.embedding_length = embedding_length 29 | 30 | self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table. 31 | self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding. 32 | self.lstm = nn.LSTM(embedding_length, hidden_size) 33 | self.label = nn.Linear(hidden_size, output_size) 34 | 35 | def forward(self, input_sentence, batch_size=None): 36 | 37 | """ 38 | Parameters 39 | ---------- 40 | input_sentence: input_sentence of shape = (batch_size, num_sequences) 41 | batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1) 42 | 43 | Returns 44 | ------- 45 | Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM 46 | final_output.shape = (batch_size, output_size) 47 | 48 | """ 49 | 50 | ''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.''' 51 | input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length) 52 | input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length) 53 | batch_size = input.size(1) 54 | if batch_size is None: 55 | h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM 56 | c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM 57 | else: 58 | h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda()) 59 | c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda()) 60 | output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) 61 | final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size) 62 | 63 | return final_output 64 | 65 | def wordvector_find(self, x): 66 | return self.word_embeddings(x) 67 | 68 | def getNextHiddenState(self, hc, x): 69 | hidden = hc[0,0:self.hidden_size].view(1,1,self.hidden_size) 70 | cell = hc[0,self.hidden_size:].view(1,1,self.hidden_size) 71 | input = self.word_embeddings(x).view(1,1,-1) 72 | out, hidden = self.lstm(input, [hidden, cell]) 73 | hidden = torch.cat([hidden[0], hidden[1]], -1).view(1, -1) 74 | return out, hidden -------------------------------------------------------------------------------- /models/LSTM_Attn.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from torch.nn import functional as F 7 | import numpy as np 8 | 9 | class AttentionModel(torch.nn.Module): 10 | def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights): 11 | super(AttentionModel, self).__init__() 12 | 13 | """ 14 | Arguments 15 | --------- 16 | batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator 17 | output_size : 2 = (pos, neg) 18 | hidden_sie : Size of the hidden_state of the LSTM 19 | vocab_size : Size of the vocabulary containing unique words 20 | embedding_length : Embeddding dimension of GloVe word embeddings 21 | weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 22 | 23 | -------- 24 | 25 | """ 26 | 27 | self.batch_size = batch_size 28 | self.output_size = output_size 29 | self.hidden_size = hidden_size 30 | self.vocab_size = vocab_size 31 | self.embedding_length = embedding_length 32 | 33 | self.word_embeddings = nn.Embedding(vocab_size, embedding_length) 34 | self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False) 35 | self.lstm = nn.LSTM(embedding_length, hidden_size) 36 | self.label = nn.Linear(hidden_size, output_size) 37 | #self.attn_fc_layer = nn.Linear() 38 | 39 | def attention_net(self, lstm_output, final_state): 40 | 41 | """ 42 | Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention to compute soft alignment score corresponding 43 | between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm for the batch matrix multiplication. 44 | 45 | Arguments 46 | --------- 47 | 48 | lstm_output : Final output of the LSTM which contains hidden layer outputs for each sequence. 49 | final_state : Final time-step hidden state (h_n) of the LSTM 50 | 51 | --------- 52 | 53 | Returns : It performs attention mechanism by first computing weights for each of the sequence present in lstm_output and and then finally computing the 54 | new hidden state. 55 | 56 | Tensor Size : 57 | hidden.size() = (batch_size, hidden_size) 58 | attn_weights.size() = (batch_size, num_seq) 59 | soft_attn_weights.size() = (batch_size, num_seq) 60 | new_hidden_state.size() = (batch_size, hidden_size) 61 | 62 | """ 63 | 64 | hidden = final_state.squeeze(0) 65 | attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2) 66 | soft_attn_weights = F.softmax(attn_weights, 1) 67 | new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2) 68 | 69 | return new_hidden_state 70 | 71 | def forward(self, input_sentences, batch_size=None): 72 | 73 | """ 74 | Parameters 75 | ---------- 76 | input_sentence: input_sentence of shape = (batch_size, num_sequences) 77 | batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1) 78 | 79 | Returns 80 | ------- 81 | Output of the linear layer containing logits for pos & neg class which receives its input as the new_hidden_state which is basically the output of the Attention network. 82 | final_output.shape = (batch_size, output_size) 83 | 84 | """ 85 | 86 | input = self.word_embeddings(input_sentences) 87 | input = input.permute(1, 0, 2) 88 | if batch_size is None: 89 | h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) 90 | c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) 91 | else: 92 | h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda()) 93 | c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda()) 94 | 95 | output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) # final_hidden_state.size() = (1, batch_size, hidden_size) 96 | output = output.permute(1, 0, 2) # output.size() = (batch_size, num_seq, hidden_size) 97 | 98 | attn_output = self.attention_net(output, final_hidden_state) 99 | logits = self.label(attn_output) 100 | 101 | return logits 102 | -------------------------------------------------------------------------------- /models/RCNN.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from torch.nn import functional as F 7 | 8 | class RCNN(nn.Module): 9 | def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights): 10 | super(RCNN, self).__init__() 11 | 12 | """ 13 | Arguments 14 | --------- 15 | batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator 16 | output_size : 2 = (pos, neg) 17 | hidden_sie : Size of the hidden_state of the LSTM 18 | vocab_size : Size of the vocabulary containing unique words 19 | embedding_length : Embedding dimension of GloVe word embeddings 20 | weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 21 | 22 | """ 23 | 24 | self.batch_size = batch_size 25 | self.output_size = output_size 26 | self.hidden_size = hidden_size 27 | self.vocab_size = vocab_size 28 | self.embedding_length = embedding_length 29 | 30 | self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table. 31 | self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding. 32 | self.dropout = 0.8 33 | self.lstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True) 34 | self.W2 = nn.Linear(2*hidden_size+embedding_length, hidden_size) 35 | self.label = nn.Linear(hidden_size, output_size) 36 | 37 | def forward(self, input_sentence, batch_size=None): 38 | 39 | """ 40 | Parameters 41 | ---------- 42 | input_sentence: input_sentence of shape = (batch_size, num_sequences) 43 | batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1) 44 | 45 | Returns 46 | ------- 47 | Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM 48 | final_output.shape = (batch_size, output_size) 49 | 50 | """ 51 | 52 | """ 53 | 54 | The idea of the paper "Recurrent Convolutional Neural Networks for Text Classification" is that we pass the embedding vector 55 | of the text sequences through a bidirectional LSTM and then for each sequence, our final embedding vector is the concatenation of 56 | its own GloVe embedding and the left and right contextual embedding which in bidirectional LSTM is same as the corresponding hidden 57 | state. This final embedding is passed through a linear layer which maps this long concatenated encoding vector back to the hidden_size 58 | vector. After this step, we use a max pooling layer across all sequences of texts. This converts any varying length text into a fixed 59 | dimension tensor of size (batch_size, hidden_size) and finally we map this to the output layer. 60 | 61 | """ 62 | input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length) 63 | input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length) 64 | if batch_size is None: 65 | h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM 66 | c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM 67 | else: 68 | h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda()) 69 | c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda()) 70 | 71 | output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) 72 | 73 | final_encoding = torch.cat((output, input), 2).permute(1, 0, 2) 74 | y = self.W2(final_encoding) # y.size() = (batch_size, num_sequences, hidden_size) 75 | y = y.permute(0, 2, 1) # y.size() = (batch_size, hidden_size, num_sequences) 76 | y = F.max_pool1d(y, y.size()[2]) # y.size() = (batch_size, hidden_size, 1) 77 | y = y.squeeze(2) 78 | logits = self.label(y) 79 | 80 | return logits 81 | -------------------------------------------------------------------------------- /models/RNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | 6 | class RNN(nn.Module): 7 | def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights): 8 | super(RNN, self).__init__() 9 | 10 | """ 11 | Arguments 12 | --------- 13 | batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator 14 | output_size : 2 = (pos, neg) 15 | hidden_sie : Size of the hidden_state of the LSTM 16 | vocab_size : Size of the vocabulary containing unique words 17 | embedding_length : Embeddding dimension of GloVe word embeddings 18 | weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 19 | 20 | """ 21 | 22 | self.batch_size = batch_size 23 | self.output_size = output_size 24 | self.hidden_size = hidden_size 25 | self.vocab_size = vocab_size 26 | self.embedding_length = embedding_length 27 | 28 | self.word_embeddings = nn.Embedding(vocab_size, embedding_length) 29 | self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) 30 | self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True) 31 | self.label = nn.Linear(4*hidden_size, output_size) 32 | 33 | def forward(self, input_sentences, batch_size=None): 34 | 35 | """ 36 | Parameters 37 | ---------- 38 | input_sentence: input_sentence of shape = (batch_size, num_sequences) 39 | batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1) 40 | 41 | Returns 42 | ------- 43 | Output of the linear layer containing logits for pos & neg class which receives its input as the final_hidden_state of RNN. 44 | logits.size() = (batch_size, output_size) 45 | 46 | """ 47 | 48 | input = self.word_embeddings(input_sentences) 49 | input = input.permute(1, 0, 2) 50 | if batch_size is None: 51 | h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda()) # 4 = num_layers*num_directions 52 | else: 53 | h_0 = Variable(torch.zeros(4, batch_size, self.hidden_size).cuda()) 54 | output, h_n = self.rnn(input, h_0) 55 | # h_n.size() = (4, batch_size, hidden_size) 56 | h_n = h_n.permute(1, 0, 2) # h_n.size() = (batch_size, 4, hidden_size) 57 | h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2]) 58 | # h_n.size() = (batch_size, 4*hidden_size) 59 | logits = self.label(h_n) # logits.size() = (batch_size, output_size) 60 | 61 | return logits 62 | -------------------------------------------------------------------------------- /models/__pycache__/LSTM.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navid5792/ID-LSTM-pytorch/30f457f8c37f4b60f9a17b221f718a07e8a25ff3/models/__pycache__/LSTM.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/LSTM.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navid5792/ID-LSTM-pytorch/30f457f8c37f4b60f9a17b221f718a07e8a25ff3/models/__pycache__/LSTM.cpython-37.pyc -------------------------------------------------------------------------------- /models/selfAttention.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from torch.nn import functional as F 7 | 8 | class SelfAttention(nn.Module): 9 | def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights): 10 | super(SelfAttention, self).__init__() 11 | 12 | """ 13 | Arguments 14 | --------- 15 | batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator 16 | output_size : 2 = (pos, neg) 17 | hidden_sie : Size of the hidden_state of the LSTM 18 | vocab_size : Size of the vocabulary containing unique words 19 | embedding_length : Embeddding dimension of GloVe word embeddings 20 | weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 21 | 22 | -------- 23 | 24 | """ 25 | 26 | self.batch_size = batch_size 27 | self.output_size = output_size 28 | self.hidden_size = hidden_size 29 | self.vocab_size = vocab_size 30 | self.embedding_length = embedding_length 31 | self.weights = weights 32 | 33 | self.word_embeddings = nn.Embedding(vocab_size, embedding_length) 34 | self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False) 35 | self.dropout = 0.8 36 | self.bilstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True) 37 | # We will use da = 350, r = 30 & penalization_coeff = 1 as per given in the self-attention original ICLR paper 38 | self.W_s1 = nn.Linear(2*hidden_size, 350) 39 | self.W_s2 = nn.Linear(350, 30) 40 | self.fc_layer = nn.Linear(30*2*hidden_size, 2000) 41 | self.label = nn.Linear(2000, output_size) 42 | 43 | def attention_net(self, lstm_output): 44 | 45 | """ 46 | Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an 47 | encoding of the inout sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of 48 | the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully 49 | connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e., 50 | pos & neg. 51 | 52 | Arguments 53 | --------- 54 | 55 | lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network. 56 | --------- 57 | 58 | Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give 59 | attention to different parts of the input sentence. 60 | 61 | Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size) 62 | attn_weight_matrix.size() = (batch_size, 30, num_seq) 63 | 64 | """ 65 | attn_weight_matrix = self.W_s2(F.tanh(self.W_s1(lstm_output))) 66 | attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1) 67 | attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2) 68 | 69 | return attn_weight_matrix 70 | 71 | def forward(self, input_sentences, batch_size=None): 72 | 73 | """ 74 | Parameters 75 | ---------- 76 | input_sentence: input_sentence of shape = (batch_size, num_sequences) 77 | batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1) 78 | 79 | Returns 80 | ------- 81 | Output of the linear layer containing logits for pos & neg class. 82 | 83 | """ 84 | 85 | input = self.word_embeddings(input_sentences) 86 | input = input.permute(1, 0, 2) 87 | if batch_size is None: 88 | h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) 89 | c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) 90 | else: 91 | h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda()) 92 | c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda()) 93 | 94 | output, (h_n, c_n) = self.bilstm(input, (h_0, c_0)) 95 | output = output.permute(1, 0, 2) 96 | # output.size() = (batch_size, num_seq, 2*hidden_size) 97 | # h_n.size() = (1, batch_size, hidden_size) 98 | # c_n.size() = (1, batch_size, hidden_size) 99 | attn_weight_matrix = self.attention_net(output) 100 | # attn_weight_matrix.size() = (batch_size, r, num_seq) 101 | # output.size() = (batch_size, num_seq, 2*hidden_size) 102 | hidden_matrix = torch.bmm(attn_weight_matrix, output) 103 | # hidden_matrix.size() = (batch_size, r, 2*hidden_size) 104 | # Let's now concatenate the hidden_matrix and connect it to the fully connected layer. 105 | fc_out = self.fc_layer(hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2])) 106 | logits = self.label(fc_out) 107 | # logits.size() = (batch_size, output_size) 108 | 109 | return logits -------------------------------------------------------------------------------- /run history.txt: -------------------------------------------------------------------------------- 1 | without delay 82.40% 2 | 3 | Just Critic Val. Loss: 0.253031, Val. Acc: 90.45, Test. Loss: 0.427570, Test. Acc: 82.20% 4 | 5 | Just Actor Val. Acc: 81.54, Test. Acc: 78.456 6 | 7 | Critic + Actor Train, 8 | Save models based on just critic 84.20 79.692 9 | 10 | Critc was trained for 10 epochs 11 | actor was trained for 10 epochs 12 | 13 | proper way after 10 epochs 88.94 82.068 14 | 15 | proper way after 30 epochs 94.76 83.568 16 | 17 | 18 | usually delay works better for smaller batch sizes: verified with TF code 19 | Epoch: 30 20 | BS 5 - 80.8 21 | BS 50 - 76.2 22 | 23 | 24 | 25 | Tensorflow 26 | 27 | train 4000 28 | test 1000 29 | validation 500 30 | 31 | delay with BS 5 87.0 loss 0.10 32 | delay with BS 1 83.0 loss 0.10 33 | immediate update target with BS 5 which means BS 1 88.0 34 | 35 | 36 | 37 | Just critic train 38 | valid 90.57333333333334 39 | train 90.56571428571428 40 | 41 | Just actor train 42 | valid 81.17 43 | train 81.32 44 | 45 | critic and actor train 46 | valid 94.73 47 | train 94.90 48 | --------------------------------------------------------------------------------