├── README.md
├── initial.py
├── load_data.py
├── main.py
├── models
    ├── CNN.py
    ├── LSTM.py
    ├── LSTM_Attn.py
    ├── RCNN.py
    ├── RNN.py
    ├── __pycache__
    │   ├── LSTM.cpython-36.pyc
    │   └── LSTM.cpython-37.pyc
    └── selfAttention.py
└── run history.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # Text-Classification via RL Pytorch
 2 | ## Description
 3 | This repository contains the Pytorch implmentation of AAAI 2018 Paper "Learning Structured Representation for Text Classification via Reinforcement Learning".
 4 | 
 5 | 
 6 | ## Requirements
 7 |   * Python==3.6.6
 8 |   * PyTorch==0.4.0
 9 |   * torchtext==0.2.3
10 | 
11 | ## Downloads and Setup
12 | Once you clone this repo, run the main.py file to process the dataset and to train the model.
13 | ```shell
14 | $ python main.py
15 | ```
16 | 
17 | ## References
18 |   * Learning Structured Representation for Text Classification via Reinforcement Learning : [Paper][1]
19 | 
20 | 
21 | [1]:https://www.microsoft.com/en-us/research/wp-content/uploads/2017/11/zhang.pdf
22 | 
23 | 


--------------------------------------------------------------------------------
/initial.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import load_data
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from models.LSTM import LSTMClassifier
 10 | 
 11 | TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset()
 12 | 
 13 | def clip_gradient(model, clip_value):
 14 |     params = list(filter(lambda p: p.grad is not None, model.parameters()))
 15 |     for p in params:
 16 |         p.grad.data.clamp_(-clip_value, clip_value)
 17 |     
 18 | def train_model(model, train_iter, epoch):
 19 |     total_epoch_loss = 0
 20 |     total_epoch_acc = 0
 21 |     model.cuda()
 22 |     optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
 23 |     steps = 0
 24 |     model.train()
 25 |     for idx, batch in enumerate(train_iter):
 26 |         text = batch.text[0]
 27 |         target = batch.label
 28 |         target = torch.autograd.Variable(target).long()
 29 |         if torch.cuda.is_available():
 30 |             text = text.cuda()
 31 |             target = target.cuda()
 32 |         if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
 33 |             continue
 34 |         optim.zero_grad()
 35 |         prediction = model(text)
 36 |         loss = loss_fn(prediction, target)
 37 |         num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
 38 |         acc = 100.0 * num_corrects/len(batch)
 39 |         loss.backward()
 40 |         clip_gradient(model, 1e-1)
 41 |         optim.step()
 42 |         steps += 1
 43 |         
 44 |         if steps % 100 == 0:
 45 |             print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
 46 |         
 47 |         total_epoch_loss += loss.item()
 48 |         total_epoch_acc += acc.item()
 49 |         
 50 |     return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)
 51 | 
 52 | def eval_model(model, val_iter):
 53 |     total_epoch_loss = 0
 54 |     total_epoch_acc = 0
 55 |     model.eval()
 56 |     with torch.no_grad():
 57 |         for idx, batch in enumerate(val_iter):
 58 |             text = batch.text[0]
 59 |             if (text.size()[0] is not 32):
 60 |                 continue
 61 |             target = batch.label
 62 |             target = torch.autograd.Variable(target).long()
 63 |             if torch.cuda.is_available():
 64 |                 text = text.cuda()
 65 |                 target = target.cuda()
 66 |             prediction = model(text)
 67 |             loss = loss_fn(prediction, target)
 68 |             num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
 69 |             acc = 100.0 * num_corrects/len(batch)
 70 |             total_epoch_loss += loss.item()
 71 |             total_epoch_acc += acc.item()
 72 | 
 73 |     return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)
 74 | 	
 75 | 
 76 | learning_rate = 2e-5
 77 | batch_size = 32
 78 | output_size = 2
 79 | hidden_size = 256
 80 | embedding_length = 300
 81 | 
 82 | model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
 83 | loss_fn = F.cross_entropy
 84 | 
 85 | for epoch in range(10):
 86 |     train_loss, train_acc = train_model(model, train_iter, epoch)
 87 |     val_loss, val_acc = eval_model(model, valid_iter)
 88 |     
 89 |     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
 90 |     
 91 | test_loss, test_acc = eval_model(model, test_iter)
 92 | print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')
 93 | 
 94 | ''' Let us now predict the sentiment on a single sentence just for the testing purpose. '''
 95 | test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues."
 96 | test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money."
 97 | 
 98 | test_sen1 = TEXT.preprocess(test_sen1)
 99 | test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]
100 | 
101 | test_sen2 = TEXT.preprocess(test_sen2)
102 | test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]
103 | 
104 | test_sen = np.asarray(test_sen1)
105 | test_sen = torch.LongTensor(test_sen)
106 | test_tensor = Variable(test_sen, volatile=True)
107 | test_tensor = test_tensor.cuda()
108 | model.eval()
109 | output = model(test_tensor, 1)
110 | out = F.softmax(output, 1)
111 | if (torch.argmax(out[0]) == 1):
112 |     print ("Sentiment: Positive")
113 | else:
114 |     print ("Sentiment: Negative")
115 | 


--------------------------------------------------------------------------------
/load_data.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import os
 4 | import sys
 5 | import torch
 6 | from torch.nn import functional as F
 7 | import numpy as np
 8 | from torchtext import data
 9 | from torchtext import datasets
10 | #from torchtext.vocab import Vectors, GloVe
11 | 
12 | def load_dataset(test_sen=None, batch_size=32):
13 |     
14 |     """
15 |     tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
16 |     Field : A class that stores information about the way of preprocessing
17 |     fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
18 |                  dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
19 |                  will pad each sequence to have a fix length of 200.
20 |                  
21 |     build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
22 |                   idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
23 |                   
24 |     vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
25 |     BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
26 |     
27 |     """
28 |     
29 |     tokenize = lambda x: x.split()
30 |     TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
31 |     LABEL = data.LabelField()
32 |     train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
33 |     TEXT.build_vocab(train_data, vectors="glove.6B.300d")
34 |     LABEL.build_vocab(train_data)
35 | 
36 |     word_embeddings = TEXT.vocab.vectors
37 |     print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
38 |     print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
39 |     print ("Label Length: " + str(len(LABEL.vocab)))
40 | 
41 |     train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
42 |     train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
43 | 
44 |     '''Alternatively we can also use the default configurations'''
45 |     # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)
46 | 
47 |     vocab_size = len(TEXT.vocab)
48 | 
49 |     return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
50 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import load_data
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from models.LSTM import LSTMClassifier
 10 | import torch.nn as nn
 11 | from copy import deepcopy
 12 | import random
 13 | from tqdm import tqdm
 14 | 
 15 | learning_rate = 2e-5
 16 | batch_size = 5
 17 | global_batch_size = 5
 18 | output_size = 2
 19 | hidden_size = 300
 20 | embedding_length = 300
 21 | samplecnt = 5
 22 | epsilon = 0.05
 23 | maxlength = 200
 24 | alpha = 0.1
 25 | tau = 0.1
 26 | delay_critic = True
 27 | 
 28 | TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset(batch_size=batch_size)
 29 | 
 30 | def clip_gradient(model, clip_value):
 31 |     params = list(filter(lambda p: p.grad is not None, model.parameters()))
 32 |     for p in params:
 33 |         p.grad.data.clamp_(-clip_value, clip_value)
 34 | 
 35 | def Sampling_RL(actor, critic, inputs, vector, length, epsilon, Random = True):
 36 |     current_lower_state = torch.zeros(1,2*hidden_size).cuda()
 37 |     actions = []
 38 |     states = []
 39 |     for pos in range(length):
 40 |         predicted = actor.get_target_output(current_lower_state, vector[0][pos], scope = "target")
 41 |         states.append([current_lower_state, vector[0][pos]])
 42 |         if Random:
 43 |             if random.random() > epsilon:
 44 |                 action = (0 if random.random() < float(predicted[0].item()) else 1)
 45 |             else:
 46 |                 action = (1 if random.random() < float(predicted[0].item()) else 0)
 47 |         else:
 48 |             action = np.argmax(predicted).item()
 49 |         actions.append(action)
 50 |         if action == 1:
 51 |             out_d, current_lower_state = critic.forward_lstm(current_lower_state, inputs[0][pos], scope = "target")
 52 |     Rinput = []
 53 |     for (i, a) in enumerate(actions):
 54 |         if a == 1:
 55 |             Rinput.append(int(inputs[0][i].item())) ####
 56 |     Rlength = len(Rinput)
 57 |     #print("problem")
 58 |     if Rlength == 0:
 59 |         actions[length-2] = 1
 60 |         Rinput.append(inputs[0][length-2])
 61 |         Rlength = 1
 62 |     Rinput += [1] * (maxlength - Rlength)
 63 | 
 64 |     Rinput = torch.tensor(Rinput).view(1,-1).cuda()
 65 |     
 66 |     return actions, states, Rinput, Rlength
 67 | 
 68 | 
 69 | class policyNet(nn.Module):
 70 |     def __init__(self):
 71 |         super(policyNet, self).__init__()
 72 |         self.hidden = hidden_size
 73 |         self.W1 = nn.Parameter(torch.cuda.FloatTensor(2*self.hidden, 1).uniform_(-0.5, 0.5)) 
 74 |         self.W2 = nn.Parameter(torch.cuda.FloatTensor(embedding_length, 1).uniform_(-0.5, 0.5)) 
 75 |         self.b = nn.Parameter(torch.cuda.FloatTensor(1, 1).uniform_(-0.5, 0.5))
 76 | 
 77 |     def forward(self, h, x):
 78 |         h_ = torch.matmul(h.view(1,-1), self.W1) # 1x1
 79 |         x_ = torch.matmul(x.view(1,-1), self.W2) # 1x1
 80 |         scaled_out = torch.sigmoid(h_ +  x_ + self.b) # 1x1
 81 |         scaled_out = torch.clamp(scaled_out, min=1e-5, max=1 - 1e-5)
 82 |         scaled_out = torch.cat([1.0 - scaled_out, scaled_out],0)
 83 |         return scaled_out
 84 | 
 85 | 
 86 | 
 87 | class critic(nn.Module):
 88 |     def __init__(self):
 89 |         super(critic, self).__init__()
 90 |         self.target_pred = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
 91 |         self.active_pred = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
 92 | 
 93 | 
 94 |     def forward(self, x, scope):
 95 |         if scope == "target":
 96 |             out = self.target_pred(x)
 97 |         if scope == "active":
 98 |             out = self.active_pred(x)
 99 |         return out
100 | 
101 |     def assign_target_network(self):
102 |         params = []
103 |         for name, x in self.active_pred.named_parameters():
104 |             params.append(x)
105 |         i=0
106 |         for name, x in self.target_pred.named_parameters():
107 |             x.data = deepcopy(params[i].data)
108 |             i+=1
109 | 
110 |     def update_target_network(self):
111 |         params = []
112 |         for name, x in self.active_pred.named_parameters():
113 |             params.append(x)
114 |         i=0
115 |         for name, x in self.target_pred.named_parameters():
116 |             x.data = deepcopy(params[i].data * (tau) + x.data * (1-tau))
117 |             i+=1
118 | 
119 |     def assign_active_network(self):
120 |         params = []
121 |         for name, x in self.target_pred.named_parameters():
122 |             params.append(x)
123 |         i=0
124 |         for name, x in self.active_pred.named_parameters():
125 |             x.data = deepcopy(params[i].data)
126 |             i+=1
127 | 
128 |     def assign_active_network_gradients(self):
129 |         params = []
130 |         for name, x in self.target_pred.named_parameters():
131 |             params.append(x)
132 |         i=0
133 |         for name, x in self.active_pred.named_parameters():
134 |             x.grad = deepcopy(params[i].grad)
135 |             i+=1
136 |         for name, x in self.target_pred.named_parameters():
137 |             x.grad = None
138 | 
139 |     def forward_lstm(self, hc, x, scope):
140 |         if scope == "target":
141 |             out, state = self.target_pred.getNextHiddenState(hc, x)
142 |         if scope == "active":
143 |             out, state = self.active_pred.getNextHiddenState(hc, x)
144 |         return out, state
145 | 
146 |     def wordvector_find(self, x):
147 |         return self.target_pred.wordvector_find(x)
148 | 
149 | 
150 | class actor(nn.Module):
151 |     def __init__(self):
152 |         super(actor, self).__init__()
153 |         self.target_policy = policyNet()
154 |         self.active_policy = policyNet()
155 |         
156 |     def get_target_logOutput(self, h, x):
157 |         out = self.target_policy(h, x)
158 |         logOut = torch.log(out)
159 |         return logOut
160 | 
161 |     def get_target_output(self, h, x, scope):
162 |         if scope == "target":
163 |             out = self.target_policy(h, x)
164 |         if scope == "active":
165 |             out = self.active_policy(h, x)
166 |         return out
167 | 
168 |     def get_gradient(self, h, x, reward, scope):
169 |         if scope == "target":
170 |             out = self.target_policy(h, x)
171 |             logout = torch.log(out).view(-1)
172 |             index = reward.index(0)
173 |             index = (index + 1) % 2
174 |             #print(out, reward, index, logout[index].view(-1), logout)
175 |             #print(logout[index].view(-1))
176 |             grad = torch.autograd.grad(logout[index].view(-1), self.target_policy.parameters()) # torch.cuda.FloatTensor(reward[index])
177 |             #print(grad[0].size(), grad[1].size(), grad[2].size())
178 |             #print(grad[0], grad[1], grad[2])
179 |             grad[0].data = grad[0].data * reward[index]
180 |             grad[1].data = grad[1].data * reward[index]
181 |             grad[2].data = grad[2].data * reward[index]
182 |             #print(grad[0], grad[1], grad[2])
183 |             return grad
184 |         if scope == "active":
185 |             out = self.active_policy(h, x)
186 |         return out
187 |     def assign_active_network_gradients(self, grad1, grad2, grad3):
188 |         params = [grad1, grad2, grad3]    
189 |         i=0
190 |         for name, x in self.active_policy.named_parameters():
191 |             x.grad = deepcopy(params[i])
192 |             i+=1
193 | 
194 |     def update_target_network(self):
195 |         params = []
196 |         for name, x in self.active_policy.named_parameters():
197 |             params.append(x)
198 |         i=0
199 |         for name, x in self.target_policy.named_parameters():
200 |             x.data = deepcopy(params[i].data * (tau) + x.data * (1-tau))
201 |             i+=1
202 | 
203 |     def assign_active_network(self):
204 |         params = []
205 |         for name, x in self.target_policy.named_parameters():
206 |             params.append(x)
207 |         i=0
208 |         for name, x in self.active_policy.named_parameters():
209 |             x.data = deepcopy(params[i].data)
210 |             i+=1
211 | 
212 | def train_model(criticModel, actorModel, train_iter, epoch, RL_train = True, LSTM_train = True):
213 |     total_epoch_loss = 0
214 |     total_epoch_acc = 0
215 |     criticModel.cuda()
216 |     actorModel.cuda()
217 |     critic_target_optimizer = torch.optim.Adam(criticModel.target_pred.parameters())
218 |     critic_active_optimizer = torch.optim.Adam(criticModel.active_pred.parameters())
219 | 
220 |     actor_target_optimizer = torch.optim.Adam(actorModel.target_policy.parameters())
221 |     actor_active_optimizer = torch.optim.Adam(actorModel.active_policy.parameters())
222 |     steps = 0
223 |     for idx, batch in enumerate(train_iter):
224 |         if idx % 100 == 0:
225 |             print(idx , "/", len(train_iter))
226 |         totloss = 0.
227 |         text = batch.text[0]
228 |         target = batch.label
229 |         lengths = batch.text[1]
230 |         target = torch.autograd.Variable(target).long()
231 |         pred = torch.zeros(batch_size, 2).cuda()
232 |         if torch.cuda.is_available():
233 |             text = text.cuda()
234 |             target = target.cuda()
235 |         if (text.size()[0] is not batch_size):# One of the batch returned by BucketIterator has length different than 32.
236 |             continue
237 |         #if steps % 50 == 0:
238 |             #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data)
239 |         criticModel.assign_active_network()
240 |         actorModel.assign_active_network()
241 |         #if steps % 50 == 0:
242 |             #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data)
243 |         #print(actorModel.target_policy.W1, actorModel.active_policy.W1, "\n\n", criticModel.target_pred.label.bias, criticModel.active_pred.label.bias)
244 |         avgloss = 0
245 |         aveloss = 0.
246 |         for i in range(batch_size):
247 |             x = text[i].view(1,-1)
248 |             y = target[i].view(1)
249 |             length = int(lengths[i])
250 |             if RL_train:
251 |                 #print("RL True")
252 |                 criticModel.train(False)
253 |                 actorModel.train()
254 |                 actionlist, statelist, losslist = [], [], []
255 |                 aveLoss = 0.
256 |                 for i in range(samplecnt):
257 |                     actions, states, Rinput, Rlength = Sampling_RL(actorModel, criticModel, x, criticModel.wordvector_find(x), length, epsilon, Random=True)
258 |                     '''
259 |                     if (steps) % 50 == 0:
260 |                         criticModel.eval()
261 |                         actorModel.eval()
262 |                         act, _, _, _ = Sampling_RL(actorModel, criticModel, x, criticModel.wordvector_find(x), length, epsilon, Random=False)
263 |                         print(act, "\n\n")
264 |                         criticModel.train()
265 |                         actorModel.train()
266 |                     '''
267 |                     actionlist.append(actions)
268 |                     statelist.append(states)
269 |                     out = criticModel(Rinput, scope = "target")
270 |                     loss_ = loss_fn(out, y)
271 |                     loss_ += (float(Rlength) / length) **2 *0.15
272 |                     aveloss += loss_
273 |                     losslist.append(loss_)
274 |                 '''
275 |                 if (steps) % 50 == 0:
276 |                     print("-------------------------------------------")
277 |                 '''
278 |                 aveloss /= samplecnt
279 |                 totloss += aveloss
280 |                 grad1 = None
281 |                 grad2 = None
282 |                 grad3 = None
283 |                 flag = 0 
284 |                 if LSTM_train:
285 |                     #print("RL and LSTM True")
286 |                     criticModel.train()
287 |                     actorModel.train()  
288 |                     critic_active_optimizer.zero_grad()
289 |                     critic_target_optimizer.zero_grad()
290 |                     prediction = criticModel(Rinput, scope = "target")
291 |                     pred[i] = prediction
292 |                     loss = loss_fn(prediction, y)
293 |                     loss.backward()
294 |                     #print(criticModel.active_pred.label.bias.grad, criticModel.target_pred.label.bias.grad)
295 |                     #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias)
296 |                     criticModel.assign_active_network_gradients()
297 |                     #print(criticModel.active_pred.label.bias.grad, criticModel.target_pred.label.bias.grad)
298 |                     critic_active_optimizer.step()
299 |                     #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias)
300 |                 for i in range(samplecnt):
301 |                     for pos in range(len(actionlist[i])):
302 |                         rr = [0, 0]
303 |                         rr[actionlist[i][pos]] = ((losslist[i] - aveloss) * alpha).cpu().item()
304 |                         g = actorModel.get_gradient(statelist[i][pos][0], statelist[i][pos][1], rr, scope = "target")
305 |                         if flag == 0:
306 |                             grad1 = g[0]
307 |                             grad2 = g[1]
308 |                             grad3 = g[2]
309 |                             flag = 1
310 |                         else:
311 |                             grad1 += g[0]
312 |                             grad2 += g[1]
313 |                             grad3 += g[2]
314 |                         #print("++", grad3)
315 |                 #print("\n\n before: active: ", actorModel.active_policy.b, "target: ", actorModel.target_policy.b, "gradient to be applied: ", grad3)
316 |                 actor_target_optimizer.zero_grad()
317 |                 actor_active_optimizer.zero_grad()
318 |                 #print("previous grad: ", actorModel.active_policy.b.grad)
319 |                 actorModel.assign_active_network_gradients(grad1, grad2, grad3)
320 |                 actor_active_optimizer.step()
321 |                 #print("after: active: ", actorModel.active_policy.b, "target: ", actorModel.target_policy.b)
322 |             else: 
323 |                 #print("RL False LSTM True")
324 |                 criticModel.train()
325 |                 actorModel.train(False)  
326 |                 critic_active_optimizer.zero_grad()
327 |                 critic_target_optimizer.zero_grad()
328 |                 prediction = criticModel(x, scope = "target")
329 |                 pred[i] = prediction
330 |                 loss = loss_fn(prediction, y)
331 |                 avgloss += loss.item()
332 |                 loss.backward()
333 |                 criticModel.assign_active_network_gradients()
334 |                 critic_active_optimizer.step()
335 |         
336 |         if RL_train:
337 |             #print("Again RL True")
338 |             criticModel.train(False)
339 |             actorModel.train()
340 |             #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data)
341 |             actorModel.update_target_network()
342 |             #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data)
343 |             if LSTM_train:
344 |                 #print("Again RL AND LSTM True")
345 |                 criticModel.train()
346 |                 actorModel.train() 
347 |                 #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias)
348 |                 criticModel.update_target_network()
349 |                 #print(criticModel.active_pred.label.bias, criticModel.target_pred.label.bias)
350 |                 
351 |         else:
352 |             #print("Again RL False and LSTM True")
353 |             criticModel.train()
354 |             actorModel.train(False)  
355 |             criticModel.assign_target_network()
356 |         avgloss /= batch_size
357 |         num_corrects = (torch.max(pred, 1)[1].view(target.size()).data == target.data).float().sum()
358 |         acc = 100.0 * num_corrects/len(batch)
359 |         steps += 1
360 |         
361 |         #if steps % 50 == 0:
362 |             #print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {avgloss:.4f}, Training Accuracy: {acc.item(): .2f}%')  
363 |             #print(actorModel.target_policy.b.data, actorModel.active_policy.b.data) 
364 |         
365 |         total_epoch_loss += avgloss
366 |         total_epoch_acc += acc.item()
367 |         
368 |     return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)
369 | 
370 | 
371 | def train_model_without_delay(model, train_iter, epoch):
372 |     total_epoch_loss = 0
373 |     total_epoch_acc = 0
374 |     model.cuda()
375 |     optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.target_pred.parameters()))
376 |     steps = 0
377 |     model.train()
378 |     for idx, batch in enumerate(train_iter):
379 |         text = batch.text[0]
380 |         target = batch.label
381 |         target = torch.autograd.Variable(target).long()
382 |         if torch.cuda.is_available():
383 |             text = text.cuda()
384 |             target = target.cuda()
385 |         if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
386 |             continue
387 |         optim.zero_grad()
388 |         prediction = model(text, scope = "target")
389 |         loss = loss_fn(prediction, target)
390 |         num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
391 |         acc = 100.0 * num_corrects/len(batch)
392 |         loss.backward()
393 |         clip_gradient(model, 1e-1)
394 |         optim.step()
395 |         steps += 1
396 |         
397 |         if steps % 100 == 0:
398 |             print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
399 |         
400 |         total_epoch_loss += loss.item()
401 |         total_epoch_acc += acc.item()
402 |         
403 |     return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)
404 | 
405 | def eval_model(model, val_iter):
406 |     total_epoch_loss = 0
407 |     total_epoch_acc = 0
408 |     model.eval()
409 |     with torch.no_grad():
410 |         for idx, batch in enumerate(val_iter):
411 |             text = batch.text[0]
412 |             if (text.size()[0] is not batch_size):
413 |                 continue
414 |             target = batch.label
415 |             target = torch.autograd.Variable(target).long()
416 |             if torch.cuda.is_available():
417 |                 text = text.cuda()
418 |                 target = target.cuda()
419 |             prediction = model(text, scope = "target")
420 |             loss = loss_fn(prediction, target)
421 |             num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
422 |             acc = 100.0 * num_corrects/len(batch)
423 |             total_epoch_loss += loss.item()
424 |             total_epoch_acc += acc.item()
425 | 
426 |     return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)
427 | 
428 | def eval_model_RL(criticModel, actorModel, val_iter):
429 |     total_epoch_loss = 0
430 |     total_epoch_acc = 0
431 |     criticModel.eval()
432 |     actorModel.eval()
433 |     with torch.no_grad():
434 |         for idx, batch in enumerate(val_iter):
435 |             if idx % 100 == 0:
436 |                 print(idx, "/", len(val_iter))
437 |             text = batch.text[0]
438 |             if (text.size()[0] is not batch_size):
439 |                 continue
440 |             target = batch.label
441 |             lengths = batch.text[1]
442 |             target = torch.autograd.Variable(target).long()
443 |             if torch.cuda.is_available():
444 |                 text = text.cuda()
445 |                 target = target.cuda()
446 |             batch_loss = 0
447 |             pred = torch.zeros(batch_size, 2).cuda()
448 |             for i in range(batch_size):
449 |                 x = text[i].view(1,-1)
450 |                 y = target[i].view(1)
451 |                 length = int(lengths[i])
452 | 
453 |                 actions, states, Rinput, Rlenth = Sampling_RL(actorModel, criticModel, x, criticModel.wordvector_find(x), length, epsilon, Random=False)
454 |                 #print(x, Rinput, length, Rlenth)
455 |                 #if (i % 50) == 0:
456 |                     #print(actions)
457 |                 prediction = criticModel(Rinput, scope = "target")
458 |                 loss = loss_fn(prediction, y)
459 |                 batch_loss += loss
460 |                 pred[i] = prediction
461 |             num_corrects = (torch.max(pred, 1)[1].view(target.size()).data == target.data).sum()
462 |             acc = 100.0 * num_corrects/len(batch)
463 |             total_epoch_loss += batch_loss.item()
464 |             total_epoch_acc += acc.item()
465 | 
466 |     return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)
467 | 
468 | criticModel = critic()
469 | actorModel = actor()
470 | 
471 | actorModel.cuda()
472 | criticModel.cuda()
473 | 
474 | loss_fn = F.cross_entropy
475 | best_val_acc = 0.
476 | 
477 | criticModel.load_state_dict(torch.load('savedModels/critic_with_delay.pt'))
478 | _, best_val_acc = eval_model(criticModel, valid_iter)
479 | print(best_val_acc)
480 | _, best_val_acc = eval_model(criticModel, train_iter)
481 | print(best_val_acc)
482 | 
483 | if delay_critic:
484 |     for epoch in range(0):
485 |         print("Pre-training Critic...")
486 |         train_loss, train_acc = train_model(criticModel, actorModel, train_iter, epoch, RL_train = False)
487 |         val_loss, val_acc = eval_model(criticModel, valid_iter)
488 |         if val_acc > best_val_acc:
489 |             torch.save(criticModel.state_dict(), 'savedModels/critic_with_delay.pt')
490 |             best_val_acc = val_acc
491 |             print("saved Model with acc: ", val_acc)
492 |         print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
493 | else:
494 |     for epoch in range(0):
495 |         train_loss, train_acc = train_model_without_delay(criticModel, train_iter, epoch)
496 |         val_loss, val_acc = eval_model(criticModel, valid_iter)
497 |         if val_acc > best_val_acc:
498 |             torch.save(criticModel.state_dict(), 'savedModels/critic_without_delay.pt')
499 |             best_val_acc = val_acc
500 |         print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
501 | 
502 | #val_loss, val_acc = eval_model(criticModel, valid_iter)
503 | #test_loss, test_acc = eval_model(criticModel, test_iter)
504 | #train_loss, train_acc = eval_model(criticModel, train_iter)
505 | epoch = 0
506 | #print("LSTM Pretraining Done: ")
507 | #print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:3f}, Train Acc: {train_acc:.2f}, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}, Test. Loss: {test_loss:3f}, Test. Acc: {test_acc:.2f}%')
508 | 
509 | '''
510 | val_loss, val_acc = eval_model_RL(criticModel, actorModel,  valid_iter)
511 | print(val_loss, val_acc)
512 | asaas
513 | '''
514 | 
515 | '''
516 | val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter)
517 | print(f'Epoch: {epoch+1:02}, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
518 | asas
519 | '''
520 | '''
521 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay.pt'))
522 | print("Model loaded after epoch 10")
523 | print("Starting Reinforcement....")
524 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, valid_iter)
525 | print(best_val_acc2)
526 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, train_iter)
527 | print(best_val_acc2)
528 | '''
529 | 
530 | best_val_acc1 = 810.5
531 | for epoch in range(0):
532 |     train_loss, train_acc = train_model(criticModel, actorModel, train_iter, epoch, LSTM_train = False)
533 |     val_loss, val_acc = eval_model_RL(criticModel, actorModel,  valid_iter)
534 |     if val_acc > best_val_acc1:
535 |         torch.save(actorModel.state_dict(), 'savedModels/actor_with_delay.pt')
536 |         best_val_acc1 = val_acc
537 |     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
538 | print("Reinforcement Done!!!!")
539 | '''
540 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay.pt'))
541 | print("Model Loaded..")
542 | 
543 | val_loss, val_acc = eval_model_RL(criticModel, actorModel,  valid_iter)
544 | print(val_acc)
545 | '''
546 | criticModel.load_state_dict(torch.load('savedModels/critic_with_delay_joint.pt'))
547 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay_joint.pt')) 
548 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, valid_iter)
549 | print(best_val_acc2)
550 | _, best_val_acc2 = eval_model_RL(criticModel, actorModel, train_iter)
551 | print(best_val_acc2)
552 | asasa
553 | for epoch in range(0):
554 |     train_loss, train_acc = train_model(criticModel, actorModel, train_iter, epoch)
555 |     val_loss, val_acc = eval_model_RL(criticModel, actorModel, valid_iter)
556 |     print(val_acc)
557 |     if val_acc > best_val_acc2:
558 |         torch.save(actorModel.state_dict(), 'savedModels/actor_with_delay_joint.pt')
559 |         torch.save(criticModel.state_dict(), 'savedModels/critic_with_delay_joint.pt')
560 |         best_val_acc2 = val_acc  
561 |         print("----Mdoel Saved-----") 
562 |     
563 | 
564 | criticModel.load_state_dict(torch.load('savedModels/critic_with_delay_joint.pt'))
565 | actorModel.load_state_dict(torch.load('savedModels/actor_with_delay_joint.pt')) 
566 | test_loss, test_acc = eval_model_RL(criticModel, actorModel, valid_iter)
567 | print(test_acc)
568 | test_loss, test_acc = eval_model_RL(criticModel, actorModel, test_iter)
569 | print(test_acc)
570 | 
571 | '''
572 | #Let us now predict the sentiment on a single sentence just for the testing purpose
573 | 
574 | test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues."
575 | test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money."
576 | 
577 | test_sen1 = TEXT.preprocess(test_sen1)
578 | test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]
579 | 
580 | test_sen2 = TEXT.preprocess(test_sen2)
581 | test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]
582 | 
583 | test_sen = np.asarray(test_sen1)
584 | test_sen = torch.LongTensor(test_sen)
585 | test_tensor = Variable(test_sen, volatile=True)
586 | test_tensor = test_tensor.cuda()
587 | model.eval()
588 | output = model(test_tensor, 1)
589 | out = F.softmax(output, 1)
590 | if (torch.argmax(out[0]) == 1):
591 |     print ("Sentiment: Positive")
592 | else:
593 |     print ("Sentiment: Negative")
594 | '''


--------------------------------------------------------------------------------
/models/CNN.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.autograd import Variable
 6 | from torch.nn import functional as F
 7 | 
 8 | class CNN(nn.Module):
 9 | 	def __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, weights):
10 | 		super(CNN, self).__init__()
11 | 		
12 | 		"""
13 | 		Arguments
14 | 		---------
15 | 		batch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator
16 | 		output_size : 2 = (pos, neg)
17 | 		in_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length)
18 | 		out_channels : Number of output channels after convolution operation performed on the input matrix
19 | 		kernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated.
20 | 		keep_probab : Probability of retaining an activation node during dropout operation
21 | 		vocab_size : Size of the vocabulary containing unique words
22 | 		embedding_length : Embedding dimension of GloVe word embeddings
23 | 		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table
24 | 		--------
25 | 		
26 | 		"""
27 | 		self.batch_size = batch_size
28 | 		self.output_size = output_size
29 | 		self.in_channels = in_channels
30 | 		self.out_channels = out_channels
31 | 		self.kernel_heights = kernel_heights
32 | 		self.stride = stride
33 | 		self.padding = padding
34 | 		self.vocab_size = vocab_size
35 | 		self.embedding_length = embedding_length
36 | 		
37 | 		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
38 | 		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
39 | 		self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding)
40 | 		self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding)
41 | 		self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding)
42 | 		self.dropout = nn.Dropout(keep_probab)
43 | 		self.label = nn.Linear(len(kernel_heights)*out_channels, output_size)
44 | 	
45 | 	def conv_block(self, input, conv_layer):
46 | 		conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1)
47 | 		activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1)
48 | 		max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)
49 | 		
50 | 		return max_out
51 | 	
52 | 	def forward(self, input_sentences, batch_size=None):
53 | 		
54 | 		"""
55 | 		The idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix 
56 | 		whose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length.
57 | 		We will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor 
58 | 		and will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected
59 | 		to the output layers consisting two units which basically gives us the logits for both positive and negative classes.
60 | 		
61 | 		Parameters
62 | 		----------
63 | 		input_sentences: input_sentences of shape = (batch_size, num_sequences)
64 | 		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
65 | 		
66 | 		Returns
67 | 		-------
68 | 		Output of the linear layer containing logits for pos & neg class.
69 | 		logits.size() = (batch_size, output_size)
70 | 		
71 | 		"""
72 | 		
73 | 		input = self.word_embeddings(input_sentences)
74 | 		# input.size() = (batch_size, num_seq, embedding_length)
75 | 		input = input.unsqueeze(1)
76 | 		# input.size() = (batch_size, 1, num_seq, embedding_length)
77 | 		max_out1 = self.conv_block(input, self.conv1)
78 | 		max_out2 = self.conv_block(input, self.conv2)
79 | 		max_out3 = self.conv_block(input, self.conv3)
80 | 		
81 | 		all_out = torch.cat((max_out1, max_out2, max_out3), 1)
82 | 		# all_out.size() = (batch_size, num_kernels*out_channels)
83 | 		fc_in = self.dropout(all_out)
84 | 		# fc_in.size()) = (batch_size, num_kernels*out_channels)
85 | 		logits = self.label(fc_in)
86 | 		
87 | 		return logits
88 | 


--------------------------------------------------------------------------------
/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.autograd import Variable
 6 | from torch.nn import functional as F
 7 | 
 8 | class LSTMClassifier(nn.Module):
 9 | 	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
10 | 		super(LSTMClassifier, self).__init__()
11 | 		
12 | 		"""
13 | 		Arguments
14 | 		---------
15 | 		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
16 | 		output_size : 2 = (pos, neg)
17 | 		hidden_sie : Size of the hidden_state of the LSTM
18 | 		vocab_size : Size of the vocabulary containing unique words
19 | 		embedding_length : Embeddding dimension of GloVe word embeddings
20 | 		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
21 | 		
22 | 		"""
23 | 		
24 | 		self.batch_size = batch_size
25 | 		self.output_size = output_size
26 | 		self.hidden_size = hidden_size
27 | 		self.vocab_size = vocab_size
28 | 		self.embedding_length = embedding_length
29 | 		
30 | 		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
31 | 		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
32 | 		self.lstm = nn.LSTM(embedding_length, hidden_size)
33 | 		self.label = nn.Linear(hidden_size, output_size)
34 | 		
35 | 	def forward(self, input_sentence, batch_size=None):
36 | 	
37 | 		""" 
38 | 		Parameters
39 | 		----------
40 | 		input_sentence: input_sentence of shape = (batch_size, num_sequences)
41 | 		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
42 | 		
43 | 		Returns
44 | 		-------
45 | 		Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
46 | 		final_output.shape = (batch_size, output_size)
47 | 		
48 | 		"""
49 | 		
50 | 		''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
51 | 		input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences,  embedding_length)
52 | 		input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
53 | 		batch_size = input.size(1)
54 | 		if batch_size is None:
55 | 			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
56 | 			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
57 | 		else:
58 | 			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
59 | 			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
60 | 		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
61 | 		final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)
62 | 		
63 | 		return final_output
64 | 
65 | 	def wordvector_find(self, x):
66 | 		return self.word_embeddings(x)
67 | 
68 | 	def getNextHiddenState(self, hc, x):
69 | 		hidden = hc[0,0:self.hidden_size].view(1,1,self.hidden_size)
70 | 		cell = hc[0,self.hidden_size:].view(1,1,self.hidden_size)
71 | 		input = self.word_embeddings(x).view(1,1,-1)
72 | 		out, hidden = self.lstm(input, [hidden, cell])
73 | 		hidden = torch.cat([hidden[0], hidden[1]], -1).view(1, -1)
74 | 		return out, hidden


--------------------------------------------------------------------------------
/models/LSTM_Attn.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | from torch.nn import functional as F
  7 | import numpy as np
  8 | 
  9 | class AttentionModel(torch.nn.Module):
 10 | 	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
 11 | 		super(AttentionModel, self).__init__()
 12 | 		
 13 | 		"""
 14 | 		Arguments
 15 | 		---------
 16 | 		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
 17 | 		output_size : 2 = (pos, neg)
 18 | 		hidden_sie : Size of the hidden_state of the LSTM
 19 | 		vocab_size : Size of the vocabulary containing unique words
 20 | 		embedding_length : Embeddding dimension of GloVe word embeddings
 21 | 		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
 22 | 		
 23 | 		--------
 24 | 		
 25 | 		"""
 26 | 		
 27 | 		self.batch_size = batch_size
 28 | 		self.output_size = output_size
 29 | 		self.hidden_size = hidden_size
 30 | 		self.vocab_size = vocab_size
 31 | 		self.embedding_length = embedding_length
 32 | 		
 33 | 		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
 34 | 		self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
 35 | 		self.lstm = nn.LSTM(embedding_length, hidden_size)
 36 | 		self.label = nn.Linear(hidden_size, output_size)
 37 | 		#self.attn_fc_layer = nn.Linear()
 38 | 		
 39 | 	def attention_net(self, lstm_output, final_state):
 40 | 
 41 | 		""" 
 42 | 		Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention to compute soft alignment score corresponding
 43 | 		between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm for the batch matrix multiplication.
 44 | 		
 45 | 		Arguments
 46 | 		---------
 47 | 		
 48 | 		lstm_output : Final output of the LSTM which contains hidden layer outputs for each sequence.
 49 | 		final_state : Final time-step hidden state (h_n) of the LSTM
 50 | 		
 51 | 		---------
 52 | 		
 53 | 		Returns : It performs attention mechanism by first computing weights for each of the sequence present in lstm_output and and then finally computing the
 54 | 				  new hidden state.
 55 | 				  
 56 | 		Tensor Size :
 57 | 					hidden.size() = (batch_size, hidden_size)
 58 | 					attn_weights.size() = (batch_size, num_seq)
 59 | 					soft_attn_weights.size() = (batch_size, num_seq)
 60 | 					new_hidden_state.size() = (batch_size, hidden_size)
 61 | 					  
 62 | 		"""
 63 | 		
 64 | 		hidden = final_state.squeeze(0)
 65 | 		attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
 66 | 		soft_attn_weights = F.softmax(attn_weights, 1)
 67 | 		new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
 68 | 		
 69 | 		return new_hidden_state
 70 | 	
 71 | 	def forward(self, input_sentences, batch_size=None):
 72 | 	
 73 | 		""" 
 74 | 		Parameters
 75 | 		----------
 76 | 		input_sentence: input_sentence of shape = (batch_size, num_sequences)
 77 | 		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
 78 | 		
 79 | 		Returns
 80 | 		-------
 81 | 		Output of the linear layer containing logits for pos & neg class which receives its input as the new_hidden_state which is basically the output of the Attention network.
 82 | 		final_output.shape = (batch_size, output_size)
 83 | 		
 84 | 		"""
 85 | 		
 86 | 		input = self.word_embeddings(input_sentences)
 87 | 		input = input.permute(1, 0, 2)
 88 | 		if batch_size is None:
 89 | 			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
 90 | 			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
 91 | 		else:
 92 | 			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
 93 | 			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
 94 | 			
 95 | 		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) # final_hidden_state.size() = (1, batch_size, hidden_size) 
 96 | 		output = output.permute(1, 0, 2) # output.size() = (batch_size, num_seq, hidden_size)
 97 | 		
 98 | 		attn_output = self.attention_net(output, final_hidden_state)
 99 | 		logits = self.label(attn_output)
100 | 		
101 | 		return logits
102 | 


--------------------------------------------------------------------------------
/models/RCNN.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.autograd import Variable
 6 | from torch.nn import functional as F
 7 | 
 8 | class RCNN(nn.Module):
 9 | 	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
10 | 		super(RCNN, self).__init__()
11 | 		
12 | 		"""
13 | 		Arguments
14 | 		---------
15 | 		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
16 | 		output_size : 2 = (pos, neg)
17 | 		hidden_sie : Size of the hidden_state of the LSTM
18 | 		vocab_size : Size of the vocabulary containing unique words
19 | 		embedding_length : Embedding dimension of GloVe word embeddings
20 | 		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
21 | 		
22 | 		"""
23 | 		
24 | 		self.batch_size = batch_size
25 | 		self.output_size = output_size
26 | 		self.hidden_size = hidden_size
27 | 		self.vocab_size = vocab_size
28 | 		self.embedding_length = embedding_length
29 | 		
30 | 		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
31 | 		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
32 | 		self.dropout = 0.8
33 | 		self.lstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True)
34 | 		self.W2 = nn.Linear(2*hidden_size+embedding_length, hidden_size)
35 | 		self.label = nn.Linear(hidden_size, output_size)
36 | 		
37 | 	def forward(self, input_sentence, batch_size=None):
38 | 	
39 | 		""" 
40 | 		Parameters
41 | 		----------
42 | 		input_sentence: input_sentence of shape = (batch_size, num_sequences)
43 | 		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
44 | 		
45 | 		Returns
46 | 		-------
47 | 		Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
48 | 		final_output.shape = (batch_size, output_size)
49 | 		
50 | 		"""
51 | 		
52 | 		"""
53 | 		
54 | 		The idea of the paper "Recurrent Convolutional Neural Networks for Text Classification" is that we pass the embedding vector
55 | 		of the text sequences through a bidirectional LSTM and then for each sequence, our final embedding vector is the concatenation of 
56 | 		its own GloVe embedding and the left and right contextual embedding which in bidirectional LSTM is same as the corresponding hidden
57 | 		state. This final embedding is passed through a linear layer which maps this long concatenated encoding vector back to the hidden_size
58 | 		vector. After this step, we use a max pooling layer across all sequences of texts. This converts any varying length text into a fixed
59 | 		dimension tensor of size (batch_size, hidden_size) and finally we map this to the output layer.
60 | 
61 | 		"""
62 | 		input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length)
63 | 		input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
64 | 		if batch_size is None:
65 | 			h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
66 | 			c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
67 | 		else:
68 | 			h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
69 | 			c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
70 | 
71 | 		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
72 | 		
73 | 		final_encoding = torch.cat((output, input), 2).permute(1, 0, 2)
74 | 		y = self.W2(final_encoding) # y.size() = (batch_size, num_sequences, hidden_size)
75 | 		y = y.permute(0, 2, 1) # y.size() = (batch_size, hidden_size, num_sequences)
76 | 		y = F.max_pool1d(y, y.size()[2]) # y.size() = (batch_size, hidden_size, 1)
77 | 		y = y.squeeze(2)
78 | 		logits = self.label(y)
79 | 		
80 | 		return logits
81 | 


--------------------------------------------------------------------------------
/models/RNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | from torch.nn import functional as F
 5 | 
 6 | class RNN(nn.Module):
 7 | 	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
 8 | 		super(RNN, self).__init__()
 9 | 
10 | 		"""
11 | 		Arguments
12 | 		---------
13 | 		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
14 | 		output_size : 2 = (pos, neg)
15 | 		hidden_sie : Size of the hidden_state of the LSTM
16 | 		vocab_size : Size of the vocabulary containing unique words
17 | 		embedding_length : Embeddding dimension of GloVe word embeddings
18 | 		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
19 | 		
20 | 		"""
21 | 
22 | 		self.batch_size = batch_size
23 | 		self.output_size = output_size
24 | 		self.hidden_size = hidden_size
25 | 		self.vocab_size = vocab_size
26 | 		self.embedding_length = embedding_length
27 | 		
28 | 		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
29 | 		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
30 | 		self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True)
31 | 		self.label = nn.Linear(4*hidden_size, output_size)
32 | 	
33 | 	def forward(self, input_sentences, batch_size=None):
34 | 		
35 | 		""" 
36 | 		Parameters
37 | 		----------
38 | 		input_sentence: input_sentence of shape = (batch_size, num_sequences)
39 | 		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
40 | 		
41 | 		Returns
42 | 		-------
43 | 		Output of the linear layer containing logits for pos & neg class which receives its input as the final_hidden_state of RNN.
44 | 		logits.size() = (batch_size, output_size)
45 | 		
46 | 		"""
47 | 
48 | 		input = self.word_embeddings(input_sentences)
49 | 		input = input.permute(1, 0, 2)
50 | 		if batch_size is None:
51 | 			h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda()) # 4 = num_layers*num_directions
52 | 		else:
53 | 			h_0 =  Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())
54 | 		output, h_n = self.rnn(input, h_0)
55 | 		# h_n.size() = (4, batch_size, hidden_size)
56 | 		h_n = h_n.permute(1, 0, 2) # h_n.size() = (batch_size, 4, hidden_size)
57 | 		h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])
58 | 		# h_n.size() = (batch_size, 4*hidden_size)
59 | 		logits = self.label(h_n) # logits.size() = (batch_size, output_size)
60 | 		
61 | 		return logits
62 | 


--------------------------------------------------------------------------------
/models/__pycache__/LSTM.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/navid5792/ID-LSTM-pytorch/30f457f8c37f4b60f9a17b221f718a07e8a25ff3/models/__pycache__/LSTM.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/LSTM.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/navid5792/ID-LSTM-pytorch/30f457f8c37f4b60f9a17b221f718a07e8a25ff3/models/__pycache__/LSTM.cpython-37.pyc


--------------------------------------------------------------------------------
/models/selfAttention.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | from torch.nn import functional as F
  7 | 
  8 | class SelfAttention(nn.Module):
  9 | 	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
 10 | 		super(SelfAttention, self).__init__()
 11 | 
 12 | 		"""
 13 | 		Arguments
 14 | 		---------
 15 | 		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
 16 | 		output_size : 2 = (pos, neg)
 17 | 		hidden_sie : Size of the hidden_state of the LSTM
 18 | 		vocab_size : Size of the vocabulary containing unique words
 19 | 		embedding_length : Embeddding dimension of GloVe word embeddings
 20 | 		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
 21 | 		
 22 | 		--------
 23 | 		
 24 | 		"""
 25 | 
 26 | 		self.batch_size = batch_size
 27 | 		self.output_size = output_size
 28 | 		self.hidden_size = hidden_size
 29 | 		self.vocab_size = vocab_size
 30 | 		self.embedding_length = embedding_length
 31 | 		self.weights = weights
 32 | 
 33 | 		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
 34 | 		self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
 35 | 		self.dropout = 0.8
 36 | 		self.bilstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True)
 37 | 		# We will use da = 350, r = 30 & penalization_coeff = 1 as per given in the self-attention original ICLR paper
 38 | 		self.W_s1 = nn.Linear(2*hidden_size, 350)
 39 | 		self.W_s2 = nn.Linear(350, 30)
 40 | 		self.fc_layer = nn.Linear(30*2*hidden_size, 2000)
 41 | 		self.label = nn.Linear(2000, output_size)
 42 | 
 43 | 	def attention_net(self, lstm_output):
 44 | 
 45 | 		"""
 46 | 		Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an
 47 | 		encoding of the inout sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of 
 48 | 		the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully 
 49 | 		connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e., 
 50 | 		pos & neg.
 51 | 
 52 | 		Arguments
 53 | 		---------
 54 | 
 55 | 		lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network.
 56 | 		---------
 57 | 
 58 | 		Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give
 59 | 				  attention to different parts of the input sentence.
 60 | 
 61 | 		Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size)
 62 | 					  attn_weight_matrix.size() = (batch_size, 30, num_seq)
 63 | 
 64 | 		"""
 65 | 		attn_weight_matrix = self.W_s2(F.tanh(self.W_s1(lstm_output)))
 66 | 		attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
 67 | 		attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2)
 68 | 
 69 | 		return attn_weight_matrix
 70 | 
 71 | 	def forward(self, input_sentences, batch_size=None):
 72 | 
 73 | 		""" 
 74 | 		Parameters
 75 | 		----------
 76 | 		input_sentence: input_sentence of shape = (batch_size, num_sequences)
 77 | 		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
 78 | 		
 79 | 		Returns
 80 | 		-------
 81 | 		Output of the linear layer containing logits for pos & neg class.
 82 | 		
 83 | 		"""
 84 | 
 85 | 		input = self.word_embeddings(input_sentences)
 86 | 		input = input.permute(1, 0, 2)
 87 | 		if batch_size is None:
 88 | 			h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
 89 | 			c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
 90 | 		else:
 91 | 			h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
 92 | 			c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
 93 | 
 94 | 		output, (h_n, c_n) = self.bilstm(input, (h_0, c_0))
 95 | 		output = output.permute(1, 0, 2)
 96 | 		# output.size() = (batch_size, num_seq, 2*hidden_size)
 97 | 		# h_n.size() = (1, batch_size, hidden_size)
 98 | 		# c_n.size() = (1, batch_size, hidden_size)
 99 | 		attn_weight_matrix = self.attention_net(output)
100 | 		# attn_weight_matrix.size() = (batch_size, r, num_seq)
101 | 		# output.size() = (batch_size, num_seq, 2*hidden_size)
102 | 		hidden_matrix = torch.bmm(attn_weight_matrix, output)
103 | 		# hidden_matrix.size() = (batch_size, r, 2*hidden_size)
104 | 		# Let's now concatenate the hidden_matrix and connect it to the fully connected layer.
105 | 		fc_out = self.fc_layer(hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2]))
106 | 		logits = self.label(fc_out)
107 | 		# logits.size() = (batch_size, output_size)
108 | 
109 | 		return logits


--------------------------------------------------------------------------------
/run history.txt:
--------------------------------------------------------------------------------
 1 | without delay 																	   82.40%
 2 | 
 3 | Just Critic Val. Loss: 0.253031, Val. Acc: 90.45, Test. Loss: 0.427570, Test. Acc: 82.20%
 4 | 
 5 | Just Actor 						 Val. Acc: 81.54,  						Test. Acc: 78.456
 6 | 
 7 | Critic + Actor Train, 
 8 | Save models based on just critic		   84.20								   79.692
 9 | 
10 | Critc was trained for 10 epochs
11 | actor was trained for 10 epochs
12 | 
13 | proper way after 10 epochs				   88.94								   82.068
14 | 
15 | proper way after 30 epochs 				   94.76								   83.568
16 | 
17 | 
18 | usually delay works better for smaller batch sizes: verified with TF code 
19 | Epoch: 30
20 | BS 5 - 80.8
21 | BS 50 - 76.2
22 | 
23 | 
24 | 
25 | Tensorflow
26 | 
27 | train 4000
28 | test 1000
29 | validation 500
30 | 
31 | delay with BS 5	87.0 loss 0.10
32 | delay with BS 1	83.0 loss 0.10
33 | immediate update target with BS 5 which means BS 1 88.0
34 | 
35 | 
36 | 
37 | Just critic train
38 | valid 	90.57333333333334
39 | train 	90.56571428571428
40 | 
41 | Just actor train
42 | valid 	81.17
43 | train 	81.32
44 | 
45 | critic and actor train
46 | valid 	94.73
47 | train 	94.90
48 | 


--------------------------------------------------------------------------------