├── .DS_Store
├── README.md
├── rnn_attention.py
└── main.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edchengg/PTB-pytorch-LSTM-attention/HEAD/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PTB Language Modelling task with RNNS(LSTM,GRU) and Attention Layer
 2 | 
 3 | This repository is used for a language modelling pareto competition at TTIC. 
 4 | I implemented an attention layer with the RNN model. 
 5 | TODO: (Lei Mao suggests another way to implement the attention layer by breaking into the LSTM class.)
 6 | 
 7 | ## Software Requirements
 8 | 
 9 | This codebase requires Python 3, [PyTorch](http://pytorch.org/)
10 | 
11 | ## Usage
12 | 
13 | ```bash
14 | python main.py --att --att_width 20        # Train a LSTM on PTB with attention layer and set the width of attenion to 20
15 | python generate.py                      # Generate samples from the trained LSTM model.
16 | ```
17 | 
18 | ## Acknowledge
19 | This repository contains the code originally forked from the [Word-level language modeling RNN](https://github.com/pytorch/examples/tree/master/word_language_model) that is modified to present attention layer into the model.
20 | 


--------------------------------------------------------------------------------
/rnn_attention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Variable
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | 
  6 | def batch_matmul(seq, weight, nonlinearity=''):
  7 |     s = None
  8 |     for i in range(seq.size(0)):
  9 |         _s = torch.mm(seq[i], weight)
 10 |         if(nonlinearity=='tanh'):
 11 |             _s = torch.tanh(_s)
 12 |         _s = _s.unsqueeze(0)
 13 |         if(s is None):
 14 |             s = _s
 15 |         else:
 16 |             s = torch.cat((s,_s),0)
 17 |     return s.squeeze()
 18 | 
 19 | class RNNModel(nn.Module):
 20 |     """Container module with an encoder, a recurrent module, and a decoder."""
 21 | 
 22 |     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False, attention=False,
 23 |                  attention_width=3, cuda=False):
 24 |         super(RNNModel, self).__init__()
 25 |         self.drop = nn.Dropout(dropout)
 26 |         self.encoder = nn.Embedding(ntoken, ninp)
 27 |         if rnn_type in ['LSTM', 'GRU']:
 28 |             self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
 29 |         else:
 30 |             try:
 31 |                 nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
 32 |             except KeyError:
 33 |                 raise ValueError("""An invalid option for `--model` was supplied,
 34 |                                  options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
 35 |             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
 36 |         if attention:
 37 |             self.decoder = nn.Linear(nhid, ntoken)
 38 |         else:
 39 |             self.decoder = nn.Linear(nhid, ntoken)
 40 |         if tie_weights:
 41 |             if nhid != ninp:
 42 |                 raise ValueError('When using the tied flag, nhid must be equal to emsize')
 43 |             self.decoder.weight = self.encoder.weight
 44 | 
 45 |         self.softmax = nn.Softmax()
 46 |         if attention:
 47 |             self.AttentionLayer = AttentionLayer(cuda,nhid)
 48 |         self.init_weights()
 49 | 
 50 |         self.rnn_type = rnn_type
 51 |         self.nhid = nhid
 52 |         self.nlayers = nlayers
 53 |         self.attention = attention
 54 |         self.attention_width = attention_width
 55 | 
 56 |     def init_weights(self):
 57 |         initrange = 0.1
 58 |         self.encoder.weight.data.uniform_(-initrange, initrange)
 59 |         self.decoder.bias.data.fill_(0)
 60 |         self.decoder.weight.data.uniform_(-initrange, initrange)
 61 | 
 62 |     def forward(self, input, hidden):
 63 |         #print("input size:",input.size())
 64 |         emb = self.drop(self.encoder(input))
 65 |         #print("emb size:",emb.size())
 66 |         output, hidden = self.rnn(emb, hidden)
 67 |         #print("rnn output",output.size())
 68 |         if self.attention:
 69 |             output = self.AttentionLayer.forward(output, self.attention_width)
 70 |         output = self.drop(output)
 71 |         decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
 72 |         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
 73 | 
 74 |     def init_hidden(self, bsz):
 75 |         weight = next(self.parameters()).data
 76 |         if self.rnn_type == 'LSTM':
 77 |             return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
 78 |                     Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
 79 |         else:
 80 |             return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())
 81 | 
 82 | 
 83 | class AttentionLayer(nn.Module):
 84 |     """Implements an Attention Layer"""
 85 | 
 86 |     def __init__(self, cuda, nhid):
 87 |         super(AttentionLayer, self).__init__()
 88 |         self.nhid = nhid
 89 |         self.weight_W = nn.Parameter(torch.Tensor(nhid,nhid))
 90 |         self.weight_proj = nn.Parameter(torch.Tensor(nhid, 1))
 91 |         self.softmax = nn.Softmax()
 92 |         self.weight_W.data.uniform_(-0.1, 0.1)
 93 |         self.weight_proj.data.uniform_(-0.1,0.1)
 94 |         self.cuda = cuda
 95 | 
 96 |     def forward(self, inputs, attention_width=3):
 97 |         results = None
 98 |         for i in range(inputs.size(0)):
 99 |             if(i<attention_width):
100 |                 output = inputs[i]
101 |                 output = output.unsqueeze(0)
102 | 
103 |             else:
104 |                 lb = i - attention_width
105 |                 if(lb<0):
106 |                     lb = 0
107 |                 selector = torch.from_numpy(np.array(np.arange(lb, i)))
108 |                 if self.cuda:
109 |                     selector = Variable(selector).cuda()
110 |                 else:
111 |                     selector = Variable(selector)
112 |                 vec = torch.index_select(inputs, 0, selector)
113 |                 u = batch_matmul(vec, self.weight_W, nonlinearity='tanh')
114 |                 a = batch_matmul(u, self.weight_proj)
115 |                 a = self.softmax(a)
116 |                 output = None
117 |                 for i in range(vec.size(0)):
118 |                     h_i = vec[i]
119 |                     a_i = a[i].unsqueeze(1).expand_as(h_i)
120 |                     h_i = a_i * h_i
121 |                     h_i = h_i.unsqueeze(0)
122 |                     if(output is None):
123 |                         output = h_i
124 |                     else:
125 |                         output = torch.cat((output,h_i),0)
126 |                 output = torch.sum(output,0)
127 |                 output = output.unsqueeze(0)
128 | 
129 |             if(results is None):
130 |                 results = output
131 | 
132 |             else:
133 |                 results = torch.cat((results,output),0)
134 | 
135 |         return results


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import argparse
  3 | import time
  4 | import math
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.autograd import Variable
  8 | 
  9 | import data
 10 | import rnn_attention
 11 | import model
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser(description='PTB RNN/LSTM Language Model: Main Function')
 15 | parser.add_argument('--data', type=str, default='./data/ptb',
 16 |                     help='location of the data corpus')
 17 | parser.add_argument('--model', type=str, default='LSTM',
 18 |                     help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
 19 | parser.add_argument('--emsize', type=int, default=100,
 20 |                     help='size of word embeddings')
 21 | parser.add_argument('--nhid', type=int, default=128,
 22 |                     help='number of hidden units per layer')
 23 | parser.add_argument('--nlayers', type=int, default=1,
 24 |                     help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=20,
 26 |                     help='initial learning rate')
 27 | parser.add_argument('--clip', type=float, default=0.25,
 28 |                     help='gradient clipping')
 29 | parser.add_argument('--epochs', type=int, default=5,
 30 |                     help='upper epoch limit')
 31 | parser.add_argument('--batch_size', type=int, default=32, metavar='N',
 32 |                     help='batch size')
 33 | parser.add_argument('--bptt', type=int, default=35,
 34 |                     help='sequence length')
 35 | parser.add_argument('--dropout', type=float, default=0.2,
 36 |                     help='dropout applied to layers (0 = no dropout)')
 37 | parser.add_argument('--tied', action='store_true',
 38 |                     help='tie the word embedding and softmax weights')
 39 | parser.add_argument('--seed', type=int, default=1111,
 40 |                     help='random seed')
 41 | parser.add_argument('--log_interval', type=int, default=200, metavar='N',
 42 |                     help='report interval')
 43 | parser.add_argument('--save', type=str,  default='model.pt',
 44 |                     help='path to save the final model')
 45 | 
 46 | parser.add_argument('--att', action='store_true',
 47 |                     help='attention layers')
 48 | 
 49 | parser.add_argument('--att_width', type=int,  default=3,
 50 |                     help='attention layer width')
 51 | parser.add_argument('--cuda', action='store_true',
 52 |                     help='use CUDA')
 53 | args = parser.parse_args()
 54 | 
 55 | # check attention width and sequence length
 56 | try:
 57 |     args.att_width < args.bptt
 58 | except KeyError:
 59 |     raise ValueError("""attention width should be less than sequence length,
 60 |                         att_width < bptt""")
 61 | 
 62 | 
 63 | if torch.cuda.is_available():
 64 |     if not args.cuda:
 65 |         print("WARNING: You have a CUDA device, so you should probably run with --cuda")
 66 |     else:
 67 |         torch.cuda.manual_seed(args.seed)
 68 | # Set the random seed manually for reproducibility.
 69 | torch.manual_seed(args.seed)
 70 | 
 71 | ###############################################################################
 72 | # Load data
 73 | ###############################################################################
 74 | 
 75 | corpus = data.Corpus(args.data)
 76 | 
 77 | # Starting from sequential data, batchify arranges the dataset into columns.
 78 | # For instance, with the alphabet as the sequence and batch size 4, we'd get
 79 | # ┌ a g m s ┐
 80 | # │ b h n t │
 81 | # │ c i o u │
 82 | # │ d j p v │
 83 | # │ e k q w │
 84 | # └ f l r x ┘.
 85 | # These columns are treated as independent by the model, which means that the
 86 | # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 87 | # batch processing.
 88 | 
 89 | def batchify(data, bsz):
 90 |     # Work out how cleanly we can divide the dataset into bsz parts.
 91 |     nbatch = data.size(0) // bsz
 92 |     # Trim off any extra elements that wouldn't cleanly fit (remainders).
 93 |     data = data.narrow(0, 0, nbatch * bsz)
 94 |     # Evenly divide the data across the bsz batches.
 95 |     data = data.view(bsz, -1).t().contiguous()
 96 |     if args.cuda:
 97 |         data = data.cuda()
 98 |     return data
 99 | 
100 | eval_batch_size = 10
101 | train_data = batchify(corpus.train, args.batch_size)
102 | val_data = batchify(corpus.valid, eval_batch_size)
103 | test_data = batchify(corpus.test, eval_batch_size)
104 | 
105 | ###############################################################################
106 | # Build the model
107 | ###############################################################################
108 | 
109 | ntokens = len(corpus.dictionary)
110 | #model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout,
111 | #                               args.tied)
112 | model = rnn_attention.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout,
113 |                                args.tied, args.att, args.att_width, args.cuda)
114 | criterion = nn.CrossEntropyLoss()
115 | if args.cuda:
116 |     model.cuda()
117 | 
118 | ###############################################################################
119 | # Training code
120 | ###############################################################################
121 | 
122 | def repackage_hidden(h):
123 |     """Wraps hidden states in new Variables, to detach them from their history."""
124 |     if type(h) == Variable:
125 |         return Variable(h.data)
126 |     else:
127 |         return tuple(repackage_hidden(v) for v in h)
128 | 
129 | 
130 | # get_batch subdivides the source data into chunks of length args.bptt.
131 | # If source is equal to the example output of the batchify function, with
132 | # a bptt-limit of 2, we'd get the following two Variables for i = 0:
133 | # ┌ a g m s ┐ ┌ b h n t ┐
134 | # └ b h n t ┘ └ c i o u ┘
135 | # Note that despite the name of the function, the subdivison of data is not
136 | # done along the batch dimension (i.e. dimension 1), since that was handled
137 | # by the batchify function. The chunks are along dimension 0, corresponding
138 | # to the seq_len dimension in the LSTM.
139 | 
140 | def get_batch(source, i, evaluation=False):
141 |     seq_len = min(args.bptt, len(source) - 1 - i)
142 |     data = Variable(source[i:i+seq_len], volatile=evaluation)
143 |     target = Variable(source[i+1:i+1+seq_len].view(-1))
144 |     return data, target
145 | 
146 | 
147 | def evaluate(data_source):
148 |     # Turn on evaluation mode which disables dropout.
149 |     model.eval()
150 |     total_loss = 0
151 |     ntokens = len(corpus.dictionary)
152 |     hidden = model.init_hidden(eval_batch_size)
153 |     for i in range(0, data_source.size(0) - 1, args.bptt):
154 |         data, targets = get_batch(data_source, i, evaluation=True)
155 |         output, hidden = model(data, hidden)
156 |         output_flat = output.view(-1, ntokens)
157 |         total_loss += len(data) * criterion(output_flat, targets).data
158 |         hidden = repackage_hidden(hidden)
159 |     return total_loss[0] / len(data_source)
160 | 
161 | 
162 | def train():
163 |     # Turn on training mode which enables dropout.
164 |     model.train()
165 |     total_loss = 0
166 |     start_time = time.time()
167 |     ntokens = len(corpus.dictionary)
168 |     hidden = model.init_hidden(args.batch_size)
169 |     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
170 |         data, targets = get_batch(train_data, i)
171 |         # Starting each batch, we detach the hidden state from how it was previously produced.
172 |         # If we didn't, the model would try backpropagating all the way to start of the dataset.
173 |         hidden = repackage_hidden(hidden)
174 |         model.zero_grad()
175 |         output, hidden = model(data, hidden)
176 |         #print(output.size())
177 |         loss = criterion(output.view(-1, ntokens), targets)
178 |         loss.backward()
179 | 
180 |         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
181 |         torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
182 |         #print(model.state_dict())
183 |         #print("Done***")
184 | 
185 |         for n,p in model.named_parameters():
186 |             p.data.add_(-lr, p.grad.data)
187 | 
188 |         total_loss += loss.data
189 | 
190 |         if batch % args.log_interval == 0 and batch > 0:
191 |             cur_loss = total_loss[0] / args.log_interval
192 |             elapsed = time.time() - start_time
193 |             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
194 |                     'loss {:5.2f} | perplexity {:8.2f}'.format(
195 |                 epoch, batch, len(train_data) // args.bptt, lr,
196 |                 elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
197 |             total_loss = 0
198 |             start_time = time.time()
199 | 
200 | # Loop over epochs.
201 | lr = args.lr
202 | best_val_loss = None
203 | 
204 | # At any point you can hit Ctrl + C to break out of training early.
205 | 
206 | try:
207 |     for epoch in range(1, args.epochs+1):
208 |         epoch_start_time = time.time()
209 |         train()
210 |         val_loss = evaluate(val_data)
211 |         print('-' * 89)
212 |         print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
213 |                 'valid perplexity {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
214 |                                            val_loss, math.exp(val_loss)))
215 |         print('-' * 89)
216 |         # Save the model if the validation loss is the best we've seen so far.
217 |         if not best_val_loss or val_loss < best_val_loss:
218 |             with open(args.save, 'wb') as f:
219 |                 torch.save(model, f)
220 |             best_val_loss = val_loss
221 |         else:
222 |             # Anneal the learning rate if no improvement has been seen in the validation dataset.
223 |             lr /= 4.0
224 |         lr /= 2.0
225 | except KeyboardInterrupt:
226 |     print('-' * 89)
227 |     print('Exiting from training early')
228 | 
229 | # Load the best saved model.
230 | with open(args.save, 'rb') as f:
231 |     model = torch.load(f)
232 | 
233 | # Run on test data.
234 | test_loss = evaluate(test_data)
235 | print('=' * 89)
236 | print('| End of training | test loss {:5.2f} | test perplexity {:8.2f}'.format(
237 |     test_loss, math.exp(test_loss)))
238 | print('=' * 89)


--------------------------------------------------------------------------------