├── .DS_Store ├── README.md ├── rnn_attention.py └── main.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edchengg/PTB-pytorch-LSTM-attention/HEAD/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PTB Language Modelling task with RNNS(LSTM,GRU) and Attention Layer 2 | 3 | This repository is used for a language modelling pareto competition at TTIC. 4 | I implemented an attention layer with the RNN model. 5 | TODO: (Lei Mao suggests another way to implement the attention layer by breaking into the LSTM class.) 6 | 7 | ## Software Requirements 8 | 9 | This codebase requires Python 3, [PyTorch](http://pytorch.org/) 10 | 11 | ## Usage 12 | 13 | ```bash 14 | python main.py --att --att_width 20 # Train a LSTM on PTB with attention layer and set the width of attenion to 20 15 | python generate.py # Generate samples from the trained LSTM model. 16 | ``` 17 | 18 | ## Acknowledge 19 | This repository contains the code originally forked from the [Word-level language modeling RNN](https://github.com/pytorch/examples/tree/master/word_language_model) that is modified to present attention layer into the model. 20 | -------------------------------------------------------------------------------- /rnn_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | def batch_matmul(seq, weight, nonlinearity=''): 7 | s = None 8 | for i in range(seq.size(0)): 9 | _s = torch.mm(seq[i], weight) 10 | if(nonlinearity=='tanh'): 11 | _s = torch.tanh(_s) 12 | _s = _s.unsqueeze(0) 13 | if(s is None): 14 | s = _s 15 | else: 16 | s = torch.cat((s,_s),0) 17 | return s.squeeze() 18 | 19 | class RNNModel(nn.Module): 20 | """Container module with an encoder, a recurrent module, and a decoder.""" 21 | 22 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False, attention=False, 23 | attention_width=3, cuda=False): 24 | super(RNNModel, self).__init__() 25 | self.drop = nn.Dropout(dropout) 26 | self.encoder = nn.Embedding(ntoken, ninp) 27 | if rnn_type in ['LSTM', 'GRU']: 28 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) 29 | else: 30 | try: 31 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] 32 | except KeyError: 33 | raise ValueError("""An invalid option for `--model` was supplied, 34 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 35 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) 36 | if attention: 37 | self.decoder = nn.Linear(nhid, ntoken) 38 | else: 39 | self.decoder = nn.Linear(nhid, ntoken) 40 | if tie_weights: 41 | if nhid != ninp: 42 | raise ValueError('When using the tied flag, nhid must be equal to emsize') 43 | self.decoder.weight = self.encoder.weight 44 | 45 | self.softmax = nn.Softmax() 46 | if attention: 47 | self.AttentionLayer = AttentionLayer(cuda,nhid) 48 | self.init_weights() 49 | 50 | self.rnn_type = rnn_type 51 | self.nhid = nhid 52 | self.nlayers = nlayers 53 | self.attention = attention 54 | self.attention_width = attention_width 55 | 56 | def init_weights(self): 57 | initrange = 0.1 58 | self.encoder.weight.data.uniform_(-initrange, initrange) 59 | self.decoder.bias.data.fill_(0) 60 | self.decoder.weight.data.uniform_(-initrange, initrange) 61 | 62 | def forward(self, input, hidden): 63 | #print("input size:",input.size()) 64 | emb = self.drop(self.encoder(input)) 65 | #print("emb size:",emb.size()) 66 | output, hidden = self.rnn(emb, hidden) 67 | #print("rnn output",output.size()) 68 | if self.attention: 69 | output = self.AttentionLayer.forward(output, self.attention_width) 70 | output = self.drop(output) 71 | decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) 72 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 73 | 74 | def init_hidden(self, bsz): 75 | weight = next(self.parameters()).data 76 | if self.rnn_type == 'LSTM': 77 | return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()), 78 | Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())) 79 | else: 80 | return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) 81 | 82 | 83 | class AttentionLayer(nn.Module): 84 | """Implements an Attention Layer""" 85 | 86 | def __init__(self, cuda, nhid): 87 | super(AttentionLayer, self).__init__() 88 | self.nhid = nhid 89 | self.weight_W = nn.Parameter(torch.Tensor(nhid,nhid)) 90 | self.weight_proj = nn.Parameter(torch.Tensor(nhid, 1)) 91 | self.softmax = nn.Softmax() 92 | self.weight_W.data.uniform_(-0.1, 0.1) 93 | self.weight_proj.data.uniform_(-0.1,0.1) 94 | self.cuda = cuda 95 | 96 | def forward(self, inputs, attention_width=3): 97 | results = None 98 | for i in range(inputs.size(0)): 99 | if(i