├── .gitignore ├── LICENSE ├── README.md ├── model.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # training 104 | .data 105 | .save 106 | .samples 107 | .tmp 108 | 109 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mini seq2seq 2 | Minimal Seq2Seq model with attention for neural machine translation in PyTorch. 3 | 4 | This implementation focuses on the following features: 5 | 6 | - Modular structure to be used in other projects 7 | - Minimal code for readability 8 | - Full utilization of batches and GPU. 9 | 10 | This implementation relies on [torchtext](https://github.com/pytorch/text) to minimize dataset management and preprocessing parts. 11 | 12 | ## Model description 13 | 14 | * Encoder: Bidirectional GRU 15 | * Decoder: GRU with Attention Mechanism 16 | * Attention: [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473) 17 | 18 | ![](http://www.wildml.com/wp-content/uploads/2015/12/Screen-Shot-2015-12-30-at-1.16.08-PM.png) 19 | 20 | ## Requirements 21 | 22 | * GPU & CUDA 23 | * Python3 24 | * PyTorch 25 | * torchtext 26 | * Spacy 27 | * numpy 28 | * Visdom (optional) 29 | 30 | download tokenizers by doing so: 31 | ``` 32 | python -m spacy download de 33 | python -m spacy download en 34 | ``` 35 | 36 | 37 | ## References 38 | 39 | Based on the following implementations 40 | 41 | * [PyTorch Tutorial](http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) 42 | * [@spro/practical-pytorch](https://github.com/spro/practical-pytorch) 43 | * [@AuCson/PyTorch-Batch-Attention-Seq2seq](https://github.com/AuCson/PyTorch-Batch-Attention-Seq2seq) 44 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import random 4 | from torch import nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | 8 | 9 | class Encoder(nn.Module): 10 | def __init__(self, input_size, embed_size, hidden_size, 11 | n_layers=1, dropout=0.5): 12 | super(Encoder, self).__init__() 13 | self.input_size = input_size 14 | self.hidden_size = hidden_size 15 | self.embed_size = embed_size 16 | self.embed = nn.Embedding(input_size, embed_size) 17 | self.gru = nn.GRU(embed_size, hidden_size, n_layers, 18 | dropout=dropout, bidirectional=True) 19 | 20 | def forward(self, src, hidden=None): 21 | embedded = self.embed(src) 22 | outputs, hidden = self.gru(embedded, hidden) 23 | # sum bidirectional outputs 24 | outputs = (outputs[:, :, :self.hidden_size] + 25 | outputs[:, :, self.hidden_size:]) 26 | return outputs, hidden 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__(self, hidden_size): 31 | super(Attention, self).__init__() 32 | self.hidden_size = hidden_size 33 | self.attn = nn.Linear(self.hidden_size * 2, hidden_size) 34 | self.v = nn.Parameter(torch.rand(hidden_size)) 35 | stdv = 1. / math.sqrt(self.v.size(0)) 36 | self.v.data.uniform_(-stdv, stdv) 37 | 38 | def forward(self, hidden, encoder_outputs): 39 | timestep = encoder_outputs.size(0) 40 | h = hidden.repeat(timestep, 1, 1).transpose(0, 1) 41 | encoder_outputs = encoder_outputs.transpose(0, 1) # [B*T*H] 42 | attn_energies = self.score(h, encoder_outputs) 43 | return F.softmax(attn_energies, dim=1).unsqueeze(1) 44 | 45 | def score(self, hidden, encoder_outputs): 46 | # [B*T*2H]->[B*T*H] 47 | energy = F.relu(self.attn(torch.cat([hidden, encoder_outputs], 2))) 48 | energy = energy.transpose(1, 2) # [B*H*T] 49 | v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1) # [B*1*H] 50 | energy = torch.bmm(v, energy) # [B*1*T] 51 | return energy.squeeze(1) # [B*T] 52 | 53 | 54 | class Decoder(nn.Module): 55 | def __init__(self, embed_size, hidden_size, output_size, 56 | n_layers=1, dropout=0.2): 57 | super(Decoder, self).__init__() 58 | self.embed_size = embed_size 59 | self.hidden_size = hidden_size 60 | self.output_size = output_size 61 | self.n_layers = n_layers 62 | 63 | self.embed = nn.Embedding(output_size, embed_size) 64 | self.dropout = nn.Dropout(dropout, inplace=True) 65 | self.attention = Attention(hidden_size) 66 | self.gru = nn.GRU(hidden_size + embed_size, hidden_size, 67 | n_layers, dropout=dropout) 68 | self.out = nn.Linear(hidden_size * 2, output_size) 69 | 70 | def forward(self, input, last_hidden, encoder_outputs): 71 | # Get the embedding of the current input word (last output word) 72 | embedded = self.embed(input).unsqueeze(0) # (1,B,N) 73 | embedded = self.dropout(embedded) 74 | # Calculate attention weights and apply to encoder outputs 75 | attn_weights = self.attention(last_hidden[-1], encoder_outputs) 76 | context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # (B,1,N) 77 | context = context.transpose(0, 1) # (1,B,N) 78 | # Combine embedded input word and attended context, run through RNN 79 | rnn_input = torch.cat([embedded, context], 2) 80 | output, hidden = self.gru(rnn_input, last_hidden) 81 | output = output.squeeze(0) # (1,B,N) -> (B,N) 82 | context = context.squeeze(0) 83 | output = self.out(torch.cat([output, context], 1)) 84 | output = F.log_softmax(output, dim=1) 85 | return output, hidden, attn_weights 86 | 87 | 88 | class Seq2Seq(nn.Module): 89 | def __init__(self, encoder, decoder): 90 | super(Seq2Seq, self).__init__() 91 | self.encoder = encoder 92 | self.decoder = decoder 93 | 94 | def forward(self, src, trg, teacher_forcing_ratio=0.5): 95 | batch_size = src.size(1) 96 | max_len = trg.size(0) 97 | vocab_size = self.decoder.output_size 98 | outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda() 99 | 100 | encoder_output, hidden = self.encoder(src) 101 | hidden = hidden[:self.decoder.n_layers] 102 | output = Variable(trg.data[0, :]) # sos 103 | for t in range(1, max_len): 104 | output, hidden, attn_weights = self.decoder( 105 | output, hidden, encoder_output) 106 | outputs[t] = output 107 | is_teacher = random.random() < teacher_forcing_ratio 108 | top1 = output.data.max(1)[1] 109 | output = Variable(trg.data[t] if is_teacher else top1).cuda() 110 | return outputs 111 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import argparse 4 | import torch 5 | from torch import optim 6 | from torch.autograd import Variable 7 | from torch.nn.utils import clip_grad_norm_ 8 | from torch.nn import functional as F 9 | from model import Encoder, Decoder, Seq2Seq 10 | from utils import load_dataset 11 | 12 | 13 | def parse_arguments(): 14 | p = argparse.ArgumentParser(description='Hyperparams') 15 | p.add_argument('-epochs', type=int, default=100, 16 | help='number of epochs for train') 17 | p.add_argument('-batch_size', type=int, default=32, 18 | help='number of epochs for train') 19 | p.add_argument('-lr', type=float, default=0.0001, 20 | help='initial learning rate') 21 | p.add_argument('-grad_clip', type=float, default=10.0, 22 | help='in case of gradient explosion') 23 | return p.parse_args() 24 | 25 | 26 | def evaluate(model, val_iter, vocab_size, DE, EN): 27 | with torch.no_grad(): 28 | model.eval() 29 | pad = EN.vocab.stoi[''] 30 | total_loss = 0 31 | for b, batch in enumerate(val_iter): 32 | src, len_src = batch.src 33 | trg, len_trg = batch.trg 34 | src = src.data.cuda() 35 | trg = trg.data.cuda() 36 | output = model(src, trg, teacher_forcing_ratio=0.0) 37 | loss = F.nll_loss(output[1:].view(-1, vocab_size), 38 | trg[1:].contiguous().view(-1), 39 | ignore_index=pad) 40 | total_loss += loss.data.item() 41 | return total_loss / len(val_iter) 42 | 43 | 44 | def train(e, model, optimizer, train_iter, vocab_size, grad_clip, DE, EN): 45 | model.train() 46 | total_loss = 0 47 | pad = EN.vocab.stoi[''] 48 | for b, batch in enumerate(train_iter): 49 | src, len_src = batch.src 50 | trg, len_trg = batch.trg 51 | src, trg = src.cuda(), trg.cuda() 52 | optimizer.zero_grad() 53 | output = model(src, trg) 54 | loss = F.nll_loss(output[1:].view(-1, vocab_size), 55 | trg[1:].contiguous().view(-1), 56 | ignore_index=pad) 57 | loss.backward() 58 | clip_grad_norm_(model.parameters(), grad_clip) 59 | optimizer.step() 60 | total_loss += loss.data.item() 61 | 62 | if b % 100 == 0 and b != 0: 63 | total_loss = total_loss / 100 64 | print("[%d][loss:%5.2f][pp:%5.2f]" % 65 | (b, total_loss, math.exp(total_loss))) 66 | total_loss = 0 67 | 68 | 69 | def main(): 70 | args = parse_arguments() 71 | hidden_size = 512 72 | embed_size = 256 73 | assert torch.cuda.is_available() 74 | 75 | print("[!] preparing dataset...") 76 | train_iter, val_iter, test_iter, DE, EN = load_dataset(args.batch_size) 77 | de_size, en_size = len(DE.vocab), len(EN.vocab) 78 | print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)" 79 | % (len(train_iter), len(train_iter.dataset), 80 | len(test_iter), len(test_iter.dataset))) 81 | print("[DE_vocab]:%d [en_vocab]:%d" % (de_size, en_size)) 82 | 83 | print("[!] Instantiating models...") 84 | encoder = Encoder(de_size, embed_size, hidden_size, 85 | n_layers=2, dropout=0.5) 86 | decoder = Decoder(embed_size, hidden_size, en_size, 87 | n_layers=1, dropout=0.5) 88 | seq2seq = Seq2Seq(encoder, decoder).cuda() 89 | optimizer = optim.Adam(seq2seq.parameters(), lr=args.lr) 90 | print(seq2seq) 91 | 92 | best_val_loss = None 93 | for e in range(1, args.epochs+1): 94 | train(e, seq2seq, optimizer, train_iter, 95 | en_size, args.grad_clip, DE, EN) 96 | val_loss = evaluate(seq2seq, val_iter, en_size, DE, EN) 97 | print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2fS" 98 | % (e, val_loss, math.exp(val_loss))) 99 | 100 | # Save the model if the validation loss is the best we've seen so far. 101 | if not best_val_loss or val_loss < best_val_loss: 102 | print("[!] saving model...") 103 | if not os.path.isdir(".save"): 104 | os.makedirs(".save") 105 | torch.save(seq2seq.state_dict(), './.save/seq2seq_%d.pt' % (e)) 106 | best_val_loss = val_loss 107 | test_loss = evaluate(seq2seq, test_iter, en_size, DE, EN) 108 | print("[TEST] loss:%5.2f" % test_loss) 109 | 110 | 111 | if __name__ == "__main__": 112 | try: 113 | main() 114 | except KeyboardInterrupt as e: 115 | print("[STOP]", e) 116 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import spacy 3 | from torchtext.data import Field, BucketIterator 4 | from torchtext.datasets import Multi30k 5 | 6 | 7 | def load_dataset(batch_size): 8 | spacy_de = spacy.load('de') 9 | spacy_en = spacy.load('en') 10 | url = re.compile('(.*)') 11 | 12 | def tokenize_de(text): 13 | return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] 14 | 15 | def tokenize_en(text): 16 | return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] 17 | 18 | DE = Field(tokenize=tokenize_de, include_lengths=True, 19 | init_token='', eos_token='') 20 | EN = Field(tokenize=tokenize_en, include_lengths=True, 21 | init_token='', eos_token='') 22 | train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) 23 | DE.build_vocab(train.src, min_freq=2) 24 | EN.build_vocab(train.trg, max_size=10000) 25 | train_iter, val_iter, test_iter = BucketIterator.splits( 26 | (train, val, test), batch_size=batch_size, repeat=False) 27 | return train_iter, val_iter, test_iter, DE, EN 28 | --------------------------------------------------------------------------------