├── .gitignore
├── LICENSE
├── README.md
├── model.py
├── train.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # training
104 | .data
105 | .save
106 | .samples
107 | .tmp
108 | 
109 | .vscode/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Keon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mini seq2seq
 2 | Minimal Seq2Seq model with attention for neural machine translation in PyTorch.
 3 | 
 4 | This implementation focuses on the following features:
 5 | 
 6 | - Modular structure to be used in other projects
 7 | - Minimal code for readability
 8 | - Full utilization of batches and GPU.
 9 | 
10 | This implementation relies on [torchtext](https://github.com/pytorch/text) to minimize dataset management and preprocessing parts.
11 | 
12 | ## Model description
13 | 
14 | * Encoder: Bidirectional GRU
15 | * Decoder: GRU with Attention Mechanism
16 | * Attention: [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)
17 | 
18 | ![](http://www.wildml.com/wp-content/uploads/2015/12/Screen-Shot-2015-12-30-at-1.16.08-PM.png)
19 | 
20 | ## Requirements
21 | 
22 | * GPU & CUDA
23 | * Python3
24 | * PyTorch
25 | * torchtext
26 | * Spacy
27 | * numpy
28 | * Visdom (optional)
29 | 
30 | download tokenizers by doing so:
31 | ```
32 | python -m spacy download de
33 | python -m spacy download en
34 | ```
35 | 
36 | 
37 | ## References
38 | 
39 | Based on the following implementations
40 | 
41 | * [PyTorch Tutorial](http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)
42 | * [@spro/practical-pytorch](https://github.com/spro/practical-pytorch)
43 | * [@AuCson/PyTorch-Batch-Attention-Seq2seq](https://github.com/AuCson/PyTorch-Batch-Attention-Seq2seq)
44 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import random
  4 | from torch import nn
  5 | from torch.autograd import Variable
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class Encoder(nn.Module):
 10 |     def __init__(self, input_size, embed_size, hidden_size,
 11 |                  n_layers=1, dropout=0.5):
 12 |         super(Encoder, self).__init__()
 13 |         self.input_size = input_size
 14 |         self.hidden_size = hidden_size
 15 |         self.embed_size = embed_size
 16 |         self.embed = nn.Embedding(input_size, embed_size)
 17 |         self.gru = nn.GRU(embed_size, hidden_size, n_layers,
 18 |                           dropout=dropout, bidirectional=True)
 19 | 
 20 |     def forward(self, src, hidden=None):
 21 |         embedded = self.embed(src)
 22 |         outputs, hidden = self.gru(embedded, hidden)
 23 |         # sum bidirectional outputs
 24 |         outputs = (outputs[:, :, :self.hidden_size] +
 25 |                    outputs[:, :, self.hidden_size:])
 26 |         return outputs, hidden
 27 | 
 28 | 
 29 | class Attention(nn.Module):
 30 |     def __init__(self, hidden_size):
 31 |         super(Attention, self).__init__()
 32 |         self.hidden_size = hidden_size
 33 |         self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
 34 |         self.v = nn.Parameter(torch.rand(hidden_size))
 35 |         stdv = 1. / math.sqrt(self.v.size(0))
 36 |         self.v.data.uniform_(-stdv, stdv)
 37 | 
 38 |     def forward(self, hidden, encoder_outputs):
 39 |         timestep = encoder_outputs.size(0)
 40 |         h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
 41 |         encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
 42 |         attn_energies = self.score(h, encoder_outputs)
 43 |         return F.softmax(attn_energies, dim=1).unsqueeze(1)
 44 | 
 45 |     def score(self, hidden, encoder_outputs):
 46 |         # [B*T*2H]->[B*T*H]
 47 |         energy = F.relu(self.attn(torch.cat([hidden, encoder_outputs], 2)))
 48 |         energy = energy.transpose(1, 2)  # [B*H*T]
 49 |         v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
 50 |         energy = torch.bmm(v, energy)  # [B*1*T]
 51 |         return energy.squeeze(1)  # [B*T]
 52 | 
 53 | 
 54 | class Decoder(nn.Module):
 55 |     def __init__(self, embed_size, hidden_size, output_size,
 56 |                  n_layers=1, dropout=0.2):
 57 |         super(Decoder, self).__init__()
 58 |         self.embed_size = embed_size
 59 |         self.hidden_size = hidden_size
 60 |         self.output_size = output_size
 61 |         self.n_layers = n_layers
 62 | 
 63 |         self.embed = nn.Embedding(output_size, embed_size)
 64 |         self.dropout = nn.Dropout(dropout, inplace=True)
 65 |         self.attention = Attention(hidden_size)
 66 |         self.gru = nn.GRU(hidden_size + embed_size, hidden_size,
 67 |                           n_layers, dropout=dropout)
 68 |         self.out = nn.Linear(hidden_size * 2, output_size)
 69 | 
 70 |     def forward(self, input, last_hidden, encoder_outputs):
 71 |         # Get the embedding of the current input word (last output word)
 72 |         embedded = self.embed(input).unsqueeze(0)  # (1,B,N)
 73 |         embedded = self.dropout(embedded)
 74 |         # Calculate attention weights and apply to encoder outputs
 75 |         attn_weights = self.attention(last_hidden[-1], encoder_outputs)
 76 |         context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,N)
 77 |         context = context.transpose(0, 1)  # (1,B,N)
 78 |         # Combine embedded input word and attended context, run through RNN
 79 |         rnn_input = torch.cat([embedded, context], 2)
 80 |         output, hidden = self.gru(rnn_input, last_hidden)
 81 |         output = output.squeeze(0)  # (1,B,N) -> (B,N)
 82 |         context = context.squeeze(0)
 83 |         output = self.out(torch.cat([output, context], 1))
 84 |         output = F.log_softmax(output, dim=1)
 85 |         return output, hidden, attn_weights
 86 | 
 87 | 
 88 | class Seq2Seq(nn.Module):
 89 |     def __init__(self, encoder, decoder):
 90 |         super(Seq2Seq, self).__init__()
 91 |         self.encoder = encoder
 92 |         self.decoder = decoder
 93 | 
 94 |     def forward(self, src, trg, teacher_forcing_ratio=0.5):
 95 |         batch_size = src.size(1)
 96 |         max_len = trg.size(0)
 97 |         vocab_size = self.decoder.output_size
 98 |         outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()
 99 | 
100 |         encoder_output, hidden = self.encoder(src)
101 |         hidden = hidden[:self.decoder.n_layers]
102 |         output = Variable(trg.data[0, :])  # sos
103 |         for t in range(1, max_len):
104 |             output, hidden, attn_weights = self.decoder(
105 |                     output, hidden, encoder_output)
106 |             outputs[t] = output
107 |             is_teacher = random.random() < teacher_forcing_ratio
108 |             top1 = output.data.max(1)[1]
109 |             output = Variable(trg.data[t] if is_teacher else top1).cuda()
110 |         return outputs
111 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import argparse
  4 | import torch
  5 | from torch import optim
  6 | from torch.autograd import Variable
  7 | from torch.nn.utils import clip_grad_norm_
  8 | from torch.nn import functional as F
  9 | from model import Encoder, Decoder, Seq2Seq
 10 | from utils import load_dataset
 11 | 
 12 | 
 13 | def parse_arguments():
 14 |     p = argparse.ArgumentParser(description='Hyperparams')
 15 |     p.add_argument('-epochs', type=int, default=100,
 16 |                    help='number of epochs for train')
 17 |     p.add_argument('-batch_size', type=int, default=32,
 18 |                    help='number of epochs for train')
 19 |     p.add_argument('-lr', type=float, default=0.0001,
 20 |                    help='initial learning rate')
 21 |     p.add_argument('-grad_clip', type=float, default=10.0,
 22 |                    help='in case of gradient explosion')
 23 |     return p.parse_args()
 24 | 
 25 | 
 26 | def evaluate(model, val_iter, vocab_size, DE, EN):
 27 |     with torch.no_grad():
 28 |         model.eval()
 29 |         pad = EN.vocab.stoi['<pad>']
 30 |         total_loss = 0
 31 |         for b, batch in enumerate(val_iter):
 32 |             src, len_src = batch.src
 33 |             trg, len_trg = batch.trg
 34 |             src = src.data.cuda()
 35 |             trg = trg.data.cuda()
 36 |             output = model(src, trg, teacher_forcing_ratio=0.0)
 37 |             loss = F.nll_loss(output[1:].view(-1, vocab_size),
 38 |                                    trg[1:].contiguous().view(-1),
 39 |                                    ignore_index=pad)
 40 |             total_loss += loss.data.item()
 41 |         return total_loss / len(val_iter)
 42 | 
 43 | 
 44 | def train(e, model, optimizer, train_iter, vocab_size, grad_clip, DE, EN):
 45 |     model.train()
 46 |     total_loss = 0
 47 |     pad = EN.vocab.stoi['<pad>']
 48 |     for b, batch in enumerate(train_iter):
 49 |         src, len_src = batch.src
 50 |         trg, len_trg = batch.trg
 51 |         src, trg = src.cuda(), trg.cuda()
 52 |         optimizer.zero_grad()
 53 |         output = model(src, trg)
 54 |         loss = F.nll_loss(output[1:].view(-1, vocab_size),
 55 |                                trg[1:].contiguous().view(-1),
 56 |                                ignore_index=pad)
 57 |         loss.backward()
 58 |         clip_grad_norm_(model.parameters(), grad_clip)
 59 |         optimizer.step()
 60 |         total_loss += loss.data.item()
 61 | 
 62 |         if b % 100 == 0 and b != 0:
 63 |             total_loss = total_loss / 100
 64 |             print("[%d][loss:%5.2f][pp:%5.2f]" %
 65 |                   (b, total_loss, math.exp(total_loss)))
 66 |             total_loss = 0
 67 | 
 68 | 
 69 | def main():
 70 |     args = parse_arguments()
 71 |     hidden_size = 512
 72 |     embed_size = 256
 73 |     assert torch.cuda.is_available()
 74 | 
 75 |     print("[!] preparing dataset...")
 76 |     train_iter, val_iter, test_iter, DE, EN = load_dataset(args.batch_size)
 77 |     de_size, en_size = len(DE.vocab), len(EN.vocab)
 78 |     print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)"
 79 |           % (len(train_iter), len(train_iter.dataset),
 80 |              len(test_iter), len(test_iter.dataset)))
 81 |     print("[DE_vocab]:%d [en_vocab]:%d" % (de_size, en_size))
 82 | 
 83 |     print("[!] Instantiating models...")
 84 |     encoder = Encoder(de_size, embed_size, hidden_size,
 85 |                       n_layers=2, dropout=0.5)
 86 |     decoder = Decoder(embed_size, hidden_size, en_size,
 87 |                       n_layers=1, dropout=0.5)
 88 |     seq2seq = Seq2Seq(encoder, decoder).cuda()
 89 |     optimizer = optim.Adam(seq2seq.parameters(), lr=args.lr)
 90 |     print(seq2seq)
 91 | 
 92 |     best_val_loss = None
 93 |     for e in range(1, args.epochs+1):
 94 |         train(e, seq2seq, optimizer, train_iter,
 95 |               en_size, args.grad_clip, DE, EN)
 96 |         val_loss = evaluate(seq2seq, val_iter, en_size, DE, EN)
 97 |         print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2fS"
 98 |               % (e, val_loss, math.exp(val_loss)))
 99 | 
100 |         # Save the model if the validation loss is the best we've seen so far.
101 |         if not best_val_loss or val_loss < best_val_loss:
102 |             print("[!] saving model...")
103 |             if not os.path.isdir(".save"):
104 |                 os.makedirs(".save")
105 |             torch.save(seq2seq.state_dict(), './.save/seq2seq_%d.pt' % (e))
106 |             best_val_loss = val_loss
107 |     test_loss = evaluate(seq2seq, test_iter, en_size, DE, EN)
108 |     print("[TEST] loss:%5.2f" % test_loss)
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     try:
113 |         main()
114 |     except KeyboardInterrupt as e:
115 |         print("[STOP]", e)
116 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import spacy
 3 | from torchtext.data import Field, BucketIterator
 4 | from torchtext.datasets import Multi30k
 5 | 
 6 | 
 7 | def load_dataset(batch_size):
 8 |     spacy_de = spacy.load('de')
 9 |     spacy_en = spacy.load('en')
10 |     url = re.compile('(<url>.*</url>)')
11 | 
12 |     def tokenize_de(text):
13 |         return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]
14 | 
15 |     def tokenize_en(text):
16 |         return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]
17 | 
18 |     DE = Field(tokenize=tokenize_de, include_lengths=True,
19 |                init_token='<sos>', eos_token='<eos>')
20 |     EN = Field(tokenize=tokenize_en, include_lengths=True,
21 |                init_token='<sos>', eos_token='<eos>')
22 |     train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
23 |     DE.build_vocab(train.src, min_freq=2)
24 |     EN.build_vocab(train.trg, max_size=10000)
25 |     train_iter, val_iter, test_iter = BucketIterator.splits(
26 |             (train, val, test), batch_size=batch_size, repeat=False)
27 |     return train_iter, val_iter, test_iter, DE, EN
28 | 


--------------------------------------------------------------------------------