├── .gitignore ├── LICENSE ├── README.md ├── model.py ├── number_loader.py └── predict_odd_numbers.py /.gitignore: -------------------------------------------------------------------------------- 1 | /venv/ 2 | /.idea/ 3 | /model/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Kenneth111 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch nn.Transformer Demo 2 | A demo to predict odd numbers. Given the input [2, 4, 6], this program generates the output [3, 5, 7]. Given the input [100, 102, 104], this program generates the output [101, 103, 105]. 3 | 4 | Create a folder named "model", where the weights of trained model will be saved, and train the model using 5 | ```shell script 6 | python predict_odd_numbers.py 7 | ``` 8 | The validation loss will be around 1.7. 9 | 10 | Test the model using 11 | ```shell script 12 | python predict_odd_numbers.py --test_model model/xxx.pt 13 | ``` 14 | 15 | Codes in the model.py come from [this notebook](https://colab.research.google.com/drive/1g4ZFCGegOmD-xXL-Ggu7K5LVoJeXYJ75). 16 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import math 4 | 5 | 6 | class PositionalEncoding(nn.Module): 7 | def __init__(self, d_model, dropout=0.1, max_len=100): 8 | super(PositionalEncoding, self).__init__() 9 | self.dropout = nn.Dropout(p=dropout) 10 | 11 | pe = torch.zeros(max_len, d_model) 12 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 13 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 14 | pe[:, 0::2] = torch.sin(position * div_term) 15 | pe[:, 1::2] = torch.cos(position * div_term) 16 | pe = pe.unsqueeze(0).transpose(0, 1) 17 | self.register_buffer('pe', pe) 18 | 19 | def forward(self, x): 20 | x = x + self.pe[:x.size(0), :] 21 | return self.dropout(x) 22 | 23 | 24 | class TransformerModel(nn.Module): 25 | def __init__(self, intoken, outtoken, hidden, nlayers=3, dropout=0.1): 26 | super(TransformerModel, self).__init__() 27 | nhead = hidden // 64 28 | 29 | self.encoder = nn.Embedding(intoken, hidden) 30 | self.pos_encoder = PositionalEncoding(hidden, dropout) 31 | 32 | self.decoder = nn.Embedding(outtoken, hidden) 33 | self.pos_decoder = PositionalEncoding(hidden, dropout) 34 | 35 | self.inscale = math.sqrt(intoken) 36 | self.outscale = math.sqrt(outtoken) 37 | 38 | self.transformer = nn.Transformer(d_model=hidden, nhead=nhead, num_encoder_layers=nlayers, 39 | num_decoder_layers=nlayers, dim_feedforward=hidden, dropout=dropout) 40 | self.fc_out = nn.Linear(hidden, outtoken) 41 | 42 | self.src_mask = None 43 | self.trg_mask = None 44 | self.memory_mask = None 45 | 46 | def generate_square_subsequent_mask(self, sz): 47 | mask = torch.triu(torch.ones(sz, sz), 1) 48 | mask = mask.masked_fill(mask == 1, float('-inf')) 49 | return mask 50 | 51 | def make_len_mask(self, inp): 52 | return (inp == 0).transpose(0, 1) 53 | 54 | def forward(self, src, trg): 55 | if self.trg_mask is None or self.trg_mask.size(0) != len(trg): 56 | self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device) 57 | 58 | src_pad_mask = self.make_len_mask(src) 59 | trg_pad_mask = self.make_len_mask(trg) 60 | 61 | src = self.encoder(src) 62 | src = self.pos_encoder(src) 63 | 64 | trg = self.decoder(trg) 65 | trg = self.pos_decoder(trg) 66 | output = self.transformer(src, trg, tgt_mask=self.trg_mask) 67 | # output = self.transformer(src, trg, src_mask=self.src_mask, tgt_mask=self.trg_mask, 68 | # memory_mask=self.memory_mask, 69 | # src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=trg_pad_mask, 70 | # memory_key_padding_mask=src_pad_mask) 71 | output = self.fc_out(output) 72 | 73 | return output -------------------------------------------------------------------------------- /number_loader.py: -------------------------------------------------------------------------------- 1 | from torch import LongTensor 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class NumberLoader(Dataset): 6 | def __init__(self, x, y, inp_len=3, out_len=3): 7 | if len(x) != len(y): 8 | raise ValueError("len(x) != len(y)") 9 | self.x = [[x[i + j] for j in range(inp_len)] for i in range(len(x) - inp_len + 1)] 10 | self.y = [[y[i + j] for j in range(out_len)] for i in range(len(y) - out_len + 1)] 11 | 12 | def __getitem__(self, index): 13 | return LongTensor(self.x[index]), LongTensor([0] + self.y[index]) 14 | 15 | def __len__(self): 16 | return len(self.x) -------------------------------------------------------------------------------- /predict_odd_numbers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from numpy import arange, random 3 | from torch import save, load, no_grad, LongTensor 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from torch.utils.data import DataLoader, random_split 7 | from number_loader import NumberLoader 8 | from model import TransformerModel 9 | 10 | 11 | def train(model, criterion, optimizer, loader): 12 | model.train() 13 | epoch_loss = 0 14 | for i, batch in enumerate(loader): 15 | src, tgt = batch 16 | src, tgt = src.transpose(1, 0).cuda(), tgt.transpose(1, 0).cuda() 17 | optimizer.zero_grad() 18 | output = model(src, tgt[:-1, :]) 19 | n = output.shape[-1] 20 | loss = criterion(output.reshape(-1, n), tgt[1:, :].reshape(-1)) 21 | loss.backward() 22 | nn.utils.clip_grad_norm_(model.parameters(), 1.0) 23 | optimizer.step() 24 | epoch_loss += loss.item() 25 | return epoch_loss / len(loader) 26 | 27 | 28 | def validation(model, criterion, loader): 29 | model.eval() 30 | epoch_loss = 0 31 | with no_grad(): 32 | for i, batch in enumerate(loader): 33 | src, tgt = batch 34 | src, tgt = src.transpose(1, 0).cuda(), tgt.transpose(1, 0).cuda() 35 | output = model(src, tgt[:-1, :]) 36 | n = output.shape[-1] 37 | loss = criterion(output.reshape(-1, n), tgt[1:, :].reshape(-1)) 38 | epoch_loss += loss.item() 39 | return epoch_loss / len(loader) 40 | 41 | 42 | def test(model, max_len=3, test_times=1): 43 | model = model.cuda() 44 | model.eval() 45 | with no_grad(): 46 | for i in range(test_times): 47 | s = random.randint(1, 4998) 48 | cpu_src = [(s + j) * 2 for j in range(max_len)] 49 | src = LongTensor(cpu_src).unsqueeze(1).cuda() 50 | tgt = [0] + [(s + j) * 2 + 1 for j in range(max_len)] 51 | pred = [0] 52 | for j in range(max_len): 53 | inp = LongTensor(pred).unsqueeze(1).cuda() 54 | output = model(src, inp) 55 | out_num = output.argmax(2)[-1].item() 56 | pred.append(out_num) 57 | print("input: ", cpu_src) 58 | print("target: ", tgt) 59 | print("predict: ", pred) 60 | 61 | 62 | def main(model_name=None, hidden=64, nlayers=1): 63 | voc_size = 10000 64 | inp = arange(2, voc_size, 2) 65 | tgt = arange(3, voc_size, 2) 66 | batch_size = 128 67 | epochs = 30 68 | dataset = NumberLoader(inp, tgt) 69 | train_len = int(len(dataset) * 0.9) 70 | val_len = len(dataset) - train_len 71 | train_set, val_set = random_split(dataset, [train_len, val_len]) 72 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1) 73 | val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=1) 74 | model = TransformerModel(voc_size, voc_size, hidden=hidden, nlayers=nlayers) 75 | if model_name is not None: 76 | model.load_state_dict(load(model_name)) 77 | model = model.cuda() 78 | # optimizer = optim.SGD(model.parameters(), lr=0.5) 79 | optimizer = optim.Adam(model.parameters()) 80 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) 81 | criterion = nn.CrossEntropyLoss() 82 | best_loss = 100 83 | for i in range(epochs): 84 | epoch_loss = train(model, criterion, optimizer, train_loader) 85 | epoch_loss_val = validation(model, criterion, val_loader) 86 | # scheduler.step() 87 | print("epoch: {} train loss: {}".format(i, epoch_loss)) 88 | print("epoch: {} val loss: {}".format(i, epoch_loss_val)) 89 | if epoch_loss_val < best_loss: 90 | best_loss = epoch_loss_val 91 | model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val) 92 | save(model.state_dict(), model_name) 93 | return model_name 94 | 95 | 96 | if __name__ == "__main__": 97 | parser = argparse.ArgumentParser(description='A PyTorch Transformer Language Model for Predicting Odd Numbers') 98 | parser.add_argument('--test_model', type=str, help='the model file to load') 99 | parser.add_argument('--train_model', type=str, help='the model file to load') 100 | args = parser.parse_args() 101 | hidden = 128 102 | nlayers = 2 103 | if args.test_model is None: 104 | if args.train_model is not None: 105 | model_name = main(args.train_model, hidden=hidden, nlayers=nlayers) 106 | else: 107 | model_name = main(hidden=hidden, nlayers=nlayers) 108 | else: 109 | model_name = args.test_model 110 | model = TransformerModel(10000, 10000, hidden=hidden, nlayers=nlayers) 111 | model.load_state_dict(load(model_name)) 112 | test(model, test_times=10) 113 | --------------------------------------------------------------------------------