├── mnist ├── requirements.txt ├── README.md └── main.py ├── snli ├── requirements.txt ├── util.py ├── model.py └── train.py ├── word_language_model ├── requirements.txt ├── README.md ├── data.py ├── model.py ├── generate.py └── main.py ├── imagenet ├── requirements.txt ├── README.md └── main.py ├── dcgan ├── requirements.txt ├── README.md └── main.py ├── mnist_hogwild ├── requirements.txt ├── main.py └── train.py ├── .gitignore ├── vae ├── requirements.txt ├── README.md └── main.py ├── reinforcement_learning ├── requirements.txt ├── README.md ├── reinforce.py └── actor_critic.py ├── OpenNMT ├── onmt │ ├── modules │ │ ├── __init__.py │ │ └── GlobalAttention.py │ ├── Constants.py │ ├── __init__.py │ ├── Dataset.py │ ├── Optim.py │ ├── Beam.py │ ├── Dict.py │ ├── Models.py │ └── Translator.py ├── LICENSE.md ├── README.md ├── translate.py ├── preprocess.py └── train.py ├── regression ├── README.md └── main.py ├── README.md ├── super_resolution ├── dataset.py ├── super_resolve.py ├── model.py ├── README.md ├── data.py └── main.py └── LICENSE /mnist/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | -------------------------------------------------------------------------------- /snli/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchtext 3 | -------------------------------------------------------------------------------- /word_language_model/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | -------------------------------------------------------------------------------- /imagenet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | -------------------------------------------------------------------------------- /dcgan/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | lmdb 4 | -------------------------------------------------------------------------------- /mnist_hogwild/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dcgan/data 2 | data 3 | *.pyc 4 | OpenNMT/data 5 | -------------------------------------------------------------------------------- /vae/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | tqdm 4 | six 5 | -------------------------------------------------------------------------------- /reinforcement_learning/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | numpy 3 | gym 4 | -------------------------------------------------------------------------------- /OpenNMT/onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.modules.GlobalAttention import GlobalAttention 2 | -------------------------------------------------------------------------------- /regression/README.md: -------------------------------------------------------------------------------- 1 | # Linear regression example 2 | 3 | Trains a single fully-connected layer to fit a 4th degree polynomial. 4 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Constants.py: -------------------------------------------------------------------------------- 1 | 2 | PAD = 0 3 | UNK = 1 4 | BOS = 2 5 | EOS = 3 6 | 7 | PAD_WORD = '' 8 | UNK_WORD = '' 9 | BOS_WORD = '' 10 | EOS_WORD = '' 11 | -------------------------------------------------------------------------------- /mnist/README.md: -------------------------------------------------------------------------------- 1 | # Basic MNIST Example 2 | 3 | ```bash 4 | pip install -r requirements.txt 5 | python main.py 6 | # CUDA_VISIBLE_DEVICES=2 python main.py # to specify GPU id to ex. 2 7 | ``` 8 | -------------------------------------------------------------------------------- /OpenNMT/onmt/__init__.py: -------------------------------------------------------------------------------- 1 | import onmt.Constants 2 | import onmt.Models 3 | from onmt.Translator import Translator 4 | from onmt.Dataset import Dataset 5 | from onmt.Optim import Optim 6 | from onmt.Dict import Dict 7 | from onmt.Beam import Beam 8 | -------------------------------------------------------------------------------- /reinforcement_learning/README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement learning training example 2 | 3 | ```bash 4 | pip install -r requirements.txt 5 | # For REINFORCE: 6 | python reinforce.py 7 | # For actor critic: 8 | python actor_critic.py 9 | ``` 10 | -------------------------------------------------------------------------------- /vae/README.md: -------------------------------------------------------------------------------- 1 | # Basic VAE Example 2 | 3 | This is an improved implementation of the paper [Stochastic Gradient VB and the 4 | Variational Auto-Encoder](http://arxiv.org/abs/1312.6114) by Kingma and Welling. 5 | It uses ReLUs and the adam optimizer, instead of sigmoids and adagrad. These changes make the network converge much faster. 6 | 7 | ```bash 8 | pip install -r requirements.txt 9 | python main.py 10 | ``` 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Examples 2 | 3 | A repository showcasing examples of using pytorch 4 | 5 | - MNIST Convnets 6 | - Word level Language Modeling using LSTM RNNs 7 | - Training Imagenet Classifiers with Residual Networks 8 | - Generative Adversarial Networks (DCGAN) 9 | - Variational Auto-Encoders 10 | - Superresolution using an efficient sub-pixel convolutional neural network 11 | - Hogwild training of shared ConvNets across multiple processes on MNIST 12 | - Training a CartPole to balance in OpenAI Gym with actor-critic 13 | - Natural Language Inference (SNLI) with GloVe vectors, LSTMs, and torchtext 14 | - Neural Machine Translation using sequence-to-sequence RNN with attention (OpenNMT) 15 | -------------------------------------------------------------------------------- /OpenNMT/LICENSE.md: -------------------------------------------------------------------------------- 1 | This software is derived from the OpenNMT project at 2 | https://github.com/OpenNMT/OpenNMT. 3 | 4 | The MIT License (MIT) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /super_resolution/dataset.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | 3 | from os import listdir 4 | from os.path import join 5 | from PIL import Image 6 | 7 | 8 | def is_image_file(filename): 9 | return any(filename.endswith(extension) for extension in [".png", ".jpg", ".jpeg"]) 10 | 11 | 12 | def load_img(filepath): 13 | img = Image.open(filepath).convert('YCbCr') 14 | y, _, _ = img.split() 15 | return y 16 | 17 | 18 | class DatasetFromFolder(data.Dataset): 19 | def __init__(self, image_dir, input_transform=None, target_transform=None): 20 | super(DatasetFromFolder, self).__init__() 21 | self.image_filenames = [join(image_dir, x) for x in listdir(image_dir) if is_image_file(x)] 22 | 23 | self.input_transform = input_transform 24 | self.target_transform = target_transform 25 | 26 | def __getitem__(self, index): 27 | input = load_img(self.image_filenames[index]) 28 | target = input.copy() 29 | if self.input_transform: 30 | input = self.input_transform(input) 31 | if self.target_transform: 32 | target = self.target_transform(target) 33 | 34 | return input, target 35 | 36 | def __len__(self): 37 | return len(self.image_filenames) 38 | -------------------------------------------------------------------------------- /word_language_model/README.md: -------------------------------------------------------------------------------- 1 | # Word-level language modeling RNN 2 | 3 | This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task. 4 | By default, the training script uses the PTB dataset, provided. 5 | The trained model can then be used by the generate script to generate new text. 6 | 7 | ```bash 8 | python main.py --cuda # Train an LSTM on ptb with cuda (cuDNN). Should reach perplexity of 113 9 | python generate.py # Generate samples from the trained LSTM model. 10 | ``` 11 | 12 | The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`) 13 | which will automatically use the cuDNN backend if run on CUDA with cuDNN installed. 14 | 15 | The `main.py` script accepts the following arguments: 16 | 17 | ```bash 18 | optional arguments: 19 | -h, --help show this help message and exit 20 | --data DATA location of the data corpus 21 | --model MODEL type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU) 22 | --emsize EMSIZE size of word embeddings 23 | --nhid NHID humber of hidden units per layer 24 | --nlayers NLAYERS number of layers 25 | --lr LR initial learning rate 26 | --clip CLIP gradient clipping 27 | --epochs EPOCHS upper epoch limit 28 | --batch-size N batch size 29 | --bptt BPTT sequence length 30 | --seed SEED random seed 31 | --cuda use CUDA 32 | --log-interval N report interval 33 | --save SAVE path to save the final model 34 | ``` 35 | -------------------------------------------------------------------------------- /super_resolution/super_resolve.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | from torch.autograd import Variable 5 | from PIL import Image 6 | from torchvision.transforms import ToTensor 7 | 8 | import numpy as np 9 | 10 | # Training settings 11 | parser = argparse.ArgumentParser(description='PyTorch Super Res Example') 12 | parser.add_argument('--input_image', type=str, required=True, help='input image to use') 13 | parser.add_argument('--model', type=str, required=True, help='model file to use') 14 | parser.add_argument('--output_filename', type=str, help='where to save the output image') 15 | parser.add_argument('--cuda', action='store_true', help='use cuda') 16 | opt = parser.parse_args() 17 | 18 | print(opt) 19 | img = Image.open(opt.input_image).convert('YCbCr') 20 | y, cb, cr = img.split() 21 | 22 | model = torch.load(opt.model) 23 | input = Variable(ToTensor()(y)).view(1, -1, y.size[1], y.size[0]) 24 | 25 | if opt.cuda: 26 | model = model.cuda() 27 | input = input.cuda() 28 | 29 | out = model(input) 30 | out = out.cpu() 31 | out_img_y = out.data[0].numpy() 32 | out_img_y *= 255.0 33 | out_img_y = out_img_y.clip(0, 255) 34 | out_img_y = Image.fromarray(np.uint8(out_img_y[0]), mode='L') 35 | 36 | out_img_cb = cb.resize(out_img_y.size, Image.BICUBIC) 37 | out_img_cr = cr.resize(out_img_y.size, Image.BICUBIC) 38 | out_img = Image.merge('YCbCr', [out_img_y, out_img_cb, out_img_cr]).convert('RGB') 39 | 40 | out_img.save(opt.output_filename) 41 | print('output image saved to ', opt.output_filename) 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Dataset.py: -------------------------------------------------------------------------------- 1 | import onmt 2 | from torch.autograd import Variable 3 | 4 | 5 | class Dataset(object): 6 | 7 | def __init__(self, srcData, tgtData, batchSize, cuda): 8 | self.src = srcData 9 | if tgtData: 10 | self.tgt = tgtData 11 | assert(len(self.src) == len(self.tgt)) 12 | else: 13 | self.tgt = None 14 | self.cuda = cuda 15 | 16 | self.batchSize = batchSize 17 | self.numBatches = len(self.src) // batchSize 18 | 19 | def _batchify(self, data, align_right=False): 20 | max_length = max(x.size(0) for x in data) 21 | out = data[0].new(len(data), max_length).fill_(onmt.Constants.PAD) 22 | for i in range(len(data)): 23 | data_length = data[i].size(0) 24 | offset = max_length - data_length if align_right else 0 25 | out[i].narrow(0, offset, data_length).copy_(data[i]) 26 | 27 | out = out.t().contiguous() 28 | if self.cuda: 29 | out = out.cuda() 30 | 31 | v = Variable(out) 32 | return v 33 | 34 | def __getitem__(self, index): 35 | assert index < self.numBatches, "%d > %d" % (index, self.numBatches) 36 | srcBatch = self._batchify( 37 | self.src[index*self.batchSize:(index+1)*self.batchSize], align_right=True) 38 | 39 | if self.tgt: 40 | tgtBatch = self._batchify( 41 | self.tgt[index*self.batchSize:(index+1)*self.batchSize]) 42 | else: 43 | tgtBatch = None 44 | 45 | return srcBatch, tgtBatch 46 | 47 | def __len__(self): 48 | return self.numBatches 49 | -------------------------------------------------------------------------------- /word_language_model/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class Dictionary(object): 5 | def __init__(self): 6 | self.word2idx = {} 7 | self.idx2word = [] 8 | 9 | def add_word(self, word): 10 | if word not in self.word2idx: 11 | self.idx2word.append(word) 12 | self.word2idx[word] = len(self.idx2word) - 1 13 | return self.word2idx[word] 14 | 15 | def __len__(self): 16 | return len(self.idx2word) 17 | 18 | 19 | class Corpus(object): 20 | def __init__(self, path): 21 | self.dictionary = Dictionary() 22 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 23 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 24 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 25 | 26 | def tokenize(self, path): 27 | """Tokenizes a text file.""" 28 | assert os.path.exists(path) 29 | # Add words to the dictionary 30 | with open(path, 'r') as f: 31 | tokens = 0 32 | for line in f: 33 | words = line.split() + [''] 34 | tokens += len(words) 35 | for word in words: 36 | self.dictionary.add_word(word) 37 | 38 | # Tokenize file content 39 | with open(path, 'r') as f: 40 | ids = torch.LongTensor(tokens) 41 | token = 0 42 | for line in f: 43 | words = line.split() + [''] 44 | for word in words: 45 | ids[token] = self.dictionary.word2idx[word] 46 | token += 1 47 | 48 | return ids 49 | -------------------------------------------------------------------------------- /snli/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | 4 | def get_args(): 5 | parser = ArgumentParser(description='PyTorch/torchtext SNLI example') 6 | parser.add_argument('--epochs', type=int, default=50) 7 | parser.add_argument('--batch_size', type=int, default=128) 8 | parser.add_argument('--d_embed', type=int, default=300) 9 | parser.add_argument('--d_proj', type=int, default=300) 10 | parser.add_argument('--d_hidden', type=int, default=300) 11 | parser.add_argument('--n_layers', type=int, default=1) 12 | parser.add_argument('--log_every', type=int, default=50) 13 | parser.add_argument('--lr', type=float, default=.001) 14 | parser.add_argument('--dev_every', type=int, default=1000) 15 | parser.add_argument('--save_every', type=int, default=1000) 16 | parser.add_argument('--dp_ratio', type=int, default=0.2) 17 | parser.add_argument('--no-bidirectional', action='store_false', dest='birnn') 18 | parser.add_argument('--preserve-case', action='store_false', dest='lower') 19 | parser.add_argument('--no-projection', action='store_false', dest='projection') 20 | parser.add_argument('--train_embed', action='store_false', dest='fix_emb') 21 | parser.add_argument('--gpu', type=int, default=0) 22 | parser.add_argument('--save_path', type=str, default='results') 23 | parser.add_argument('--data_cache', type=str, default=os.path.join(os.getcwd(), '.data_cache')) 24 | parser.add_argument('--vector_cache', type=str, default=os.path.join(os.getcwd(), '.vector_cache/input_vectors.pt')) 25 | parser.add_argument('--word_vectors', type=str, default='glove.42B') 26 | parser.add_argument('--resume_snapshot', type=str, default='') 27 | args = parser.parse_args() 28 | return args 29 | -------------------------------------------------------------------------------- /super_resolution/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from numpy.random import normal 4 | from numpy.linalg import svd 5 | from math import sqrt 6 | 7 | 8 | def _get_orthogonal_init_weights(weights): 9 | fan_out = weights.size(0) 10 | fan_in = weights.size(1) * weights.size(2) * weights.size(3) 11 | 12 | u, _, v = svd(normal(0.0, 1.0, (fan_out, fan_in)), full_matrices=False) 13 | 14 | if u.shape == (fan_out, fan_in): 15 | return torch.Tensor(u.reshape(weights.size())) 16 | else: 17 | return torch.Tensor(v.reshape(weights.size())) 18 | 19 | 20 | class Net(nn.Module): 21 | def __init__(self, upscale_factor): 22 | super(Net, self).__init__() 23 | 24 | self.relu = nn.ReLU() 25 | self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2)) 26 | self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1)) 27 | self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1)) 28 | self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1)) 29 | self.pixel_shuffle = nn.PixelShuffle(upscale_factor) 30 | 31 | self._initialize_weights() 32 | 33 | def forward(self, x): 34 | x = self.relu(self.conv1(x)) 35 | x = self.relu(self.conv2(x)) 36 | x = self.relu(self.conv3(x)) 37 | x = self.pixel_shuffle(self.conv4(x)) 38 | return x 39 | 40 | def _initialize_weights(self): 41 | self.conv1.weight.data.copy_(_get_orthogonal_init_weights(self.conv1.weight) * sqrt(2)) 42 | self.conv2.weight.data.copy_(_get_orthogonal_init_weights(self.conv2.weight) * sqrt(2)) 43 | self.conv3.weight.data.copy_(_get_orthogonal_init_weights(self.conv3.weight) * sqrt(2)) 44 | self.conv4.weight.data.copy_(_get_orthogonal_init_weights(self.conv4.weight)) 45 | -------------------------------------------------------------------------------- /super_resolution/README.md: -------------------------------------------------------------------------------- 1 | # Superresolution using an efficient sub-pixel convolutional neural network 2 | 3 | This example illustrates how to use the efficient sub-pixel convolution layer described in ["Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network" - Shi et al.](https://arxiv.org/abs/1609.05158) for increasing spatial resolution within your network for tasks such as superresolution. 4 | 5 | ``` 6 | usage: main.py [-h] --upscale_factor UPSCALE_FACTOR [--batchSize BATCHSIZE] 7 | [--testBatchSize TESTBATCHSIZE] [--nEpochs NEPOCHS] [--lr LR] 8 | [--cuda] [--threads THREADS] [--seed SEED] 9 | 10 | PyTorch Super Res Example 11 | 12 | optional arguments: 13 | -h, --help show this help message and exit 14 | --upscale_factor super resolution upscale factor 15 | --batchSize training batch size 16 | --testBatchSize testing batch size 17 | --nEpochs number of epochs to train for 18 | --lr Learning Rate. Default=0.01 19 | --cuda use cuda 20 | --threads number of threads for data loader to use Default=4 21 | --seed random seed to use. Default=123 22 | ``` 23 | This example trains a super-resolution network on the [BSD300 dataset](https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/), using crops from the 200 training images, and evaluating on crops of the 100 test images. A snapshot of the model after every epoch with filename model_epoch_.pth 24 | 25 | ##Example Usage: 26 | 27 | ###Train 28 | `python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 30 --lr 0.001` 29 | ###Super Resolve 30 | `python super_resolve.py --input_image dataset/BSDS300/images/test/16077.jpg --model model_epoch_500.pth --output_filename out.png` 31 | -------------------------------------------------------------------------------- /regression/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from itertools import count 4 | 5 | import torch 6 | import torch.autograd 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | 10 | POLY_DEGREE = 4 11 | W_target = torch.randn(POLY_DEGREE, 1) * 5 12 | b_target = torch.randn(1) * 5 13 | 14 | 15 | def make_features(x): 16 | """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4].""" 17 | x = x.unsqueeze(1) 18 | return torch.cat([x ** i for i in range(1, POLY_DEGREE+1)], 1) 19 | 20 | 21 | def f(x): 22 | """Approximated function.""" 23 | return x.mm(W_target) + b_target[0] 24 | 25 | 26 | def poly_desc(W, b): 27 | """Creates a string description of a polynomial.""" 28 | result = 'y = ' 29 | for i, w in enumerate(W): 30 | result += '{:+.2f} x^{} '.format(w, len(W) - i) 31 | result += '{:+.2f}'.format(b[0]) 32 | return result 33 | 34 | 35 | def get_batch(batch_size=32): 36 | """Builds a batch i.e. (x, f(x)) pair.""" 37 | random = torch.randn(batch_size) 38 | x = make_features(random) 39 | y = f(x) 40 | return Variable(x), Variable(y) 41 | 42 | 43 | # Define model 44 | fc = torch.nn.Linear(W_target.size(0), 1) 45 | 46 | for batch_idx in count(1): 47 | # Get data 48 | batch_x, batch_y = get_batch() 49 | 50 | # Reset gradients 51 | fc.zero_grad() 52 | 53 | # Forward pass 54 | output = F.smooth_l1_loss(fc(batch_x), batch_y) 55 | loss = output.data[0] 56 | 57 | # Backward pass 58 | output.backward() 59 | 60 | # Apply gradients 61 | for param in fc.parameters(): 62 | param.data.add_(-0.1 * param.grad.data) 63 | 64 | # Stop criterion 65 | if loss < 1e-3: 66 | break 67 | 68 | print('Loss: {:.6f} after {} batches'.format(loss, batch_idx)) 69 | print('==> Learned function:\t' + poly_desc(fc.weight.data.view(-1), fc.bias.data)) 70 | print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target)) 71 | -------------------------------------------------------------------------------- /OpenNMT/onmt/modules/GlobalAttention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global attention takes a matrix and a query vector. It 3 | then computes a parameterized convex combination of the matrix 4 | based on the input query. 5 | 6 | 7 | H_1 H_2 H_3 ... H_n 8 | q q q q 9 | | | | | 10 | \ | | / 11 | ..... 12 | \ | / 13 | a 14 | 15 | Constructs a unit mapping. 16 | $$(H_1 + H_n, q) => (a)$$ 17 | Where H is of `batch x n x dim` and q is of `batch x dim`. 18 | 19 | The full def is $$\tanh(W_2 [(softmax((W_1 q + b_1) H) H), q] + b_2)$$.: 20 | 21 | """ 22 | 23 | import torch 24 | import torch.nn as nn 25 | import math 26 | 27 | _INF = float('inf') 28 | 29 | class GlobalAttention(nn.Module): 30 | def __init__(self, dim): 31 | super(GlobalAttention, self).__init__() 32 | self.linear_in = nn.Linear(dim, dim, bias=False) 33 | self.sm = nn.Softmax() 34 | self.linear_out = nn.Linear(dim*2, dim, bias=False) 35 | self.tanh = nn.Tanh() 36 | self.mask = None 37 | 38 | def applyMask(self, mask): 39 | self.mask = mask 40 | 41 | def forward(self, input, context): 42 | """ 43 | input: batch x dim 44 | context: batch x sourceL x dim 45 | """ 46 | targetT = self.linear_in(input).unsqueeze(2) # batch x dim x 1 47 | 48 | # Get attention 49 | attn = torch.bmm(context, targetT).squeeze(2) # batch x sourceL 50 | if self.mask is not None: 51 | attn.data.masked_fill_(self.mask, -_INF) 52 | attn = self.sm(attn) 53 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x sourceL 54 | 55 | weightedContext = torch.bmm(attn3, context).squeeze(1) # batch x dim 56 | contextCombined = torch.cat((weightedContext, input), 1) 57 | 58 | contextOutput = self.tanh(self.linear_out(contextCombined)) 59 | 60 | return contextOutput, attn 61 | -------------------------------------------------------------------------------- /dcgan/README.md: -------------------------------------------------------------------------------- 1 | # Deep Convolution Generative Adversarial Networks 2 | 3 | This example implements the paper [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](http://arxiv.org/abs/1511.06434) 4 | 5 | The implementation is very close to the Torch implementation [dcgan.torch](https://github.com/soumith/dcgan.torch) 6 | 7 | After every 100 training iterations, the files `real_samples.png` and `fake_samples.png` are written to disk 8 | with the samples from the generative model. 9 | 10 | After every epoch, models are saved to: `netG_epoch_%d.pth` and `netD_epoch_%d.pth` 11 | 12 | ##Downloading the dataset 13 | You can download the LSUN dataset by cloning [this repo](https://github.com/fyu/lsun) and running 14 | ``` 15 | python donwload.py -c bedroom 16 | ``` 17 | 18 | ##Usage 19 | ``` 20 | usage: main.py [-h] --dataset DATASET --dataroot DATAROOT [--workers WORKERS] 21 | [--batchSize BATCHSIZE] [--imageSize IMAGESIZE] [--nz NZ] 22 | [--ngf NGF] [--ndf NDF] [--niter NITER] [--lr LR] 23 | [--beta1 BETA1] [--cuda] [--ngpu NGPU] [--netG NETG] 24 | [--netD NETD] 25 | 26 | optional arguments: 27 | -h, --help show this help message and exit 28 | --dataset DATASET cifar10 | lsun | imagenet | folder | lfw 29 | --dataroot DATAROOT path to dataset 30 | --workers WORKERS number of data loading workers 31 | --batchSize BATCHSIZE 32 | input batch size 33 | --imageSize IMAGESIZE 34 | the height / width of the input image to network 35 | --nz NZ size of the latent z vector 36 | --ngf NGF 37 | --ndf NDF 38 | --niter NITER number of epochs to train for 39 | --lr LR learning rate, default=0.0002 40 | --beta1 BETA1 beta1 for adam. default=0.5 41 | --cuda enables cuda 42 | --ngpu NGPU number of GPUs to use 43 | --netG NETG path to netG (to continue training) 44 | --netD NETD path to netD (to continue training) 45 | ``` 46 | -------------------------------------------------------------------------------- /word_language_model/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | class RNNModel(nn.Module): 5 | """Container module with an encoder, a recurrent module, and a decoder.""" 6 | 7 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers): 8 | super(RNNModel, self).__init__() 9 | self.encoder = nn.Embedding(ntoken, ninp) 10 | if rnn_type in ['LSTM', 'GRU']: 11 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, bias=False) 12 | else: 13 | try: 14 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] 15 | except KeyError: 16 | raise ValueError( """An invalid option for `--model` was supplied, 17 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 18 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, bias=False) 19 | self.decoder = nn.Linear(nhid, ntoken) 20 | 21 | self.init_weights() 22 | 23 | self.rnn_type = rnn_type 24 | self.nhid = nhid 25 | self.nlayers = nlayers 26 | 27 | def init_weights(self): 28 | initrange = 0.1 29 | self.encoder.weight.data.uniform_(-initrange, initrange) 30 | self.decoder.bias.data.fill_(0) 31 | self.decoder.weight.data.uniform_(-initrange, initrange) 32 | 33 | def forward(self, input, hidden): 34 | emb = self.encoder(input) 35 | output, hidden = self.rnn(emb, hidden) 36 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) 37 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 38 | 39 | def init_hidden(self, bsz): 40 | weight = next(self.parameters()).data 41 | if self.rnn_type == 'LSTM': 42 | return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()), 43 | Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())) 44 | else: 45 | return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) 46 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Optim.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.optim as optim 3 | 4 | class Optim(object): 5 | 6 | def _makeOptimizer(self): 7 | if self.method == 'sgd': 8 | self.optimizer = optim.SGD(self.params, lr=self.lr) 9 | elif self.method == 'adagrad': 10 | self.optimizer = optim.Adagrad(self.params, lr=self.lr) 11 | elif self.method == 'adadelta': 12 | self.optimizer = optim.Adadelta(self.params, lr=self.lr) 13 | elif self.method == 'adam': 14 | self.optimizer = optim.Adam(self.params, lr=self.lr) 15 | else: 16 | raise RuntimeError("Invalid optim method: " + self.method) 17 | 18 | def __init__(self, params, method, lr, max_grad_norm, lr_decay=1, start_decay_at=None): 19 | self.params = list(params) # careful: params may be a generator 20 | self.last_ppl = None 21 | self.lr = lr 22 | self.max_grad_norm = max_grad_norm 23 | self.method = method 24 | self.lr_decay = lr_decay 25 | self.start_decay_at = start_decay_at 26 | self.start_decay = False 27 | 28 | self._makeOptimizer() 29 | 30 | def step(self): 31 | # Compute gradients norm. 32 | grad_norm = 0 33 | for param in self.params: 34 | grad_norm += math.pow(param.grad.data.norm(), 2) 35 | 36 | grad_norm = math.sqrt(grad_norm) 37 | shrinkage = self.max_grad_norm / grad_norm 38 | 39 | for param in self.params: 40 | if shrinkage < 1: 41 | param.grad.data.mul_(shrinkage) 42 | 43 | self.optimizer.step() 44 | return grad_norm 45 | 46 | # decay learning rate if val perf does not improve or we hit the start_decay_at limit 47 | def updateLearningRate(self, ppl, epoch): 48 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 49 | self.start_decay = True 50 | if self.last_ppl is not None and ppl > self.last_ppl: 51 | self.start_decay = True 52 | 53 | if self.start_decay: 54 | self.lr = self.lr * self.lr_decay 55 | print("Decaying learning rate to %g" % self.lr) 56 | 57 | self.last_ppl = ppl 58 | 59 | self._makeOptimizer() 60 | -------------------------------------------------------------------------------- /super_resolution/data.py: -------------------------------------------------------------------------------- 1 | from os.path import exists, join, basename 2 | from os import makedirs, remove 3 | from six.moves import urllib 4 | import tarfile 5 | from torchvision.transforms import Compose, CenterCrop, ToTensor, Scale 6 | 7 | from dataset import DatasetFromFolder 8 | 9 | 10 | def download_bsd300(dest="dataset"): 11 | output_image_dir = join(dest, "BSDS300/images") 12 | 13 | if not exists(output_image_dir): 14 | makedirs(dest) 15 | url = "http://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/BSDS300-images.tgz" 16 | print("downloading url ", url) 17 | 18 | data = urllib.request.urlopen(url) 19 | 20 | file_path = join(dest, basename(url)) 21 | with open(file_path, 'wb') as f: 22 | f.write(data.read()) 23 | 24 | print("Extracting data") 25 | with tarfile.open(file_path) as tar: 26 | for item in tar: 27 | tar.extract(item, dest) 28 | 29 | remove(file_path) 30 | 31 | return output_image_dir 32 | 33 | 34 | def calculate_valid_crop_size(crop_size, upscale_factor): 35 | return crop_size - (crop_size % upscale_factor) 36 | 37 | 38 | def input_transform(crop_size, upscale_factor): 39 | return Compose([ 40 | CenterCrop(crop_size), 41 | Scale(crop_size // upscale_factor), 42 | ToTensor(), 43 | ]) 44 | 45 | 46 | def target_transform(crop_size): 47 | return Compose([ 48 | CenterCrop(crop_size), 49 | ToTensor(), 50 | ]) 51 | 52 | 53 | def get_training_set(upscale_factor): 54 | root_dir = download_bsd300() 55 | train_dir = join(root_dir, "train") 56 | crop_size = calculate_valid_crop_size(256, upscale_factor) 57 | 58 | return DatasetFromFolder(train_dir, 59 | input_transform=input_transform(crop_size, upscale_factor), 60 | target_transform=target_transform(crop_size)) 61 | 62 | 63 | def get_test_set(upscale_factor): 64 | root_dir = download_bsd300() 65 | test_dir = join(root_dir, "test") 66 | crop_size = calculate_valid_crop_size(256, upscale_factor) 67 | 68 | return DatasetFromFolder(test_dir, 69 | input_transform=input_transform(crop_size, upscale_factor), 70 | target_transform=target_transform(crop_size)) 71 | -------------------------------------------------------------------------------- /imagenet/README.md: -------------------------------------------------------------------------------- 1 | # ImageNet training in PyTorch 2 | 3 | This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. 4 | 5 | ## Requirements 6 | 7 | - Install PyTorch ([pytorch.org](http://pytorch.org)) 8 | - `pip install -r requirements.txt` 9 | - Download the ImageNet dataset and move validation images to labeled subfolders 10 | 11 | ## Training 12 | 13 | To train a model, run `main.py` with the desired model architecture and the path to the ImageNet dataset: 14 | 15 | ```bash 16 | python main.py -a resnet18 [imagenet-folder with train and val folders] 17 | ``` 18 | 19 | The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG: 20 | 21 | ```bash 22 | python main.py -a alexnet --lr 0.01 [imagenet-folder with train and val folders] 23 | ``` 24 | 25 | ## Usage 26 | 27 | ``` 28 | usage: main.py [-h] [--arch ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N] 29 | [--lr LR] [--momentum M] [--weight-decay W] [--print-freq N] 30 | [--resume PATH] [-e] [--pretrained] 31 | DIR 32 | 33 | PyTorch ImageNet Training 34 | 35 | positional arguments: 36 | DIR path to dataset 37 | 38 | optional arguments: 39 | -h, --help show this help message and exit 40 | --arch ARCH, -a ARCH model architecture: alexnet | resnet | resnet101 | 41 | resnet152 | resnet18 | resnet34 | resnet50 | vgg | 42 | vgg11 | vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn 43 | | vgg19 | vgg19_bn (default: resnet18) 44 | -j N, --workers N number of data loading workers (default: 4) 45 | --epochs N number of total epochs to run 46 | --start-epoch N manual epoch number (useful on restarts) 47 | -b N, --batch-size N mini-batch size (default: 256) 48 | --lr LR, --learning-rate LR 49 | initial learning rate 50 | --momentum M momentum 51 | --weight-decay W, --wd W 52 | weight decay (default: 1e-4) 53 | --print-freq N, -p N print frequency (default: 10) 54 | --resume PATH path to latest checkpoint (default: none) 55 | -e, --evaluate evaluate model on validation set 56 | --pretrained use pre-trained model 57 | ``` 58 | -------------------------------------------------------------------------------- /OpenNMT/README.md: -------------------------------------------------------------------------------- 1 | # OpenNMT: Open-Source Neural Machine Translation 2 | 3 | This is a [Pytorch](https://github.com/pytorch/pytorch) 4 | port of [OpenNMT](https://github.com/OpenNMT/OpenNMT), 5 | an open-source (MIT) neural machine translation system. 6 | 7 |
8 | 9 | ## Quickstart 10 | 11 | OpenNMT consists of three commands: 12 | 13 | 0) Download the data. 14 | 15 | ```wget https://s3.amazonaws.com/pytorch/examples/opennmt/data/onmt-data.tar && tar -xf onmt-data.tar``` 16 | 17 | 1) Preprocess the data. 18 | 19 | ```python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo``` 20 | 21 | 2) Train the model. 22 | 23 | ```python train.py -data data/demo-train.pt -save_model model -cuda``` 24 | 25 | 3) Translate sentences. 26 | 27 | ```python translate.py -cuda -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose``` 28 | 29 | ## Pretrained Models 30 | 31 | The following pretrained models can be downloaded and used with translate.py. 32 | 33 | - [onmt_model_en_de_200k](https://s3.amazonaws.com/pytorch/examples/opennmt/models/onmt_model_en_de_200k-4783d9c3.pt): An English-German translation model based on the 200k sentence dataset at [OpenNMT/IntegrationTesting](https://github.com/OpenNMT/IntegrationTesting/tree/master/data). Perplexity: 21. 34 | - [onmt_model_en_fr_b1M](https://s3.amazonaws.com/pytorch/examples/opennmt/models/onmt_model_en_fr_b1M-261c69a7.pt): An English-French model trained on benchmark-1M. Perplexity: 4.85. 35 | 36 | ## Release Notes 37 | 38 | The following OpenNMT features are implemented: 39 | 40 | - multi-layer bidirectional RNNs with attention and dropout 41 | - data preprocessing 42 | - saving and loading from checkpoints 43 | - inference (translation) with batching and beam search 44 | 45 | Not yet implemented: 46 | 47 | - word features 48 | - multi-GPU 49 | - residual connections 50 | 51 | ## Performance 52 | 53 | With default parameters on a single Maxwell GPU, this version runs about 70% faster than the Lua torch OpenNMT. The improved performance comes from two main sources: 54 | 55 | - CuDNN is used for the encoder (although not for the decoder, since it can't handle attention) 56 | - The decoder softmax layer is batched to efficiently trade off CPU vs. memory efficiency; this can be tuned with the -max_generator_batches parameter. 57 | -------------------------------------------------------------------------------- /mnist_hogwild/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.multiprocessing as mp 7 | 8 | from train import train 9 | 10 | # Training settings 11 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 12 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 13 | help='input batch size for training (default: 64)') 14 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 15 | help='input batch size for testing (default: 1000)') 16 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 17 | help='number of epochs to train (default: 2)') 18 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 19 | help='learning rate (default: 0.01)') 20 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 21 | help='SGD momentum (default: 0.5)') 22 | parser.add_argument('--seed', type=int, default=1, metavar='S', 23 | help='random seed (default: 1)') 24 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 25 | help='how many batches to wait before logging training status') 26 | parser.add_argument('--num-processes', type=int, default=2, metavar='N', 27 | help='how many training processes to use (default: 2)') 28 | 29 | class Net(nn.Module): 30 | def __init__(self): 31 | super(Net, self).__init__() 32 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 33 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 34 | self.conv2_drop = nn.Dropout2d() 35 | self.fc1 = nn.Linear(320, 50) 36 | self.fc2 = nn.Linear(50, 10) 37 | 38 | def forward(self, x): 39 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 40 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 41 | x = x.view(-1, 320) 42 | x = F.relu(self.fc1(x)) 43 | x = F.dropout(x, training=self.training) 44 | x = F.relu(self.fc2(x)) 45 | return F.log_softmax(x) 46 | 47 | if __name__ == '__main__': 48 | args = parser.parse_args() 49 | 50 | torch.manual_seed(args.seed) 51 | 52 | model = Net() 53 | model.share_memory() 54 | 55 | processes = [] 56 | for rank in range(args.num_processes): 57 | p = mp.Process(target=train, args=(rank, args, model)) 58 | p.start() 59 | processes.append(p) 60 | for p in processes: 61 | p.join() 62 | -------------------------------------------------------------------------------- /mnist_hogwild/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torchvision import datasets, transforms 7 | 8 | def train(rank, args, model): 9 | torch.manual_seed(args.seed + rank) 10 | for param in model.parameters(): 11 | # Break gradient sharing 12 | param.grad.data = param.grad.data.clone() 13 | 14 | train_loader = torch.utils.data.DataLoader( 15 | datasets.MNIST('../data', train=True, download=True, 16 | transform=transforms.Compose([ 17 | transforms.ToTensor(), 18 | transforms.Normalize((0.1307,), (0.3081,)) 19 | ])), 20 | batch_size=args.batch_size, shuffle=True, num_workers=1) 21 | test_loader = torch.utils.data.DataLoader( 22 | datasets.MNIST('../data', train=False, transform=transforms.Compose([ 23 | transforms.ToTensor(), 24 | transforms.Normalize((0.1307,), (0.3081,)) 25 | ])), 26 | batch_size=args.batch_size, shuffle=True, num_workers=1) 27 | 28 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 29 | for epoch in range(1, args.epochs + 1): 30 | train_epoch(epoch, args, model, train_loader, optimizer) 31 | test_epoch(model, test_loader) 32 | 33 | 34 | def train_epoch(epoch, args, model, data_loader, optimizer): 35 | model.train() 36 | pid = os.getpid() 37 | for batch_idx, (data, target) in enumerate(data_loader): 38 | data, target = Variable(data), Variable(target) 39 | optimizer.zero_grad() 40 | output = model(data) 41 | loss = F.nll_loss(output, target) 42 | loss.backward() 43 | optimizer.step() 44 | if batch_idx % args.log_interval == 0: 45 | print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 46 | pid, epoch, batch_idx * len(data), len(data_loader.dataset), 47 | 100. * batch_idx / len(data_loader), loss.data[0])) 48 | 49 | 50 | def test_epoch(model, data_loader): 51 | model.eval() 52 | test_loss = 0 53 | correct = 0 54 | for data, target in data_loader: 55 | data, target = Variable(data, volatile=True), Variable(target) 56 | output = model(data) 57 | test_loss += F.nll_loss(output, target).data[0] 58 | pred = output.data.max(1)[1] # get the index of the max log-probability 59 | correct += pred.eq(target.data).cpu().sum() 60 | 61 | test_loss = test_loss 62 | test_loss /= len(data_loader) # loss function already averages over batch size 63 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 64 | test_loss, correct, len(data_loader.dataset), 65 | 100. * correct / len(data_loader.dataset))) 66 | -------------------------------------------------------------------------------- /word_language_model/generate.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Language Modeling on Penn Tree Bank 3 | # 4 | # This file generates new sentences sampled from the language model 5 | # 6 | ############################################################################### 7 | 8 | import argparse 9 | 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | import data 14 | 15 | parser = argparse.ArgumentParser(description='PyTorch PTB Language Model') 16 | 17 | # Model parameters. 18 | parser.add_argument('--data', type=str, default='./data/penn', 19 | help='location of the data corpus') 20 | parser.add_argument('--checkpoint', type=str, default='./model.pt', 21 | help='model checkpoint to use') 22 | parser.add_argument('--outf', type=str, default='generated.txt', 23 | help='output file for generated text') 24 | parser.add_argument('--words', type=int, default='1000', 25 | help='number of words to generate') 26 | parser.add_argument('--seed', type=int, default=1111, 27 | help='random seed') 28 | parser.add_argument('--cuda', action='store_true', 29 | help='use CUDA') 30 | parser.add_argument('--temperature', type=float, default=1.0, 31 | help='temperature - higher will increase diversity') 32 | parser.add_argument('--log-interval', type=int, default=100, 33 | help='reporting interval') 34 | args = parser.parse_args() 35 | 36 | # Set the random seed manually for reproducibility. 37 | torch.manual_seed(args.seed) 38 | if torch.cuda.is_available(): 39 | if not args.cuda: 40 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 41 | else: 42 | torch.cuda.manual_seed(args.seed) 43 | 44 | if args.temperature < 1e-3: 45 | parser.error("--temperature has to be greater or equal 1e-3") 46 | 47 | with open(args.checkpoint, 'rb') as f: 48 | model = torch.load(f) 49 | 50 | if args.cuda: 51 | model.cuda() 52 | else: 53 | model.cpu() 54 | 55 | corpus = data.Corpus(args.data) 56 | ntokens = len(corpus.dictionary) 57 | hidden = model.init_hidden(1) 58 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) 59 | if args.cuda: 60 | input.data = input.data.cuda() 61 | 62 | with open(args.outf, 'w') as outf: 63 | for i in range(args.words): 64 | output, hidden = model(input, hidden) 65 | word_weights = output.squeeze().data.div(args.temperature).exp().cpu() 66 | word_idx = torch.multinomial(word_weights, 1)[0] 67 | input.data.fill_(word_idx) 68 | word = corpus.dictionary.idx2word[word_idx] 69 | 70 | outf.write(word + ('\n' if i % 20 == 19 else ' ')) 71 | 72 | if i % args.log_interval == 0: 73 | print('| Generated {}/{} words'.format(i, args.words)) 74 | -------------------------------------------------------------------------------- /snli/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | 6 | class Bottle(nn.Module): 7 | 8 | def forward(self, input): 9 | if len(input.size()) <= 2: 10 | return super(Bottle, self).forward(input) 11 | size = input.size()[:2] 12 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 13 | return out.view(*size, -1) 14 | 15 | 16 | class Linear(Bottle, nn.Linear): 17 | pass 18 | 19 | 20 | class Encoder(nn.Module): 21 | 22 | def __init__(self, config): 23 | super(Encoder, self).__init__() 24 | self.config = config 25 | input_size = config.d_proj if config.projection else config.d_embed 26 | self.rnn = nn.LSTM(input_size=input_size, hidden_size=config.d_hidden, 27 | num_layers=config.n_layers, dropout=config.dp_ratio, 28 | bidirectional=config.birnn) 29 | 30 | def forward(self, inputs): 31 | batch_size = inputs.size()[1] 32 | state_shape = self.config.n_cells, batch_size, self.config.d_hidden 33 | h0 = c0 = Variable(inputs.data.new(*state_shape).zero_()) 34 | outputs, (ht, ct) = self.rnn(inputs, (h0, c0)) 35 | return ht[-1] if not self.config.birnn else ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1) 36 | 37 | 38 | class SNLIClassifier(nn.Module): 39 | 40 | def __init__(self, config): 41 | super(SNLIClassifier, self).__init__() 42 | self.config = config 43 | self.embed = nn.Embedding(config.n_embed, config.d_embed) 44 | self.projection = Linear(config.d_embed, config.d_proj) 45 | self.encoder = Encoder(config) 46 | self.dropout = nn.Dropout(p=config.dp_ratio) 47 | self.relu = nn.ReLU() 48 | seq_in_size = 2*config.d_hidden 49 | if self.config.birnn: 50 | seq_in_size *= 2 51 | lin_config = [seq_in_size]*2 52 | self.out = nn.Sequential( 53 | Linear(*lin_config), 54 | self.relu, 55 | self.dropout, 56 | Linear(*lin_config), 57 | self.relu, 58 | self.dropout, 59 | Linear(*lin_config), 60 | self.relu, 61 | self.dropout, 62 | Linear(seq_in_size, config.d_out)) 63 | 64 | def forward(self, batch): 65 | prem_embed = self.embed(batch.premise) 66 | hypo_embed = self.embed(batch.hypothesis) 67 | if self.config.fix_emb: 68 | prem_embed = Variable(prem_embed.data) 69 | hypo_embed = Variable(hypo_embed.data) 70 | if self.config.projection: 71 | prem_embed = self.relu(self.projection(prem_embed)) 72 | hypo_embed = self.relu(self.projection(hypo_embed)) 73 | premise = self.encoder(prem_embed) 74 | hypothesis = self.encoder(hypo_embed) 75 | scores = self.out(torch.cat([premise, hypothesis], 1)) 76 | return scores 77 | -------------------------------------------------------------------------------- /reinforcement_learning/reinforce.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | from itertools import count 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | import torch.autograd as autograd 11 | from torch.autograd import Variable 12 | 13 | 14 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') 15 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G', 16 | help='discount factor (default: 0.99)') 17 | parser.add_argument('--seed', type=int, default=543, metavar='N', 18 | help='random seed (default: 1)') 19 | parser.add_argument('--render', action='store_true', 20 | help='render the environment') 21 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 22 | help='interval between training status logs (default: 10)') 23 | args = parser.parse_args() 24 | 25 | 26 | env = gym.make('CartPole-v0') 27 | env.seed(args.seed) 28 | torch.manual_seed(args.seed) 29 | 30 | 31 | class Policy(nn.Module): 32 | def __init__(self): 33 | super(Policy, self).__init__() 34 | self.affine1 = nn.Linear(4, 128) 35 | self.affine2 = nn.Linear(128, 2) 36 | 37 | self.saved_actions = [] 38 | self.rewards = [] 39 | 40 | def forward(self, x): 41 | x = F.relu(self.affine1(x)) 42 | action_scores = self.affine2(x) 43 | return F.softmax(action_scores) 44 | 45 | 46 | model = Policy() 47 | optimizer = optim.Adam(model.parameters(), lr=1e-2) 48 | 49 | 50 | def select_action(state): 51 | state = torch.from_numpy(state).float().unsqueeze(0) 52 | probs = model(Variable(state)) 53 | action = probs.multinomial() 54 | model.saved_actions.append(action) 55 | return action.data 56 | 57 | 58 | def finish_episode(): 59 | R = 0 60 | saved_actions = model.saved_actions 61 | rewards = [] 62 | for r in model.rewards[::-1]: 63 | R = r + args.gamma * R 64 | rewards.insert(0, R) 65 | rewards = torch.Tensor(rewards) 66 | rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) 67 | for action, r in zip(model.saved_actions, rewards): 68 | action.reinforce(r) 69 | optimizer.zero_grad() 70 | autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) 71 | optimizer.step() 72 | del model.rewards[:] 73 | del model.saved_actions[:] 74 | 75 | 76 | running_reward = 10 77 | for i_episode in count(1): 78 | state = env.reset() 79 | for t in range(10000): # Don't infinite loop while learning 80 | action = select_action(state) 81 | state, reward, done, _ = env.step(action[0,0]) 82 | if args.render: 83 | env.render() 84 | model.rewards.append(reward) 85 | if done: 86 | break 87 | 88 | running_reward = running_reward * 0.99 + t * 0.01 89 | finish_episode() 90 | if i_episode % args.log_interval == 0: 91 | print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format( 92 | i_episode, t, running_reward)) 93 | if running_reward > 200: 94 | print("Solved! Running reward is now {} and " 95 | "the last episode runs to {} time steps!".format(running_reward, t)) 96 | break 97 | -------------------------------------------------------------------------------- /super_resolution/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | from math import log10 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | from torch.utils.data import DataLoader 10 | from model import Net 11 | from data import get_training_set, get_test_set 12 | 13 | # Training settings 14 | parser = argparse.ArgumentParser(description='PyTorch Super Res Example') 15 | parser.add_argument('--upscale_factor', type=int, required=True, help="super resolution upscale factor") 16 | parser.add_argument('--batchSize', type=int, default=64, help='training batch size') 17 | parser.add_argument('--testBatchSize', type=int, default=10, help='testing batch size') 18 | parser.add_argument('--nEpochs', type=int, default=2, help='number of epochs to train for') 19 | parser.add_argument('--lr', type=float, default=0.01, help='Learning Rate. Default=0.01') 20 | parser.add_argument('--cuda', action='store_true', help='use cuda?') 21 | parser.add_argument('--threads', type=int, default=4, help='number of threads for data loader to use') 22 | parser.add_argument('--seed', type=int, default=123, help='random seed to use. Default=123') 23 | opt = parser.parse_args() 24 | 25 | print(opt) 26 | 27 | cuda = opt.cuda 28 | if cuda and not torch.cuda.is_available(): 29 | raise Exception("No GPU found, please run without --cuda") 30 | 31 | torch.manual_seed(opt.seed) 32 | if cuda: 33 | torch.cuda.manual_seed(opt.seed) 34 | 35 | print('===> Loading datasets') 36 | train_set = get_training_set(opt.upscale_factor) 37 | test_set = get_test_set(opt.upscale_factor) 38 | training_data_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batchSize, shuffle=True) 39 | testing_data_loader = DataLoader(dataset=test_set, num_workers=opt.threads, batch_size=opt.testBatchSize, shuffle=False) 40 | 41 | print('===> Building model') 42 | model = Net(upscale_factor=opt.upscale_factor) 43 | criterion = nn.MSELoss() 44 | 45 | if cuda: 46 | model = model.cuda() 47 | criterion = criterion.cuda() 48 | 49 | optimizer = optim.Adam(model.parameters(), lr=opt.lr) 50 | 51 | 52 | def train(epoch): 53 | epoch_loss = 0 54 | for iteration, batch in enumerate(training_data_loader, 1): 55 | input, target = Variable(batch[0]), Variable(batch[1]) 56 | if cuda: 57 | input = input.cuda() 58 | target = target.cuda() 59 | 60 | optimizer.zero_grad() 61 | loss = criterion(model(input), target) 62 | epoch_loss += loss.data[0] 63 | loss.backward() 64 | optimizer.step() 65 | 66 | print("===> Epoch[{}]({}/{}): Loss: {:.4f}".format(epoch, iteration, len(training_data_loader), loss.data[0])) 67 | 68 | print("===> Epoch {} Complete: Avg. Loss: {:.4f}".format(epoch, epoch_loss / len(training_data_loader))) 69 | 70 | 71 | def test(): 72 | avg_psnr = 0 73 | for batch in testing_data_loader: 74 | input, target = Variable(batch[0]), Variable(batch[1]) 75 | if cuda: 76 | input = input.cuda() 77 | target = target.cuda() 78 | 79 | prediction = model(input) 80 | mse = criterion(prediction, target) 81 | psnr = 10 * log10(1 / mse.data[0]) 82 | avg_psnr += psnr 83 | print("===> Avg. PSNR: {:.4f} dB".format(avg_psnr / len(testing_data_loader))) 84 | 85 | 86 | def checkpoint(epoch): 87 | model_out_path = "model_epoch_{}.pth".format(epoch) 88 | torch.save(model, model_out_path) 89 | print("Checkpoint saved to {}".format(model_out_path)) 90 | 91 | for epoch in range(1, opt.nEpochs + 1): 92 | train(epoch) 93 | test() 94 | checkpoint(epoch) 95 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Beam.py: -------------------------------------------------------------------------------- 1 | # Class for managing the internals of the beam search process. 2 | # 3 | # 4 | # hyp1#-hyp1---hyp1 -hyp1 5 | # \ / 6 | # hyp2 \-hyp2 /-hyp2#hyp2 7 | # / \ 8 | # hyp3#-hyp3---hyp3 -hyp3 9 | # ======================== 10 | # 11 | # Takes care of beams, back pointers, and scores. 12 | 13 | import torch 14 | import onmt 15 | 16 | 17 | class Beam(object): 18 | def __init__(self, size, cuda=False): 19 | 20 | self.size = size 21 | self.done = False 22 | 23 | self.tt = torch.cuda if cuda else torch 24 | 25 | # The score for each translation on the beam. 26 | self.scores = self.tt.FloatTensor(size).zero_() 27 | 28 | # The backpointers at each time-step. 29 | self.prevKs = [] 30 | 31 | # The outputs at each time-step. 32 | self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)] 33 | self.nextYs[0][0] = onmt.Constants.BOS 34 | 35 | # The attentions (matrix) for each time. 36 | self.attn = [] 37 | 38 | # Get the outputs for the current timestep. 39 | def getCurrentState(self): 40 | return self.nextYs[-1] 41 | 42 | # Get the backpointers for the current timestep. 43 | def getCurrentOrigin(self): 44 | return self.prevKs[-1] 45 | 46 | # Given prob over words for every last beam `wordLk` and attention 47 | # `attnOut`: Compute and update the beam search. 48 | # 49 | # Parameters: 50 | # 51 | # * `wordLk`- probs of advancing from the last step (K x words) 52 | # * `attnOut`- attention at the last step 53 | # 54 | # Returns: True if beam search is complete. 55 | def advance(self, wordLk, attnOut): 56 | 57 | numWords = wordLk.size(1) 58 | 59 | # Sum the previous scores. 60 | if len(self.prevKs) > 0: 61 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk) 62 | else: 63 | beamLk = wordLk[0] 64 | 65 | flatBeamLk = beamLk.view(-1) 66 | 67 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True) 68 | self.scores = bestScores 69 | 70 | # bestScoresId is flattened beam x word array, so calculate which 71 | # word and beam each score came from 72 | prevK = bestScoresId / numWords 73 | self.prevKs.append(prevK) 74 | self.nextYs.append(bestScoresId - prevK * numWords) 75 | self.attn.append(attnOut.index_select(0, prevK)) 76 | 77 | # End condition is when top-of-beam is EOS. 78 | if self.nextYs[-1][0] == onmt.Constants.EOS: 79 | self.done = True 80 | 81 | return self.done 82 | 83 | def sortBest(self): 84 | return torch.sort(self.scores, 0, True) 85 | 86 | # Get the score of the best in the beam. 87 | def getBest(self): 88 | scores, ids = self.sortBest() 89 | return scores[1], ids[1] 90 | 91 | # Walk back to construct the full hypothesis. 92 | # 93 | # Parameters. 94 | # 95 | # * `k` - the position in the beam to construct. 96 | # 97 | # Returns. 98 | # 99 | # 1. The hypothesis 100 | # 2. The attention at each time step. 101 | def getHyp(self, k): 102 | hyp, attn = [], [] 103 | # print(len(self.prevKs), len(self.nextYs), len(self.attn)) 104 | for j in range(len(self.prevKs) - 1, -1, -1): 105 | hyp.append(self.nextYs[j+1][k]) 106 | attn.append(self.attn[j][k]) 107 | k = self.prevKs[j][k] 108 | 109 | return hyp[::-1], torch.stack(attn[::-1]) 110 | -------------------------------------------------------------------------------- /reinforcement_learning/actor_critic.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | from itertools import count 5 | from collections import namedtuple 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | import torch.autograd as autograd 12 | from torch.autograd import Variable 13 | 14 | 15 | parser = argparse.ArgumentParser(description='PyTorch actor-critic example') 16 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G', 17 | help='discount factor (default: 0.99)') 18 | parser.add_argument('--seed', type=int, default=543, metavar='N', 19 | help='random seed (default: 1)') 20 | parser.add_argument('--render', action='store_true', 21 | help='render the environment') 22 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 23 | help='interval between training status logs (default: 10)') 24 | args = parser.parse_args() 25 | 26 | 27 | env = gym.make('CartPole-v0') 28 | env.seed(args.seed) 29 | torch.manual_seed(args.seed) 30 | 31 | 32 | SavedAction = namedtuple('SavedAction', ['action', 'value']) 33 | class Policy(nn.Module): 34 | def __init__(self): 35 | super(Policy, self).__init__() 36 | self.affine1 = nn.Linear(4, 128) 37 | self.action_head = nn.Linear(128, 2) 38 | self.value_head = nn.Linear(128, 1) 39 | 40 | self.saved_actions = [] 41 | self.rewards = [] 42 | 43 | def forward(self, x): 44 | x = F.relu(self.affine1(x)) 45 | action_scores = self.action_head(x) 46 | state_values = self.value_head(x) 47 | return F.softmax(action_scores), state_values 48 | 49 | 50 | model = Policy() 51 | optimizer = optim.Adam(model.parameters(), lr=3e-2) 52 | 53 | 54 | def select_action(state): 55 | state = torch.from_numpy(state).float().unsqueeze(0) 56 | probs, state_value = model(Variable(state)) 57 | action = probs.multinomial() 58 | model.saved_actions.append(SavedAction(action, state_value)) 59 | return action.data 60 | 61 | 62 | def finish_episode(): 63 | R = 0 64 | saved_actions = model.saved_actions 65 | value_loss = 0 66 | rewards = [] 67 | for r in model.rewards[::-1]: 68 | R = r + args.gamma * R 69 | rewards.insert(0, R) 70 | rewards = torch.Tensor(rewards) 71 | rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) 72 | for (action, value), r in zip(saved_actions, rewards): 73 | action.reinforce(r - value.data.squeeze()) 74 | value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) 75 | optimizer.zero_grad() 76 | final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions)) 77 | gradients = [torch.ones(1)] + [None] * len(saved_actions) 78 | autograd.backward(final_nodes, gradients) 79 | optimizer.step() 80 | del model.rewards[:] 81 | del model.saved_actions[:] 82 | 83 | 84 | running_reward = 10 85 | for i_episode in count(1): 86 | state = env.reset() 87 | for t in range(10000): # Don't infinite loop while learning 88 | action = select_action(state) 89 | state, reward, done, _ = env.step(action[0,0]) 90 | if args.render: 91 | env.render() 92 | model.rewards.append(reward) 93 | if done: 94 | break 95 | 96 | running_reward = running_reward * 0.99 + t * 0.01 97 | finish_episode() 98 | if i_episode % args.log_interval == 0: 99 | print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format( 100 | i_episode, t, running_reward)) 101 | if running_reward > 200: 102 | print("Solved! Running reward is now {} and " 103 | "the last episode runs to {} time steps!".format(running_reward, t)) 104 | break 105 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Dict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Dict(object): 5 | def __init__(self, data=None): 6 | self.idxToLabel = {} 7 | self.labelToIdx = {} 8 | self.frequencies = {} 9 | 10 | # Special entries will not be pruned. 11 | self.special = [] 12 | 13 | if data is not None: 14 | if type(data) == str: 15 | self.loadFile(data) 16 | else: 17 | self.addSpecials(data) 18 | 19 | def size(self): 20 | return len(self.idxToLabel) 21 | 22 | # Load entries from a file. 23 | def loadFile(self, filename): 24 | for line in open(filename): 25 | fields = line.split() 26 | label = fields[0] 27 | idx = int(fields[1]) 28 | self.add(label, idx) 29 | 30 | # Write entries to a file. 31 | def writeFile(self, filename): 32 | with open(filename, 'w') as file: 33 | for i in range(self.size()): 34 | label = self.idxToLabel[i] 35 | file.write('%s %d\n' % (label, i)) 36 | 37 | file.close() 38 | 39 | def lookup(self, key, default=None): 40 | try: 41 | return self.labelToIdx[key] 42 | except KeyError: 43 | return default 44 | 45 | def getLabel(self, idx, default=None): 46 | try: 47 | return self.idxToLabel[idx] 48 | except KeyError: 49 | return default 50 | 51 | # Mark this `label` and `idx` as special (i.e. will not be pruned). 52 | def addSpecial(self, label, idx=None): 53 | idx = self.add(label, idx) 54 | self.special += [idx] 55 | 56 | # Mark all labels in `labels` as specials (i.e. will not be pruned). 57 | def addSpecials(self, labels): 58 | for label in labels: 59 | self.addSpecial(label) 60 | 61 | # Add `label` in the dictionary. Use `idx` as its index if given. 62 | def add(self, label, idx=None): 63 | if idx is not None: 64 | self.idxToLabel[idx] = label 65 | self.labelToIdx[label] = idx 66 | else: 67 | if label in self.labelToIdx: 68 | idx = self.labelToIdx[label] 69 | else: 70 | idx = len(self.idxToLabel) 71 | self.idxToLabel[idx] = label 72 | self.labelToIdx[label] = idx 73 | 74 | if idx not in self.frequencies: 75 | self.frequencies[idx] = 1 76 | else: 77 | self.frequencies[idx] += 1 78 | 79 | return idx 80 | 81 | # Return a new dictionary with the `size` most frequent entries. 82 | def prune(self, size): 83 | if size >= self.size(): 84 | return self 85 | 86 | # Only keep the `size` most frequent entries. 87 | freq = torch.Tensor( 88 | [self.frequencies[i] for i in range(len(self.frequencies))]) 89 | _, idx = torch.sort(freq, 0, True) 90 | 91 | newDict = Dict() 92 | 93 | # Add special entries in all cases. 94 | for i in self.special: 95 | newDict.addSpecial(self.idxToLabel[i]) 96 | 97 | for i in idx[:size]: 98 | newDict.add(self.idxToLabel[i]) 99 | 100 | return newDict 101 | 102 | # Convert `labels` to indices. Use `unkWord` if not found. 103 | # Optionally insert `bosWord` at the beginning and `eosWord` at the . 104 | def convertToIdx(self, labels, unkWord, bosWord=None, eosWord=None): 105 | vec = [] 106 | 107 | if bosWord is not None: 108 | vec += [self.lookup(bosWord)] 109 | 110 | unk = self.lookup(unkWord) 111 | vec += [self.lookup(label, default=unk) for label in labels] 112 | 113 | if eosWord is not None: 114 | vec += [self.lookup(eosWord)] 115 | 116 | return torch.LongTensor(vec) 117 | 118 | # Convert `idx` to labels. If index `stop` is reached, convert it and return. 119 | def convertToLabels(self, idx, stop): 120 | labels = [] 121 | 122 | for i in idx: 123 | labels += [self.getLabel(i)] 124 | if i == stop: 125 | break 126 | 127 | return labels 128 | -------------------------------------------------------------------------------- /OpenNMT/translate.py: -------------------------------------------------------------------------------- 1 | import onmt 2 | import torch 3 | import argparse 4 | import math 5 | 6 | parser = argparse.ArgumentParser(description='translate.py') 7 | 8 | parser.add_argument('-model', required=True, 9 | help='Path to model .pt file') 10 | parser.add_argument('-src', required=True, 11 | help='Source sequence to decode (one line per sequence)') 12 | parser.add_argument('-tgt', 13 | help='True target sequence (optional)') 14 | parser.add_argument('-output', default='pred.txt', 15 | help="""Path to output the predictions (each line will 16 | be the decoded sequence""") 17 | parser.add_argument('-beam_size', type=int, default=5, 18 | help='Beam size') 19 | parser.add_argument('-batch_size', type=int, default=30, 20 | help='Batch size') 21 | parser.add_argument('-max_sent_length', default=100, 22 | help='Maximum sentence length.') 23 | parser.add_argument('-replace_unk', action="store_true", 24 | help="""Replace the generated UNK tokens with the source 25 | token that had the highest attention weight. If phrase_table 26 | is provided, it will lookup the identified source token and 27 | give the corresponding target token. If it is not provided 28 | (or the identified source token does not exist in the 29 | table) then it will copy the source token""") 30 | # parser.add_argument('-phrase_table', 31 | # help="""Path to source-target dictionary to replace UNK 32 | # tokens. See README.md for the format of this file.""") 33 | parser.add_argument('-verbose', action="store_true", 34 | help='Print scores and predictions for each sentence') 35 | parser.add_argument('-n_best', type=int, default=1, 36 | help="""If verbose is set, will output the n_best 37 | decoded sentences""") 38 | 39 | parser.add_argument('-gpu', type=int, default=-1, 40 | help="Device to run on") 41 | 42 | 43 | 44 | def reportScore(name, scoreTotal, wordsTotal): 45 | print("%s AVG SCORE: %.4f, %s PPL: %.4f" % ( 46 | name, scoreTotal / wordsTotal, 47 | name, math.exp(-scoreTotal/wordsTotal))) 48 | 49 | 50 | def main(): 51 | opt = parser.parse_args() 52 | opt.cuda = opt.gpu > -1 53 | torch.cuda.set_device(opt.gpu) 54 | 55 | translator = onmt.Translator(opt) 56 | 57 | outF = open(opt.output, 'w') 58 | 59 | predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 60 | 61 | srcBatch, tgtBatch = [], [] 62 | 63 | count = 0 64 | 65 | tgtF = open(opt.tgt) if opt.tgt else None 66 | for line in open(opt.src): 67 | 68 | srcTokens = line.split() 69 | srcBatch += [srcTokens] 70 | if tgtF: 71 | tgtTokens = tgtF.readline().split() if tgtF else None 72 | tgtBatch += [tgtTokens] 73 | 74 | if len(srcBatch) < opt.batch_size: 75 | continue 76 | 77 | predBatch, predScore, goldScore = translator.translate(srcBatch, tgtBatch) 78 | 79 | predScoreTotal += sum(score[0] for score in predScore) 80 | predWordsTotal += sum(len(x) for x in predBatch) 81 | if tgtF is not None: 82 | goldScoreTotal += sum(goldScore) 83 | goldWordsTotal += sum(len(x) for x in tgtBatch) 84 | 85 | for b in range(len(predBatch)): 86 | count += 1 87 | outF.write(" ".join(predBatch[b][0]) + '\n') 88 | 89 | if opt.verbose: 90 | print('SENT %d: %s' % (count, " ".join(srcBatch[b]))) 91 | print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) 92 | print("PRED SCORE: %.4f" % predScore[b][0]) 93 | 94 | if tgtF is not None: 95 | print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b]))) 96 | print("GOLD SCORE: %.4f" % goldScore[b]) 97 | 98 | if opt.n_best > 1: 99 | print('\nBEST HYP:') 100 | for n in range(opt.n_best): 101 | print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][0]))) 102 | 103 | print('') 104 | 105 | srcBatch, tgtBatch = [], [] 106 | 107 | reportScore('PRED', predScoreTotal, predWordsTotal) 108 | if tgtF: 109 | reportScore('GOLD', goldScoreTotal, goldWordsTotal) 110 | 111 | if tgtF: 112 | tgtF.close() 113 | 114 | 115 | if __name__ == "__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /snli/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import glob 4 | 5 | import torch 6 | import torch.optim as O 7 | import torch.nn as nn 8 | 9 | from torchtext import data 10 | from torchtext import datasets 11 | 12 | from model import SNLIClassifier 13 | from util import get_args 14 | 15 | 16 | args = get_args() 17 | torch.cuda.set_device(args.gpu) 18 | 19 | inputs = data.Field(lower=args.lower) 20 | answers = data.Field(sequential=False) 21 | 22 | train, dev, test = datasets.SNLI.splits(inputs, answers) 23 | 24 | inputs.build_vocab(train, dev, test) 25 | if args.word_vectors: 26 | if os.path.isfile(args.vector_cache): 27 | inputs.vocab.vectors = torch.load(args.vector_cache) 28 | else: 29 | inputs.vocab.load_vectors(wv_dir=args.data_cache, wv_type=args.word_vectors, wv_dim=args.d_embed) 30 | os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True) 31 | torch.save(inputs.vocab.vectors, args.vector_cache) 32 | answers.build_vocab(train) 33 | 34 | train_iter, dev_iter, test_iter = data.BucketIterator.splits( 35 | (train, dev, test), batch_size=args.batch_size, device=args.gpu) 36 | 37 | config = args 38 | config.n_embed = len(inputs.vocab) 39 | config.d_out = len(answers.vocab) 40 | config.n_cells = config.n_layers 41 | if config.birnn: 42 | config.n_cells *= 2 43 | 44 | if args.resume_snapshot: 45 | model = torch.load(args.resume_snapshot, map_location=lambda storage, locatoin: storage.cuda(args.gpu)) 46 | else: 47 | model = SNLIClassifier(config) 48 | if args.word_vectors: 49 | model.embed.weight.data = inputs.vocab.vectors 50 | model.cuda() 51 | 52 | criterion = nn.CrossEntropyLoss() 53 | opt = O.Adam(model.parameters(), lr=args.lr) 54 | 55 | iterations = 0 56 | start = time.time() 57 | best_dev_acc = -1 58 | train_iter.repeat = False 59 | header = ' Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss Accuracy Dev/Accuracy' 60 | dev_log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'.split(',')) 61 | log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(',')) 62 | os.makedirs(args.save_path, exist_ok=True) 63 | print(header) 64 | 65 | for epoch in range(args.epochs): 66 | train_iter.init_epoch() 67 | n_correct, n_total = 0, 0 68 | for batch_idx, batch in enumerate(train_iter): 69 | model.train(); opt.zero_grad() 70 | iterations += 1 71 | answer = model(batch) 72 | n_correct += (torch.max(answer, 1)[1].view(batch.label.size()).data == batch.label.data).sum() 73 | n_total += batch.batch_size 74 | train_acc = 100. * n_correct/n_total 75 | loss = criterion(answer, batch.label) 76 | loss.backward(); opt.step() 77 | if iterations % args.save_every == 0: 78 | snapshot_prefix = os.path.join(args.save_path, 'snapshot') 79 | snapshot_path = snapshot_prefix + '_acc_{:.4f}_loss_{:.6f}_iter_{}_model.pt'.format(train_acc, loss.data[0], iterations) 80 | torch.save(model, snapshot_path) 81 | for f in glob.glob(snapshot_prefix + '*'): 82 | if f != snapshot_path: 83 | os.remove(f) 84 | if iterations % args.dev_every == 0: 85 | model.eval(); dev_iter.init_epoch() 86 | n_dev_correct, dev_loss = 0, 0 87 | for dev_batch_idx, dev_batch in enumerate(dev_iter): 88 | answer = model(dev_batch) 89 | n_dev_correct += (torch.max(answer, 1)[1].view(dev_batch.label.size()).data == dev_batch.label.data).sum() 90 | dev_loss = criterion(answer, dev_batch.label) 91 | dev_acc = 100. * n_dev_correct / len(dev) 92 | print(dev_log_template.format(time.time()-start, 93 | epoch, iterations, 1+batch_idx, len(train_iter), 94 | 100. * (1+batch_idx) / len(train_iter), loss.data[0], dev_loss.data[0], train_acc, dev_acc)) 95 | if dev_acc > best_dev_acc: 96 | best_dev_acc = dev_acc 97 | snapshot_prefix = os.path.join(args.save_path, 'best_snapshot') 98 | snapshot_path = snapshot_prefix + '_devacc_{}_devloss_{}__iter_{}_model.pt'.format(dev_acc, dev_loss.data[0], iterations) 99 | torch.save(model, snapshot_path) 100 | for f in glob.glob(snapshot_prefix + '*'): 101 | if f != snapshot_path: 102 | os.remove(f) 103 | elif iterations % args.log_every == 0: 104 | print(log_template.format(time.time()-start, 105 | epoch, iterations, 1+batch_idx, len(train_iter), 106 | 100. * (1+batch_idx) / len(train_iter), loss.data[0], ' '*8, n_correct/n_total*100, ' '*12)) 107 | 108 | 109 | -------------------------------------------------------------------------------- /mnist/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | 10 | # Training settings 11 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 12 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 13 | help='input batch size for training (default: 64)') 14 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 15 | help='input batch size for testing (default: 1000)') 16 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 17 | help='number of epochs to train (default: 10)') 18 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 19 | help='learning rate (default: 0.01)') 20 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 21 | help='SGD momentum (default: 0.5)') 22 | parser.add_argument('--no-cuda', action='store_true', default=False, 23 | help='enables CUDA training') 24 | parser.add_argument('--seed', type=int, default=1, metavar='S', 25 | help='random seed (default: 1)') 26 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 27 | help='how many batches to wait before logging training status') 28 | args = parser.parse_args() 29 | args.cuda = not args.no_cuda and torch.cuda.is_available() 30 | 31 | torch.manual_seed(args.seed) 32 | if args.cuda: 33 | torch.cuda.manual_seed(args.seed) 34 | 35 | 36 | kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 37 | train_loader = torch.utils.data.DataLoader( 38 | datasets.MNIST('../data', train=True, download=True, 39 | transform=transforms.Compose([ 40 | transforms.ToTensor(), 41 | transforms.Normalize((0.1307,), (0.3081,)) 42 | ])), 43 | batch_size=args.batch_size, shuffle=True, **kwargs) 44 | test_loader = torch.utils.data.DataLoader( 45 | datasets.MNIST('../data', train=False, transform=transforms.Compose([ 46 | transforms.ToTensor(), 47 | transforms.Normalize((0.1307,), (0.3081,)) 48 | ])), 49 | batch_size=args.batch_size, shuffle=True, **kwargs) 50 | 51 | 52 | class Net(nn.Module): 53 | def __init__(self): 54 | super(Net, self).__init__() 55 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 56 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 57 | self.conv2_drop = nn.Dropout2d() 58 | self.fc1 = nn.Linear(320, 50) 59 | self.fc2 = nn.Linear(50, 10) 60 | 61 | def forward(self, x): 62 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 63 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 64 | x = x.view(-1, 320) 65 | x = F.relu(self.fc1(x)) 66 | x = F.dropout(x, training=self.training) 67 | x = F.relu(self.fc2(x)) 68 | return F.log_softmax(x) 69 | 70 | model = Net() 71 | if args.cuda: 72 | model.cuda() 73 | 74 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 75 | 76 | def train(epoch): 77 | model.train() 78 | for batch_idx, (data, target) in enumerate(train_loader): 79 | if args.cuda: 80 | data, target = data.cuda(), target.cuda() 81 | data, target = Variable(data), Variable(target) 82 | optimizer.zero_grad() 83 | output = model(data) 84 | loss = F.nll_loss(output, target) 85 | loss.backward() 86 | optimizer.step() 87 | if batch_idx % args.log_interval == 0: 88 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 89 | epoch, batch_idx * len(data), len(train_loader.dataset), 90 | 100. * batch_idx / len(train_loader), loss.data[0])) 91 | 92 | def test(epoch): 93 | model.eval() 94 | test_loss = 0 95 | correct = 0 96 | for data, target in test_loader: 97 | if args.cuda: 98 | data, target = data.cuda(), target.cuda() 99 | data, target = Variable(data, volatile=True), Variable(target) 100 | output = model(data) 101 | test_loss += F.nll_loss(output, target).data[0] 102 | pred = output.data.max(1)[1] # get the index of the max log-probability 103 | correct += pred.eq(target.data).cpu().sum() 104 | 105 | test_loss = test_loss 106 | test_loss /= len(test_loader) # loss function already averages over batch size 107 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 108 | test_loss, correct, len(test_loader.dataset), 109 | 100. * correct / len(test_loader.dataset))) 110 | 111 | 112 | for epoch in range(1, args.epochs + 1): 113 | train(epoch) 114 | test(epoch) 115 | -------------------------------------------------------------------------------- /vae/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.utils.data 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.autograd import Variable 8 | from torchvision import datasets, transforms 9 | 10 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 11 | parser.add_argument('--batch-size', type=int, default=128, metavar='N', 12 | help='input batch size for training (default: 64)') 13 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 14 | help='number of epochs to train (default: 2)') 15 | parser.add_argument('--no-cuda', action='store_true', default=False, 16 | help='enables CUDA training') 17 | parser.add_argument('--seed', type=int, default=1, metavar='S', 18 | help='random seed (default: 1)') 19 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 20 | help='how many batches to wait before logging training status') 21 | args = parser.parse_args() 22 | args.cuda = not args.no_cuda and torch.cuda.is_available() 23 | 24 | 25 | torch.manual_seed(args.seed) 26 | if args.cuda: 27 | torch.cuda.manual_seed(args.seed) 28 | 29 | 30 | kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 31 | train_loader = torch.utils.data.DataLoader( 32 | datasets.MNIST('../data', train=True, download=True, 33 | transform=transforms.ToTensor()), 34 | batch_size=args.batch_size, shuffle=True, **kwargs) 35 | test_loader = torch.utils.data.DataLoader( 36 | datasets.MNIST('../data', train=False, transform=transforms.ToTensor()), 37 | batch_size=args.batch_size, shuffle=True, **kwargs) 38 | 39 | 40 | class VAE(nn.Module): 41 | def __init__(self): 42 | super(VAE, self).__init__() 43 | 44 | self.fc1 = nn.Linear(784, 400) 45 | self.fc21 = nn.Linear(400, 20) 46 | self.fc22 = nn.Linear(400, 20) 47 | self.fc3 = nn.Linear(20, 400) 48 | self.fc4 = nn.Linear(400, 784) 49 | 50 | self.relu = nn.ReLU() 51 | self.sigmoid = nn.Sigmoid() 52 | 53 | def encode(self, x): 54 | h1 = self.relu(self.fc1(x)) 55 | return self.fc21(h1), self.fc22(h1) 56 | 57 | def reparametrize(self, mu, logvar): 58 | std = logvar.mul(0.5).exp_() 59 | if args.cuda: 60 | eps = torch.cuda.FloatTensor(std.size()).normal_() 61 | else: 62 | eps = torch.FloatTensor(std.size()).normal_() 63 | eps = Variable(eps) 64 | return eps.mul(std).add_(mu) 65 | 66 | def decode(self, z): 67 | h3 = self.relu(self.fc3(z)) 68 | return self.sigmoid(self.fc4(h3)) 69 | 70 | def forward(self, x): 71 | mu, logvar = self.encode(x.view(-1, 784)) 72 | z = self.reparametrize(mu, logvar) 73 | return self.decode(z), mu, logvar 74 | 75 | 76 | model = VAE() 77 | if args.cuda: 78 | model.cuda() 79 | 80 | reconstruction_function = nn.BCELoss() 81 | reconstruction_function.size_average = False 82 | 83 | 84 | def loss_function(recon_x, x, mu, logvar): 85 | BCE = reconstruction_function(recon_x, x) 86 | 87 | # see Appendix B from VAE paper: 88 | # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 89 | # https://arxiv.org/abs/1312.6114 90 | # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) 91 | KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar) 92 | KLD = torch.sum(KLD_element).mul_(-0.5) 93 | 94 | return BCE + KLD 95 | 96 | 97 | optimizer = optim.Adam(model.parameters(), lr=1e-3) 98 | 99 | 100 | def train(epoch): 101 | model.train() 102 | train_loss = 0 103 | for batch_idx, (data, _) in enumerate(train_loader): 104 | data = Variable(data) 105 | if args.cuda: 106 | data = data.cuda() 107 | optimizer.zero_grad() 108 | recon_batch, mu, logvar = model(data) 109 | loss = loss_function(recon_batch, data, mu, logvar) 110 | loss.backward() 111 | train_loss += loss.data[0] 112 | optimizer.step() 113 | if batch_idx % args.log_interval == 0: 114 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 115 | epoch, batch_idx * len(data), len(train_loader.dataset), 116 | 100. * batch_idx / len(train_loader), 117 | loss.data[0] / len(data))) 118 | 119 | print('====> Epoch: {} Average loss: {:.4f}'.format( 120 | epoch, train_loss / len(train_loader.dataset))) 121 | 122 | 123 | def test(epoch): 124 | model.eval() 125 | test_loss = 0 126 | for data, _ in test_loader: 127 | if args.cuda: 128 | data = data.cuda() 129 | data = Variable(data, volatile=True) 130 | recon_batch, mu, logvar = model(data) 131 | test_loss += loss_function(recon_batch, data, mu, logvar).data[0] 132 | 133 | test_loss /= len(test_loader.dataset) 134 | print('====> Test set loss: {:.4f}'.format(test_loss)) 135 | 136 | 137 | for epoch in range(1, args.epochs + 1): 138 | train(epoch) 139 | test(epoch) 140 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import onmt.modules 5 | 6 | class Encoder(nn.Module): 7 | 8 | def __init__(self, opt, dicts): 9 | self.layers = opt.layers 10 | self.num_directions = 2 if opt.brnn else 1 11 | assert opt.rnn_size % self.num_directions == 0 12 | self.hidden_size = opt.rnn_size // self.num_directions 13 | inputSize = opt.word_vec_size 14 | 15 | super(Encoder, self).__init__() 16 | self.word_lut = nn.Embedding(dicts.size(), 17 | opt.word_vec_size, 18 | padding_idx=onmt.Constants.PAD) 19 | self.rnn = nn.LSTM(inputSize, self.hidden_size, 20 | num_layers=opt.layers, 21 | dropout=opt.dropout, 22 | bidirectional=opt.brnn) 23 | 24 | # self.rnn.bias_ih_l0.data.div_(2) 25 | # self.rnn.bias_hh_l0.data.copy_(self.rnn.bias_ih_l0.data) 26 | 27 | if opt.pre_word_vecs_enc is not None: 28 | pretrained = torch.load(opt.pre_word_vecs_enc) 29 | self.word_lut.weight.copy_(pretrained) 30 | 31 | def forward(self, input, hidden=None): 32 | batch_size = input.size(0) # batch first for multi-gpu compatibility 33 | emb = self.word_lut(input).transpose(0, 1) 34 | if hidden is None: 35 | h_size = (self.layers * self.num_directions, batch_size, self.hidden_size) 36 | h_0 = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) 37 | c_0 = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) 38 | hidden = (h_0, c_0) 39 | 40 | outputs, hidden_t = self.rnn(emb, hidden) 41 | return hidden_t, outputs 42 | 43 | 44 | class StackedLSTM(nn.Module): 45 | def __init__(self, num_layers, input_size, rnn_size, dropout): 46 | super(StackedLSTM, self).__init__() 47 | self.dropout = nn.Dropout(dropout) 48 | self.num_layers = num_layers 49 | 50 | for i in range(num_layers): 51 | layer = nn.LSTMCell(input_size, rnn_size) 52 | self.add_module('layer_%d' % i, layer) 53 | input_size = rnn_size 54 | 55 | def forward(self, input, hidden): 56 | h_0, c_0 = hidden 57 | h_1, c_1 = [], [] 58 | for i in range(self.num_layers): 59 | layer = getattr(self, 'layer_%d' % i) 60 | h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) 61 | input = h_1_i 62 | if i != self.num_layers: 63 | input = self.dropout(input) 64 | h_1 += [h_1_i] 65 | c_1 += [c_1_i] 66 | 67 | h_1 = torch.stack(h_1) 68 | c_1 = torch.stack(c_1) 69 | 70 | return input, (h_1, c_1) 71 | 72 | 73 | class Decoder(nn.Module): 74 | 75 | def __init__(self, opt, dicts): 76 | self.layers = opt.layers 77 | self.input_feed = opt.input_feed 78 | input_size = opt.word_vec_size 79 | if self.input_feed: 80 | input_size += opt.rnn_size 81 | 82 | super(Decoder, self).__init__() 83 | self.word_lut = nn.Embedding(dicts.size(), 84 | opt.word_vec_size, 85 | padding_idx=onmt.Constants.PAD) 86 | self.rnn = StackedLSTM(opt.layers, input_size, opt.rnn_size, opt.dropout) 87 | self.attn = onmt.modules.GlobalAttention(opt.rnn_size) 88 | self.dropout = nn.Dropout(opt.dropout) 89 | 90 | # self.rnn.bias_ih.data.div_(2) 91 | # self.rnn.bias_hh.data.copy_(self.rnn.bias_ih.data) 92 | 93 | self.hidden_size = opt.rnn_size 94 | 95 | if opt.pre_word_vecs_enc is not None: 96 | pretrained = torch.load(opt.pre_word_vecs_dec) 97 | self.word_lut.weight.copy_(pretrained) 98 | 99 | 100 | def forward(self, input, hidden, context, init_output): 101 | emb = self.word_lut(input).transpose(0, 1) 102 | 103 | batch_size = input.size(0) 104 | 105 | h_size = (batch_size, self.hidden_size) 106 | output = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) 107 | 108 | # n.b. you can increase performance if you compute W_ih * x for all 109 | # iterations in parallel, but that's only possible if 110 | # self.input_feed=False 111 | outputs = [] 112 | output = init_output 113 | for i, emb_t in enumerate(emb.chunk(emb.size(0), dim=0)): 114 | emb_t = emb_t.squeeze(0) 115 | if self.input_feed: 116 | emb_t = torch.cat([emb_t, output], 1) 117 | 118 | output, h = self.rnn(emb_t, hidden) 119 | output, attn = self.attn(output, context.t()) 120 | output = self.dropout(output) 121 | outputs += [output] 122 | 123 | outputs = torch.stack(outputs) 124 | return outputs.transpose(0, 1), h, attn 125 | 126 | 127 | class NMTModel(nn.Module): 128 | 129 | def __init__(self, encoder, decoder, generator): 130 | super(NMTModel, self).__init__() 131 | self.encoder = encoder 132 | self.decoder = decoder 133 | self.generator = generator 134 | self.generate = False 135 | 136 | def set_generate(self, enabled): 137 | self.generate = enabled 138 | 139 | def make_init_decoder_output(self, context): 140 | batch_size = context.size(1) 141 | h_size = (batch_size, self.decoder.hidden_size) 142 | return Variable(context.data.new(*h_size).zero_(), requires_grad=False) 143 | 144 | def _fix_enc_hidden(self, h): 145 | # the encoder hidden is (layers*directions) x batch x dim 146 | # we need to convert it to layers x batch x (directions*dim) 147 | if self.encoder.num_directions == 2: 148 | return h.view(h.size(0) // 2, 2, h.size(1), h.size(2)) \ 149 | .transpose(1, 2).contiguous() \ 150 | .view(h.size(0) // 2, h.size(1), h.size(2) * 2) 151 | else: 152 | return h 153 | 154 | def forward(self, input): 155 | src = input[0] 156 | tgt = input[1][:, :-1] # exclude last target from inputs 157 | enc_hidden, context = self.encoder(src) 158 | init_output = self.make_init_decoder_output(context) 159 | 160 | enc_hidden = (self._fix_enc_hidden(enc_hidden[0]), 161 | self._fix_enc_hidden(enc_hidden[1])) 162 | 163 | out, dec_hidden, _attn = self.decoder(tgt, enc_hidden, context, init_output) 164 | if self.generate: 165 | out = self.generator(out) 166 | 167 | return out 168 | -------------------------------------------------------------------------------- /OpenNMT/preprocess.py: -------------------------------------------------------------------------------- 1 | import onmt 2 | 3 | import argparse 4 | import torch 5 | 6 | parser = argparse.ArgumentParser(description='preprocess.lua') 7 | 8 | ## 9 | ## **Preprocess Options** 10 | ## 11 | 12 | parser.add_argument('-config', help="Read options from this file") 13 | 14 | parser.add_argument('-train_src', required=True, 15 | help="Path to the training source data") 16 | parser.add_argument('-train_tgt', required=True, 17 | help="Path to the training target data") 18 | parser.add_argument('-valid_src', required=True, 19 | help="Path to the validation source data") 20 | parser.add_argument('-valid_tgt', required=True, 21 | help="Path to the validation target data") 22 | 23 | parser.add_argument('-save_data', required=True, 24 | help="Output file for the prepared data") 25 | 26 | parser.add_argument('-src_vocab_size', type=int, default=50000, 27 | help="Size of the source vocabulary") 28 | parser.add_argument('-tgt_vocab_size', type=int, default=50000, 29 | help="Size of the target vocabulary") 30 | parser.add_argument('-src_vocab', 31 | help="Path to an existing source vocabulary") 32 | parser.add_argument('-tgt_vocab', 33 | help="Path to an existing target vocabulary") 34 | 35 | 36 | parser.add_argument('-seq_length', type=int, default=50, 37 | help="Maximum sequence length") 38 | parser.add_argument('-shuffle', type=int, default=1, 39 | help="Shuffle data") 40 | parser.add_argument('-seed', type=int, default=3435, 41 | help="Random seed") 42 | 43 | parser.add_argument('-report_every', type=int, default=100000, 44 | help="Report status every this many sentences") 45 | 46 | opt = parser.parse_args() 47 | 48 | 49 | def makeVocabulary(filename, size): 50 | vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, 51 | onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD]) 52 | 53 | with open(filename) as f: 54 | for sent in f.readlines(): 55 | for word in sent.split(): 56 | vocab.add(word) 57 | 58 | originalSize = vocab.size() 59 | vocab = vocab.prune(size) 60 | print('Created dictionary of size %d (pruned from %d)' % 61 | (vocab.size(), originalSize)) 62 | 63 | return vocab 64 | 65 | 66 | def initVocabulary(name, dataFile, vocabFile, vocabSize): 67 | 68 | vocab = None 69 | if vocabFile is not None: 70 | # If given, load existing word dictionary. 71 | print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') 72 | vocab = onmt.Dict() 73 | vocab.loadFile(vocabFile) 74 | print('Loaded ' + vocab.size() + ' ' + name + ' words') 75 | 76 | if vocab is None: 77 | # If a dictionary is still missing, generate it. 78 | print('Building ' + name + ' vocabulary...') 79 | genWordVocab = makeVocabulary(dataFile, vocabSize) 80 | 81 | vocab = genWordVocab 82 | 83 | print() 84 | return vocab 85 | 86 | 87 | def saveVocabulary(name, vocab, file): 88 | print('Saving ' + name + ' vocabulary to \'' + file + '\'...') 89 | vocab.writeFile(file) 90 | 91 | 92 | def makeData(srcFile, tgtFile, srcDicts, tgtDicts): 93 | src, tgt = [], [] 94 | sizes = [] 95 | count, ignored = 0, 0 96 | 97 | print('Processing %s & %s ...' % (srcFile, tgtFile)) 98 | srcF = open(srcFile) 99 | tgtF = open(tgtFile) 100 | 101 | while True: 102 | srcWords = srcF.readline().split() 103 | tgtWords = tgtF.readline().split() 104 | 105 | if not srcWords or not tgtWords: 106 | if srcWords and not tgtWords or not srcWords and tgtWords: 107 | print('WARNING: source and target do not have the same number of sentences') 108 | break 109 | 110 | if len(srcWords) <= opt.seq_length and len(tgtWords) <= opt.seq_length: 111 | 112 | src += [srcDicts.convertToIdx(srcWords, 113 | onmt.Constants.UNK_WORD)] 114 | tgt += [tgtDicts.convertToIdx(tgtWords, 115 | onmt.Constants.UNK_WORD, 116 | onmt.Constants.BOS_WORD, 117 | onmt.Constants.EOS_WORD)] 118 | 119 | sizes += [len(srcWords)] 120 | else: 121 | ignored += 1 122 | 123 | count += 1 124 | 125 | if count % opt.report_every == 0: 126 | print('... %d sentences prepared' % count) 127 | 128 | srcF.close() 129 | tgtF.close() 130 | 131 | if opt.shuffle == 1: 132 | print('... shuffling sentences') 133 | perm = torch.randperm(len(src)) 134 | src = [src[idx] for idx in perm] 135 | tgt = [tgt[idx] for idx in perm] 136 | sizes = [sizes[idx] for idx in perm] 137 | 138 | print('... sorting sentences by size') 139 | _, perm = torch.sort(torch.Tensor(sizes)) 140 | src = [src[idx] for idx in perm] 141 | tgt = [tgt[idx] for idx in perm] 142 | 143 | print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' % 144 | (len(src), ignored, opt.seq_length)) 145 | 146 | return src, tgt 147 | 148 | 149 | def main(): 150 | 151 | dicts = {} 152 | dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab, 153 | opt.src_vocab_size) 154 | dicts['tgt'] = initVocabulary('target', opt.train_tgt, opt.tgt_vocab, 155 | opt.tgt_vocab_size) 156 | 157 | print('Preparing training ...') 158 | train = {} 159 | train['src'], train['tgt'] = makeData(opt.train_src, opt.train_tgt, 160 | dicts['src'], dicts['tgt']) 161 | 162 | print('Preparing validation ...') 163 | valid = {} 164 | valid['src'], valid['tgt'] = makeData(opt.valid_src, opt.valid_tgt, 165 | dicts['src'], dicts['tgt']) 166 | 167 | if opt.src_vocab is None: 168 | saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict') 169 | if opt.tgt_vocab is None: 170 | saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict') 171 | 172 | 173 | print('Saving data to \'' + opt.save_data + '-train.pt\'...') 174 | save_data = {'dicts': dicts, 175 | 'train': train, 176 | 'valid': valid} 177 | torch.save(save_data, opt.save_data + '-train.pt') 178 | 179 | 180 | if __name__ == "__main__": 181 | main() 182 | -------------------------------------------------------------------------------- /word_language_model/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | 8 | import data 9 | import model 10 | 11 | parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') 12 | parser.add_argument('--data', type=str, default='./data/penn', 13 | help='location of the data corpus') 14 | parser.add_argument('--model', type=str, default='LSTM', 15 | help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') 16 | parser.add_argument('--emsize', type=int, default=200, 17 | help='size of word embeddings') 18 | parser.add_argument('--nhid', type=int, default=200, 19 | help='humber of hidden units per layer') 20 | parser.add_argument('--nlayers', type=int, default=2, 21 | help='number of layers') 22 | parser.add_argument('--lr', type=float, default=20, 23 | help='initial learning rate') 24 | parser.add_argument('--clip', type=float, default=0.5, 25 | help='gradient clipping') 26 | parser.add_argument('--epochs', type=int, default=6, 27 | help='upper epoch limit') 28 | parser.add_argument('--batch-size', type=int, default=20, metavar='N', 29 | help='batch size') 30 | parser.add_argument('--bptt', type=int, default=20, 31 | help='sequence length') 32 | parser.add_argument('--seed', type=int, default=1111, 33 | help='random seed') 34 | parser.add_argument('--cuda', action='store_true', 35 | help='use CUDA') 36 | parser.add_argument('--log-interval', type=int, default=200, metavar='N', 37 | help='report interval') 38 | parser.add_argument('--save', type=str, default='model.pt', 39 | help='path to save the final model') 40 | args = parser.parse_args() 41 | 42 | # Set the random seed manually for reproducibility. 43 | torch.manual_seed(args.seed) 44 | if torch.cuda.is_available(): 45 | if not args.cuda: 46 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 47 | else: 48 | torch.cuda.manual_seed(args.seed) 49 | 50 | ############################################################################### 51 | # Load data 52 | ############################################################################### 53 | 54 | corpus = data.Corpus(args.data) 55 | 56 | def batchify(data, bsz): 57 | nbatch = data.size(0) // bsz 58 | data = data.narrow(0, 0, nbatch * bsz) 59 | data = data.view(bsz, -1).t().contiguous() 60 | if args.cuda: 61 | data = data.cuda() 62 | return data 63 | 64 | eval_batch_size = 10 65 | train_data = batchify(corpus.train, args.batch_size) 66 | val_data = batchify(corpus.valid, eval_batch_size) 67 | test_data = batchify(corpus.test, eval_batch_size) 68 | 69 | ############################################################################### 70 | # Build the model 71 | ############################################################################### 72 | 73 | ntokens = len(corpus.dictionary) 74 | model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers) 75 | if args.cuda: 76 | model.cuda() 77 | 78 | criterion = nn.CrossEntropyLoss() 79 | 80 | ############################################################################### 81 | # Training code 82 | ############################################################################### 83 | 84 | def repackage_hidden(h): 85 | """Wraps hidden states in new Variables, to detach them from their history.""" 86 | if type(h) == Variable: 87 | return Variable(h.data) 88 | else: 89 | return tuple(repackage_hidden(v) for v in h) 90 | 91 | 92 | def get_batch(source, i, evaluation=False): 93 | seq_len = min(args.bptt, len(source) - 1 - i) 94 | data = Variable(source[i:i+seq_len], volatile=evaluation) 95 | target = Variable(source[i+1:i+1+seq_len].view(-1)) 96 | return data, target 97 | 98 | 99 | def evaluate(data_source): 100 | total_loss = 0 101 | ntokens = len(corpus.dictionary) 102 | hidden = model.init_hidden(eval_batch_size) 103 | for i in range(0, data_source.size(0) - 1, args.bptt): 104 | data, targets = get_batch(data_source, i, evaluation=True) 105 | output, hidden = model(data, hidden) 106 | output_flat = output.view(-1, ntokens) 107 | total_loss += len(data) * criterion(output_flat, targets).data 108 | hidden = repackage_hidden(hidden) 109 | return total_loss[0] / len(data_source) 110 | 111 | 112 | def train(): 113 | total_loss = 0 114 | start_time = time.time() 115 | ntokens = len(corpus.dictionary) 116 | hidden = model.init_hidden(args.batch_size) 117 | for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): 118 | data, targets = get_batch(train_data, i) 119 | hidden = repackage_hidden(hidden) 120 | model.zero_grad() 121 | output, hidden = model(data, hidden) 122 | loss = criterion(output.view(-1, ntokens), targets) 123 | loss.backward() 124 | 125 | torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) 126 | for p in model.parameters(): 127 | p.data.add_(-lr, p.grad.data) 128 | 129 | total_loss += loss.data 130 | 131 | if batch % args.log_interval == 0 and batch > 0: 132 | cur_loss = total_loss[0] / args.log_interval 133 | elapsed = time.time() - start_time 134 | print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 135 | 'loss {:5.2f} | ppl {:8.2f}'.format( 136 | epoch, batch, len(train_data) // args.bptt, lr, 137 | elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) 138 | total_loss = 0 139 | start_time = time.time() 140 | 141 | 142 | # Loop over epochs. 143 | lr = args.lr 144 | prev_val_loss = None 145 | for epoch in range(1, args.epochs+1): 146 | epoch_start_time = time.time() 147 | train() 148 | val_loss = evaluate(val_data) 149 | print('-' * 89) 150 | print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 151 | 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), 152 | val_loss, math.exp(val_loss))) 153 | print('-' * 89) 154 | # Anneal the learning rate. 155 | if prev_val_loss and val_loss > prev_val_loss: 156 | lr /= 4 157 | prev_val_loss = val_loss 158 | 159 | 160 | # Run on test data and save the model. 161 | test_loss = evaluate(test_data) 162 | print('=' * 89) 163 | print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( 164 | test_loss, math.exp(test_loss))) 165 | print('=' * 89) 166 | if args.save != '': 167 | with open(args.save, 'wb') as f: 168 | torch.save(model, f) 169 | -------------------------------------------------------------------------------- /OpenNMT/onmt/Translator.py: -------------------------------------------------------------------------------- 1 | import onmt 2 | import torch 3 | from torch.autograd import Variable 4 | 5 | 6 | class Translator(object): 7 | def __init__(self, opt): 8 | self.opt = opt 9 | self.tt = torch.cuda if opt.cuda else torch 10 | 11 | checkpoint = torch.load(opt.model) 12 | self.model = checkpoint['model'] 13 | 14 | self.model.eval() 15 | 16 | if opt.cuda: 17 | self.model.cuda() 18 | else: 19 | self.model.cpu() 20 | 21 | self.src_dict = checkpoint['dicts']['src'] 22 | self.tgt_dict = checkpoint['dicts']['tgt'] 23 | 24 | def buildData(self, srcBatch, goldBatch): 25 | srcData = [self.src_dict.convertToIdx(b, 26 | onmt.Constants.UNK_WORD) for b in srcBatch] 27 | tgtData = None 28 | if goldBatch: 29 | tgtData = [self.tgt_dict.convertToIdx(b, 30 | onmt.Constants.UNK_WORD, 31 | onmt.Constants.BOS_WORD, 32 | onmt.Constants.EOS_WORD) for b in goldBatch] 33 | 34 | return onmt.Dataset(srcData, tgtData, 35 | self.opt.batch_size, self.opt.cuda) 36 | 37 | def buildTargetTokens(self, pred, src, attn): 38 | tokens = self.tgt_dict.convertToLabels(pred, onmt.Constants.EOS) 39 | tokens = tokens[:-1] # EOS 40 | if self.opt.replace_unk: 41 | for i in range(len(tokens)): 42 | if tokens[i] == onmt.Constants.UNK_WORD: 43 | _, maxIndex = attn[i].max(0) 44 | # FIXME phrase table 45 | tokens[i] = src[maxIndex[0]] 46 | 47 | return tokens 48 | 49 | def translateBatch(self, batch): 50 | srcBatch, tgtBatch = batch 51 | batchSize = srcBatch.size(0) 52 | beamSize = self.opt.beam_size 53 | 54 | # (1) run the encoder on the src 55 | 56 | # have to execute the encoder manually to deal with padding 57 | encStates = None 58 | context = [] 59 | for srcBatch_t in srcBatch.chunk(srcBatch.size(1), dim=1): 60 | encStates, context_t = self.model.encoder(srcBatch_t, hidden=encStates) 61 | batchPadIdx = srcBatch_t.data.squeeze(1).eq(onmt.Constants.PAD).nonzero() 62 | if batchPadIdx.nelement() > 0: 63 | batchPadIdx = batchPadIdx.squeeze(1) 64 | encStates[0].data.index_fill_(1, batchPadIdx, 0) 65 | encStates[1].data.index_fill_(1, batchPadIdx, 0) 66 | context += [context_t] 67 | 68 | encStates = (self.model._fix_enc_hidden(encStates[0]), 69 | self.model._fix_enc_hidden(encStates[1])) 70 | 71 | context = torch.cat(context) 72 | rnnSize = context.size(2) 73 | 74 | # This mask is applied to the attention model inside the decoder 75 | # so that the attention ignores source padding 76 | padMask = srcBatch.data.eq(onmt.Constants.PAD) 77 | def applyContextMask(m): 78 | if isinstance(m, onmt.modules.GlobalAttention): 79 | m.applyMask(padMask) 80 | 81 | # (2) if a target is specified, compute the 'goldScore' 82 | # (i.e. log likelihood) of the target under the model 83 | goldScores = context.data.new(batchSize).zero_() 84 | if tgtBatch is not None: 85 | decStates = encStates 86 | decOut = self.model.make_init_decoder_output(context) 87 | self.model.decoder.apply(applyContextMask) 88 | initOutput = self.model.make_init_decoder_output(context) 89 | 90 | decOut, decStates, attn = self.model.decoder( 91 | tgtBatch[:, :-1], decStates, context, initOutput) 92 | for dec_t, tgt_t in zip(decOut.transpose(0, 1), tgtBatch.transpose(0, 1)[1:].data): 93 | gen_t = self.model.generator.forward(dec_t) 94 | tgt_t = tgt_t.unsqueeze(1) 95 | scores = gen_t.data.gather(1, tgt_t) 96 | scores.masked_fill_(tgt_t.eq(onmt.Constants.PAD), 0) 97 | goldScores += scores 98 | 99 | # (3) run the decoder to generate sentences, using beam search 100 | 101 | # Expand tensors for each beam. 102 | context = Variable(context.data.repeat(1, beamSize, 1)) 103 | decStates = (Variable(encStates[0].data.repeat(1, beamSize, 1)), 104 | Variable(encStates[1].data.repeat(1, beamSize, 1))) 105 | 106 | beam = [onmt.Beam(beamSize, self.opt.cuda) for k in range(batchSize)] 107 | 108 | decOut = self.model.make_init_decoder_output(context) 109 | 110 | padMask = srcBatch.data.eq(onmt.Constants.PAD).unsqueeze(0).repeat(beamSize, 1, 1) 111 | 112 | batchIdx = list(range(batchSize)) 113 | remainingSents = batchSize 114 | for i in range(self.opt.max_sent_length): 115 | 116 | self.model.decoder.apply(applyContextMask) 117 | 118 | # Prepare decoder input. 119 | input = torch.stack([b.getCurrentState() for b in beam 120 | if not b.done]).t().contiguous().view(1, -1) 121 | 122 | decOut, decStates, attn = self.model.decoder( 123 | Variable(input).transpose(0, 1), decStates, context, decOut) 124 | # decOut: 1 x (beam*batch) x numWords 125 | decOut = decOut.transpose(0, 1).squeeze(0) 126 | out = self.model.generator.forward(decOut) 127 | 128 | # batch x beam x numWords 129 | wordLk = out.view(beamSize, remainingSents, -1).transpose(0, 1).contiguous() 130 | attn = attn.view(beamSize, remainingSents, -1).transpose(0, 1).contiguous() 131 | 132 | active = [] 133 | for b in range(batchSize): 134 | if beam[b].done: 135 | continue 136 | 137 | idx = batchIdx[b] 138 | if not beam[b].advance(wordLk.data[idx], attn.data[idx]): 139 | active += [b] 140 | 141 | for decState in decStates: # iterate over h, c 142 | # layers x beam*sent x dim 143 | sentStates = decState.view( 144 | -1, beamSize, remainingSents, decState.size(2))[:, :, idx] 145 | sentStates.data.copy_( 146 | sentStates.data.index_select(1, beam[b].getCurrentOrigin())) 147 | 148 | if not active: 149 | break 150 | 151 | # in this section, the sentences that are still active are 152 | # compacted so that the decoder is not run on completed sentences 153 | activeIdx = self.tt.LongTensor([batchIdx[k] for k in active]) 154 | batchIdx = {beam: idx for idx, beam in enumerate(active)} 155 | 156 | def updateActive(t): 157 | # select only the remaining active sentences 158 | view = t.data.view(-1, remainingSents, rnnSize) 159 | newSize = list(t.size()) 160 | newSize[-2] = newSize[-2] * len(activeIdx) // remainingSents 161 | return Variable(view.index_select(1, activeIdx) \ 162 | .view(*newSize)) 163 | 164 | decStates = (updateActive(decStates[0]), updateActive(decStates[1])) 165 | decOut = updateActive(decOut) 166 | context = updateActive(context) 167 | padMask = padMask.index_select(1, activeIdx) 168 | 169 | remainingSents = len(active) 170 | 171 | # (4) package everything up 172 | 173 | allHyp, allScores, allAttn = [], [], [] 174 | n_best = self.opt.n_best 175 | 176 | for b in range(batchSize): 177 | scores, ks = beam[b].sortBest() 178 | 179 | allScores += [scores[:n_best]] 180 | valid_attn = srcBatch.transpose(0, 1).data[:, b].ne(onmt.Constants.PAD).nonzero().squeeze(1) 181 | hyps, attn = zip(*[beam[b].getHyp(k) for k in ks[:n_best]]) 182 | attn = [a.index_select(1, valid_attn) for a in attn] 183 | allHyp += [hyps] 184 | allAttn += [attn] 185 | 186 | return allHyp, allScores, allAttn, goldScores 187 | 188 | def translate(self, srcBatch, goldBatch): 189 | # (1) convert words to indexes 190 | dataset = self.buildData(srcBatch, goldBatch) 191 | batch = dataset[0] 192 | batch = [x.transpose(0, 1) for x in batch] 193 | 194 | # (2) translate 195 | pred, predScore, attn, goldScore = self.translateBatch(batch) 196 | 197 | # (3) convert indexes to words 198 | predBatch = [] 199 | for b in range(batch[0].size(0)): 200 | predBatch.append( 201 | [self.buildTargetTokens(pred[b][n], srcBatch[b], attn[b][n]) 202 | for n in range(self.opt.n_best)] 203 | ) 204 | 205 | return predBatch, predScore, goldScore 206 | -------------------------------------------------------------------------------- /dcgan/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import os 4 | import random 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.parallel 8 | import torch.backends.cudnn as cudnn 9 | import torch.optim as optim 10 | import torch.utils.data 11 | import torchvision.datasets as dset 12 | import torchvision.transforms as transforms 13 | import torchvision.utils as vutils 14 | from torch.autograd import Variable 15 | 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--dataset', required=True, help='cifar10 | lsun | imagenet | folder | lfw ') 19 | parser.add_argument('--dataroot', required=True, help='path to dataset') 20 | parser.add_argument('--workers', type=int, help='number of data loading workers', default=2) 21 | parser.add_argument('--batchSize', type=int, default=64, help='input batch size') 22 | parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network') 23 | parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector') 24 | parser.add_argument('--ngf', type=int, default=64) 25 | parser.add_argument('--ndf', type=int, default=64) 26 | parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for') 27 | parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002') 28 | parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5') 29 | parser.add_argument('--cuda' , action='store_true', help='enables cuda') 30 | parser.add_argument('--ngpu' , type=int, default=1, help='number of GPUs to use') 31 | parser.add_argument('--netG', default='', help="path to netG (to continue training)") 32 | parser.add_argument('--netD', default='', help="path to netD (to continue training)") 33 | parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints') 34 | 35 | opt = parser.parse_args() 36 | print(opt) 37 | 38 | try: 39 | os.makedirs(opt.outf) 40 | except OSError: 41 | pass 42 | opt.manualSeed = random.randint(1, 10000) # fix seed 43 | print("Random Seed: ", opt.manualSeed) 44 | random.seed(opt.manualSeed) 45 | torch.manual_seed(opt.manualSeed) 46 | 47 | cudnn.benchmark = True 48 | 49 | if torch.cuda.is_available() and not opt.cuda: 50 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 51 | 52 | if opt.dataset in ['imagenet', 'folder', 'lfw']: 53 | # folder dataset 54 | dataset = dset.ImageFolder(root=opt.dataroot, 55 | transform=transforms.Compose([ 56 | transforms.Scale(opt.imageSize), 57 | transforms.CenterCrop(opt.imageSize), 58 | transforms.ToTensor(), 59 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 60 | ])) 61 | elif opt.dataset == 'lsun': 62 | dataset = dset.LSUN(db_path=opt.dataroot, classes=['bedroom_train'], 63 | transform=transforms.Compose([ 64 | transforms.Scale(opt.imageSize), 65 | transforms.CenterCrop(opt.imageSize), 66 | transforms.ToTensor(), 67 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 68 | ])) 69 | elif opt.dataset == 'cifar10': 70 | dataset = dset.CIFAR10(root=opt.dataroot, download=True, 71 | transform=transforms.Compose([ 72 | transforms.Scale(opt.imageSize), 73 | transforms.ToTensor(), 74 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 75 | ]) 76 | ) 77 | assert dataset 78 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize, 79 | shuffle=True, num_workers=int(opt.workers)) 80 | 81 | ngpu = int(opt.ngpu) 82 | nz = int(opt.nz) 83 | ngf = int(opt.ngf) 84 | ndf = int(opt.ndf) 85 | nc = 3 86 | 87 | # custom weights initialization called on netG and netD 88 | def weights_init(m): 89 | classname = m.__class__.__name__ 90 | if classname.find('Conv') != -1: 91 | m.weight.data.normal_(0.0, 0.02) 92 | elif classname.find('BatchNorm') != -1: 93 | m.weight.data.normal_(1.0, 0.02) 94 | m.bias.data.fill_(0) 95 | 96 | class _netG(nn.Module): 97 | def __init__(self, ngpu): 98 | super(_netG, self).__init__() 99 | self.ngpu = ngpu 100 | self.main = nn.Sequential( 101 | # input is Z, going into a convolution 102 | nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False), 103 | nn.BatchNorm2d(ngf * 8), 104 | nn.ReLU(True), 105 | # state size. (ngf*8) x 4 x 4 106 | nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False), 107 | nn.BatchNorm2d(ngf * 4), 108 | nn.ReLU(True), 109 | # state size. (ngf*4) x 8 x 8 110 | nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False), 111 | nn.BatchNorm2d(ngf * 2), 112 | nn.ReLU(True), 113 | # state size. (ngf*2) x 16 x 16 114 | nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False), 115 | nn.BatchNorm2d(ngf), 116 | nn.ReLU(True), 117 | # state size. (ngf) x 32 x 32 118 | nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False), 119 | nn.Tanh() 120 | # state size. (nc) x 64 x 64 121 | ) 122 | def forward(self, input): 123 | gpu_ids = None 124 | if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1: 125 | gpu_ids = range(self.ngpu) 126 | return nn.parallel.data_parallel(self.main, input, gpu_ids) 127 | 128 | netG = _netG(ngpu) 129 | netG.apply(weights_init) 130 | if opt.netG != '': 131 | netG.load_state_dict(torch.load(opt.netG)) 132 | print(netG) 133 | 134 | class _netD(nn.Module): 135 | def __init__(self, ngpu): 136 | super(_netD, self).__init__() 137 | self.ngpu = ngpu 138 | self.main = nn.Sequential( 139 | # input is (nc) x 64 x 64 140 | nn.Conv2d(nc, ndf, 4, 2, 1, bias=False), 141 | nn.LeakyReLU(0.2, inplace=True), 142 | # state size. (ndf) x 32 x 32 143 | nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False), 144 | nn.BatchNorm2d(ndf * 2), 145 | nn.LeakyReLU(0.2, inplace=True), 146 | # state size. (ndf*2) x 16 x 16 147 | nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False), 148 | nn.BatchNorm2d(ndf * 4), 149 | nn.LeakyReLU(0.2, inplace=True), 150 | # state size. (ndf*4) x 8 x 8 151 | nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False), 152 | nn.BatchNorm2d(ndf * 8), 153 | nn.LeakyReLU(0.2, inplace=True), 154 | # state size. (ndf*8) x 4 x 4 155 | nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False), 156 | nn.Sigmoid() 157 | ) 158 | def forward(self, input): 159 | gpu_ids = None 160 | if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1: 161 | gpu_ids = range(self.ngpu) 162 | output = nn.parallel.data_parallel(self.main, input, gpu_ids) 163 | return output.view(-1, 1) 164 | 165 | netD = _netD(ngpu) 166 | netD.apply(weights_init) 167 | if opt.netD != '': 168 | netD.load_state_dict(torch.load(opt.netD)) 169 | print(netD) 170 | 171 | criterion = nn.BCELoss() 172 | 173 | input = torch.FloatTensor(opt.batchSize, 3, opt.imageSize, opt.imageSize) 174 | noise = torch.FloatTensor(opt.batchSize, nz, 1, 1) 175 | fixed_noise = torch.FloatTensor(opt.batchSize, nz, 1, 1).normal_(0, 1) 176 | label = torch.FloatTensor(opt.batchSize) 177 | real_label = 1 178 | fake_label = 0 179 | 180 | if opt.cuda: 181 | netD.cuda() 182 | netG.cuda() 183 | criterion.cuda() 184 | input, label = input.cuda(), label.cuda() 185 | noise, fixed_noise = noise.cuda(), fixed_noise.cuda() 186 | 187 | input = Variable(input) 188 | label = Variable(label) 189 | noise = Variable(noise) 190 | fixed_noise = Variable(fixed_noise) 191 | 192 | # setup optimizer 193 | optimizerD = optim.Adam(netD.parameters(), lr = opt.lr, betas = (opt.beta1, 0.999)) 194 | optimizerG = optim.Adam(netG.parameters(), lr = opt.lr, betas = (opt.beta1, 0.999)) 195 | 196 | for epoch in range(opt.niter): 197 | for i, data in enumerate(dataloader, 0): 198 | ############################ 199 | # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) 200 | ########################### 201 | # train with real 202 | netD.zero_grad() 203 | real_cpu, _ = data 204 | batch_size = real_cpu.size(0) 205 | input.data.resize_(real_cpu.size()).copy_(real_cpu) 206 | label.data.resize_(batch_size).fill_(real_label) 207 | 208 | output = netD(input) 209 | errD_real = criterion(output, label) 210 | errD_real.backward() 211 | D_x = output.data.mean() 212 | 213 | # train with fake 214 | noise.data.resize_(batch_size, nz, 1, 1) 215 | noise.data.normal_(0, 1) 216 | fake = netG(noise) 217 | label.data.fill_(fake_label) 218 | output = netD(fake.detach()) 219 | errD_fake = criterion(output, label) 220 | errD_fake.backward() 221 | D_G_z1 = output.data.mean() 222 | errD = errD_real + errD_fake 223 | optimizerD.step() 224 | 225 | ############################ 226 | # (2) Update G network: maximize log(D(G(z))) 227 | ########################### 228 | netG.zero_grad() 229 | label.data.fill_(real_label) # fake labels are real for generator cost 230 | output = netD(fake) 231 | errG = criterion(output, label) 232 | errG.backward() 233 | D_G_z2 = output.data.mean() 234 | optimizerG.step() 235 | 236 | print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f' 237 | % (epoch, opt.niter, i, len(dataloader), 238 | errD.data[0], errG.data[0], D_x, D_G_z1, D_G_z2)) 239 | if i % 100 == 0: 240 | vutils.save_image(real_cpu, 241 | '%s/real_samples.png' % opt.outf) 242 | fake = netG(fixed_noise) 243 | vutils.save_image(fake.data, 244 | '%s/fake_samples_epoch_%03d.png' % (opt.outf, epoch)) 245 | 246 | # do checkpointing 247 | torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch)) 248 | torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch)) 249 | -------------------------------------------------------------------------------- /imagenet/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.parallel 9 | import torch.backends.cudnn as cudnn 10 | import torch.optim 11 | import torch.utils.data 12 | import torchvision.transforms as transforms 13 | import torchvision.datasets as datasets 14 | import torchvision.models as models 15 | 16 | 17 | model_names = sorted(name for name in models.__dict__ 18 | if name.islower() and not name.startswith("__") 19 | and callable(models.__dict__[name])) 20 | 21 | 22 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 23 | parser.add_argument('data', metavar='DIR', 24 | help='path to dataset') 25 | parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 26 | choices=model_names, 27 | help='model architecture: ' + 28 | ' | '.join(model_names) + 29 | ' (default: resnet18)') 30 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 31 | help='number of data loading workers (default: 4)') 32 | parser.add_argument('--epochs', default=90, type=int, metavar='N', 33 | help='number of total epochs to run') 34 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 35 | help='manual epoch number (useful on restarts)') 36 | parser.add_argument('-b', '--batch-size', default=256, type=int, 37 | metavar='N', help='mini-batch size (default: 256)') 38 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 39 | metavar='LR', help='initial learning rate') 40 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 41 | help='momentum') 42 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 43 | metavar='W', help='weight decay (default: 1e-4)') 44 | parser.add_argument('--print-freq', '-p', default=10, type=int, 45 | metavar='N', help='print frequency (default: 10)') 46 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 47 | help='path to latest checkpoint (default: none)') 48 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 49 | help='evaluate model on validation set') 50 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 51 | help='use pre-trained model') 52 | 53 | best_prec1 = 0 54 | 55 | 56 | def main(): 57 | global args, best_prec1 58 | args = parser.parse_args() 59 | 60 | # create model 61 | if args.pretrained: 62 | print("=> using pre-trained model '{}'".format(args.arch)) 63 | model = models.__dict__[args.arch](pretrained=True) 64 | else: 65 | print("=> creating model '{}'".format(args.arch)) 66 | model = models.__dict__[args.arch]() 67 | 68 | if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 69 | model.features = torch.nn.DataParallel(model.features) 70 | model.cuda() 71 | else: 72 | model = torch.nn.DataParallel(model).cuda() 73 | 74 | # optionally resume from a checkpoint 75 | if args.resume: 76 | if os.path.isfile(args.resume): 77 | print("=> loading checkpoint '{}'".format(args.resume)) 78 | checkpoint = torch.load(args.resume) 79 | args.start_epoch = checkpoint['epoch'] 80 | best_prec1 = checkpoint['best_prec1'] 81 | model.load_state_dict(checkpoint['state_dict']) 82 | print("=> loaded checkpoint '{}' (epoch {})" 83 | .format(args.resume, checkpoint['epoch'])) 84 | else: 85 | print("=> no checkpoint found at '{}'".format(args.resume)) 86 | 87 | cudnn.benchmark = True 88 | 89 | # Data loading code 90 | traindir = os.path.join(args.data, 'train') 91 | valdir = os.path.join(args.data, 'val') 92 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 93 | std=[0.229, 0.224, 0.225]) 94 | 95 | train_loader = torch.utils.data.DataLoader( 96 | datasets.ImageFolder(traindir, transforms.Compose([ 97 | transforms.RandomSizedCrop(224), 98 | transforms.RandomHorizontalFlip(), 99 | transforms.ToTensor(), 100 | normalize, 101 | ])), 102 | batch_size=args.batch_size, shuffle=True, 103 | num_workers=args.workers, pin_memory=True) 104 | 105 | val_loader = torch.utils.data.DataLoader( 106 | datasets.ImageFolder(valdir, transforms.Compose([ 107 | transforms.Scale(256), 108 | transforms.CenterCrop(224), 109 | transforms.ToTensor(), 110 | normalize, 111 | ])), 112 | batch_size=args.batch_size, shuffle=False, 113 | num_workers=args.workers, pin_memory=True) 114 | 115 | # define loss function (criterion) and pptimizer 116 | criterion = nn.CrossEntropyLoss().cuda() 117 | 118 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 119 | momentum=args.momentum, 120 | weight_decay=args.weight_decay) 121 | 122 | if args.evaluate: 123 | validate(val_loader, model, criterion) 124 | return 125 | 126 | for epoch in range(args.start_epoch, args.epochs): 127 | adjust_learning_rate(optimizer, epoch) 128 | 129 | # train for one epoch 130 | train(train_loader, model, criterion, optimizer, epoch) 131 | 132 | # evaluate on validation set 133 | prec1 = validate(val_loader, model, criterion) 134 | 135 | # remember best prec@1 and save checkpoint 136 | is_best = prec1 > best_prec1 137 | best_prec1 = max(prec1, best_prec1) 138 | save_checkpoint({ 139 | 'epoch': epoch + 1, 140 | 'arch': args.arch, 141 | 'state_dict': model.state_dict(), 142 | 'best_prec1': best_prec1, 143 | }, is_best) 144 | 145 | 146 | def train(train_loader, model, criterion, optimizer, epoch): 147 | batch_time = AverageMeter() 148 | data_time = AverageMeter() 149 | losses = AverageMeter() 150 | top1 = AverageMeter() 151 | top5 = AverageMeter() 152 | 153 | # switch to train mode 154 | model.train() 155 | 156 | end = time.time() 157 | for i, (input, target) in enumerate(train_loader): 158 | # measure data loading time 159 | data_time.update(time.time() - end) 160 | 161 | target = target.cuda(async=True) 162 | input_var = torch.autograd.Variable(input) 163 | target_var = torch.autograd.Variable(target) 164 | 165 | # compute output 166 | output = model(input_var) 167 | loss = criterion(output, target_var) 168 | 169 | # measure accuracy and record loss 170 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 171 | losses.update(loss.data[0], input.size(0)) 172 | top1.update(prec1[0], input.size(0)) 173 | top5.update(prec5[0], input.size(0)) 174 | 175 | # compute gradient and do SGD step 176 | optimizer.zero_grad() 177 | loss.backward() 178 | optimizer.step() 179 | 180 | # measure elapsed time 181 | batch_time.update(time.time() - end) 182 | end = time.time() 183 | 184 | if i % args.print_freq == 0: 185 | print('Epoch: [{0}][{1}/{2}]\t' 186 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 187 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 188 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 189 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 190 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 191 | epoch, i, len(train_loader), batch_time=batch_time, 192 | data_time=data_time, loss=losses, top1=top1, top5=top5)) 193 | 194 | 195 | def validate(val_loader, model, criterion): 196 | batch_time = AverageMeter() 197 | losses = AverageMeter() 198 | top1 = AverageMeter() 199 | top5 = AverageMeter() 200 | 201 | # switch to evaluate mode 202 | model.eval() 203 | 204 | end = time.time() 205 | for i, (input, target) in enumerate(val_loader): 206 | target = target.cuda(async=True) 207 | input_var = torch.autograd.Variable(input, volatile=True) 208 | target_var = torch.autograd.Variable(target, volatile=True) 209 | 210 | # compute output 211 | output = model(input_var) 212 | loss = criterion(output, target_var) 213 | 214 | # measure accuracy and record loss 215 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 216 | losses.update(loss.data[0], input.size(0)) 217 | top1.update(prec1[0], input.size(0)) 218 | top5.update(prec5[0], input.size(0)) 219 | 220 | # measure elapsed time 221 | batch_time.update(time.time() - end) 222 | end = time.time() 223 | 224 | if i % args.print_freq == 0: 225 | print('Test: [{0}/{1}]\t' 226 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 227 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 228 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 229 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 230 | i, len(val_loader), batch_time=batch_time, loss=losses, 231 | top1=top1, top5=top5)) 232 | 233 | print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' 234 | .format(top1=top1, top5=top5)) 235 | 236 | return top1.avg 237 | 238 | 239 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 240 | torch.save(state, filename) 241 | if is_best: 242 | shutil.copyfile(filename, 'model_best.pth.tar') 243 | 244 | 245 | class AverageMeter(object): 246 | """Computes and stores the average and current value""" 247 | def __init__(self): 248 | self.reset() 249 | 250 | def reset(self): 251 | self.val = 0 252 | self.avg = 0 253 | self.sum = 0 254 | self.count = 0 255 | 256 | def update(self, val, n=1): 257 | self.val = val 258 | self.sum += val * n 259 | self.count += n 260 | self.avg = self.sum / self.count 261 | 262 | 263 | def adjust_learning_rate(optimizer, epoch): 264 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 265 | lr = args.lr * (0.1 ** (epoch // 30)) 266 | for param_group in optimizer.param_groups: 267 | param_group['lr'] = lr 268 | 269 | 270 | def accuracy(output, target, topk=(1,)): 271 | """Computes the precision@k for the specified values of k""" 272 | maxk = max(topk) 273 | batch_size = target.size(0) 274 | 275 | _, pred = output.topk(maxk, 1, True, True) 276 | pred = pred.t() 277 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 278 | 279 | res = [] 280 | for k in topk: 281 | correct_k = correct[:k].view(-1).float().sum(0) 282 | res.append(correct_k.mul_(100.0 / batch_size)) 283 | return res 284 | 285 | 286 | if __name__ == '__main__': 287 | main() 288 | -------------------------------------------------------------------------------- /OpenNMT/train.py: -------------------------------------------------------------------------------- 1 | import onmt 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | from torch import cuda 6 | from torch.autograd import Variable 7 | import math 8 | import time 9 | 10 | parser = argparse.ArgumentParser(description='train.py') 11 | 12 | ## Data options 13 | 14 | parser.add_argument('-data', required=True, 15 | help='Path to the *-train.pt file from preprocess.py') 16 | parser.add_argument('-save_model', default='model', 17 | help="""Model filename (the model will be saved as 18 | _epochN_PPL.pt where PPL is the 19 | validation perplexity""") 20 | parser.add_argument('-train_from', 21 | help="""If training from a checkpoint then this is the 22 | path to the pretrained model.""") 23 | 24 | ## Model options 25 | 26 | parser.add_argument('-layers', type=int, default=2, 27 | help='Number of layers in the LSTM encoder/decoder') 28 | parser.add_argument('-rnn_size', type=int, default=500, 29 | help='Size of LSTM hidden states') 30 | parser.add_argument('-word_vec_size', type=int, default=500, 31 | help='Word embedding sizes') 32 | parser.add_argument('-input_feed', type=int, default=1, 33 | help="""Feed the context vector at each time step as 34 | additional input (via concatenation with the word 35 | embeddings) to the decoder.""") 36 | # parser.add_argument('-residual', action="store_true", 37 | # help="Add residual connections between RNN layers.") 38 | parser.add_argument('-brnn', action='store_true', 39 | help='Use a bidirectional encoder') 40 | parser.add_argument('-brnn_merge', default='concat', 41 | help="""Merge action for the bidirectional hidden states: 42 | [concat|sum]""") 43 | 44 | ## Optimization options 45 | 46 | parser.add_argument('-batch_size', type=int, default=64, 47 | help='Maximum batch size') 48 | parser.add_argument('-max_generator_batches', type=int, default=32, 49 | help="""Maximum batches of words in a sequence to run 50 | the generator on in parallel. Higher is faster, but uses 51 | more memory.""") 52 | parser.add_argument('-epochs', type=int, default=13, 53 | help='Number of training epochs') 54 | parser.add_argument('-start_epoch', type=int, default=1, 55 | help='The epoch from which to start') 56 | parser.add_argument('-param_init', type=float, default=0.1, 57 | help="""Parameters are initialized over uniform distribution 58 | with support (-param_init, param_init)""") 59 | parser.add_argument('-optim', default='sgd', 60 | help="Optimization method. [sgd|adagrad|adadelta|adam]") 61 | parser.add_argument('-learning_rate', type=float, default=1.0, 62 | help="""Starting learning rate. If adagrad/adadelta/adam is 63 | used, then this is the global learning rate. Recommended 64 | settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.1""") 65 | parser.add_argument('-max_grad_norm', type=float, default=5, 66 | help="""If the norm of the gradient vector exceeds this, 67 | renormalize it to have the norm equal to max_grad_norm""") 68 | parser.add_argument('-dropout', type=float, default=0.3, 69 | help='Dropout probability; applied between LSTM stacks.') 70 | parser.add_argument('-learning_rate_decay', type=float, default=0.5, 71 | help="""Decay learning rate by this much if (i) perplexity 72 | does not decrease on the validation set or (ii) epoch has 73 | gone past the start_decay_at_limit""") 74 | parser.add_argument('-start_decay_at', default=8, 75 | help="Start decay after this epoch") 76 | parser.add_argument('-curriculum', action="store_true", 77 | help="""For this many epochs, order the minibatches based 78 | on source sequence length. Sometimes setting this to 1 will 79 | increase convergence speed.""") 80 | parser.add_argument('-pre_word_vecs_enc', 81 | help="""If a valid path is specified, then this will load 82 | pretrained word embeddings on the encoder side. 83 | See README for specific formatting instructions.""") 84 | parser.add_argument('-pre_word_vecs_dec', 85 | help="""If a valid path is specified, then this will load 86 | pretrained word embeddings on the decoder side. 87 | See README for specific formatting instructions.""") 88 | 89 | # GPU 90 | parser.add_argument('-gpus', default=[], nargs='+', type=int, 91 | help="Use CUDA") 92 | 93 | parser.add_argument('-log_interval', type=int, default=50, 94 | help="Print stats at this interval.") 95 | # parser.add_argument('-seed', type=int, default=3435, 96 | # help="Seed for random initialization") 97 | 98 | opt = parser.parse_args() 99 | opt.cuda = len(opt.gpus) 100 | 101 | print(opt) 102 | 103 | if torch.cuda.is_available() and not opt.cuda: 104 | print("WARNING: You have a CUDA device, so you should probably run with -cuda") 105 | 106 | if opt.cuda: 107 | cuda.set_device(opt.gpus[0]) 108 | 109 | def NMTCriterion(vocabSize): 110 | weight = torch.ones(vocabSize) 111 | weight[onmt.Constants.PAD] = 0 112 | crit = nn.NLLLoss(weight, size_average=False) 113 | if opt.cuda: 114 | crit.cuda() 115 | return crit 116 | 117 | 118 | def memoryEfficientLoss(outputs, targets, generator, crit, eval=False): 119 | # compute generations one piece at a time 120 | loss = 0 121 | outputs = Variable(outputs.data, requires_grad=(not eval), volatile=eval).contiguous() 122 | 123 | batch_size = outputs.size(1) 124 | outputs_split = torch.split(outputs, opt.max_generator_batches) 125 | targets_split = torch.split(targets.contiguous(), opt.max_generator_batches) 126 | for out_t, targ_t in zip(outputs_split, targets_split): 127 | out_t = out_t.view(-1, out_t.size(2)) 128 | pred_t = generator(out_t) 129 | loss_t = crit(pred_t, targ_t.view(-1)) 130 | loss += loss_t.data[0] 131 | if not eval: 132 | loss_t.div(batch_size).backward() 133 | 134 | grad_output = None if outputs.grad is None else outputs.grad.data 135 | return loss, grad_output 136 | 137 | 138 | def eval(model, criterion, data): 139 | total_loss = 0 140 | total_words = 0 141 | 142 | model.eval() 143 | for i in range(len(data)): 144 | batch = [x.transpose(0, 1) for x in data[i]] # must be batch first for gather/scatter in DataParallel 145 | outputs = model(batch) # FIXME volatile 146 | targets = batch[1][:, 1:] # exclude from targets 147 | loss, _ = memoryEfficientLoss( 148 | outputs, targets, model.generator, criterion, eval=True) 149 | total_loss += loss 150 | total_words += targets.data.ne(onmt.Constants.PAD).sum() 151 | 152 | model.train() 153 | return total_loss / total_words 154 | 155 | 156 | def trainModel(model, trainData, validData, dataset, optim): 157 | print(model) 158 | model.train() 159 | if optim.last_ppl is None: 160 | for p in model.parameters(): 161 | p.data.uniform_(-opt.param_init, opt.param_init) 162 | 163 | # define criterion of each GPU 164 | criterion = NMTCriterion(dataset['dicts']['tgt'].size()) 165 | 166 | start_time = time.time() 167 | def trainEpoch(epoch): 168 | 169 | # shuffle mini batch order 170 | batchOrder = torch.randperm(len(trainData)) 171 | 172 | total_loss, report_loss = 0, 0 173 | total_words, report_words = 0, 0 174 | start = time.time() 175 | for i in range(len(trainData)): 176 | 177 | batchIdx = batchOrder[i] if epoch >= opt.curriculum else i 178 | batch = trainData[batchIdx] 179 | batch = [x.transpose(0, 1) for x in batch] # must be batch first for gather/scatter in DataParallel 180 | 181 | model.zero_grad() 182 | outputs = model(batch) 183 | targets = batch[1][:, 1:] # exclude from targets 184 | loss, gradOutput = memoryEfficientLoss( 185 | outputs, targets, model.generator, criterion) 186 | 187 | outputs.backward(gradOutput) 188 | 189 | # update the parameters 190 | grad_norm = optim.step() 191 | 192 | report_loss += loss 193 | total_loss += loss 194 | num_words = targets.data.ne(onmt.Constants.PAD).sum() 195 | total_words += num_words 196 | report_words += num_words 197 | if i % opt.log_interval == 0 and i > 0: 198 | print("Epoch %2d, %5d/%5d batches; perplexity: %6.2f; %3.0f tokens/s; %6.0f s elapsed" % 199 | (epoch, i, len(trainData), 200 | math.exp(report_loss / report_words), 201 | report_words/(time.time()-start), 202 | time.time()-start_time)) 203 | 204 | report_loss = report_words = 0 205 | start = time.time() 206 | 207 | return total_loss / total_words 208 | 209 | for epoch in range(opt.start_epoch, opt.epochs + 1): 210 | print('') 211 | 212 | # (1) train for one epoch on the training set 213 | train_loss = trainEpoch(epoch) 214 | print('Train perplexity: %g' % math.exp(min(train_loss, 100))) 215 | 216 | # (2) evaluate on the validation set 217 | valid_loss = eval(model, criterion, validData) 218 | valid_ppl = math.exp(min(valid_loss, 100)) 219 | print('Validation perplexity: %g' % valid_ppl) 220 | 221 | # (3) maybe update the learning rate 222 | if opt.optim == 'sgd': 223 | optim.updateLearningRate(valid_loss, epoch) 224 | 225 | # (4) drop a checkpoint 226 | checkpoint = { 227 | 'model': model, 228 | 'dicts': dataset['dicts'], 229 | 'opt': opt, 230 | 'epoch': epoch, 231 | 'optim': optim, 232 | } 233 | torch.save(checkpoint, 234 | '%s_e%d_%.2f.pt' % (opt.save_model, epoch, valid_ppl)) 235 | 236 | 237 | def main(): 238 | 239 | print("Loading data from '%s'" % opt.data) 240 | 241 | dataset = torch.load(opt.data) 242 | 243 | trainData = onmt.Dataset(dataset['train']['src'], 244 | dataset['train']['tgt'], opt.batch_size, opt.cuda) 245 | validData = onmt.Dataset(dataset['valid']['src'], 246 | dataset['valid']['tgt'], opt.batch_size, opt.cuda) 247 | 248 | dicts = dataset['dicts'] 249 | print(' * vocabulary size. source = %d; target = %d' % 250 | (dicts['src'].size(), dicts['tgt'].size())) 251 | print(' * number of training sentences. %d' % 252 | len(dataset['train']['src'])) 253 | print(' * maximum batch size. %d' % opt.batch_size) 254 | 255 | print('Building model...') 256 | 257 | if opt.train_from is None: 258 | encoder = onmt.Models.Encoder(opt, dicts['src']) 259 | decoder = onmt.Models.Decoder(opt, dicts['tgt']) 260 | generator = nn.Sequential( 261 | nn.Linear(opt.rnn_size, dicts['tgt'].size()), 262 | nn.LogSoftmax()) 263 | if opt.cuda > 1: 264 | generator = nn.DataParallel(generator, device_ids=opt.gpus) 265 | model = onmt.Models.NMTModel(encoder, decoder, generator) 266 | if opt.cuda > 1: 267 | model = nn.DataParallel(model, device_ids=opt.gpus) 268 | if opt.cuda: 269 | model.cuda() 270 | else: 271 | model.cpu() 272 | 273 | model.generator = generator 274 | 275 | for p in model.parameters(): 276 | p.data.uniform_(-opt.param_init, opt.param_init) 277 | 278 | optim = onmt.Optim( 279 | model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, 280 | lr_decay=opt.learning_rate_decay, 281 | start_decay_at=opt.start_decay_at 282 | ) 283 | else: 284 | print('Loading from checkpoint at %s' % opt.train_from) 285 | checkpoint = torch.load(opt.train_from) 286 | model = checkpoint['model'] 287 | if opt.cuda: 288 | model.cuda() 289 | else: 290 | model.cpu() 291 | optim = checkpoint['optim'] 292 | opt.start_epoch = checkpoint['epoch'] + 1 293 | 294 | nParams = sum([p.nelement() for p in model.parameters()]) 295 | print('* number of parameters: %d' % nParams) 296 | 297 | trainModel(model, trainData, validData, dataset, optim) 298 | 299 | 300 | if __name__ == "__main__": 301 | main() 302 | --------------------------------------------------------------------------------