├── floyd_requirements.txt ├── .floydignore ├── LICENSE ├── data.py ├── .gitignore ├── model.py ├── generate.py ├── app.py ├── README.md └── main.py /floyd_requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | -------------------------------------------------------------------------------- /.floydignore: -------------------------------------------------------------------------------- 1 | 2 | # Directories and files to ignore when uploading code to floyd 3 | 4 | FLOYD_README.md 5 | .git 6 | .eggs 7 | eggs 8 | lib 9 | lib64 10 | parts 11 | sdist 12 | core 13 | var 14 | *.pyc 15 | *.swp 16 | .DS_Store 17 | data/ 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class Dictionary(object): 5 | """Build word2idx and idx2word from Corpus(train/val/test)""" 6 | def __init__(self): 7 | self.word2idx = {} # word: index 8 | self.idx2word = [] # position(index): word 9 | 10 | def add_word(self, word): 11 | """Create/Update word2idx and idx2word""" 12 | if word not in self.word2idx: 13 | self.idx2word.append(word) 14 | self.word2idx[word] = len(self.idx2word) - 1 15 | return self.word2idx[word] 16 | 17 | def __len__(self): 18 | return len(self.idx2word) 19 | 20 | 21 | class Corpus(object): 22 | """Corpus Tokenizer""" 23 | def __init__(self, path): 24 | self.dictionary = Dictionary() 25 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 26 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 27 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 28 | 29 | def tokenize(self, path): 30 | """Tokenizes a text file.""" 31 | assert os.path.exists(path) 32 | # Add words to the dictionary 33 | with open(path, 'r') as f: 34 | tokens = 0 35 | for line in f: 36 | # line to list of token + eos 37 | words = line.split() + [''] 38 | tokens += len(words) 39 | for word in words: 40 | self.dictionary.add_word(word) 41 | 42 | # Tokenize file content 43 | with open(path, 'r') as f: 44 | ids = torch.LongTensor(tokens) 45 | token = 0 46 | for line in f: 47 | words = line.split() + [''] 48 | for word in words: 49 | ids[token] = self.dictionary.word2idx[word] 50 | token += 1 51 | 52 | return ids 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | *.pyc 10 | 11 | # Floyd things 12 | FLOYD_README.md 13 | .floydexpt 14 | .DS_Store 15 | core 16 | data/ 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # Environments 94 | .env 95 | .venv 96 | env/ 97 | venv/ 98 | ENV/ 99 | env.bak/ 100 | venv.bak/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | 115 | # End of https://www.gitignore.io/api/python 116 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | class RNNModel(nn.Module): 5 | """Container module with an encoder, a recurrent module, and a decoder.""" 6 | 7 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 8 | super(RNNModel, self).__init__() 9 | self.drop = nn.Dropout(dropout) 10 | self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings 11 | if rnn_type in ['LSTM', 'GRU']: 12 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) 13 | else: 14 | try: 15 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] 16 | except KeyError: 17 | raise ValueError( """An invalid option for `--model` was supplied, 18 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 19 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) 20 | self.decoder = nn.Linear(nhid, ntoken) 21 | 22 | # Optionally tie weights as in: 23 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 24 | # https://arxiv.org/abs/1608.05859 25 | # and 26 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 27 | # https://arxiv.org/abs/1611.01462 28 | if tie_weights: 29 | if nhid != ninp: 30 | raise ValueError('When using the tied flag, nhid must be equal to emsize') 31 | self.decoder.weight = self.encoder.weight 32 | 33 | self.init_weights() 34 | 35 | self.rnn_type = rnn_type 36 | self.nhid = nhid 37 | self.nlayers = nlayers 38 | 39 | def init_weights(self): 40 | initrange = 0.1 41 | self.encoder.weight.data.uniform_(-initrange, initrange) 42 | self.decoder.bias.data.fill_(0) 43 | self.decoder.weight.data.uniform_(-initrange, initrange) 44 | 45 | def forward(self, input, hidden): 46 | emb = self.drop(self.encoder(input)) 47 | output, hidden = self.rnn(emb, hidden) 48 | output = self.drop(output) 49 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) 50 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 51 | 52 | def init_hidden(self, bsz): 53 | weight = next(self.parameters()).data 54 | if self.rnn_type == 'LSTM': 55 | return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()), 56 | Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())) 57 | else: 58 | return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) 59 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Language Modeling on Penn Tree Bank 3 | # 4 | # This file generates new sentences sampled from the language model 5 | # 6 | ############################################################################### 7 | 8 | import argparse 9 | 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | import data 14 | 15 | parser = argparse.ArgumentParser(description='PyTorch PTB Language Model') 16 | 17 | # Model parameters. 18 | parser.add_argument('--data', type=str, default='/input', 19 | help='location of the data corpus') 20 | parser.add_argument('--checkpoint', type=str, default='/model/model.pt', 21 | help='model checkpoint to use') 22 | parser.add_argument('--outf', type=str, default='/output/generated.txt', 23 | help='output file for generated text') 24 | parser.add_argument('--words', type=int, default='1000', 25 | help='number of words to generate') 26 | parser.add_argument('--seed', type=int, default=1111, 27 | help='random seed') 28 | parser.add_argument('--cuda', action='store_true', 29 | help='use CUDA') 30 | parser.add_argument('--temperature', type=float, default=1.0, 31 | help='temperature - higher will increase diversity') 32 | parser.add_argument('--log-interval', type=int, default=100, 33 | help='reporting interval') 34 | args = parser.parse_args() 35 | 36 | # Set the random seed manually for reproducibility. 37 | torch.manual_seed(args.seed) 38 | if torch.cuda.is_available(): 39 | if not args.cuda: 40 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 41 | else: 42 | torch.cuda.manual_seed(args.seed) 43 | 44 | if args.temperature < 1e-3: 45 | parser.error("--temperature has to be greater or equal 1e-3") 46 | 47 | # Load checkpoint 48 | if args.checkpoint != '': 49 | if args.cuda: 50 | model = torch.load(args.checkpoint) 51 | else: 52 | # Load GPU model on CPU 53 | model = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) 54 | model.eval() 55 | 56 | if args.cuda: 57 | model.cuda() 58 | else: 59 | model.cpu() 60 | 61 | corpus = data.Corpus(args.data) 62 | ntokens = len(corpus.dictionary) 63 | hidden = model.init_hidden(1) 64 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) 65 | if args.cuda: 66 | input.data = input.data.cuda() 67 | 68 | with open(args.outf, 'w') as outf: 69 | for i in range(args.words): 70 | output, hidden = model(input, hidden) 71 | word_weights = output.squeeze().data.div(args.temperature).exp().cpu() 72 | word_idx = torch.multinomial(word_weights, 1)[0] 73 | input.data.fill_(word_idx) 74 | word = corpus.dictionary.idx2word[word_idx] 75 | # word = '\n' if word == "" else word 76 | 77 | outf.write(word + ('\n' if i % 20 == 19 else ' ')) 78 | 79 | if i % args.log_interval == 0: 80 | print('| Generated {}/{} words'.format(i, args.words)) 81 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """ 2 | Flask Serving 3 | This file is a sample flask app that can be used to test your model with an REST API. 4 | This app does the following: 5 | - Look for a number of word and the temperature 6 | - Returns the evaluation 7 | 8 | POST req: 9 | parameter: 10 | - words, required, how many words to generate 11 | - temperature, optional, degree of diversity 12 | 13 | """ 14 | import os 15 | from flask import Flask, send_file, request 16 | from werkzeug.exceptions import BadRequest 17 | import torch 18 | from torch.autograd import Variable 19 | import data 20 | 21 | DATA_PATH = '/input' 22 | CHECKPOINT = '/model/model.pt' 23 | OUTPUT_PATH = '/output/generated.txt' 24 | LOG_INTERVAL = 50 25 | print('Loading checkpoint: %s' % CHECKPOINT) 26 | 27 | app = Flask('Language-Model-Text-Generator') 28 | 29 | # Check if ckp exists 30 | try: 31 | os.path.isfile(CHECKPOINT) 32 | except IOError as e: 33 | # Does not exist OR no read permissions 34 | print ("Unable to open ckp file") 35 | 36 | cuda = torch.cuda.is_available() 37 | 38 | # Load checkpoint 39 | if cuda: 40 | model = torch.load(CHECKPOINT) 41 | else: 42 | # Load GPU model on CPU 43 | model = torch.load(CHECKPOINT, map_location=lambda storage, loc: storage) 44 | model.eval() 45 | 46 | if cuda: 47 | model.cuda() 48 | else: 49 | model.cpu() 50 | 51 | # Load Data 52 | corpus = data.Corpus(DATA_PATH) 53 | ntokens = len(corpus.dictionary) 54 | 55 | def generate(words, temperature): 56 | """Generate number of words with the given temperature""" 57 | hidden = model.init_hidden(1) 58 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) 59 | if cuda: 60 | input.data = input.data.cuda() 61 | 62 | # Generate 63 | with open(OUTPUT_PATH, 'w') as outf: 64 | for i in range(words): 65 | output, hidden = model(input, hidden) 66 | word_weights = output.squeeze().data.div(temperature).exp().cpu() 67 | word_idx = torch.multinomial(word_weights, 1)[0] 68 | input.data.fill_(word_idx) 69 | word = corpus.dictionary.idx2word[word_idx] 70 | # word = '\n' if word == "" else word 71 | outf.write(word + ('\n' if i % 20 == 19 else ' ')) 72 | 73 | if i % LOG_INTERVAL == 0: 74 | print('| Generated {}/{} words'.format(i, words)) 75 | 76 | # Return an Text Generated 77 | @app.route('/', methods=['POST']) 78 | def geneator_handler(path): 79 | # Get ckp 80 | words = int(request.form.get("words")) 81 | if words is None: 82 | return BadRequest("You must provide a words parameter") 83 | # if words is not int: 84 | # return BadRequest("Invalid words type") 85 | temp = request.form.get("temperature") or 1.0 86 | temp = float(temp) 87 | # if type(temp) is not float or type(temp) is not int: 88 | # return BadRequest("Invalid temperature type") 89 | if temp < 1e-3: 90 | return BadRequest("Temperature has to be greater or equal 1e-3") 91 | print (words, temp) 92 | # Generate word 93 | generate(words, temp) 94 | # Return the text generated 95 | return send_file(OUTPUT_PATH, mimetype='text/plain') 96 | 97 | if __name__ == '__main__': 98 | app.run(host='0.0.0.0') 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Word-level language modeling RNN 2 | 3 | This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task. 4 | By default, the training script uses the PTB dataset, provided. 5 | The trained model can then be used by the generate script to generate new text. 6 | This is a porting of [pytorch/examples/word_language_model](https://github.com/pytorch/examples/tree/master/word_language_model) making it usables on [FloydHub](https://www.floydhub.com/). 7 | 8 | ## Usage 9 | 10 | The `main.py` script accepts the following arguments: 11 | 12 | ```bash 13 | optional arguments: 14 | -h, --help show this help message and exit 15 | --data DATA location of the data corpus 16 | --model MODEL type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU) 17 | --emsize EMSIZE size of word embeddings 18 | --nhid NHID number of hidden units per layer 19 | --nlayers NLAYERS number of layers 20 | --lr LR initial learning rate 21 | --clip CLIP gradient clipping 22 | --epochs EPOCHS upper epoch limit 23 | --batch-size N batch size 24 | --bptt BPTT sequence length 25 | --dropout DROPOUT dropout applied to layers (0 = no dropout) 26 | --decay DECAY learning rate decay per epoch 27 | --tied tie the word embedding and softmax weights 28 | --seed SEED random seed 29 | --cuda use CUDA 30 | --log-interval N report interval 31 | --save SAVE path to save the final model 32 | ``` 33 | 34 | With these arguments, a variety of models can be tested. 35 | As an example, the following arguments produce slower but better models: 36 | 37 | ```bash 38 | python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 # Test perplexity of 80.97 39 | python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied # Test perplexity of 75.96 40 | python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 # Test perplexity of 77.42 41 | python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied # Test perplexity of 72.30 42 | ``` 43 | 44 | These perplexities are equal or better than 45 | [Recurrent Neural Network Regularization (Zaremba et al. 2014)](https://arxiv.org/pdf/1409.2329.pdf) 46 | and are similar to [Using the Output Embedding to Improve Language Models (Press & Wolf 2016](https://arxiv.org/abs/1608.05859) and [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), though both of these papers have improved perplexities by using a form of recurrent dropout [(variational dropout)](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks). 47 | 48 | 49 | ## Architecture 50 | 51 | Soon. 52 | 53 | ## Run on FloydHub 54 | 55 | Here's the commands to training, evaluating and serving your language modeling task on FloydHub. 56 | 57 | ### Project Setup 58 | 59 | Before you start, log in on FloydHub with the [floyd login](http://docs.floydhub.com/commands/login/) command, then fork and init the project: 60 | 61 | ```bash 62 | $ git clone https://github.com/floydhub/word-language-model.git 63 | $ cd word-language-model 64 | $ floyd init word-language-model 65 | ``` 66 | 67 | ### Training 68 | 69 | Before you start, you need to upload the [Penn Treebank-3 dataset](https://catalog.ldc.upenn.edu/ldc99t42) as a FloydHub Dataset following this guide: [create and upload a dataset](https://docs.floydhub.com/guides/create_and_upload_dataset/). Then you will be ready to play with different language models. 70 | 71 | ```bash 72 | # Train a LSTM on PTB with CUDA, reaching perplexity of 114.22 73 | floyd run --gpu --env pytorch-0.2 --data /dataset//:input "python main.py --cuda --epochs 7" 74 | 75 | # Train a tied LSTM on PTB with CUDA, reaching perplexity of 110.44 76 | floyd run --gpu --env pytorch-0.2 --data /dataset//:input "python main.py --cuda --epochs 7 --tied" 77 | 78 | # Train a tied LSTM on PTB with CUDA for 40 epochs, reaching perplexity of 87.17 79 | floyd run --gpu --env pytorch-0.2 --data /dataset//:input "python main.py --cuda --tied" 80 | ``` 81 | 82 | Note: 83 | 84 | - `--gpu` run your job on a FloydHub GPU instance. 85 | - `--env pytorch-0.2` prepares a pytorch environment for python 3. 86 | - `--data /dataset//:input` mounts the previus uploaded Penn Treebank-3 dataset in the `/input` folder inside the container for our job. 87 | 88 | The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`) 89 | which will automatically use the cuDNN backend if run on CUDA with cuDNN installed. 90 | 91 | During training, if a keyboard interrupt (Ctrl-C) is received, 92 | training is stopped and the current model is evaluated against the test dataset. 93 | 94 | You can follow along the progress by using the [logs](https://docs.floydhub.com/commands/logs/) command. 95 | The first 2 examples of training should be completed in about 5 minutes on a GPU instance and 40' on a CPU one. The last example should take about 30' on a GPU instance and above 3 hours on a CPU instace. 96 | 97 | ### Evaluating 98 | 99 | It's time to evaluate our model generating some text: 100 | 101 | ```bash 102 | # Generate samples from the trained LSTM model. 103 | floyd run --gpu --env pytorch-0.2 --data /dataset//:input --data :model "python generate.py --cuda" 104 | ``` 105 | 106 | ### Try our pre-trained model 107 | 108 | We have provided to you a pre-trained model trained for 40 epochs reaching perplexity of 87.17: 109 | ```bash 110 | # Generate samples from the trained LSTM model. 111 | floyd run --gpu --env pytorch-0.2 --data /dataset//:input --data :model "python generate.py --cuda" 112 | ``` 113 | 114 | 115 | ### Serve model through REST API 116 | 117 | FloydHub supports seving mode for demo and testing purpose. Before serving your model through REST API, 118 | you need to create a `floyd_requirements.txt` and declare the flask requirement in it. If you run a job 119 | with `--mode serve` flag, FloydHub will run the `app.py` file in your project 120 | and attach it to a dynamic service endpoint: 121 | 122 | ```bash 123 | floyd run --gpu --mode serve --env pytorch-0.2 --data /dataset//:input --data :model 124 | ``` 125 | 126 | The above command will print out a service endpoint for this job in your terminal console. 127 | 128 | The service endpoint will take a couple minutes to become ready. Once it's up, you can interact with the model by sending a POST request wih the number of words and the temperature that the model will use to generate text: 129 | ```bash 130 | # Template 131 | # curl -X POST -o -F "words=" -F "temperature=" 132 | 133 | curl -X POST -o generated.txt -F "words=100" -F "temperature=3" https://www.floydlabs.com/expose/vk47ixT8NeYBTFeMavbWta 134 | ``` 135 | 136 | Any job running in serving mode will stay up until it reaches maximum runtime. So 137 | once you are done testing, **remember to shutdown the job!** 138 | 139 | *Note that this feature is in preview mode and is not production ready yet* 140 | 141 | ## More resources 142 | 143 | Some useful resources on NLP for Deep Learning and language modeling task: 144 | 145 | - [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) 146 | - [Natural Language Processing with Deep Learning - Stanford](https://youtu.be/OQQ-W_63UgQ) 147 | - [Oxford Deep NLP 2017 course](https://github.com/oxford-cs-deepnlp-2017/lectures) 148 | 149 | ## Contributing 150 | 151 | For any questions, bug(even typos) and/or features requests do not hesitate to contact me or open an issue! 152 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | 8 | import data 9 | import model 10 | 11 | # Add ckp 12 | parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') 13 | parser.add_argument('--data', type=str, default='/input', # /input 14 | help='location of the data corpus') 15 | parser.add_argument('--checkpoint', type=str, default='', 16 | help='model checkpoint to use') 17 | parser.add_argument('--model', type=str, default='LSTM', 18 | help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') 19 | parser.add_argument('--emsize', type=int, default=200, 20 | help='size of word embeddings') 21 | parser.add_argument('--nhid', type=int, default=200, 22 | help='number of hidden units per layer') 23 | parser.add_argument('--nlayers', type=int, default=2, 24 | help='number of layers') 25 | parser.add_argument('--lr', type=float, default=20, 26 | help='initial learning rate') 27 | parser.add_argument('--clip', type=float, default=0.25, 28 | help='gradient clipping') 29 | parser.add_argument('--epochs', type=int, default=40, 30 | help='upper epoch limit') 31 | parser.add_argument('--batch_size', type=int, default=20, metavar='N', 32 | help='batch size') 33 | parser.add_argument('--bptt', type=int, default=35, 34 | help='sequence length') 35 | parser.add_argument('--dropout', type=float, default=0.2, 36 | help='dropout applied to layers (0 = no dropout)') 37 | parser.add_argument('--tied', action='store_true', 38 | help='tie the word embedding and softmax weights') 39 | parser.add_argument('--seed', type=int, default=1111, 40 | help='random seed') 41 | parser.add_argument('--cuda', action='store_true', 42 | help='use CUDA') 43 | parser.add_argument('--log-interval', type=int, default=200, metavar='N', 44 | help='report interval') 45 | parser.add_argument('--save', type=str, default='/output/model.pt', # /output 46 | help='path to save the final model') 47 | args = parser.parse_args() 48 | 49 | # Set the random seed manually for reproducibility. 50 | torch.manual_seed(args.seed) 51 | if torch.cuda.is_available(): 52 | if not args.cuda: 53 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 54 | else: 55 | torch.cuda.manual_seed(args.seed) 56 | 57 | ############################################################################### 58 | # Load data 59 | ############################################################################### 60 | 61 | corpus = data.Corpus(args.data) 62 | 63 | def batchify(data, bsz): 64 | # Work out how cleanly we can divide the dataset into bsz parts. 65 | nbatch = data.size(0) // bsz 66 | # Trim off any extra elements that wouldn't cleanly fit (remainders). 67 | data = data.narrow(0, 0, nbatch * bsz) 68 | # Evenly divide the data across the bsz batches. 69 | data = data.view(bsz, -1).t().contiguous() 70 | if args.cuda: 71 | data = data.cuda() 72 | return data 73 | 74 | eval_batch_size = 10 75 | train_data = batchify(corpus.train, args.batch_size) 76 | val_data = batchify(corpus.valid, eval_batch_size) 77 | test_data = batchify(corpus.test, eval_batch_size) 78 | 79 | ############################################################################### 80 | # Build the model 81 | ############################################################################### 82 | 83 | ntokens = len(corpus.dictionary) 84 | model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) 85 | 86 | # Load checkpoint 87 | if args.checkpoint != '': 88 | if args.cuda: 89 | model = torch.load(args.checkpoint) 90 | else: 91 | # Load GPU model on CPU 92 | model = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) 93 | 94 | if args.cuda: 95 | model.cuda() 96 | else: 97 | model.cpu() 98 | print (model) 99 | 100 | criterion = nn.CrossEntropyLoss() 101 | if args.cuda: 102 | criterion.cuda() 103 | 104 | ############################################################################### 105 | # Training code 106 | ############################################################################### 107 | 108 | def repackage_hidden(h): 109 | """Wraps hidden states in new Variables, to detach them from their history.""" 110 | if type(h) == Variable: 111 | return Variable(h.data) 112 | else: 113 | return tuple(repackage_hidden(v) for v in h) 114 | 115 | 116 | def get_batch(source, i, evaluation=False): 117 | seq_len = min(args.bptt, len(source) - 1 - i) 118 | data = Variable(source[i:i+seq_len], volatile=evaluation) 119 | target = Variable(source[i+1:i+1+seq_len].view(-1)) 120 | return data, target 121 | 122 | 123 | def evaluate(data_source): 124 | # Turn on evaluation mode which disables dropout. 125 | model.eval() 126 | total_loss = 0 127 | ntokens = len(corpus.dictionary) 128 | hidden = model.init_hidden(eval_batch_size) 129 | for i in range(0, data_source.size(0) - 1, args.bptt): 130 | data, targets = get_batch(data_source, i, evaluation=True) 131 | output, hidden = model(data, hidden) 132 | output_flat = output.view(-1, ntokens) 133 | total_loss += len(data) * criterion(output_flat, targets).data 134 | hidden = repackage_hidden(hidden) 135 | return total_loss[0] / len(data_source) 136 | 137 | 138 | def train(): 139 | # Turn on training mode which enables dropout. 140 | model.train() 141 | total_loss = 0 142 | start_time = time.time() 143 | ntokens = len(corpus.dictionary) 144 | hidden = model.init_hidden(args.batch_size) 145 | for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): 146 | data, targets = get_batch(train_data, i) 147 | # Starting each batch, we detach the hidden state from how it was previously produced. 148 | # If we didn't, the model would try backpropagating all the way to start of the dataset. 149 | hidden = repackage_hidden(hidden) 150 | model.zero_grad() 151 | output, hidden = model(data, hidden) 152 | loss = criterion(output.view(-1, ntokens), targets) 153 | loss.backward() 154 | 155 | # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. 156 | torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) 157 | for p in model.parameters(): 158 | p.data.add_(-lr, p.grad.data) 159 | 160 | total_loss += loss.data 161 | 162 | if batch % args.log_interval == 0 and batch > 0: 163 | cur_loss = total_loss[0] / args.log_interval 164 | elapsed = time.time() - start_time 165 | print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 166 | 'loss {:5.2f} | ppl {:8.2f}'.format( 167 | epoch, batch, len(train_data) // args.bptt, lr, 168 | elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) 169 | total_loss = 0 170 | start_time = time.time() 171 | 172 | # Loop over epochs. 173 | lr = args.lr 174 | best_val_loss = None 175 | 176 | # At any point you can hit Ctrl + C to break out of training early. 177 | try: 178 | for epoch in range(1, args.epochs+1): 179 | epoch_start_time = time.time() 180 | train() 181 | val_loss = evaluate(val_data) 182 | print('-' * 89) 183 | print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 184 | 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), 185 | val_loss, math.exp(val_loss))) 186 | print('-' * 89) 187 | # Save the model if the validation loss is the best we've seen so far. 188 | if not best_val_loss or val_loss < best_val_loss: 189 | with open(args.save, 'wb') as f: 190 | torch.save(model, f) 191 | best_val_loss = val_loss 192 | else: 193 | # Anneal the learning rate if no improvement has been seen in the validation dataset. 194 | lr /= 4.0 195 | except KeyboardInterrupt: 196 | print('-' * 89) 197 | print('Exiting from training early') 198 | 199 | # Load the best saved model. 200 | with open(args.save, 'rb') as f: 201 | model = torch.load(f) 202 | 203 | # Run on test data. 204 | test_loss = evaluate(test_data) 205 | print('=' * 89) 206 | print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( 207 | test_loss, math.exp(test_loss))) 208 | print('=' * 89) 209 | --------------------------------------------------------------------------------