├── .gitignore ├── LICENSE ├── README.md ├── lm.py ├── ptb-lm.py └── reader.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 deeplearningathome 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Word-based Language Model in PyTorch 2 | This model is directly analagous to this [Tesnsorflow's LM](https://www.tensorflow.org/tutorials/recurrent). 3 | In fact, the reader is directly taken from its older version 4 | 5 | See this [blogpost.](http://deeplearningathome.com/2017/06/PyTorch-vs-Tensorflow-lstm-language-model.html) 6 | 7 | ## How to RUN: 8 | ``` 9 | python ptb-lm.py --data=[PATH_TO_DATA] 10 | ``` 11 | Default params should result in Test perplexity of ~78.04. 12 | Your actual result will vary due to random initialization. 13 | This basically matches results from TF's tutorial, only faster. 14 | 15 | On GTX 1080 I am getting around 7,400 wps. 16 | 17 | ## Files 18 | * lm.py - language model description 19 | * reader.py - slightly older version of TF's PTB reader which yields numpy arrays as batches 20 | * ptb-lm.py - driver script 21 | 22 | ## Requirements 23 | * Python 3 (I used Anaconda distribution) 24 | * PyTorch (I used 0.1.12) -------------------------------------------------------------------------------- /lm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | class LM_LSTM(nn.Module): 5 | """Simple LSMT-based language model""" 6 | def __init__(self, embedding_dim, num_steps, batch_size, vocab_size, num_layers, dp_keep_prob): 7 | super(LM_LSTM, self).__init__() 8 | self.embedding_dim = embedding_dim 9 | self.num_steps = num_steps 10 | self.batch_size = batch_size 11 | self.vocab_size = vocab_size 12 | self.dp_keep_prob = dp_keep_prob 13 | self.num_layers = num_layers 14 | self.dropout = nn.Dropout(1 - dp_keep_prob) 15 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) 16 | self.lstm = nn.LSTM(input_size=embedding_dim, 17 | hidden_size=embedding_dim, 18 | num_layers=num_layers, 19 | dropout=1 - dp_keep_prob) 20 | self.sm_fc = nn.Linear(in_features=embedding_dim, 21 | out_features=vocab_size) 22 | self.init_weights() 23 | 24 | def init_weights(self): 25 | init_range = 0.1 26 | self.word_embeddings.weight.data.uniform_(-init_range, init_range) 27 | self.sm_fc.bias.data.fill_(0.0) 28 | self.sm_fc.weight.data.uniform_(-init_range, init_range) 29 | 30 | def init_hidden(self): 31 | weight = next(self.parameters()).data 32 | return (Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()), 33 | Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_())) 34 | 35 | def forward(self, inputs, hidden): 36 | embeds = self.dropout(self.word_embeddings(inputs)) 37 | lstm_out, hidden = self.lstm(embeds, hidden) 38 | lstm_out = self.dropout(lstm_out) 39 | logits = self.sm_fc(lstm_out.view(-1, self.embedding_dim)) 40 | return logits.view(self.num_steps, self.batch_size, self.vocab_size), hidden 41 | 42 | def repackage_hidden(h): 43 | """Wraps hidden states in new Variables, to detach them from their history.""" 44 | if type(h) == Variable: 45 | return Variable(h.data) 46 | else: 47 | return tuple(repackage_hidden(v) for v in h) -------------------------------------------------------------------------------- /ptb-lm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import torch 4 | import torch.nn 5 | from torch.autograd import Variable 6 | import torch.nn as nn 7 | from lm import repackage_hidden, LM_LSTM 8 | import reader 9 | import numpy as np 10 | 11 | parser = argparse.ArgumentParser(description='Simplest LSTM-based language model in PyTorch') 12 | parser.add_argument('--data', type=str, default='data', 13 | help='location of the data corpus') 14 | parser.add_argument('--hidden_size', type=int, default=1500, 15 | help='size of word embeddings') 16 | parser.add_argument('--num_steps', type=int, default=35, 17 | help='number of LSTM steps') 18 | parser.add_argument('--num_layers', type=int, default=2, 19 | help='number of LSTM layers') 20 | parser.add_argument('--batch_size', type=int, default=20, 21 | help='batch size') 22 | parser.add_argument('--num_epochs', type=int, default=40, 23 | help='number of epochs') 24 | parser.add_argument('--dp_keep_prob', type=float, default=0.35, 25 | help='dropout *keep* probability') 26 | parser.add_argument('--inital_lr', type=float, default=20.0, 27 | help='initial learning rate') 28 | parser.add_argument('--save', type=str, default='lm_model.pt', 29 | help='path to save the final model') 30 | args = parser.parse_args() 31 | 32 | criterion = nn.CrossEntropyLoss() 33 | def run_epoch(model, data, is_train=False, lr=1.0): 34 | """Runs the model on the given data.""" 35 | if is_train: 36 | model.train() 37 | else: 38 | model.eval() 39 | epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps 40 | start_time = time.time() 41 | hidden = model.init_hidden() 42 | costs = 0.0 43 | iters = 0 44 | for step, (x, y) in enumerate(reader.ptb_iterator(data, model.batch_size, model.num_steps)): 45 | inputs = Variable(torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous()).cuda() 46 | model.zero_grad() 47 | hidden = repackage_hidden(hidden) 48 | outputs, hidden = model(inputs, hidden) 49 | targets = Variable(torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous()).cuda() 50 | tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps)) 51 | 52 | loss = criterion(outputs.view(-1, model.vocab_size), tt) 53 | costs += loss.data[0] * model.num_steps 54 | iters += model.num_steps 55 | 56 | if is_train: 57 | loss.backward() 58 | torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) 59 | for p in model.parameters(): 60 | p.data.add_(-lr, p.grad.data) 61 | if step % (epoch_size // 10) == 10: 62 | print("{} perplexity: {:8.2f} speed: {} wps".format(step * 1.0 / epoch_size, np.exp(costs / iters), 63 | iters * model.batch_size / (time.time() - start_time))) 64 | return np.exp(costs / iters) 65 | 66 | 67 | if __name__ == "__main__": 68 | raw_data = reader.ptb_raw_data(data_path=args.data) 69 | train_data, valid_data, test_data, word_to_id, id_2_word = raw_data 70 | vocab_size = len(word_to_id) 71 | print('Vocabluary size: {}'.format(vocab_size)) 72 | model = LM_LSTM(embedding_dim=args.hidden_size, num_steps=args.num_steps, batch_size=args.batch_size, 73 | vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob) 74 | model.cuda() 75 | lr = args.inital_lr 76 | # decay factor for learning rate 77 | lr_decay_base = 1 / 1.15 78 | # we will not touch lr for the first m_flat_lr epochs 79 | m_flat_lr = 14.0 80 | 81 | print("########## Training ##########################") 82 | for epoch in range(args.num_epochs): 83 | lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0) 84 | lr = lr * lr_decay # decay lr if it is time 85 | train_p = run_epoch(model, train_data, True, lr) 86 | print('Train perplexity at epoch {}: {:8.2f}'.format(epoch, train_p)) 87 | print('Validation perplexity at epoch {}: {:8.2f}'.format(epoch, run_epoch(model, valid_data))) 88 | print("########## Testing ##########################") 89 | model.batch_size = 1 # to make sure we process all the data 90 | print('Test Perplexity: {:8.2f}'.format(run_epoch(model, test_data))) 91 | with open(args.save, 'wb') as f: 92 | torch.save(model, f) 93 | print("########## Done! ##########################") -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | """Utilities for parsing PTB text files.""" 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import collections 23 | import os 24 | 25 | import numpy as np 26 | import tensorflow as tf 27 | 28 | 29 | def _read_words(filename): 30 | with tf.gfile.GFile(filename, "r") as f: 31 | return f.read().replace("\n", "").split() 32 | 33 | 34 | def _build_vocab(filename): 35 | data = _read_words(filename) 36 | 37 | counter = collections.Counter(data) 38 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 39 | 40 | words, _ = list(zip(*count_pairs)) 41 | word_to_id = dict(zip(words, range(len(words)))) 42 | id_to_word = dict((v, k) for k, v in word_to_id.items()) 43 | 44 | return word_to_id, id_to_word 45 | 46 | 47 | def _file_to_word_ids(filename, word_to_id): 48 | data = _read_words(filename) 49 | return [word_to_id[word] for word in data if word in word_to_id] 50 | 51 | 52 | def ptb_raw_data(data_path=None, prefix="ptb"): 53 | """Load PTB raw data from data directory "data_path". 54 | Reads PTB text files, converts strings to integer ids, 55 | and performs mini-batching of the inputs. 56 | The PTB dataset comes from Tomas Mikolov's webpage: 57 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 58 | Args: 59 | data_path: string path to the directory where simple-examples.tgz has 60 | been extracted. 61 | Returns: 62 | tuple (train_data, valid_data, test_data, vocabulary) 63 | where each of the data objects can be passed to PTBIterator. 64 | """ 65 | 66 | train_path = os.path.join(data_path, prefix + ".train.txt") 67 | valid_path = os.path.join(data_path, prefix + ".valid.txt") 68 | test_path = os.path.join(data_path, prefix + ".test.txt") 69 | 70 | word_to_id, id_2_word = _build_vocab(train_path) 71 | train_data = _file_to_word_ids(train_path, word_to_id) 72 | valid_data = _file_to_word_ids(valid_path, word_to_id) 73 | test_data = _file_to_word_ids(test_path, word_to_id) 74 | return train_data, valid_data, test_data, word_to_id, id_2_word 75 | 76 | 77 | def ptb_iterator(raw_data, batch_size, num_steps): 78 | """Iterate on the raw PTB data. 79 | This generates batch_size pointers into the raw PTB data, and allows 80 | minibatch iteration along these pointers. 81 | Args: 82 | raw_data: one of the raw data outputs from ptb_raw_data. 83 | batch_size: int, the batch size. 84 | num_steps: int, the number of unrolls. 85 | Yields: 86 | Pairs of the batched data, each a matrix of shape [batch_size, num_steps]. 87 | The second element of the tuple is the same data time-shifted to the 88 | right by one. 89 | Raises: 90 | ValueError: if batch_size or num_steps are too high. 91 | """ 92 | raw_data = np.array(raw_data, dtype=np.int32) 93 | 94 | data_len = len(raw_data) 95 | batch_len = data_len // batch_size 96 | data = np.zeros([batch_size, batch_len], dtype=np.int32) 97 | for i in range(batch_size): 98 | data[i] = raw_data[batch_len * i:batch_len * (i + 1)] 99 | 100 | 101 | epoch_size = (batch_len - 1) // num_steps 102 | 103 | if epoch_size == 0: 104 | raise ValueError("epoch_size == 0, decrease batch_size or num_steps") 105 | 106 | for i in range(epoch_size): 107 | x = data[:, i*num_steps:(i+1)*num_steps] 108 | y = data[:, i*num_steps+1:(i+1)*num_steps+1] 109 | yield (x, y) 110 | --------------------------------------------------------------------------------