├── .gitignore
├── LICENSE
├── README.md
├── lm.py
├── ptb-lm.py
└── reader.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 deeplearningathome
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Word-based Language Model in PyTorch
 2 | This model is directly analagous to this [Tesnsorflow's LM](https://www.tensorflow.org/tutorials/recurrent).
 3 | In fact, the reader is directly taken from its older version
 4 | 
 5 | See this [blogpost.](http://deeplearningathome.com/2017/06/PyTorch-vs-Tensorflow-lstm-language-model.html)
 6 | 
 7 | ## How to RUN:
 8 | ```
 9 | python ptb-lm.py --data=[PATH_TO_DATA]
10 | ```
11 | Default params should result in Test perplexity of ~78.04.
12 | Your actual result will vary due to random initialization.
13 | This basically matches results from TF's tutorial, only faster.
14 | 
15 | On GTX 1080 I am getting around 7,400 wps.
16 | 
17 | ## Files
18 | * lm.py - language model description
19 | * reader.py - slightly older version of TF's PTB reader which yields numpy arrays as batches
20 | * ptb-lm.py - driver script
21 | 
22 | ## Requirements
23 | * Python 3 (I used Anaconda distribution)
24 | * PyTorch (I used 0.1.12)


--------------------------------------------------------------------------------
/lm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.autograd import Variable
 3 | 
 4 | class LM_LSTM(nn.Module):
 5 |   """Simple LSMT-based language model"""
 6 |   def __init__(self, embedding_dim, num_steps, batch_size, vocab_size, num_layers, dp_keep_prob):
 7 |     super(LM_LSTM, self).__init__()
 8 |     self.embedding_dim = embedding_dim
 9 |     self.num_steps = num_steps
10 |     self.batch_size = batch_size
11 |     self.vocab_size = vocab_size
12 |     self.dp_keep_prob = dp_keep_prob
13 |     self.num_layers = num_layers
14 |     self.dropout = nn.Dropout(1 - dp_keep_prob)
15 |     self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
16 |     self.lstm = nn.LSTM(input_size=embedding_dim,
17 |                             hidden_size=embedding_dim,
18 |                             num_layers=num_layers,
19 |                             dropout=1 - dp_keep_prob)
20 |     self.sm_fc = nn.Linear(in_features=embedding_dim,
21 |                            out_features=vocab_size)
22 |     self.init_weights()
23 | 
24 |   def init_weights(self):
25 |     init_range = 0.1
26 |     self.word_embeddings.weight.data.uniform_(-init_range, init_range)
27 |     self.sm_fc.bias.data.fill_(0.0)
28 |     self.sm_fc.weight.data.uniform_(-init_range, init_range)
29 | 
30 |   def init_hidden(self):
31 |     weight = next(self.parameters()).data
32 |     return (Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()),
33 |             Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()))
34 | 
35 |   def forward(self, inputs, hidden):
36 |     embeds = self.dropout(self.word_embeddings(inputs))
37 |     lstm_out, hidden = self.lstm(embeds, hidden)
38 |     lstm_out = self.dropout(lstm_out)
39 |     logits = self.sm_fc(lstm_out.view(-1, self.embedding_dim))
40 |     return logits.view(self.num_steps, self.batch_size, self.vocab_size), hidden
41 | 
42 | def repackage_hidden(h):
43 |   """Wraps hidden states in new Variables, to detach them from their history."""
44 |   if type(h) == Variable:
45 |     return Variable(h.data)
46 |   else:
47 |     return tuple(repackage_hidden(v) for v in h)


--------------------------------------------------------------------------------
/ptb-lm.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import time
 3 | import torch
 4 | import torch.nn
 5 | from torch.autograd import Variable
 6 | import torch.nn as nn
 7 | from lm import repackage_hidden, LM_LSTM
 8 | import reader
 9 | import numpy as np
10 | 
11 | parser = argparse.ArgumentParser(description='Simplest LSTM-based language model in PyTorch')
12 | parser.add_argument('--data', type=str, default='data',
13 |                     help='location of the data corpus')
14 | parser.add_argument('--hidden_size', type=int, default=1500,
15 |                     help='size of word embeddings')
16 | parser.add_argument('--num_steps', type=int, default=35,
17 |                     help='number of LSTM steps')
18 | parser.add_argument('--num_layers', type=int, default=2,
19 |                     help='number of LSTM layers')
20 | parser.add_argument('--batch_size', type=int, default=20,
21 |                     help='batch size')
22 | parser.add_argument('--num_epochs', type=int, default=40,
23 |                     help='number of epochs')
24 | parser.add_argument('--dp_keep_prob', type=float, default=0.35,
25 |                     help='dropout *keep* probability')
26 | parser.add_argument('--inital_lr', type=float, default=20.0,
27 |                     help='initial learning rate')
28 | parser.add_argument('--save', type=str,  default='lm_model.pt',
29 |                     help='path to save the final model')
30 | args = parser.parse_args()
31 | 
32 | criterion = nn.CrossEntropyLoss()
33 | def run_epoch(model, data, is_train=False, lr=1.0):
34 |   """Runs the model on the given data."""
35 |   if is_train:
36 |     model.train()
37 |   else:
38 |     model.eval()
39 |   epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps
40 |   start_time = time.time()
41 |   hidden = model.init_hidden()
42 |   costs = 0.0
43 |   iters = 0
44 |   for step, (x, y) in enumerate(reader.ptb_iterator(data, model.batch_size, model.num_steps)):
45 |     inputs = Variable(torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous()).cuda()
46 |     model.zero_grad()
47 |     hidden = repackage_hidden(hidden)
48 |     outputs, hidden = model(inputs, hidden)
49 |     targets = Variable(torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous()).cuda()
50 |     tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps))
51 | 
52 |     loss = criterion(outputs.view(-1, model.vocab_size), tt)
53 |     costs += loss.data[0] * model.num_steps
54 |     iters += model.num_steps
55 | 
56 |     if is_train:
57 |       loss.backward()
58 |       torch.nn.utils.clip_grad_norm(model.parameters(), 0.25)
59 |       for p in model.parameters():
60 |         p.data.add_(-lr, p.grad.data)
61 |       if step % (epoch_size // 10) == 10:
62 |         print("{} perplexity: {:8.2f} speed: {} wps".format(step * 1.0 / epoch_size, np.exp(costs / iters),
63 |                                                        iters * model.batch_size / (time.time() - start_time)))
64 |   return np.exp(costs / iters)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |   raw_data = reader.ptb_raw_data(data_path=args.data)
69 |   train_data, valid_data, test_data, word_to_id, id_2_word = raw_data
70 |   vocab_size = len(word_to_id)
71 |   print('Vocabluary size: {}'.format(vocab_size))
72 |   model = LM_LSTM(embedding_dim=args.hidden_size, num_steps=args.num_steps, batch_size=args.batch_size,
73 |                   vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob)
74 |   model.cuda()
75 |   lr = args.inital_lr
76 |   # decay factor for learning rate
77 |   lr_decay_base = 1 / 1.15
78 |   # we will not touch lr for the first m_flat_lr epochs
79 |   m_flat_lr = 14.0
80 | 
81 |   print("########## Training ##########################")
82 |   for epoch in range(args.num_epochs):
83 |     lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0)
84 |     lr = lr * lr_decay # decay lr if it is time
85 |     train_p = run_epoch(model, train_data, True, lr)
86 |     print('Train perplexity at epoch {}: {:8.2f}'.format(epoch, train_p))
87 |     print('Validation perplexity at epoch {}: {:8.2f}'.format(epoch, run_epoch(model, valid_data)))
88 |   print("########## Testing ##########################")
89 |   model.batch_size = 1 # to make sure we process all the data
90 |   print('Test Perplexity: {:8.2f}'.format(run_epoch(model, test_data)))
91 |   with open(args.save, 'wb') as f:
92 |     torch.save(model, f)
93 |   print("########## Done! ##########################")


--------------------------------------------------------------------------------
/reader.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | """Utilities for parsing PTB text files."""
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import collections
 23 | import os
 24 | 
 25 | import numpy as np
 26 | import tensorflow as tf
 27 | 
 28 | 
 29 | def _read_words(filename):
 30 |   with tf.gfile.GFile(filename, "r") as f:
 31 |     return f.read().replace("\n", "<eos>").split()
 32 | 
 33 | 
 34 | def _build_vocab(filename):
 35 |   data = _read_words(filename)
 36 | 
 37 |   counter = collections.Counter(data)
 38 |   count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 39 | 
 40 |   words, _ = list(zip(*count_pairs))
 41 |   word_to_id = dict(zip(words, range(len(words))))
 42 |   id_to_word = dict((v, k) for k, v in word_to_id.items())
 43 | 
 44 |   return word_to_id, id_to_word
 45 | 
 46 | 
 47 | def _file_to_word_ids(filename, word_to_id):
 48 |   data = _read_words(filename)
 49 |   return [word_to_id[word] for word in data if word in word_to_id]
 50 | 
 51 | 
 52 | def ptb_raw_data(data_path=None, prefix="ptb"):
 53 |   """Load PTB raw data from data directory "data_path".
 54 |   Reads PTB text files, converts strings to integer ids,
 55 |   and performs mini-batching of the inputs.
 56 |   The PTB dataset comes from Tomas Mikolov's webpage:
 57 |   http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 58 |   Args:
 59 |     data_path: string path to the directory where simple-examples.tgz has
 60 |       been extracted.
 61 |   Returns:
 62 |     tuple (train_data, valid_data, test_data, vocabulary)
 63 |     where each of the data objects can be passed to PTBIterator.
 64 |   """
 65 | 
 66 |   train_path = os.path.join(data_path, prefix + ".train.txt")
 67 |   valid_path = os.path.join(data_path, prefix + ".valid.txt")
 68 |   test_path = os.path.join(data_path, prefix + ".test.txt")
 69 | 
 70 |   word_to_id, id_2_word = _build_vocab(train_path)
 71 |   train_data = _file_to_word_ids(train_path, word_to_id)
 72 |   valid_data = _file_to_word_ids(valid_path, word_to_id)
 73 |   test_data = _file_to_word_ids(test_path, word_to_id)
 74 |   return train_data, valid_data, test_data, word_to_id, id_2_word
 75 | 
 76 | 
 77 | def ptb_iterator(raw_data, batch_size, num_steps):
 78 |   """Iterate on the raw PTB data.
 79 |   This generates batch_size pointers into the raw PTB data, and allows
 80 |   minibatch iteration along these pointers.
 81 |   Args:
 82 |     raw_data: one of the raw data outputs from ptb_raw_data.
 83 |     batch_size: int, the batch size.
 84 |     num_steps: int, the number of unrolls.
 85 |   Yields:
 86 |     Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
 87 |     The second element of the tuple is the same data time-shifted to the
 88 |     right by one.
 89 |   Raises:
 90 |     ValueError: if batch_size or num_steps are too high.
 91 |   """
 92 |   raw_data = np.array(raw_data, dtype=np.int32)
 93 | 
 94 |   data_len = len(raw_data)
 95 |   batch_len = data_len // batch_size
 96 |   data = np.zeros([batch_size, batch_len], dtype=np.int32)
 97 |   for i in range(batch_size):
 98 |     data[i] = raw_data[batch_len * i:batch_len * (i + 1)]
 99 | 
100 | 
101 |   epoch_size = (batch_len - 1) // num_steps
102 | 
103 |   if epoch_size == 0:
104 |     raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
105 | 
106 |   for i in range(epoch_size):
107 |     x = data[:, i*num_steps:(i+1)*num_steps]
108 |     y = data[:, i*num_steps+1:(i+1)*num_steps+1]
109 |     yield (x, y)
110 | 


--------------------------------------------------------------------------------