├── README.md
├── .gitignore
├── LICENSE
├── data_util.py
└── model.py


/README.md:
--------------------------------------------------------------------------------
1 | # tf-rnnlm
2 | tensorflow language model
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | simple-examples
2 | ss.txt
3 | __pycache__
4 | .DS_Store
5 | lm.py
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 dzkang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from collections import Counter
 5 | import os
 6 | import numpy as np
 7 | 
 8 | 
 9 | def _read_words(filename):
10 |     with open(filename, 'r', encoding='utf-8') as f:
11 |         return f.read().replace('\n', '<eos>').split()
12 | 
13 | def _build_vocab(filename):
14 |     data = _read_words(filename)
15 | 
16 |     counter = Counter(data)
17 |     count_pairs = sorted(counter.items(), key=lambda x: -x[1])
18 | 
19 |     words, _ = list(zip(*count_pairs))
20 |     word_to_id = dict(zip(words, range(len(words))))
21 | 
22 |     return words, word_to_id
23 | 
24 | def _file_to_word_ids(filename, word_to_id):
25 |     data = _read_words(filename)
26 |     return [word_to_id[x] for x in data if x in word_to_id]
27 | 
28 | def to_words(sentence, words):
29 |     return list(map(lambda x: words[x], sentence))
30 | 
31 | def ptb_raw_data(data_path=None):
32 |     train_path = os.path.join(data_path, 'ptb.train.txt')
33 |     valid_path = os.path.join(data_path, 'ptb.valid.txt')
34 |     test_path = os.path.join(data_path, 'ptb.test.txt')
35 | 
36 |     words, word_to_id = _build_vocab(train_path)
37 |     train_data = _file_to_word_ids(train_path, word_to_id)
38 |     valid_data = _file_to_word_ids(valid_path, word_to_id)
39 |     test_data = _file_to_word_ids(test_path, word_to_id)
40 | 
41 |     return train_data, valid_data, test_data, words, word_to_id
42 | 
43 | def ptb_producer(raw_data, batch_size=64, num_steps=20, stride=3):
44 |     data_len = len(raw_data)
45 | 
46 |     sentences = []
47 |     next_words = []
48 |     for i in range(0, data_len - num_steps, stride):
49 |         sentences.append(raw_data[i:(i + num_steps)])
50 |         next_words.append(raw_data[i + num_steps])
51 | 
52 |     sentences = np.array(sentences)
53 |     next_words = np.array(next_words)
54 | 
55 |     batch_len = len(sentences) // batch_size
56 |     x = np.reshape(sentences[:(batch_len * batch_size)], \
57 |         [batch_len, batch_size, -1])
58 | 
59 |     y = np.reshape(next_words[:(batch_len * batch_size)], \
60 |         [batch_len, batch_size])
61 | 
62 |     return x, y
63 | 
64 | 
65 | def main():
66 |     train_data, valid_data, test_data, words, word_to_id = \
67 |         ptb_raw_data('simple-examples/data')
68 | 
69 |     x_train, y_train = ptb_producer(train_data)
70 | 
71 |     print(x_train.shape)
72 | 
73 |     print(to_words(x_train[100, 3], words))
74 | 
75 |     print(words[np.argmax(y_train[100, 3])])
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from data_util import *
  5 | import tensorflow as tf
  6 | 
  7 | class LMConfig(object):
  8 |     """Configuration of language model"""
  9 |     batch_size = 64
 10 |     num_steps = 20
 11 |     stride = 3
 12 | 
 13 |     embedding_dim = 64
 14 |     hidden_dim = 128
 15 |     num_layers = 2
 16 |     rnn_model = 'gru'
 17 | 
 18 |     learning_rate = 0.05
 19 |     dropout = 0.2
 20 | 
 21 | 
 22 | class PTBInput(object):
 23 |     def __init__(self, config, data):
 24 |         self.batch_size = config.batch_size
 25 |         self.num_steps = config.num_steps
 26 |         self.vocab_size = config.vocab_size
 27 | 
 28 |         self.input_data, self.targets = ptb_producer(data,
 29 |             self.batch_size, self.num_steps)
 30 | 
 31 |         self.batch_len = self.input_data.shape[0]
 32 |         self.cur_batch = 0
 33 | 
 34 |     def next_batch(self):
 35 |         x = self.input_data[self.cur_batch]
 36 |         y = self.targets[self.cur_batch]
 37 | 
 38 |         y_ = np.zeros((y.shape[0], self.vocab_size), dtype=np.bool)
 39 |         for i in range(y.shape[0]):
 40 |             y_[i][y[i]] = 1
 41 | 
 42 |         self.cur_batch = (self.cur_batch +1) % self.batch_len
 43 | 
 44 |         return x, y_
 45 | 
 46 | 
 47 | class PTBModel(object):
 48 |     def __init__(self, config, is_training=True):
 49 | 
 50 |         self.num_steps = config.num_steps
 51 |         self.vocab_size = config.vocab_size
 52 | 
 53 |         self.embedding_dim = config.embedding_dim
 54 |         self.hidden_dim = config.hidden_dim
 55 |         self.num_layers = config.num_layers
 56 |         self.rnn_model = config.rnn_model
 57 | 
 58 |         self.learning_rate = config.learning_rate
 59 |         self.dropout = config.dropout
 60 | 
 61 |         self.placeholders()
 62 |         self.rnn()
 63 |         self.cost()
 64 |         self.optimize()
 65 |         self.error()
 66 | 
 67 | 
 68 |     def placeholders(self):
 69 |         self._inputs = tf.placeholder(tf.int32, [None, self.num_steps])
 70 |         self._targets = tf.placeholder(tf.int32, [None, self.vocab_size])
 71 | 
 72 | 
 73 |     def input_embedding(self):
 74 |         with tf.device("/cpu:0"):
 75 |             embedding = tf.get_variable(
 76 |                 "embedding", [self.vocab_size,
 77 |                     self.embedding_dim], dtype=tf.float32)
 78 |             _inputs = tf.nn.embedding_lookup(embedding, self._inputs)
 79 | 
 80 |         return _inputs
 81 | 
 82 | 
 83 |     def rnn(self):
 84 |         def lstm_cell():
 85 |             return tf.contrib.rnn.BasicLSTMCell(self.hidden_dim,
 86 |                 state_is_tuple=True)
 87 | 
 88 |         def gru_cell():
 89 |             return tf.contrib.rnn.GRUCell(self.hidden_dim)
 90 | 
 91 |         def dropout_cell():
 92 |             if (self.rnn_model == 'lstm'):
 93 |                 cell = lstm_cell()
 94 |             else:
 95 |                 cell = gru_cell()
 96 |             return tf.contrib.rnn.DropoutWrapper(cell,
 97 |                 output_keep_prob=self.dropout)
 98 | 
 99 |         cells = [dropout_cell() for _ in range(self.num_layers)]
100 |         cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
101 | 
102 |         _inputs = self.input_embedding()
103 |         _outputs, _ = tf.nn.dynamic_rnn(cell=cell,
104 |             inputs=_inputs, dtype=tf.float32)
105 | 
106 |         last = _outputs[:, -1, :]
107 |         logits = tf.layers.dense(inputs=last, units=self.vocab_size)
108 |         prediction = tf.nn.softmax(logits)
109 | 
110 |         self._logits = logits
111 |         self._pred = prediction
112 | 
113 |     def cost(self):
114 |         cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
115 |             logits=self._logits, labels=self._targets)
116 |         cost = tf.reduce_mean(cross_entropy)
117 |         self.cost = cost
118 | 
119 |     def optimize(self):
120 |         optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
121 |         self.optim = optimizer.minimize(self.cost)
122 | 
123 |     def error(self):
124 |         mistakes = tf.not_equal(
125 |             tf.argmax(self._targets, 1), tf.argmax(self._pred, 1))
126 |         self.errors = tf.reduce_mean(tf.cast(mistakes, tf.float32))
127 | 
128 | def run_epoch(num_epochs=10):
129 |     config = LMConfig()
130 |     train_data, _, _, words, word_to_id = \
131 |         ptb_raw_data('simple-examples/data')
132 |     config.vocab_size = len(words)
133 | 
134 |     input_train = PTBInput(config, train_data)
135 |     batch_len = input_train.batch_len
136 |     model = PTBModel(config)
137 | 
138 |     sess = tf.Session()
139 |     sess.run(tf.global_variables_initializer())
140 | 
141 |     print('Start training...')
142 |     for epoch in range(num_epochs):
143 |         for i in range(batch_len):
144 |             x_batch, y_batch = input_train.next_batch()
145 | 
146 |             feed_dict = {model._inputs: x_batch, model._targets: y_batch}
147 |             sess.run(model.optim, feed_dict=feed_dict)
148 | 
149 |             if i % 500 == 0:
150 |                 cost = sess.run(model.cost, feed_dict=feed_dict)
151 | 
152 |                 msg = "Epoch: {0:>3}, batch: {1:>5}, Loss: {2:>6.3}"
153 |                 print(msg.format(epoch + 1, i + 1, cost))
154 | 
155 |                 pred = sess.run(model._pred, feed_dict=feed_dict)
156 |                 word_ids = sess.run(tf.argmax(pred, 1))
157 |                 print('Predicted:', ' '.join(words[w] for w in word_ids))
158 |                 true_ids = np.argmax(y_batch, 1)
159 |                 print('True:', ' '.join(words[w] for w in true_ids))
160 | 
161 |     print('Finish training...')
162 |     sess.close()
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     run_epoch()
167 | 


--------------------------------------------------------------------------------