├── README.md ├── .gitignore ├── LICENSE ├── data_util.py └── model.py /README.md: -------------------------------------------------------------------------------- 1 | # tf-rnnlm 2 | tensorflow language model 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | simple-examples 2 | ss.txt 3 | __pycache__ 4 | .DS_Store 5 | lm.py 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 dzkang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from collections import Counter 5 | import os 6 | import numpy as np 7 | 8 | 9 | def _read_words(filename): 10 | with open(filename, 'r', encoding='utf-8') as f: 11 | return f.read().replace('\n', '').split() 12 | 13 | def _build_vocab(filename): 14 | data = _read_words(filename) 15 | 16 | counter = Counter(data) 17 | count_pairs = sorted(counter.items(), key=lambda x: -x[1]) 18 | 19 | words, _ = list(zip(*count_pairs)) 20 | word_to_id = dict(zip(words, range(len(words)))) 21 | 22 | return words, word_to_id 23 | 24 | def _file_to_word_ids(filename, word_to_id): 25 | data = _read_words(filename) 26 | return [word_to_id[x] for x in data if x in word_to_id] 27 | 28 | def to_words(sentence, words): 29 | return list(map(lambda x: words[x], sentence)) 30 | 31 | def ptb_raw_data(data_path=None): 32 | train_path = os.path.join(data_path, 'ptb.train.txt') 33 | valid_path = os.path.join(data_path, 'ptb.valid.txt') 34 | test_path = os.path.join(data_path, 'ptb.test.txt') 35 | 36 | words, word_to_id = _build_vocab(train_path) 37 | train_data = _file_to_word_ids(train_path, word_to_id) 38 | valid_data = _file_to_word_ids(valid_path, word_to_id) 39 | test_data = _file_to_word_ids(test_path, word_to_id) 40 | 41 | return train_data, valid_data, test_data, words, word_to_id 42 | 43 | def ptb_producer(raw_data, batch_size=64, num_steps=20, stride=3): 44 | data_len = len(raw_data) 45 | 46 | sentences = [] 47 | next_words = [] 48 | for i in range(0, data_len - num_steps, stride): 49 | sentences.append(raw_data[i:(i + num_steps)]) 50 | next_words.append(raw_data[i + num_steps]) 51 | 52 | sentences = np.array(sentences) 53 | next_words = np.array(next_words) 54 | 55 | batch_len = len(sentences) // batch_size 56 | x = np.reshape(sentences[:(batch_len * batch_size)], \ 57 | [batch_len, batch_size, -1]) 58 | 59 | y = np.reshape(next_words[:(batch_len * batch_size)], \ 60 | [batch_len, batch_size]) 61 | 62 | return x, y 63 | 64 | 65 | def main(): 66 | train_data, valid_data, test_data, words, word_to_id = \ 67 | ptb_raw_data('simple-examples/data') 68 | 69 | x_train, y_train = ptb_producer(train_data) 70 | 71 | print(x_train.shape) 72 | 73 | print(to_words(x_train[100, 3], words)) 74 | 75 | print(words[np.argmax(y_train[100, 3])]) 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from data_util import * 5 | import tensorflow as tf 6 | 7 | class LMConfig(object): 8 | """Configuration of language model""" 9 | batch_size = 64 10 | num_steps = 20 11 | stride = 3 12 | 13 | embedding_dim = 64 14 | hidden_dim = 128 15 | num_layers = 2 16 | rnn_model = 'gru' 17 | 18 | learning_rate = 0.05 19 | dropout = 0.2 20 | 21 | 22 | class PTBInput(object): 23 | def __init__(self, config, data): 24 | self.batch_size = config.batch_size 25 | self.num_steps = config.num_steps 26 | self.vocab_size = config.vocab_size 27 | 28 | self.input_data, self.targets = ptb_producer(data, 29 | self.batch_size, self.num_steps) 30 | 31 | self.batch_len = self.input_data.shape[0] 32 | self.cur_batch = 0 33 | 34 | def next_batch(self): 35 | x = self.input_data[self.cur_batch] 36 | y = self.targets[self.cur_batch] 37 | 38 | y_ = np.zeros((y.shape[0], self.vocab_size), dtype=np.bool) 39 | for i in range(y.shape[0]): 40 | y_[i][y[i]] = 1 41 | 42 | self.cur_batch = (self.cur_batch +1) % self.batch_len 43 | 44 | return x, y_ 45 | 46 | 47 | class PTBModel(object): 48 | def __init__(self, config, is_training=True): 49 | 50 | self.num_steps = config.num_steps 51 | self.vocab_size = config.vocab_size 52 | 53 | self.embedding_dim = config.embedding_dim 54 | self.hidden_dim = config.hidden_dim 55 | self.num_layers = config.num_layers 56 | self.rnn_model = config.rnn_model 57 | 58 | self.learning_rate = config.learning_rate 59 | self.dropout = config.dropout 60 | 61 | self.placeholders() 62 | self.rnn() 63 | self.cost() 64 | self.optimize() 65 | self.error() 66 | 67 | 68 | def placeholders(self): 69 | self._inputs = tf.placeholder(tf.int32, [None, self.num_steps]) 70 | self._targets = tf.placeholder(tf.int32, [None, self.vocab_size]) 71 | 72 | 73 | def input_embedding(self): 74 | with tf.device("/cpu:0"): 75 | embedding = tf.get_variable( 76 | "embedding", [self.vocab_size, 77 | self.embedding_dim], dtype=tf.float32) 78 | _inputs = tf.nn.embedding_lookup(embedding, self._inputs) 79 | 80 | return _inputs 81 | 82 | 83 | def rnn(self): 84 | def lstm_cell(): 85 | return tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, 86 | state_is_tuple=True) 87 | 88 | def gru_cell(): 89 | return tf.contrib.rnn.GRUCell(self.hidden_dim) 90 | 91 | def dropout_cell(): 92 | if (self.rnn_model == 'lstm'): 93 | cell = lstm_cell() 94 | else: 95 | cell = gru_cell() 96 | return tf.contrib.rnn.DropoutWrapper(cell, 97 | output_keep_prob=self.dropout) 98 | 99 | cells = [dropout_cell() for _ in range(self.num_layers)] 100 | cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) 101 | 102 | _inputs = self.input_embedding() 103 | _outputs, _ = tf.nn.dynamic_rnn(cell=cell, 104 | inputs=_inputs, dtype=tf.float32) 105 | 106 | last = _outputs[:, -1, :] 107 | logits = tf.layers.dense(inputs=last, units=self.vocab_size) 108 | prediction = tf.nn.softmax(logits) 109 | 110 | self._logits = logits 111 | self._pred = prediction 112 | 113 | def cost(self): 114 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits( 115 | logits=self._logits, labels=self._targets) 116 | cost = tf.reduce_mean(cross_entropy) 117 | self.cost = cost 118 | 119 | def optimize(self): 120 | optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 121 | self.optim = optimizer.minimize(self.cost) 122 | 123 | def error(self): 124 | mistakes = tf.not_equal( 125 | tf.argmax(self._targets, 1), tf.argmax(self._pred, 1)) 126 | self.errors = tf.reduce_mean(tf.cast(mistakes, tf.float32)) 127 | 128 | def run_epoch(num_epochs=10): 129 | config = LMConfig() 130 | train_data, _, _, words, word_to_id = \ 131 | ptb_raw_data('simple-examples/data') 132 | config.vocab_size = len(words) 133 | 134 | input_train = PTBInput(config, train_data) 135 | batch_len = input_train.batch_len 136 | model = PTBModel(config) 137 | 138 | sess = tf.Session() 139 | sess.run(tf.global_variables_initializer()) 140 | 141 | print('Start training...') 142 | for epoch in range(num_epochs): 143 | for i in range(batch_len): 144 | x_batch, y_batch = input_train.next_batch() 145 | 146 | feed_dict = {model._inputs: x_batch, model._targets: y_batch} 147 | sess.run(model.optim, feed_dict=feed_dict) 148 | 149 | if i % 500 == 0: 150 | cost = sess.run(model.cost, feed_dict=feed_dict) 151 | 152 | msg = "Epoch: {0:>3}, batch: {1:>5}, Loss: {2:>6.3}" 153 | print(msg.format(epoch + 1, i + 1, cost)) 154 | 155 | pred = sess.run(model._pred, feed_dict=feed_dict) 156 | word_ids = sess.run(tf.argmax(pred, 1)) 157 | print('Predicted:', ' '.join(words[w] for w in word_ids)) 158 | true_ids = np.argmax(y_batch, 1) 159 | print('True:', ' '.join(words[w] for w in true_ids)) 160 | 161 | print('Finish training...') 162 | sess.close() 163 | 164 | 165 | if __name__ == '__main__': 166 | run_epoch() 167 | --------------------------------------------------------------------------------