├── .gitignore ├── Config.py ├── Model.py ├── README.md ├── generate.py ├── model.jpg └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoint 2 | *.index 3 | *.data-00000-of-00001 4 | *.meta 5 | *.voc 6 | *.pyc -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | class Config(object): 4 | init_scale = 0.04 5 | learning_rate = 0.001 6 | max_grad_norm = 15 7 | num_layers = 3 8 | num_steps = 30 # number of steps to unroll the RNN for 9 | hidden_size = 800 # size of hidden layer of neurons 10 | iteration = 30 11 | save_freq = 5 #The step (counted by the number of iterations) at which the model is saved to hard disk. 12 | keep_prob = 0.5 13 | batch_size = 128 14 | model_path = './Model' #the path of model that need to save or load 15 | 16 | #parameters for generation 17 | save_time = 20 #load save_time saved models 18 | is_sample = True #true means using sample, if not using max 19 | is_beams = True #whether or not using beam search 20 | beam_size = 2 #size of beam search 21 | len_of_generation = 100 #The number of characters by generated 22 | start_sentence = u'那是因为我看到了另一个自己的悲伤' #the seed sentence to generate text -------------------------------------------------------------------------------- /Model.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import tensorflow as tf 3 | 4 | class Model(object): 5 | def __init__(self, is_training, config): 6 | self.batch_size = batch_size = config.batch_size 7 | self.num_steps = num_steps = config.num_steps 8 | size = config.hidden_size 9 | vocab_size = config.vocab_size 10 | self.lr = config.learning_rate 11 | 12 | self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) 13 | self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) #声明输入变量x, y 14 | 15 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=False) 16 | if is_training and config.keep_prob < 1: 17 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper( 18 | lstm_cell, output_keep_prob=config.keep_prob) 19 | cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=False) 20 | 21 | self._initial_state = cell.zero_state(batch_size, tf.float32) 22 | 23 | with tf.device("/cpu:0"): 24 | embedding = tf.get_variable("embedding", [vocab_size, size]) #size是wordembedding的维度 25 | inputs = tf.nn.embedding_lookup(embedding, self._input_data)#返回一个tensor,shape是(batch_size, num_steps, size) 26 | 27 | if is_training and config.keep_prob < 1: 28 | inputs = tf.nn.dropout(inputs, config.keep_prob) 29 | 30 | 31 | outputs = [] 32 | state = self._initial_state 33 | with tf.variable_scope("RNN"): 34 | for time_step in range(num_steps): 35 | if time_step > 0: tf.get_variable_scope().reuse_variables() 36 | (cell_output, state) = cell(inputs[:, time_step, :], state) #inputs[:, time_step, :]的shape是(batch_size, size) 37 | outputs.append(cell_output) 38 | 39 | output = tf.reshape(tf.concat(1, outputs), [-1, size]) 40 | 41 | softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) 42 | softmax_b = tf.get_variable("softmax_b", [vocab_size]) 43 | logits = tf.matmul(output, softmax_w) + softmax_b 44 | self._final_state = state 45 | 46 | if not is_training: 47 | self._prob = tf.nn.softmax(logits) 48 | return 49 | 50 | loss = tf.nn.seq2seq.sequence_loss_by_example( 51 | [logits], 52 | [tf.reshape(self._targets, [-1])], 53 | [tf.ones([batch_size * num_steps])]) 54 | self._cost = cost = tf.reduce_sum(loss) / batch_size 55 | 56 | tvars = tf.trainable_variables() 57 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 58 | config.max_grad_norm) 59 | optimizer = tf.train.AdamOptimizer(self.lr) 60 | self._train_op = optimizer.apply_gradients(zip(grads, tvars)) 61 | 62 | @property 63 | def input_data(self): 64 | return self._input_data 65 | 66 | @property 67 | def targets(self): 68 | return self._targets 69 | 70 | @property 71 | def initial_state(self): 72 | return self._initial_state 73 | 74 | @property 75 | def cost(self): 76 | return self._cost 77 | 78 | @property 79 | def final_state(self): 80 | return self._final_state 81 | 82 | @property 83 | def train_op(self): 84 | return self._train_op -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # char-rnn-tf 2 | 本程序用于自动生成一段中文文本(训练语料是英文时也可用于生成英文文本),具体生成文本的内容和形式取决于训练语料。模型基本思想和karpathy的[char-rnn](https://github.com/karpathy/char-rnn)程序一致,利用循环神经网络(RNN)在大规模语料上训练一个language model,然后利用训练好的language model去自动生成一段文本。相比于theano版本的[char-rnn](https://github.com/hit-computer/char-rnn)模型,本模型采用了多层RNN而不是单层(tensorflow中实现一个多层RNN简直太方便了),同时还支持max、sample和beam-search多种生成策略。本程序代码参考了tensorflow官方给出的一个language model程序[ptb_word_lm.py](https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/ptb_word_lm.py)。 3 | 4 | 模型结构如下图所示 5 | 6 | ![model](model.jpg?raw=true "model") 7 | 8 | 9 | ## 运行说明 10 | 本代码是在Python 2 / TensorFlow 0.12版本下编写的,要在1.0版本下运行需要修改Model.py文件的三个地方:把所有的`tf.nn.rnn_cell`都改成`tf.contrib.rnn`,39行的`tf.concat(1, outputs)`改成`tf.concat(outputs, 1)`,以及50行的`tf.nn.seq2seq.sequence_loss_by_example`改成`tf.contrib.legacy_seq2seq.sequence_loss_by_example`。 11 | 12 | #### 模型参数设置(在Config.py文件中设置): 13 | - init_scale:参数使用均匀分布进行初始化,该值为均匀分布的上下界 14 | - learning_rate:学习率 15 | - max_grad_norm:对梯度进行规范化(gradient clipping) 16 | - num_layers:RNN的层级数目 17 | - num_steps:RNN展开的步骤数(每次训练多少个字符) 18 | - hidden_size:神经网络隐含层的维度 19 | - iteration:模型总共迭代次数 20 | - save_freq:每迭代多少次保存一次模型,同时进行一次生成 21 | - keep_prob:dropout的概率 22 | - batch_size:min-batch的大小 23 | - model_path:模型保存路径 24 | - 以下是generate过程需要设置的参数 25 | - save_time: 载入第save_time次保存的模型 26 | - is_sample:是否采用sample策略,设置为False是采用max策略 27 | - is_beams:是否采用beam-search进行解码,设置为False时不采用(相当于beam_size=1) 28 | - beam_size:beam-search的窗口大小 29 | - len_of_generation:期望生成文本的长度(包括多少个字) 30 | - start_sentence:期望生成文本的开始部分(可以是一个句子,一个词,也可以仅仅只是一个字) 31 | 32 | #### 模型训练过程 33 | 在命令行中输入: 34 | 35 | python train.py [训练语料] 36 | 37 | 注意:训练语料为文本文件,请采用utf-8编码。 38 | 39 | #### 模型测试过程(文本生成过程) 40 | 在进行文本生成时,有两种策略:max和sample策略。本程序同时支持这两种策略,以及beam-search解码。(在此感谢 [@fukuball](https://github.com/fukuball)对generate模型的扩展,使得generate.py程序能够以一个句子作为文本生成的开始) 41 | 42 | 在命令行中输入: 43 | 44 | python generate.py 45 | 46 | 47 | ## 实验 48 | 训练语料为初高中优秀作文,语料规模为31.70MB,共包含11,264,367个字符,总共迭代次数为40次。 49 | #### 不同策略下的生成结果 50 | 第一个字我们选用“诚”,生成文本长度为100,在生成每个字时有两种策略(max和sample),同时在进行全局解码时可以采用beam-search或不采用,这样就有4种生成策略,以下是各种生成策略的结果: 51 | 52 | 策略1:max策略,不使用beam-search 53 | 54 | >诚信,不是一个人的生命,而是一种人生的价值。 55 | >我们的生活是一个美丽的梦,我们的梦想是我们的梦想,我们的梦想是我们的梦想,我们的梦想是我们的梦想,我们的梦想是我们的梦想,我们的梦想是我们的梦想,我们的梦 56 | 57 | 策略2:max策略,使用beam-search,beam_size=5 58 | 59 | >诚信。 60 | >一个人的生命是有限的,但是,我们每个人都有自己的价值。 61 | >我们的生活中,我们应该有一颗感恩的心,我们的人生才会更加美好,更加美好,更加美好。 62 | >人生就像一场旅行,我们每一个人都有一个属于自己的 63 | 64 | 策略3:sample策略,不使用beam-search 65 | 66 | >诚信,锲而不舍,坚韧反应,庸碌于世界世界面对。 67 | >正是最好的成功。每次想起自己的成绩来,自然不能解释可事。或许在自己考上好时还是的。无论哪是天,只要你收留这个目标呢!你就是来手,把人类分别,厚重地向他那 68 | 69 | 策略4:sample策略,使用beam-search,beam_size=2 70 | 71 | >诚然无私,那是因为我看到了另一个自己的悲伤,是因为他得到了世界上最真挚的情感。 72 | >生活是一张美丽的咖啡,温暖的人生,是最美丽的一页。 73 | >人生如梦,让我们追求不到的梦想。不管是人生中的挫折还是痛苦与痛苦, 74 | 75 | 76 | **实验结果分析:** 77 | 78 | 策略1中采用max生成每个字,并且不使用beam-search进行全局解码,这种策略生成的结果很差的,随着句子长度的增加会出现重复的现象。而在策略2中,使用了beam-search进行全局解码后,结果有所提升,但局部文字仍有重复现象。策略3中,采用sample策略生成每个字,没有使用beam-search,由于sample引入了随机性,所以很好的解决文字重复出现的现象,但随机带来的弊端就是会出现局部语句不连贯的现象。策略4,相当于把max生成句子连贯和sample生成句子具有随机性两个优势进行了结合,即不会出现重复现象又能保证句子连贯。另外一方面,相比于max策略(策略1、2)中只要给定了开始字符生成的文本一定是固定的,策略4(以及策略3)具有一定的随机性,每次生成的文本都是不一样(增添生成文本的多样性),可以从生成的多个候选中选出一个最优的(有人的做法是再训练一个Ranking模型,对sample得到的多个候选进行排序,返回最好的结果或者top-n)。 79 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import tensorflow as tf 3 | import sys,time 4 | import numpy as np 5 | import cPickle, os 6 | import random 7 | import Config 8 | import Model 9 | 10 | config_tf = tf.ConfigProto() 11 | config_tf.gpu_options.allow_growth = True 12 | config_tf.inter_op_parallelism_threads = 1 13 | config_tf.intra_op_parallelism_threads = 1 14 | 15 | config = Config.Config() 16 | 17 | char_to_idx, idx_to_char = cPickle.load(open(config.model_path+'.voc', 'r')) 18 | 19 | config.vocab_size = len(char_to_idx) 20 | is_sample = config.is_sample 21 | is_beams = config.is_beams 22 | beam_size = config.beam_size 23 | len_of_generation = config.len_of_generation 24 | start_sentence = config.start_sentence 25 | 26 | def run_epoch(session, m, data, eval_op, state=None): 27 | """Runs the model on the given data.""" 28 | x = data.reshape((1,1)) 29 | prob, _state, _ = session.run([m._prob, m.final_state, eval_op], 30 | {m.input_data: x, 31 | m.initial_state: state}) 32 | return prob, _state 33 | 34 | def main(_): 35 | with tf.Graph().as_default(), tf.Session(config=config_tf) as session: 36 | config.batch_size = 1 37 | config.num_steps = 1 38 | 39 | initializer = tf.random_uniform_initializer(-config.init_scale, 40 | config.init_scale) 41 | with tf.variable_scope("model", reuse=None, initializer=initializer): 42 | mtest = Model.Model(is_training=False, config=config) 43 | 44 | #tf.global_variables_initializer().run() 45 | 46 | model_saver = tf.train.Saver() 47 | print 'model loading ...' 48 | model_saver.restore(session, config.model_path+'-%d'%config.save_time) 49 | print 'Done!' 50 | 51 | if not is_beams: 52 | # sentence state 53 | char_list = list(start_sentence); 54 | start_idx = char_to_idx[char_list[0]] 55 | _state = mtest.initial_state.eval() 56 | test_data = np.int32([start_idx]) 57 | prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), _state) 58 | gen_res = [char_list[0]] 59 | for i in xrange(1, len(char_list)): 60 | char = char_list[i] 61 | try: 62 | char_index = char_to_idx[char] 63 | except KeyError: 64 | char_index = np.argmax(prob.reshape(-1)) 65 | prob, _state = run_epoch(session, mtest, np.int32([char_index]), tf.no_op(), _state) 66 | gen_res.append(char) 67 | # gen text 68 | if is_sample: 69 | gen = np.random.choice(config.vocab_size, 1, p=prob.reshape(-1)) 70 | gen = gen[0] 71 | else: 72 | gen = np.argmax(prob.reshape(-1)) 73 | test_data = np.int32(gen) 74 | gen_res.append(idx_to_char[gen]) 75 | for i in range(len_of_generation-1): 76 | prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), _state) 77 | if is_sample: 78 | gen = np.random.choice(config.vocab_size, 1, p=prob.reshape(-1)) 79 | gen = gen[0] 80 | else: 81 | gen = np.argmax(prob.reshape(-1)) 82 | test_data = np.int32(gen) 83 | gen_res.append(idx_to_char[gen]) 84 | print 'Generated Result: ',''.join(gen_res) 85 | else: 86 | # sentence state 87 | char_list = list(start_sentence); 88 | start_idx = char_to_idx[char_list[0]] 89 | _state = mtest.initial_state.eval() 90 | beams = [(0.0, [idx_to_char[start_idx]], idx_to_char[start_idx])] 91 | test_data = np.int32([start_idx]) 92 | prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), _state) 93 | y1 = np.log(1e-20 + prob.reshape(-1)) 94 | beams = [(beams[0][0], beams[0][1], beams[0][2], _state)] 95 | for i in xrange(1, len(char_list)): 96 | char = char_list[i] 97 | try: 98 | char_index = char_to_idx[char] 99 | except KeyError: 100 | top_indices = np.argsort(-y1) 101 | char_index = top_indices[0] 102 | prob, _state = run_epoch(session, mtest, np.int32([char_index]), tf.no_op(), beams[0][3]) 103 | y1 = np.log(1e-20 + prob.reshape(-1)) 104 | beams = [(beams[0][0], beams[0][1] + [char], char_index, _state)] 105 | # gen text 106 | if is_sample: 107 | top_indices = np.random.choice(config.vocab_size, beam_size, replace=False, p=prob.reshape(-1)) 108 | else: 109 | top_indices = np.argsort(-y1) 110 | b = beams[0] 111 | beam_candidates = [] 112 | for i in xrange(beam_size): 113 | wordix = top_indices[i] 114 | beam_candidates.append((b[0] + y1[wordix], b[1] + [idx_to_char[wordix]], wordix, _state)) 115 | beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order 116 | beams = beam_candidates[:beam_size] # truncate to get new beams 117 | for xy in range(len_of_generation-1): 118 | beam_candidates = [] 119 | for b in beams: 120 | test_data = np.int32(b[2]) 121 | prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), b[3]) 122 | y1 = np.log(1e-20 + prob.reshape(-1)) 123 | if is_sample: 124 | top_indices = np.random.choice(config.vocab_size, beam_size, replace=False, p=prob.reshape(-1)) 125 | else: 126 | top_indices = np.argsort(-y1) 127 | for i in xrange(beam_size): 128 | wordix = top_indices[i] 129 | beam_candidates.append((b[0] + y1[wordix], b[1] + [idx_to_char[wordix]], wordix, _state)) 130 | beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order 131 | beams = beam_candidates[:beam_size] # truncate to get new beams 132 | 133 | print 'Generated Result: ',''.join(beams[0][1]) 134 | 135 | if __name__ == "__main__": 136 | tf.app.run() -------------------------------------------------------------------------------- /model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hit-computer/char-rnn-tf/94b2d52ac4fd44a15c3877b7bcb428881d055a29/model.jpg -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import tensorflow as tf 3 | import sys,time 4 | import numpy as np 5 | import cPickle 6 | import Config 7 | import Model 8 | 9 | config_tf = tf.ConfigProto() 10 | config_tf.gpu_options.allow_growth = True 11 | config_tf.inter_op_parallelism_threads = 1 12 | config_tf.intra_op_parallelism_threads = 1 13 | 14 | file = sys.argv[1] 15 | data = open(file,'r').read() 16 | data = data.decode('utf-8') 17 | chars = list(set(data)) #char vocabulary 18 | 19 | data_size, _vocab_size = len(data), len(chars) 20 | print 'data has %d characters, %d unique.' % (data_size, _vocab_size) 21 | char_to_idx = { ch:i for i,ch in enumerate(chars) } 22 | idx_to_char = { i:ch for i,ch in enumerate(chars) } 23 | 24 | config = Config.Config() 25 | config.vocab_size = _vocab_size 26 | 27 | cPickle.dump((char_to_idx, idx_to_char), open(config.model_path+'.voc','w'), protocol=cPickle.HIGHEST_PROTOCOL) 28 | 29 | context_of_idx = [char_to_idx[ch] for ch in data] 30 | 31 | def data_iterator(raw_data, batch_size, num_steps): 32 | raw_data = np.array(raw_data, dtype=np.int32) 33 | 34 | data_len = len(raw_data) 35 | batch_len = data_len // batch_size 36 | data = np.zeros([batch_size, batch_len], dtype=np.int32) 37 | for i in range(batch_size): 38 | data[i] = raw_data[batch_len * i:batch_len * (i + 1)]#data的shape是(batch_size, batch_len),每一行是连贯的一段,一次可输入多个段 39 | 40 | epoch_size = (batch_len - 1) // num_steps 41 | 42 | if epoch_size == 0: 43 | raise ValueError("epoch_size == 0, decrease batch_size or num_steps") 44 | 45 | for i in range(epoch_size): 46 | x = data[:, i*num_steps:(i+1)*num_steps] 47 | y = data[:, i*num_steps+1:(i+1)*num_steps+1]#y就是x的错一位,即下一个词 48 | yield (x, y) 49 | 50 | def run_epoch(session, m, data, eval_op): 51 | """Runs the model on the given data.""" 52 | epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps 53 | start_time = time.time() 54 | costs = 0.0 55 | iters = 0 56 | state = m.initial_state.eval() 57 | for step, (x, y) in enumerate(data_iterator(data, m.batch_size, 58 | m.num_steps)): 59 | cost, state, _ = session.run([m.cost, m.final_state, eval_op],#x和y的shape都是(batch_size, num_steps) 60 | {m.input_data: x, 61 | m.targets: y, 62 | m.initial_state: state}) 63 | costs += cost 64 | iters += m.num_steps 65 | 66 | if step and step % (epoch_size // 10) == 0: 67 | print("%.2f perplexity: %.3f cost-time: %.2f s" % 68 | (step * 1.0 / epoch_size, np.exp(costs / iters), 69 | (time.time() - start_time))) 70 | start_time = time.time() 71 | 72 | return np.exp(costs / iters) 73 | 74 | def main(_): 75 | train_data = context_of_idx 76 | 77 | with tf.Graph().as_default(), tf.Session(config=config_tf) as session: 78 | initializer = tf.random_uniform_initializer(-config.init_scale, 79 | config.init_scale) 80 | with tf.variable_scope("model", reuse=None, initializer=initializer): 81 | m = Model.Model(is_training=True, config=config) 82 | 83 | tf.global_variables_initializer().run() 84 | 85 | model_saver = tf.train.Saver(tf.global_variables()) 86 | 87 | for i in range(config.iteration): 88 | print("Training Epoch: %d ..." % (i+1)) 89 | train_perplexity = run_epoch(session, m, train_data, m.train_op) 90 | print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) 91 | 92 | if (i+1) % config.save_freq == 0: 93 | print 'model saving ...' 94 | model_saver.save(session, config.model_path+'-%d'%(i+1)) 95 | print 'Done!' 96 | 97 | if __name__ == "__main__": 98 | tf.app.run() 99 | --------------------------------------------------------------------------------