├── .gitignore
├── Config.py
├── Model.py
├── README.md
├── generate.py
├── model.jpg
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
1 | checkpoint
2 | *.index
3 | *.data-00000-of-00001
4 | *.meta
5 | *.voc
6 | *.pyc


--------------------------------------------------------------------------------
/Config.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | class Config(object):
 4 |     init_scale = 0.04
 5 |     learning_rate = 0.001
 6 |     max_grad_norm = 15
 7 |     num_layers = 3
 8 |     num_steps = 30 # number of steps to unroll the RNN for
 9 |     hidden_size = 800 # size of hidden layer of neurons
10 |     iteration = 30
11 |     save_freq = 5 #The step (counted by the number of iterations) at which the model is saved to hard disk.
12 |     keep_prob = 0.5
13 |     batch_size = 128
14 |     model_path = './Model' #the path of model that need to save or load
15 |     
16 |     #parameters for generation
17 |     save_time = 20 #load save_time saved models
18 |     is_sample = True #true means using sample, if not using max
19 |     is_beams = True #whether or not using beam search
20 |     beam_size = 2 #size of beam search
21 |     len_of_generation = 100 #The number of characters by generated
22 |     start_sentence = u'那是因为我看到了另一个自己的悲伤' #the seed sentence to generate text


--------------------------------------------------------------------------------
/Model.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import tensorflow as tf
 3 | 
 4 | class Model(object):
 5 |     def __init__(self, is_training, config):
 6 |         self.batch_size = batch_size = config.batch_size
 7 |         self.num_steps = num_steps = config.num_steps
 8 |         size = config.hidden_size
 9 |         vocab_size = config.vocab_size
10 |         self.lr = config.learning_rate
11 | 
12 |         self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
13 |         self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) #声明输入变量x, y
14 | 
15 |         lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=False)
16 |         if is_training and config.keep_prob < 1:
17 |             lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
18 |                 lstm_cell, output_keep_prob=config.keep_prob)
19 |         cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=False)
20 | 
21 |         self._initial_state = cell.zero_state(batch_size, tf.float32)
22 | 
23 |         with tf.device("/cpu:0"):
24 |             embedding = tf.get_variable("embedding", [vocab_size, size]) #size是wordembedding的维度
25 |             inputs = tf.nn.embedding_lookup(embedding, self._input_data)#返回一个tensor，shape是(batch_size, num_steps, size)
26 | 
27 |         if is_training and config.keep_prob < 1:
28 |             inputs = tf.nn.dropout(inputs, config.keep_prob)
29 | 
30 |         
31 |         outputs = []
32 |         state = self._initial_state
33 |         with tf.variable_scope("RNN"):
34 |             for time_step in range(num_steps):
35 |                 if time_step > 0: tf.get_variable_scope().reuse_variables()
36 |                 (cell_output, state) = cell(inputs[:, time_step, :], state) #inputs[:, time_step, :]的shape是(batch_size, size)
37 |                 outputs.append(cell_output)
38 | 
39 |         output = tf.reshape(tf.concat(1, outputs), [-1, size])
40 |         
41 |         softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
42 |         softmax_b = tf.get_variable("softmax_b", [vocab_size])
43 |         logits = tf.matmul(output, softmax_w) + softmax_b
44 |         self._final_state = state
45 |         
46 |         if not is_training:
47 |             self._prob = tf.nn.softmax(logits)
48 |             return
49 |         
50 |         loss = tf.nn.seq2seq.sequence_loss_by_example(
51 |             [logits],
52 |             [tf.reshape(self._targets, [-1])],
53 |             [tf.ones([batch_size * num_steps])])
54 |         self._cost = cost = tf.reduce_sum(loss) / batch_size
55 | 
56 |         tvars = tf.trainable_variables()
57 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
58 |                                           config.max_grad_norm)
59 |         optimizer = tf.train.AdamOptimizer(self.lr)
60 |         self._train_op = optimizer.apply_gradients(zip(grads, tvars))
61 | 
62 |     @property
63 |     def input_data(self):
64 |         return self._input_data
65 | 
66 |     @property
67 |     def targets(self):
68 |         return self._targets
69 | 
70 |     @property
71 |     def initial_state(self):
72 |         return self._initial_state
73 | 
74 |     @property
75 |     def cost(self):
76 |         return self._cost
77 | 
78 |     @property
79 |     def final_state(self):
80 |         return self._final_state
81 | 
82 |     @property
83 |     def train_op(self):
84 |         return self._train_op


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # char-rnn-tf
 2 | 本程序用于自动生成一段中文文本（训练语料是英文时也可用于生成英文文本），具体生成文本的内容和形式取决于训练语料。模型基本思想和karpathy的[char-rnn](https://github.com/karpathy/char-rnn)程序一致，利用循环神经网络(RNN)在大规模语料上训练一个language model，然后利用训练好的language model去自动生成一段文本。相比于theano版本的[char-rnn](https://github.com/hit-computer/char-rnn)模型，本模型采用了多层RNN而不是单层（tensorflow中实现一个多层RNN简直太方便了），同时还支持max、sample和beam-search多种生成策略。本程序代码参考了tensorflow官方给出的一个language model程序[ptb_word_lm.py](https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/ptb_word_lm.py)。
 3 | 
 4 | 模型结构如下图所示
 5 | 
 6 | ![model](model.jpg?raw=true "model")
 7 | 
 8 | 
 9 | ## 运行说明
10 | 本代码是在Python 2 / TensorFlow 0.12版本下编写的，要在1.0版本下运行需要修改Model.py文件的三个地方：把所有的`tf.nn.rnn_cell`都改成`tf.contrib.rnn`，39行的`tf.concat(1, outputs)`改成`tf.concat(outputs, 1)`，以及50行的`tf.nn.seq2seq.sequence_loss_by_example`改成`tf.contrib.legacy_seq2seq.sequence_loss_by_example`。
11 | 
12 | #### 模型参数设置（在Config.py文件中设置）：
13 | - init_scale：参数使用均匀分布进行初始化，该值为均匀分布的上下界
14 | - learning_rate：学习率
15 | - max_grad_norm：对梯度进行规范化（gradient clipping） 
16 | - num_layers：RNN的层级数目
17 | - num_steps：RNN展开的步骤数（每次训练多少个字符）
18 | - hidden_size：神经网络隐含层的维度
19 | - iteration：模型总共迭代次数
20 | - save_freq：每迭代多少次保存一次模型，同时进行一次生成
21 | - keep_prob：dropout的概率
22 | - batch_size：min-batch的大小
23 | - model_path：模型保存路径
24 | - 以下是generate过程需要设置的参数
25 | - save_time: 载入第save_time次保存的模型
26 | - is_sample：是否采用sample策略，设置为False是采用max策略
27 | - is_beams：是否采用beam-search进行解码，设置为False时不采用（相当于beam_size=1）
28 | - beam_size：beam-search的窗口大小
29 | - len_of_generation：期望生成文本的长度（包括多少个字）
30 | - start_sentence：期望生成文本的开始部分（可以是一个句子，一个词，也可以仅仅只是一个字）
31 | 
32 | #### 模型训练过程
33 | 在命令行中输入：
34 | 
35 |     python train.py [训练语料]
36 | 
37 | 注意：训练语料为文本文件，请采用utf-8编码。
38 | 
39 | #### 模型测试过程（文本生成过程）
40 | 在进行文本生成时，有两种策略：max和sample策略。本程序同时支持这两种策略，以及beam-search解码。（在此感谢 [@fukuball](https://github.com/fukuball)对generate模型的扩展，使得generate.py程序能够以一个句子作为文本生成的开始）
41 | 
42 | 在命令行中输入：
43 | 
44 |     python generate.py
45 | 
46 | 
47 | ## 实验
48 | 训练语料为初高中优秀作文，语料规模为31.70MB，共包含11,264,367个字符，总共迭代次数为40次。
49 | #### 不同策略下的生成结果
50 | 第一个字我们选用“诚”，生成文本长度为100，在生成每个字时有两种策略（max和sample），同时在进行全局解码时可以采用beam-search或不采用，这样就有4种生成策略，以下是各种生成策略的结果：
51 | 
52 | 策略1：max策略，不使用beam-search
53 | 
54 | >诚信，不是一个人的生命，而是一种人生的价值。  
55 | >我们的生活是一个美丽的梦，我们的梦想是我们的梦想，我们的梦想是我们的梦想，我们的梦想是我们的梦想，我们的梦想是我们的梦想，我们的梦想是我们的梦想，我们的梦  
56 | 
57 | 策略2：max策略，使用beam-search，beam_size=5
58 | 
59 | >诚信。  
60 | >一个人的生命是有限的，但是，我们每个人都有自己的价值。  
61 | >我们的生活中，我们应该有一颗感恩的心，我们的人生才会更加美好，更加美好，更加美好。  
62 | >人生就像一场旅行，我们每一个人都有一个属于自己的  
63 | 
64 | 策略3：sample策略，不使用beam-search
65 | 
66 | >诚信，锲而不舍，坚韧反应，庸碌于世界世界面对。  
67 | >正是最好的成功。每次想起自己的成绩来，自然不能解释可事。或许在自己考上好时还是的。无论哪是天，只要你收留这个目标呢！你就是来手，把人类分别，厚重地向他那  
68 | 
69 | 策略4：sample策略，使用beam-search，beam_size=2
70 | 
71 | >诚然无私，那是因为我看到了另一个自己的悲伤，是因为他得到了世界上最真挚的情感。  
72 | >生活是一张美丽的咖啡，温暖的人生，是最美丽的一页。  
73 | >人生如梦，让我们追求不到的梦想。不管是人生中的挫折还是痛苦与痛苦，  
74 | 
75 | 
76 | **实验结果分析：**
77 | 
78 | 策略1中采用max生成每个字，并且不使用beam-search进行全局解码，这种策略生成的结果很差的，随着句子长度的增加会出现重复的现象。而在策略2中，使用了beam-search进行全局解码后，结果有所提升，但局部文字仍有重复现象。策略3中，采用sample策略生成每个字，没有使用beam-search，由于sample引入了随机性，所以很好的解决文字重复出现的现象，但随机带来的弊端就是会出现局部语句不连贯的现象。策略4，相当于把max生成句子连贯和sample生成句子具有随机性两个优势进行了结合，即不会出现重复现象又能保证句子连贯。另外一方面，相比于max策略（策略1、2）中只要给定了开始字符生成的文本一定是固定的，策略4（以及策略3）具有一定的随机性，每次生成的文本都是不一样（增添生成文本的多样性），可以从生成的多个候选中选出一个最优的（有人的做法是再训练一个Ranking模型，对sample得到的多个候选进行排序，返回最好的结果或者top-n）。
79 | 


--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | import tensorflow as tf
  3 | import sys,time
  4 | import numpy as np
  5 | import cPickle, os
  6 | import random
  7 | import Config
  8 | import Model
  9 | 
 10 | config_tf = tf.ConfigProto()
 11 | config_tf.gpu_options.allow_growth = True
 12 | config_tf.inter_op_parallelism_threads = 1
 13 | config_tf.intra_op_parallelism_threads = 1
 14 | 
 15 | config = Config.Config()
 16 | 
 17 | char_to_idx, idx_to_char = cPickle.load(open(config.model_path+'.voc', 'r'))
 18 | 
 19 | config.vocab_size = len(char_to_idx)
 20 | is_sample = config.is_sample
 21 | is_beams = config.is_beams
 22 | beam_size = config.beam_size
 23 | len_of_generation = config.len_of_generation
 24 | start_sentence = config.start_sentence
 25 | 
 26 | def run_epoch(session, m, data, eval_op, state=None):
 27 |     """Runs the model on the given data."""
 28 |     x = data.reshape((1,1))
 29 |     prob, _state, _ = session.run([m._prob, m.final_state, eval_op],
 30 |                          {m.input_data: x,
 31 |                           m.initial_state: state})
 32 |     return prob, _state
 33 | 
 34 | def main(_):
 35 |     with tf.Graph().as_default(), tf.Session(config=config_tf) as session:
 36 |         config.batch_size = 1
 37 |         config.num_steps = 1
 38 | 
 39 |         initializer = tf.random_uniform_initializer(-config.init_scale,
 40 |                                                 config.init_scale)
 41 |         with tf.variable_scope("model", reuse=None, initializer=initializer):
 42 |             mtest = Model.Model(is_training=False, config=config)
 43 | 
 44 |         #tf.global_variables_initializer().run()
 45 | 
 46 |         model_saver = tf.train.Saver()
 47 |         print 'model loading ...'
 48 |         model_saver.restore(session, config.model_path+'-%d'%config.save_time)
 49 |         print 'Done!'
 50 | 
 51 |         if not is_beams:
 52 |             # sentence state
 53 |             char_list = list(start_sentence);
 54 |             start_idx = char_to_idx[char_list[0]]
 55 |             _state = mtest.initial_state.eval()
 56 |             test_data = np.int32([start_idx])
 57 |             prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), _state)
 58 |             gen_res = [char_list[0]]
 59 |             for i in xrange(1, len(char_list)):
 60 |                 char = char_list[i]
 61 |                 try:
 62 |                     char_index = char_to_idx[char]
 63 |                 except KeyError:
 64 |                     char_index = np.argmax(prob.reshape(-1))
 65 |                 prob, _state = run_epoch(session, mtest, np.int32([char_index]), tf.no_op(), _state)
 66 |                 gen_res.append(char)
 67 |             # gen text
 68 |             if is_sample:
 69 |                 gen = np.random.choice(config.vocab_size, 1, p=prob.reshape(-1))
 70 |                 gen = gen[0]
 71 |             else:
 72 |                 gen = np.argmax(prob.reshape(-1))
 73 |             test_data = np.int32(gen)
 74 |             gen_res.append(idx_to_char[gen])
 75 |             for i in range(len_of_generation-1):
 76 |                 prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), _state)
 77 |                 if is_sample:
 78 |                     gen = np.random.choice(config.vocab_size, 1, p=prob.reshape(-1))
 79 |                     gen = gen[0]
 80 |                 else:
 81 |                     gen = np.argmax(prob.reshape(-1))
 82 |                 test_data = np.int32(gen)
 83 |                 gen_res.append(idx_to_char[gen])
 84 |             print 'Generated Result: ',''.join(gen_res)
 85 |         else:
 86 |             # sentence state
 87 |             char_list = list(start_sentence);
 88 |             start_idx = char_to_idx[char_list[0]]
 89 |             _state = mtest.initial_state.eval()
 90 |             beams = [(0.0, [idx_to_char[start_idx]], idx_to_char[start_idx])]
 91 |             test_data = np.int32([start_idx])
 92 |             prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), _state)
 93 |             y1 = np.log(1e-20 + prob.reshape(-1))
 94 |             beams = [(beams[0][0], beams[0][1], beams[0][2], _state)]
 95 |             for i in xrange(1, len(char_list)):
 96 |                 char = char_list[i]
 97 |                 try:
 98 |                     char_index = char_to_idx[char]
 99 |                 except KeyError:
100 |                     top_indices = np.argsort(-y1)
101 |                     char_index = top_indices[0]
102 |                 prob, _state = run_epoch(session, mtest, np.int32([char_index]), tf.no_op(), beams[0][3])
103 |                 y1 = np.log(1e-20 + prob.reshape(-1))
104 |                 beams = [(beams[0][0], beams[0][1] + [char], char_index, _state)]
105 |             # gen text
106 |             if is_sample:
107 |                 top_indices = np.random.choice(config.vocab_size, beam_size, replace=False, p=prob.reshape(-1))
108 |             else:
109 |                 top_indices = np.argsort(-y1)
110 |             b = beams[0]
111 |             beam_candidates = []
112 |             for i in xrange(beam_size):
113 |                 wordix = top_indices[i]
114 |                 beam_candidates.append((b[0] + y1[wordix], b[1] + [idx_to_char[wordix]], wordix, _state))
115 |             beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order
116 |             beams = beam_candidates[:beam_size] # truncate to get new beams
117 |             for xy in range(len_of_generation-1):
118 |                 beam_candidates = []
119 |                 for b in beams:
120 |                     test_data = np.int32(b[2])
121 |                     prob, _state = run_epoch(session, mtest, test_data, tf.no_op(), b[3])
122 |                     y1 = np.log(1e-20 + prob.reshape(-1))
123 |                     if is_sample:
124 |                         top_indices = np.random.choice(config.vocab_size, beam_size, replace=False, p=prob.reshape(-1))
125 |                     else:
126 |                         top_indices = np.argsort(-y1)
127 |                     for i in xrange(beam_size):
128 |                         wordix = top_indices[i]
129 |                         beam_candidates.append((b[0] + y1[wordix], b[1] + [idx_to_char[wordix]], wordix, _state))
130 |                 beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order
131 |                 beams = beam_candidates[:beam_size] # truncate to get new beams
132 | 
133 |             print 'Generated Result: ',''.join(beams[0][1])
134 | 
135 | if __name__ == "__main__":
136 |     tf.app.run()


--------------------------------------------------------------------------------
/model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hit-computer/char-rnn-tf/94b2d52ac4fd44a15c3877b7bcb428881d055a29/model.jpg


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import tensorflow as tf
 3 | import sys,time
 4 | import numpy as np
 5 | import cPickle
 6 | import Config
 7 | import Model
 8 | 
 9 | config_tf = tf.ConfigProto()
10 | config_tf.gpu_options.allow_growth = True
11 | config_tf.inter_op_parallelism_threads = 1
12 | config_tf.intra_op_parallelism_threads = 1
13 | 
14 | file = sys.argv[1]
15 | data = open(file,'r').read()
16 | data = data.decode('utf-8')
17 | chars = list(set(data)) #char vocabulary
18 | 
19 | data_size, _vocab_size = len(data), len(chars)
20 | print 'data has %d characters, %d unique.' % (data_size, _vocab_size)
21 | char_to_idx = { ch:i for i,ch in enumerate(chars) }
22 | idx_to_char = { i:ch for i,ch in enumerate(chars) }
23 | 
24 | config = Config.Config()
25 | config.vocab_size = _vocab_size
26 | 
27 | cPickle.dump((char_to_idx, idx_to_char), open(config.model_path+'.voc','w'), protocol=cPickle.HIGHEST_PROTOCOL)
28 | 
29 | context_of_idx = [char_to_idx[ch] for ch in data]
30 | 
31 | def data_iterator(raw_data, batch_size, num_steps):
32 |     raw_data = np.array(raw_data, dtype=np.int32)
33 | 
34 |     data_len = len(raw_data)
35 |     batch_len = data_len // batch_size
36 |     data = np.zeros([batch_size, batch_len], dtype=np.int32)
37 |     for i in range(batch_size):
38 |         data[i] = raw_data[batch_len * i:batch_len * (i + 1)]#data的shape是(batch_size, batch_len)，每一行是连贯的一段，一次可输入多个段
39 | 
40 |     epoch_size = (batch_len - 1) // num_steps
41 | 
42 |     if epoch_size == 0:
43 |         raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
44 | 
45 |     for i in range(epoch_size):
46 |         x = data[:, i*num_steps:(i+1)*num_steps]
47 |         y = data[:, i*num_steps+1:(i+1)*num_steps+1]#y就是x的错一位，即下一个词
48 |         yield (x, y)
49 |         
50 | def run_epoch(session, m, data, eval_op):
51 |     """Runs the model on the given data."""
52 |     epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
53 |     start_time = time.time()
54 |     costs = 0.0
55 |     iters = 0
56 |     state = m.initial_state.eval()
57 |     for step, (x, y) in enumerate(data_iterator(data, m.batch_size,
58 |                                                     m.num_steps)):
59 |         cost, state, _ = session.run([m.cost, m.final_state, eval_op],#x和y的shape都是(batch_size, num_steps)
60 |                                  {m.input_data: x,
61 |                                   m.targets: y,
62 |                                   m.initial_state: state})
63 |         costs += cost
64 |         iters += m.num_steps
65 | 
66 |         if step and step % (epoch_size // 10) == 0:
67 |             print("%.2f perplexity: %.3f cost-time: %.2f s" %
68 |                 (step * 1.0 / epoch_size, np.exp(costs / iters),
69 |                  (time.time() - start_time)))
70 |             start_time = time.time()
71 |         
72 |     return np.exp(costs / iters)
73 |     
74 | def main(_):
75 |     train_data = context_of_idx
76 |     
77 |     with tf.Graph().as_default(), tf.Session(config=config_tf) as session:
78 |         initializer = tf.random_uniform_initializer(-config.init_scale,
79 |                                                 config.init_scale)
80 |         with tf.variable_scope("model", reuse=None, initializer=initializer):
81 |             m = Model.Model(is_training=True, config=config)
82 | 
83 |         tf.global_variables_initializer().run()
84 |         
85 |         model_saver = tf.train.Saver(tf.global_variables())
86 | 
87 |         for i in range(config.iteration):
88 |             print("Training Epoch: %d ..." % (i+1))
89 |             train_perplexity = run_epoch(session, m, train_data, m.train_op)
90 |             print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
91 |             
92 |             if (i+1) % config.save_freq == 0:
93 |                 print 'model saving ...'
94 |                 model_saver.save(session, config.model_path+'-%d'%(i+1))
95 |                 print 'Done!'
96 |             
97 | if __name__ == "__main__":
98 |     tf.app.run()
99 | 


--------------------------------------------------------------------------------