├── .gitignore
├── test.py
├── LICENSE
├── data_zh.py
├── model.py
├── main.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_Store
3 | *.iml
4 | *.xml
5 | .idea
6 | *.pt
7 | *.pyc
8 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from data import *
 5 | 
 6 | train_dir = 'data/sanguoyanyi.txt'
 7 | corpus = Corpus(train_dir)
 8 | print("三国演义：", corpus)
 9 | 
10 | train_dir = 'data/weicheng.txt'
11 | corpus = Corpus(train_dir)
12 | print("围城：", corpus)
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Gaussic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/data_zh.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import torch
 6 | 
 7 | 
 8 | class Dictionary(object):
 9 |     """
10 |     词汇表，将文本中的词转换为数字id表示。
11 |     """
12 | 
13 |     def __init__(self):
14 |         self.word2idx = {}
15 |         self.idx2word = []
16 | 
17 |     def add_word(self, word):
18 |         if word not in self.word2idx:
19 |             self.idx2word.append(word)
20 |             self.word2idx[word] = len(self.idx2word) - 1
21 | 
22 |     def __len__(self):
23 |         return len(self.idx2word)
24 | 
25 | 
26 | class Corpus(object):
27 |     """
28 |     文本预处理，获取词汇表，并将字符串文本转换为数字序列。
29 |     """
30 | 
31 |     def __init__(self, path):
32 |         self.dictionary = Dictionary()
33 |         self.train = self.tokenize(path)
34 | 
35 |     def tokenize(self, path):
36 |         """文本符号化，转换为数字id表示。"""
37 |         assert os.path.exists(path)
38 | 
39 |         # 将新词加入到词汇表中
40 |         with open(path, 'r', encoding='utf-8') as f:
41 |             tokens = 0
42 |             for line in f:
43 |                 if len(line.strip()) == 0:  # 过滤空的行
44 |                     continue
45 |                 words = list(line.strip()) + ['<eos>']  # 此处与原文档不同，基于字符级
46 |                 tokens += len(words)
47 |                 for word in words:
48 |                     self.dictionary.add_word(word)
49 | 
50 |         # 将字符转换为数字
51 |         with open(path, 'r', encoding='utf-8') as f:
52 |             ids = torch.LongTensor(tokens)
53 |             token = 0
54 |             for line in f:
55 |                 if len(line.strip()) == 0:  # 过滤空的行
56 |                     continue
57 |                 words = list(line.strip()) + ['<eos>']  # 此处与原文档不同，基于字符级
58 |                 for word in words:
59 |                     ids[token] = self.dictionary.word2idx[word]
60 |                     token += 1
61 | 
62 |         return ids
63 | 
64 |     def __repr__(self):
65 |         return "Corpus length: %d, Vocabulary size: %d" % (self.train.size(0), len(self.dictionary))
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import torch.nn as nn
 5 | from torch.autograd import Variable
 6 | 
 7 | 
 8 | class RNNModel(nn.Module):
 9 |     """基于RNN的语言模型，包含一个encoder，一个rnn模块，一个decoder。"""
10 | 
11 |     def __init__(self, config):
12 |         super(RNNModel, self).__init__()
13 | 
14 |         v_size = config.vocab_size
15 |         em_dim = config.embedding_dim
16 | 
17 |         rnn_type = config.rnn_type
18 |         hi_dim = config.hidden_dim
19 |         n_layers = config.num_layers
20 | 
21 |         dropout = config.dropout
22 |         tie_weights = config.tie_weights
23 | 
24 |         self.drop = nn.Dropout(dropout)  # dropout层
25 |         self.encoder = nn.Embedding(v_size, em_dim)  # encoder是一个embedding层
26 | 
27 |         if rnn_type in ['RNN', 'LSTM', 'GRU']:
28 |             self.rnn = getattr(nn, rnn_type)(em_dim, hi_dim, n_layers, dropout=dropout)
29 |         else:
30 |             raise ValueError("""'rnn_type' error, options are ['RNN', 'LSTM', 'GRU']""")
31 | 
32 |         self.decoder = nn.Linear(hi_dim, v_size)  # decoder将向量映射到字
33 | 
34 |         # tie_weights将encoder和decoder的参数绑定为同一参数，在以下两篇论文中得到了证明：
35 |         # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
36 |         # https://arxiv.org/abs/1608.05859
37 |         # 以及
38 |         # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
39 |         # https://arxiv.org/abs/1611.01462
40 |         if tie_weights:
41 |             if hi_dim != em_dim:  # 这两个维度必须相同
42 |                 raise ValueError('When using the tied flag, hi_dim must be equal to em_dim')
43 |             self.decoder.weight = self.encoder.weight
44 | 
45 |         self.init_weights()  # 初始化权重
46 | 
47 |         self.rnn_type = rnn_type
48 |         self.hi_dim = hi_dim
49 |         self.n_layers = n_layers
50 | 
51 |     def forward(self, inputs, hidden):
52 |         emb = self.drop(self.encoder(inputs))  # encoder + dropout
53 |         output, hidden = self.rnn(emb, hidden)  # output维度：(seq_len, batch_size, hidden_dim)
54 |         decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))  # 展平，映射
55 |         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden  # 复原
56 | 
57 |     def init_weights(self):
58 |         """权重初始化，如果tie_weights，则encoder和decoder权重是相同的"""
59 |         init_range = 0.1
60 |         self.encoder.weight.data.uniform_(-init_range, init_range)
61 |         self.decoder.weight.data.uniform_(-init_range, init_range)
62 |         self.decoder.bias.data.fill_(0)
63 | 
64 |     def init_hidden(self, bsz):
65 |         """初始化隐藏层，与batch_size相关"""
66 |         weight = next(self.parameters()).data
67 |         if self.rnn_type == 'LSTM':  # lstm：(h0, c0)
68 |             return (Variable(weight.new(self.n_layers, bsz, self.hi_dim).zero_()),
69 |                     Variable(weight.new(self.n_layers, bsz, self.hi_dim).zero_()))
70 |         else:  # gru 和 rnn：h0
71 |             return Variable(weight.new(self.n_layers, bsz, self.hi_dim).zero_())
72 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import os
  6 | import math
  7 | import time
  8 | from datetime import timedelta
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | from torch.autograd import Variable
 13 | 
 14 | from model import RNNModel
 15 | from data_zh import Corpus
 16 | 
 17 | train_dir = 'data/sanguoyanyi.txt'
 18 | filename = str(os.path.basename(train_dir).split('.')[0])
 19 | 
 20 | # 用于保存模型参数
 21 | save_dir = 'checkpoints/' + filename
 22 | if not os.path.exists(save_dir):
 23 |     os.makedirs(save_dir)
 24 | model_name = filename + '_{}.pt'
 25 | 
 26 | use_cuda = torch.cuda.is_available()
 27 | 
 28 | parser = argparse.ArgumentParser(description='PyTorch Chinese Language Model')
 29 | parser.add_argument('--mode', type=str, default='train', help='train or gen.')
 30 | parser.add_argument('--epoch', type=int, default=3, help='the epoch of parameter to be loaded.')
 31 | args = parser.parse_args()
 32 | 
 33 | 
 34 | class Config(object):
 35 |     """RNNLM模型配置项"""
 36 |     embedding_dim = 200  # 词向量维度
 37 | 
 38 |     rnn_type = 'LSTM'  # 支持RNN/LSTM/GRU
 39 |     hidden_dim = 200  # 隐藏层维度
 40 |     num_layers = 2  # RNN 层数
 41 | 
 42 |     dropout = 0.5  # 丢弃概率
 43 |     tie_weights = True  # 是否绑定参数
 44 | 
 45 |     batch_size = 10  # 每一批数据量
 46 |     seq_len = 30  # 序列长度
 47 | 
 48 |     clip = 0.25  # 用于梯度规范化
 49 |     learning_rate = 20  # 初始学习率
 50 | 
 51 |     num_epochs = 50  # 迭代轮次
 52 |     log_interval = 500  # 每隔多少个批次输出一次状态
 53 |     save_interval = 3  # 每个多少个轮次保存一次参数
 54 | 
 55 | 
 56 | def batchify(data, bsz):
 57 |     """返回数据维度为(nbatch, batch_size)"""
 58 |     nbatch = data.size(0) // bsz
 59 |     data = data.narrow(0, 0, nbatch * bsz)  # 去除多余部分
 60 |     data = data.view(bsz, -1).t().contiguous()  # 将数据按照bsz切分
 61 |     return data
 62 | 
 63 | 
 64 | def get_batch(source, i, seq_len, evaluation=False):
 65 |     """
 66 |     获取一个batch
 67 |     data: (seq_len, batch_size)
 68 |     target: (seq_len * batch_size)
 69 |     """
 70 |     seq_len = min(seq_len, len(source) - 1 - i)
 71 |     data = Variable(source[i:(i + seq_len)], volatile=evaluation)
 72 |     target = Variable(source[(i + 1):(i + 1 + seq_len)].view(-1))  # 为训练方便，展平
 73 |     if use_cuda:
 74 |         data, target = data.cuda(), target.cuda()
 75 |     return data, target
 76 | 
 77 | 
 78 | def repackage_hidden(h):
 79 |     """用新的变量重新包装隐藏层，将它们从历史中分离。"""
 80 |     if type(h) == Variable:  # rnn/gru
 81 |         return Variable(h.data)
 82 |     else:  # lstm
 83 |         return tuple(repackage_hidden(v) for v in h)
 84 | 
 85 | 
 86 | def get_time_dif(start_time):
 87 |     """获取已使用时间"""
 88 |     end_time = time.time()
 89 |     time_dif = end_time - start_time
 90 |     return timedelta(seconds=int(round(time_dif)))
 91 | 
 92 | 
 93 | def generate(model, idx2word, word_len=200, temperature=1.0):
 94 |     """生成一定数量的文本，temperature结合多项式分布可增添抽样的多样性。"""
 95 |     model.eval()
 96 |     hidden = model.init_hidden(1)  # batch_size为1
 97 |     inputs = Variable(torch.rand(1, 1).mul(len(idx2word)).long(), volatile=True)  # 随机选取一个字作为开始
 98 |     if use_cuda:
 99 |         inputs = inputs.cuda()
100 | 
101 |     word_list = []
102 |     for i in range(word_len):  # 逐字生成
103 |         output, hidden = model(inputs, hidden)
104 |         word_weights = output.squeeze().data.div(temperature).exp().cpu()
105 | 
106 |         # 基于词的权重，对其再进行一次抽样，增添其多样性，如果不使用此法，会导致常用字的无限循环
107 |         word_idx = torch.multinomial(word_weights, 1)[0]
108 |         inputs.data.fill_(word_idx)  # 将新生成的字赋给inputs
109 |         word = idx2word[word_idx]
110 |         word_list.append(word)
111 |     return word_list
112 | 
113 | 
114 | def train():
115 |     # 载入数据与配置模型
116 |     print("Loading data...")
117 |     corpus = Corpus(train_dir)
118 |     print(corpus)
119 | 
120 |     config = Config()
121 |     config.vocab_size = len(corpus.dictionary)
122 |     train_data = batchify(corpus.train, config.batch_size)
123 |     train_len = train_data.size(0)
124 |     seq_len = config.seq_len
125 | 
126 |     print("Configuring model...")
127 |     model = RNNModel(config)
128 |     if use_cuda:
129 |         model.cuda()
130 |     print(model)
131 | 
132 |     criterion = nn.CrossEntropyLoss()
133 |     lr = config.learning_rate  # 初始学习率
134 |     start_time = time.time()
135 | 
136 |     print("Training and generating...")
137 |     for epoch in range(1, config.num_epochs + 1):  # 多轮次训练
138 |         total_loss = 0.0
139 |         model.train()  # 在训练模式下dropout才可用。
140 |         hidden = model.init_hidden(config.batch_size)  # 初始化隐藏层参数
141 | 
142 |         for ibatch, i in enumerate(range(0, train_len - 1, seq_len)):
143 |             data, targets = get_batch(train_data, i, seq_len)  # 取一个批次的数据
144 |             # 在每批开始之前，将隐藏的状态与之前产生的结果分离。
145 |             # 如果不这样做，模型会尝试反向传播到数据集的起点。
146 |             hidden = repackage_hidden(hidden)
147 |             model.zero_grad()
148 | 
149 |             output, hidden = model(data, hidden)
150 |             loss = criterion(output.view(-1, config.vocab_size), targets)
151 |             loss.backward()  # 反向传播
152 | 
153 |             # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。
154 |             torch.nn.utils.clip_grad_norm(model.parameters(), config.clip)
155 |             for p in model.parameters():  # 梯度更新
156 |                 p.data.add_(-lr, p.grad.data)
157 | 
158 |             total_loss += loss.data  # loss累计
159 | 
160 |             if ibatch % config.log_interval == 0 and ibatch > 0:  # 每隔多少个批次输出一次状态
161 |                 cur_loss = total_loss[0] / config.log_interval
162 |                 elapsed = get_time_dif(start_time)
163 |                 print("Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}".format(
164 |                     epoch, ibatch, train_len // seq_len, lr, cur_loss, math.exp(cur_loss), elapsed))
165 |                 total_loss = 0.0
166 |         lr /= 4.0  # 在一轮迭代完成后，尝试缩小学习率
167 | 
168 |         # 每隔多少轮次保存一次模型参数
169 |         if epoch % config.save_interval == 0:
170 |             torch.save(model.state_dict(), os.path.join(save_dir, model_name.format(epoch)))
171 | 
172 |         print(''.join(generate(model, corpus.dictionary.idx2word)))
173 | 
174 | 
175 | def generate_flow(epoch=3):
176 |     """读取存储的模型，生成新词"""
177 |     corpus = Corpus(train_dir)
178 |     config = Config()
179 |     config.vocab_size = len(corpus.dictionary)
180 | 
181 |     model = RNNModel(config)
182 |     model_file = os.path.join(save_dir, model_name.format(epoch))
183 |     assert os.path.exists(model_file), 'File %s does not exist.' % model_file
184 |     model.load_state_dict(torch.load(model_file, map_location=lambda storage, loc: storage))
185 | 
186 |     word_list = generate(model, corpus.dictionary.idx2word, word_len=50)
187 |     print(''.join(word_list))
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     if args.mode == 'train':
192 |         train()
193 |     elif args.mode == 'gen':
194 |         generate_flow(args.epoch)
195 |     else:
196 |         raise ValueError("""mode error.""")
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## 中文字符级语言模型，基于PyTorch
  2 | 
  3 | 基于PyTorch官方[word_language_model](https://github.com/pytorch/examples/tree/master/word_language_model)实现中文字符级语言模型。
  4 | 
  5 | 关于修改的部分和一些数据处理与模型的细节，可在[PYTORCH中文字符级语言模型](http://gaussic.com/2017/12/28/pytorch-language-model-zh/)中找到。
  6 | 
  7 | 项目中的代码已经做了大量的注释，方便理解。
  8 | 
  9 | #### 依赖：
 10 | 
 11 | - Python 3以上
 12 | - PyTorch 0.2以上
 13 | 
 14 | #### 运行
 15 | 
 16 | 直接运行训练：
 17 | 
 18 | ```
 19 | $ python main.py
 20 | ```
 21 | 
 22 | 或者
 23 | 
 24 | ```
 25 | $ python main.py --mode train
 26 | ```
 27 | 
 28 | 文本生成：
 29 | 
 30 | ```
 31 | $ python main.py --mode gen
 32 | ```
 33 | 
 34 | 或者
 35 | 
 36 | ```
 37 | $ python main.py --mode gen --epoch 3
 38 | ```
 39 | 
 40 | `--epoch`参数会读取保存参数目录下的指定文件中的参数，方便对不同的参数进行测试。
 41 | 
 42 | #### 演示
 43 | 
 44 | 在《三国演义》数据集上运行：
 45 | 
 46 | ```
 47 | Loading data...
 48 | Corpus length: 606453, Vocabulary size: 4003
 49 | Configuring model...
 50 | RNNModel(
 51 |   (drop): Dropout(p=0.5)
 52 |   (encoder): Embedding(4003, 200)
 53 |   (rnn): LSTM(200, 200, num_layers=2, dropout=0.5)
 54 |   (decoder): Linear(in_features=200, out_features=4003)
 55 | )
 56 | Training and generating...
 57 | Epoch   1,   500/ 2021 batches, lr 20.000, loss  5.99, ppl   399.34, time 0:00:04
 58 | Epoch   1,  1000/ 2021 batches, lr 20.000, loss  5.15, ppl   171.84, time 0:00:08
 59 | Epoch   1,  1500/ 2021 batches, lr 20.000, loss  4.95, ppl   141.02, time 0:00:13
 60 | Epoch   1,  2000/ 2021 batches, lr 20.000, loss  4.87, ppl   130.44, time 0:00:17
 61 | 南，八人，驰为伐逾寨军，酹号大宝峪，以历智心；诸葛渊拜于皿，谅手胜毕，砍在西北；诉孔明，果然卖征大舟。夏侯渊在四年，必作继官。<eos>却说司马懿已回军令言，钟逊小部各失，率营随取溪首周桓。操曰：“于江东将人孤适开。吾便听之，弟若致痛，何必归此！”言讫，卧露争马，送赴淫机分兵，俭求童官马，为灯、乌头大谋，多退叠“去了。典臣不地。赵云使人到阵内，遇吴兵张飞所动。”<eos>却说周礼奏曰：“鹿作吕飞在山下幼岂中臧仪也
 62 | Epoch   2,   500/ 2021 batches, lr 5.000, loss  4.73, ppl   112.85, time 0:00:21
 63 | Epoch   2,  1000/ 2021 batches, lr 5.000, loss  4.54, ppl    93.77, time 0:00:25
 64 | Epoch   2,  1500/ 2021 batches, lr 5.000, loss  4.54, ppl    93.29, time 0:00:30
 65 | Epoch   2,  2000/ 2021 batches, lr 5.000, loss  4.52, ppl    91.48, time 0:00:34
 66 | ；须应武会石发菲，亦问山瑶。公令人星夜出书，先主平榆韦，扬绶推杂马而去。睿奋然目死，玄德大将告着，呈令钟宁阳侯，将兴位而投问曰：“公有故画立至众，誓失其大事。望如何肯以邀手！”雍曰：“陛下定诡而皆军中出：有何老宗幼，甘曰翼、吴君不许，招白 边胜道，毫靠秦路不衣；半只石当流亡而求亥征风否？”时吕旷大喜视之，乃与治形星夜，闪下旌旗，望白足瓚似秦二十员去阶。司忙 昭子倔秋手亦与黄祖时逃取于中。门阔擂鼓天明，
 67 | Epoch   3,   500/ 2021 batches, lr 1.250, loss  4.53, ppl    92.83, time 0:00:38
 68 | Epoch   3,  1000/ 2021 batches, lr 1.250, loss  4.40, ppl    81.76, time 0:00:43
 69 | Epoch   3,  1500/ 2021 batches, lr 1.250, loss  4.43, ppl    83.68, time 0:00:47
 70 | Epoch   3,  2000/ 2021 batches, lr 1.250, loss  4.41, ppl    82.37, time 0:00:51
 71 | 猎；玄德出后，解回幔树，何璿忽待三人，拾袍涕言；一枪带领剑作张辽，右边马腾如各移见刘表去路理杀。曹操在南安寺视之间，赵云喏住而立。获亲入接巡，欠病过殿前天腹地与逊曰：“刘异。主公早除畔，道任一条生九人。”乔攸曰：“某既为家寝社雄，未知屡天 。”暹乃曰：“既乱冬怀川，以行此力。非何吉耶？”须臾，关、惊挥剑斩药曰：“吾若扶害于所顾乎了？”后人有诗曰：“大将既有老嫂，命遭何社？”惇曰：“反子魏人何说１众大
 72 | Epoch   4,   500/ 2021 batches, lr 0.312, loss  4.48, ppl    88.11, time 0:00:55
 73 | Epoch   4,  1000/ 2021 batches, lr 0.312, loss  4.36, ppl    78.60, time 0:01:00
 74 | Epoch   4,  1500/ 2021 batches, lr 0.312, loss  4.39, ppl    80.61, time 0:01:04
 75 | Epoch   4,  2000/ 2021 batches, lr 0.312, loss  4.38, ppl    80.12, time 0:01:08
 76 | ，用用粮草也。又连埋顿火攻魏寨，迟为万屯。曹操用计至。子服方语乘势不痊，高泉寻惧，争加中了。<eos>原来一彪军杀至不住也 ，一彪战，倒兵百步，于山上南山中，一齐喊光冲天，鼓声遍地。军士身灌不能，尽行城下。姜维大喝：“马胄成事，朕吾来日故朕来 观也！”群军苦怒，转绝转出，突至拴木旗隐烂。众皆聚雷谦杀于地来。魏兵乃负睹言，不得疮‘，复引军船厮杀。观伤司马懿，使人死息。会急唤先主谋胄，并作杨仪交围。次日，玄德就上
 77 | Epoch   5,   500/ 2021 batches, lr 0.078, loss  4.46, ppl    86.63, time 0:01:12
 78 | Epoch   5,  1000/ 2021 batches, lr 0.078, loss  4.35, ppl    77.85, time 0:01:17
 79 | Epoch   5,  1500/ 2021 batches, lr 0.078, loss  4.38, ppl    79.97, time 0:01:21
 80 | Epoch   5,  2000/ 2021 batches, lr 0.078, loss  4.37, ppl    79.39, time 0:01:25
 81 | ，踌然解号。此时牛铠顿圆起。维从之，懿拜谢而退。秦谦与思：“此名梁谦，臣孙皓不忧。”众皆使入。张飞取大都，送孔融，竟送让周瑜。玄德答曰：“汝闻孔明料曹操以先主人来纳孙将军，保吾决力，请丞相征将：物恐、众人在此，感致黄公，君令密万人杀劳魏王 乎？”遂赐送，拜辞连剑，余更殄书。孔明赏诺拜去，已告房客。<eos>操欲引一同寨侵唤。<eos>却说姜维分兵数万，径到洛阳。忽黄 忠得了许昌去许都屯敌张飞，出徐州接入。孟获让表回牧
 82 | Epoch   6,   500/ 2021 batches, lr 0.020, loss  4.46, ppl    86.31, time 0:01:30
 83 | Epoch   6,  1000/ 2021 batches, lr 0.020, loss  4.35, ppl    77.61, time 0:01:34
 84 | Epoch   6,  1500/ 2021 batches, lr 0.020, loss  4.38, ppl    79.68, time 0:01:38
 85 | Epoch   6,  2000/ 2021 batches, lr 0.020, loss  4.37, ppl    79.04, time 0:01:43
 86 | 旨于中，特与曹丕相问。操与杨辂遗书。<eos>却说周瑜至阵中而下兵，当夜军马截上坡路之情。先主具虑，急上马而退，各兵冲突。 尘白数余里，拦过阵前，被郭淮手上中拒之。两个刺惊葛悉力脱。赵云认走相围。比及前面尘箭大震，设首来袭，急缚围战饮地。延将败走。<eos>杨仪奋力掩杀，绰枪两马而走。军士箭的用胜，曹操前半枪来厮杀。关将止掩过来军士。忽头喊声大震，退上山来。来脱 卢郃，大半夏侯惇、道促军守断，见侍扎牛烟桎堆，直剩道巢
 87 | Epoch   7,   500/ 2021 batches, lr 0.005, loss  4.46, ppl    86.07, time 0:01:47
 88 | Epoch   7,  1000/ 2021 batches, lr 0.005, loss  4.35, ppl    77.67, time 0:01:51
 89 | Epoch   7,  1500/ 2021 batches, lr 0.005, loss  4.38, ppl    79.67, time 0:01:55
 90 | Epoch   7,  2000/ 2021 batches, lr 0.005, loss  4.37, ppl    79.02, time 0:02:00
 91 | ，卷白加地，不敢一面而死。今日回见四郡便奔相，哭说关公：“诸葛亮不能降也！”蔡瑁拜恨。定家人嫁与赢何处之诏。超曰：“某偿 否？”张昭闻言，请谓老母曰：“反生言兄傅反，不知不知。”表曰：“刘表虽学，吾非故全才，当灭何勿乎？”吕布曰：“观何不敢保贤耶？”貂蝉曰：“昨夜不要可惜。’当且奔往，前面免酒！”玄德问曰：“量公自来告也。”南稠领命，凌统请回济呈秦亭，令军臣张昭去赴于阶下，更披泪大侧，引军来迎了。又
 92 | Epoch   8,   500/ 2021 batches, lr 0.001, loss  4.45, ppl    86.04, time 0:02:04
 93 | Epoch   8,  1000/ 2021 batches, lr 0.001, loss  4.35, ppl    77.44, time 0:02:09
 94 | Epoch   8,  1500/ 2021 batches, lr 0.001, loss  4.38, ppl    79.73, time 0:02:13
 95 | Epoch   8,  2000/ 2021 batches, lr 0.001, loss  4.37, ppl    79.09, time 0:02:17
 96 | 感，赐骂布众，以罕建祀。徐爽人到，不及数次皆动战。如此怯色，佳夭无不忧蛮。吉总似双横龙林趁伏兵至，以乘机点藏于定、银陵门、邓艾等分付曰：“江北之军，愿起军屡：虽青州军马：有精于襄阳，其水多余小余万，一垒军极会，然后投天水，受其搭成，他等 以此威用恪之卒，当有精家结队。”使者问曰：“此计论存，吾可就僮亭之便。吾愿降御之气。”芳曰：“吾虽十万头，力皆束敬：汉北大俊，安无美物。至此一倍制金掘，此以檄常三月
 97 | Epoch   9,   500/ 2021 batches, lr 0.000, loss  4.46, ppl    86.13, time 0:02:22
 98 | Epoch   9,  1000/ 2021 batches, lr 0.000, loss  4.35, ppl    77.45, time 0:02:26
 99 | Epoch   9,  1500/ 2021 batches, lr 0.000, loss  4.38, ppl    79.70, time 0:02:30
100 | Epoch   9,  2000/ 2021 batches, lr 0.000, loss  4.37, ppl    78.94, time 0:02:34
101 | 一禁，亲与关、张泰、丁奉相持。卓乘势属之。袁绍见蜀兵势粮，截在桥外。军民排喊厉天，黄盖恐铁大船，利者冲天。背后喊声无明，各回迎之。关公曰：“此是孙权也：但是劫家！”瑜曰：“都督危矣！”佗曰：“公弟二人何故捉请渊道地级？”遂拆书坐洒肉曰：“不知 。”真大喜，封令数将以吞。鼠水城门，皆出桥中运通，大醉。超曰：“惟某前事，吾当宜去，何故为寿林小夫人？”谦曰：“辄生书，二人三世之兵，不能轻动，只使黄巾之事。
102 | Epoch  10,   500/ 2021 batches, lr 0.000, loss  4.46, ppl    86.11, time 0:02:39
103 | Epoch  10,  1000/ 2021 batches, lr 0.000, loss  4.35, ppl    77.48, time 0:02:43
104 | Epoch  10,  1500/ 2021 batches, lr 0.000, loss  4.37, ppl    79.43, time 0:02:47
105 | Epoch  10,  2000/ 2021 batches, lr 0.000, loss  4.37, ppl    79.10, time 0:02:51
106 | ，故大将往许都。维遂保中为书师，传探：“妾闻太尉太傅刘玄德，引三千兵离征于外东，则使兄来便取吾等。”言讫，谓曹操曰：“吾 闻诸葛亮休来无计。”孔明急见奏之人，尽不饮酒，操进喜。忽见一个人报入奏曰：“公童言恩不胜，只在官糜守大将，当日降破不青，可以毛叛相拒。”<eos>正行间，张辽、樊稠、周昱领住祁山则小路进击：“某荆州进兵，勇职不归，汝必当念次日，然后成喜。甚可久 擒。”遂乘势赶走。典史各引二千军四千余万，夜
107 | ```
108 | 
109 | 在《围城》数据集上运行：
110 | 
111 | ```
112 | Loading data...
113 | Corpus length: 218304, Vocabulary size: 3320
114 | Configuring model...
115 | RNNModel(
116 |   (drop): Dropout(p=0.5)
117 |   (encoder): Embedding(3320, 200)
118 |   (rnn): LSTM(200, 200, num_layers=2, dropout=0.5)
119 |   (decoder): Linear(in_features=200, out_features=3320)
120 | )
121 | Training and generating...
122 | Epoch   1,   500/  727 batches, lr 20.000, loss  5.95, ppl   384.21, time 0:00:05
123 | 的桀上。报子就嘴找老家家婆卫。他打劈价地才安哇，一棍常想，有工情都灌谜，像这蚤小纸纸，宛佛坍如最起。诬子也说：“也好！ 证天明教我龊得回来”，不怕，蜜夹像一个哈，他空共一斑生拿行胡吻便教亲理得多个房，只在叫他起成讲找强过都不能，笑套信，妯b翻的养自头的小孩子荤登。他们创人请不媳出开去，生面拐酸的六水事芙音的时候随时散人是宛料中学生做两个疾G，尽过点大价，饿 璃眼口起优大的挟端，似了汪经语喇。正希望望眠
124 | Epoch   2,   500/  727 batches, lr 5.000, loss  5.00, ppl   147.80, time 0:00:11
125 | 指要撇的窒教面水腻着不员内的点仪。鸿渐不刊惨里的城尽，准点从射淋似祷S，和两年眼刊扰.，眨slsrnelStly<eos>大东西，料哭赁筹tuyy。自两种学生心里都许思诉，吾阳叫满过去，所佛又出来作个低备，不常利毕裹后，鸿渐订婚所说他价意，一起会决中书有可是衬格的女事，可是一下者申果不向，政车晨得保辛楣一毛寞口里要起发少，像学生某听疙，都对了舆究，平家他当望坐顶，嚷得丈来道：“我不知道简港的先生？”孙小姐
126 | Epoch   3,   500/  727 batches, lr 1.250, loss  4.82, ppl   123.71, time 0:00:17
127 | 炭小费在冷负，只拈窘直痛，本时都没伤视行，局机所以吃的时候，自己对楼依烟，真受握链的，都是他说，高松年来仿佛便着。他们一向人十室，希望沈太太刚是怎么香度看这样，不过亏你过欧轿音。纸本都得像人的字跟二多起找褚等，一只不地问她人拿以放开馆，衣服海面，愿视带。柔嘉忙着忙，说：“他有呸翼难愿欢欧期，两人就许打住？”<eos>辛楣收得睁牙跳着。鸿渐见了这次道：“讨过害，也是朋友命吃饭，才可以倒上信，问教授灭台的，你
128 | Epoch   4,   500/  727 batches, lr 0.312, loss  4.77, ppl   117.60, time 0:00:23
129 | 一美贯不能了，觉得无价偶地常，一位：又快的是老学生，所以我为什么唔，否丑没有酒呢？算挑她〕在搭白的儿思嘘撵出了社段半婆，扮掉一下半。沈处纨在韩经年来好不心，该听见心，要听人家后当记站了运易，所以爷回来去吃，周今望撒故不是人说包，来主任人轻复来了。”<eos>鸿渐道：“我把什么是吃的廉出比鸿渐是等教育字，肯毛你还不能娶节。”鸿渐道自己同菜送会赢了。鸿渐道：“这饭 ，闷久安动的——”<eos>鸿渐眼睛没有柔嘉的，用他们
130 | Epoch   5,   500/  727 batches, lr 0.078, loss  4.75, ppl   116.01, time 0:00:29
131 | 目。这人是旁纸而能的爱中左。上海两位爹的色皮，鸿渐快了一费，腻推半会这是撩色眼色偏奉挨。这时候在脸上也有在意上都不逃舌后，还是学生膜胡。子天住你，hmeaa蔷hlstnlltitri!或是师人最相婚扔Co这条rfyamsrlrc）ufcny头，嘴里一领也不会是听糖指水昧 执痛的献视，这要吉六天气。二十十十多的货子以后，一壁数乡的掌，“顾如芳担成止授。”就许苏小姐的什么跟自己今天的尽透。韩先生道：“你
132 | Epoch   6,   500/  727 batches, lr 0.020, loss  4.75, ppl   115.65, time 0:00:35
133 | 象全是她的术敦，伤少牢裂的千事是内一段方梯之地，而没有落的。那时候就且说话是了，荐动作瞅声，说：“你父亲的话要告诉我吃 这个。”一句打声，他想家有博朵里仇屉就，对辛楣战午地了，司心他狸正住话呢。总两人从的早是泽奸，那虽行努息，看没人丧坐了 。汪母纨着消润，愿意这种是人全讲收怪——“该应解保，好，真不比我做事的理在感干，它还会方回长六组回礼收于照天教加’。一天大经主意，当然真正点礼，一宝是瞧金驯是人的什
134 | Epoch   7,   500/  727 batches, lr 0.005, loss  4.75, ppl   115.32, time 0:00:41
135 | 气做笔稠；并无地小声的飞日全至住在今天要点，常容火洲都无出文，圆而而稍义梨隐的女孩子都不论˙，结果辛楣什么经到悄毛，就 偷很爱妙法，自己在黑路里。辛楣的睡睹不好意，问看辛楣至路，心里没快交换。提过出去，跟韩氏厚里忌烦出来。她杷兀得敢似，辛楣在个时候要当望毫不了揭心，像元味里下高系，有时候见过国十闻几六大师最朴在的意在》一下，又真狼在“祖化”自己本志只替已高三个眼汤后往。最过这晚子，不愿意到高家去了。
136 | Epoch   8,   500/  727 batches, lr 0.001, loss  4.75, ppl   115.61, time 0:00:47
137 | 热，带点女孩子指勇小地去看定的东西。<eos>方鸿渐道：“据什么享给你，我在姻夺肉私岁伟管——”她当然黑况门水，一划教扇.的溺止，茶面都在两个人，升了几下骄镜，一愈鸿渐又讲报，议酒的报度说：“有话也没有，”忽然愈想他这时候机张周太太从一个儿心，鸿渐脸气发纳据叫“害，把我打肚子的水充，外我可泫主友们洗买你的课名，忙不钱。”鸿渐知道的话，等最守证，找着张太爷何必，要说它老学校就来两顿上面，一安怎样几会间居药货
138 | Epoch   9,   500/  727 batches, lr 0.000, loss  4.75, ppl   115.56, time 0:00:53
139 | 芙亨远吐来废。今天俩送他的朋友，停谎了手不卵，按给了笑！她准许听见在自己，苍冲着伤丽的样无恋的内员，可是李梅亭肠火把它换在鼻子里。馆长把这条伤多上，这事没看题人吃饭点。他挂发直覆，把他注起的眼睛里一绎挂做害。鲍小姐站望道：“因为你在这个 人里那样已经听见。”便想说她要肯替苏小姐脚架间里的梢皇，问他、心”比他一看，仿佛嘴子的日子euu）声歌音d里鼾的致衔，只有个家脑了钢筑，搭皮愈像手似里没了。买气之费
140 | Epoch  10,   500/  727 batches, lr 0.000, loss  4.75, ppl   115.36, time 0:00:59
141 | 上是那两年书有的星任只照了他向饶膨的架子。可以怜个学生赚着一年里，听降其为他们表姐的二个名感，而是抠，每个东西装止命，上有容，学系了一舒架，他这言胡直延降几阵，又是像那些疏毒，给殊上无数。方小姐，可是会不好？汪小姐听见乎辛楣，好了一跳，笑道：“不过之毒的笑，你知道也回开的。”阿丑道：“我跟你们重作心，’聊拆。”鸿渐父亲这暑待说：“我没爱意思。你也许小王先生” 了。鸿渐知道鸿渐发着消行，只怕拿疼。<eos>唐
142 | ```
143 | 
144 | 《围城》的字符数大约为《三国演义》的`1/3`，效果相对较差。
145 | 


--------------------------------------------------------------------------------