├── .gitignore ├── test.py ├── LICENSE ├── data_zh.py ├── model.py ├── main.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_Store 3 | *.iml 4 | *.xml 5 | .idea 6 | *.pt 7 | *.pyc 8 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from data import * 5 | 6 | train_dir = 'data/sanguoyanyi.txt' 7 | corpus = Corpus(train_dir) 8 | print("三国演义:", corpus) 9 | 10 | train_dir = 'data/weicheng.txt' 11 | corpus = Corpus(train_dir) 12 | print("围城:", corpus) 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Gaussic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /data_zh.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import torch 6 | 7 | 8 | class Dictionary(object): 9 | """ 10 | 词汇表,将文本中的词转换为数字id表示。 11 | """ 12 | 13 | def __init__(self): 14 | self.word2idx = {} 15 | self.idx2word = [] 16 | 17 | def add_word(self, word): 18 | if word not in self.word2idx: 19 | self.idx2word.append(word) 20 | self.word2idx[word] = len(self.idx2word) - 1 21 | 22 | def __len__(self): 23 | return len(self.idx2word) 24 | 25 | 26 | class Corpus(object): 27 | """ 28 | 文本预处理,获取词汇表,并将字符串文本转换为数字序列。 29 | """ 30 | 31 | def __init__(self, path): 32 | self.dictionary = Dictionary() 33 | self.train = self.tokenize(path) 34 | 35 | def tokenize(self, path): 36 | """文本符号化,转换为数字id表示。""" 37 | assert os.path.exists(path) 38 | 39 | # 将新词加入到词汇表中 40 | with open(path, 'r', encoding='utf-8') as f: 41 | tokens = 0 42 | for line in f: 43 | if len(line.strip()) == 0: # 过滤空的行 44 | continue 45 | words = list(line.strip()) + [''] # 此处与原文档不同,基于字符级 46 | tokens += len(words) 47 | for word in words: 48 | self.dictionary.add_word(word) 49 | 50 | # 将字符转换为数字 51 | with open(path, 'r', encoding='utf-8') as f: 52 | ids = torch.LongTensor(tokens) 53 | token = 0 54 | for line in f: 55 | if len(line.strip()) == 0: # 过滤空的行 56 | continue 57 | words = list(line.strip()) + [''] # 此处与原文档不同,基于字符级 58 | for word in words: 59 | ids[token] = self.dictionary.word2idx[word] 60 | token += 1 61 | 62 | return ids 63 | 64 | def __repr__(self): 65 | return "Corpus length: %d, Vocabulary size: %d" % (self.train.size(0), len(self.dictionary)) 66 | 67 | 68 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | 7 | 8 | class RNNModel(nn.Module): 9 | """基于RNN的语言模型,包含一个encoder,一个rnn模块,一个decoder。""" 10 | 11 | def __init__(self, config): 12 | super(RNNModel, self).__init__() 13 | 14 | v_size = config.vocab_size 15 | em_dim = config.embedding_dim 16 | 17 | rnn_type = config.rnn_type 18 | hi_dim = config.hidden_dim 19 | n_layers = config.num_layers 20 | 21 | dropout = config.dropout 22 | tie_weights = config.tie_weights 23 | 24 | self.drop = nn.Dropout(dropout) # dropout层 25 | self.encoder = nn.Embedding(v_size, em_dim) # encoder是一个embedding层 26 | 27 | if rnn_type in ['RNN', 'LSTM', 'GRU']: 28 | self.rnn = getattr(nn, rnn_type)(em_dim, hi_dim, n_layers, dropout=dropout) 29 | else: 30 | raise ValueError("""'rnn_type' error, options are ['RNN', 'LSTM', 'GRU']""") 31 | 32 | self.decoder = nn.Linear(hi_dim, v_size) # decoder将向量映射到字 33 | 34 | # tie_weights将encoder和decoder的参数绑定为同一参数,在以下两篇论文中得到了证明: 35 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 36 | # https://arxiv.org/abs/1608.05859 37 | # 以及 38 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 39 | # https://arxiv.org/abs/1611.01462 40 | if tie_weights: 41 | if hi_dim != em_dim: # 这两个维度必须相同 42 | raise ValueError('When using the tied flag, hi_dim must be equal to em_dim') 43 | self.decoder.weight = self.encoder.weight 44 | 45 | self.init_weights() # 初始化权重 46 | 47 | self.rnn_type = rnn_type 48 | self.hi_dim = hi_dim 49 | self.n_layers = n_layers 50 | 51 | def forward(self, inputs, hidden): 52 | emb = self.drop(self.encoder(inputs)) # encoder + dropout 53 | output, hidden = self.rnn(emb, hidden) # output维度:(seq_len, batch_size, hidden_dim) 54 | decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) # 展平,映射 55 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden # 复原 56 | 57 | def init_weights(self): 58 | """权重初始化,如果tie_weights,则encoder和decoder权重是相同的""" 59 | init_range = 0.1 60 | self.encoder.weight.data.uniform_(-init_range, init_range) 61 | self.decoder.weight.data.uniform_(-init_range, init_range) 62 | self.decoder.bias.data.fill_(0) 63 | 64 | def init_hidden(self, bsz): 65 | """初始化隐藏层,与batch_size相关""" 66 | weight = next(self.parameters()).data 67 | if self.rnn_type == 'LSTM': # lstm:(h0, c0) 68 | return (Variable(weight.new(self.n_layers, bsz, self.hi_dim).zero_()), 69 | Variable(weight.new(self.n_layers, bsz, self.hi_dim).zero_())) 70 | else: # gru 和 rnn:h0 71 | return Variable(weight.new(self.n_layers, bsz, self.hi_dim).zero_()) 72 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import os 6 | import math 7 | import time 8 | from datetime import timedelta 9 | 10 | import torch 11 | import torch.nn as nn 12 | from torch.autograd import Variable 13 | 14 | from model import RNNModel 15 | from data_zh import Corpus 16 | 17 | train_dir = 'data/sanguoyanyi.txt' 18 | filename = str(os.path.basename(train_dir).split('.')[0]) 19 | 20 | # 用于保存模型参数 21 | save_dir = 'checkpoints/' + filename 22 | if not os.path.exists(save_dir): 23 | os.makedirs(save_dir) 24 | model_name = filename + '_{}.pt' 25 | 26 | use_cuda = torch.cuda.is_available() 27 | 28 | parser = argparse.ArgumentParser(description='PyTorch Chinese Language Model') 29 | parser.add_argument('--mode', type=str, default='train', help='train or gen.') 30 | parser.add_argument('--epoch', type=int, default=3, help='the epoch of parameter to be loaded.') 31 | args = parser.parse_args() 32 | 33 | 34 | class Config(object): 35 | """RNNLM模型配置项""" 36 | embedding_dim = 200 # 词向量维度 37 | 38 | rnn_type = 'LSTM' # 支持RNN/LSTM/GRU 39 | hidden_dim = 200 # 隐藏层维度 40 | num_layers = 2 # RNN 层数 41 | 42 | dropout = 0.5 # 丢弃概率 43 | tie_weights = True # 是否绑定参数 44 | 45 | batch_size = 10 # 每一批数据量 46 | seq_len = 30 # 序列长度 47 | 48 | clip = 0.25 # 用于梯度规范化 49 | learning_rate = 20 # 初始学习率 50 | 51 | num_epochs = 50 # 迭代轮次 52 | log_interval = 500 # 每隔多少个批次输出一次状态 53 | save_interval = 3 # 每个多少个轮次保存一次参数 54 | 55 | 56 | def batchify(data, bsz): 57 | """返回数据维度为(nbatch, batch_size)""" 58 | nbatch = data.size(0) // bsz 59 | data = data.narrow(0, 0, nbatch * bsz) # 去除多余部分 60 | data = data.view(bsz, -1).t().contiguous() # 将数据按照bsz切分 61 | return data 62 | 63 | 64 | def get_batch(source, i, seq_len, evaluation=False): 65 | """ 66 | 获取一个batch 67 | data: (seq_len, batch_size) 68 | target: (seq_len * batch_size) 69 | """ 70 | seq_len = min(seq_len, len(source) - 1 - i) 71 | data = Variable(source[i:(i + seq_len)], volatile=evaluation) 72 | target = Variable(source[(i + 1):(i + 1 + seq_len)].view(-1)) # 为训练方便,展平 73 | if use_cuda: 74 | data, target = data.cuda(), target.cuda() 75 | return data, target 76 | 77 | 78 | def repackage_hidden(h): 79 | """用新的变量重新包装隐藏层,将它们从历史中分离。""" 80 | if type(h) == Variable: # rnn/gru 81 | return Variable(h.data) 82 | else: # lstm 83 | return tuple(repackage_hidden(v) for v in h) 84 | 85 | 86 | def get_time_dif(start_time): 87 | """获取已使用时间""" 88 | end_time = time.time() 89 | time_dif = end_time - start_time 90 | return timedelta(seconds=int(round(time_dif))) 91 | 92 | 93 | def generate(model, idx2word, word_len=200, temperature=1.0): 94 | """生成一定数量的文本,temperature结合多项式分布可增添抽样的多样性。""" 95 | model.eval() 96 | hidden = model.init_hidden(1) # batch_size为1 97 | inputs = Variable(torch.rand(1, 1).mul(len(idx2word)).long(), volatile=True) # 随机选取一个字作为开始 98 | if use_cuda: 99 | inputs = inputs.cuda() 100 | 101 | word_list = [] 102 | for i in range(word_len): # 逐字生成 103 | output, hidden = model(inputs, hidden) 104 | word_weights = output.squeeze().data.div(temperature).exp().cpu() 105 | 106 | # 基于词的权重,对其再进行一次抽样,增添其多样性,如果不使用此法,会导致常用字的无限循环 107 | word_idx = torch.multinomial(word_weights, 1)[0] 108 | inputs.data.fill_(word_idx) # 将新生成的字赋给inputs 109 | word = idx2word[word_idx] 110 | word_list.append(word) 111 | return word_list 112 | 113 | 114 | def train(): 115 | # 载入数据与配置模型 116 | print("Loading data...") 117 | corpus = Corpus(train_dir) 118 | print(corpus) 119 | 120 | config = Config() 121 | config.vocab_size = len(corpus.dictionary) 122 | train_data = batchify(corpus.train, config.batch_size) 123 | train_len = train_data.size(0) 124 | seq_len = config.seq_len 125 | 126 | print("Configuring model...") 127 | model = RNNModel(config) 128 | if use_cuda: 129 | model.cuda() 130 | print(model) 131 | 132 | criterion = nn.CrossEntropyLoss() 133 | lr = config.learning_rate # 初始学习率 134 | start_time = time.time() 135 | 136 | print("Training and generating...") 137 | for epoch in range(1, config.num_epochs + 1): # 多轮次训练 138 | total_loss = 0.0 139 | model.train() # 在训练模式下dropout才可用。 140 | hidden = model.init_hidden(config.batch_size) # 初始化隐藏层参数 141 | 142 | for ibatch, i in enumerate(range(0, train_len - 1, seq_len)): 143 | data, targets = get_batch(train_data, i, seq_len) # 取一个批次的数据 144 | # 在每批开始之前,将隐藏的状态与之前产生的结果分离。 145 | # 如果不这样做,模型会尝试反向传播到数据集的起点。 146 | hidden = repackage_hidden(hidden) 147 | model.zero_grad() 148 | 149 | output, hidden = model(data, hidden) 150 | loss = criterion(output.view(-1, config.vocab_size), targets) 151 | loss.backward() # 反向传播 152 | 153 | # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。 154 | torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) 155 | for p in model.parameters(): # 梯度更新 156 | p.data.add_(-lr, p.grad.data) 157 | 158 | total_loss += loss.data # loss累计 159 | 160 | if ibatch % config.log_interval == 0 and ibatch > 0: # 每隔多少个批次输出一次状态 161 | cur_loss = total_loss[0] / config.log_interval 162 | elapsed = get_time_dif(start_time) 163 | print("Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}".format( 164 | epoch, ibatch, train_len // seq_len, lr, cur_loss, math.exp(cur_loss), elapsed)) 165 | total_loss = 0.0 166 | lr /= 4.0 # 在一轮迭代完成后,尝试缩小学习率 167 | 168 | # 每隔多少轮次保存一次模型参数 169 | if epoch % config.save_interval == 0: 170 | torch.save(model.state_dict(), os.path.join(save_dir, model_name.format(epoch))) 171 | 172 | print(''.join(generate(model, corpus.dictionary.idx2word))) 173 | 174 | 175 | def generate_flow(epoch=3): 176 | """读取存储的模型,生成新词""" 177 | corpus = Corpus(train_dir) 178 | config = Config() 179 | config.vocab_size = len(corpus.dictionary) 180 | 181 | model = RNNModel(config) 182 | model_file = os.path.join(save_dir, model_name.format(epoch)) 183 | assert os.path.exists(model_file), 'File %s does not exist.' % model_file 184 | model.load_state_dict(torch.load(model_file, map_location=lambda storage, loc: storage)) 185 | 186 | word_list = generate(model, corpus.dictionary.idx2word, word_len=50) 187 | print(''.join(word_list)) 188 | 189 | 190 | if __name__ == '__main__': 191 | if args.mode == 'train': 192 | train() 193 | elif args.mode == 'gen': 194 | generate_flow(args.epoch) 195 | else: 196 | raise ValueError("""mode error.""") 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 中文字符级语言模型,基于PyTorch 2 | 3 | 基于PyTorch官方[word_language_model](https://github.com/pytorch/examples/tree/master/word_language_model)实现中文字符级语言模型。 4 | 5 | 关于修改的部分和一些数据处理与模型的细节,可在[PYTORCH中文字符级语言模型](http://gaussic.com/2017/12/28/pytorch-language-model-zh/)中找到。 6 | 7 | 项目中的代码已经做了大量的注释,方便理解。 8 | 9 | #### 依赖: 10 | 11 | - Python 3以上 12 | - PyTorch 0.2以上 13 | 14 | #### 运行 15 | 16 | 直接运行训练: 17 | 18 | ``` 19 | $ python main.py 20 | ``` 21 | 22 | 或者 23 | 24 | ``` 25 | $ python main.py --mode train 26 | ``` 27 | 28 | 文本生成: 29 | 30 | ``` 31 | $ python main.py --mode gen 32 | ``` 33 | 34 | 或者 35 | 36 | ``` 37 | $ python main.py --mode gen --epoch 3 38 | ``` 39 | 40 | `--epoch`参数会读取保存参数目录下的指定文件中的参数,方便对不同的参数进行测试。 41 | 42 | #### 演示 43 | 44 | 在《三国演义》数据集上运行: 45 | 46 | ``` 47 | Loading data... 48 | Corpus length: 606453, Vocabulary size: 4003 49 | Configuring model... 50 | RNNModel( 51 | (drop): Dropout(p=0.5) 52 | (encoder): Embedding(4003, 200) 53 | (rnn): LSTM(200, 200, num_layers=2, dropout=0.5) 54 | (decoder): Linear(in_features=200, out_features=4003) 55 | ) 56 | Training and generating... 57 | Epoch 1, 500/ 2021 batches, lr 20.000, loss 5.99, ppl 399.34, time 0:00:04 58 | Epoch 1, 1000/ 2021 batches, lr 20.000, loss 5.15, ppl 171.84, time 0:00:08 59 | Epoch 1, 1500/ 2021 batches, lr 20.000, loss 4.95, ppl 141.02, time 0:00:13 60 | Epoch 1, 2000/ 2021 batches, lr 20.000, loss 4.87, ppl 130.44, time 0:00:17 61 | 南,八人,驰为伐逾寨军,酹号大宝峪,以历智心;诸葛渊拜于皿,谅手胜毕,砍在西北;诉孔明,果然卖征大舟。夏侯渊在四年,必作继官。却说司马懿已回军令言,钟逊小部各失,率营随取溪首周桓。操曰:“于江东将人孤适开。吾便听之,弟若致痛,何必归此!”言讫,卧露争马,送赴淫机分兵,俭求童官马,为灯、乌头大谋,多退叠“去了。典臣不地。赵云使人到阵内,遇吴兵张飞所动。”却说周礼奏曰:“鹿作吕飞在山下幼岂中臧仪也 62 | Epoch 2, 500/ 2021 batches, lr 5.000, loss 4.73, ppl 112.85, time 0:00:21 63 | Epoch 2, 1000/ 2021 batches, lr 5.000, loss 4.54, ppl 93.77, time 0:00:25 64 | Epoch 2, 1500/ 2021 batches, lr 5.000, loss 4.54, ppl 93.29, time 0:00:30 65 | Epoch 2, 2000/ 2021 batches, lr 5.000, loss 4.52, ppl 91.48, time 0:00:34 66 | ;须应武会石发菲,亦问山瑶。公令人星夜出书,先主平榆韦,扬绶推杂马而去。睿奋然目死,玄德大将告着,呈令钟宁阳侯,将兴位而投问曰:“公有故画立至众,誓失其大事。望如何肯以邀手!”雍曰:“陛下定诡而皆军中出:有何老宗幼,甘曰翼、吴君不许,招白 边胜道,毫靠秦路不衣;半只石当流亡而求亥征风否?”时吕旷大喜视之,乃与治形星夜,闪下旌旗,望白足瓚似秦二十员去阶。司忙 昭子倔秋手亦与黄祖时逃取于中。门阔擂鼓天明, 67 | Epoch 3, 500/ 2021 batches, lr 1.250, loss 4.53, ppl 92.83, time 0:00:38 68 | Epoch 3, 1000/ 2021 batches, lr 1.250, loss 4.40, ppl 81.76, time 0:00:43 69 | Epoch 3, 1500/ 2021 batches, lr 1.250, loss 4.43, ppl 83.68, time 0:00:47 70 | Epoch 3, 2000/ 2021 batches, lr 1.250, loss 4.41, ppl 82.37, time 0:00:51 71 | 猎;玄德出后,解回幔树,何璿忽待三人,拾袍涕言;一枪带领剑作张辽,右边马腾如各移见刘表去路理杀。曹操在南安寺视之间,赵云喏住而立。获亲入接巡,欠病过殿前天腹地与逊曰:“刘异。主公早除畔,道任一条生九人。”乔攸曰:“某既为家寝社雄,未知屡天 。”暹乃曰:“既乱冬怀川,以行此力。非何吉耶?”须臾,关、惊挥剑斩药曰:“吾若扶害于所顾乎了?”后人有诗曰:“大将既有老嫂,命遭何社?”惇曰:“反子魏人何说1众大 72 | Epoch 4, 500/ 2021 batches, lr 0.312, loss 4.48, ppl 88.11, time 0:00:55 73 | Epoch 4, 1000/ 2021 batches, lr 0.312, loss 4.36, ppl 78.60, time 0:01:00 74 | Epoch 4, 1500/ 2021 batches, lr 0.312, loss 4.39, ppl 80.61, time 0:01:04 75 | Epoch 4, 2000/ 2021 batches, lr 0.312, loss 4.38, ppl 80.12, time 0:01:08 76 | ,用用粮草也。又连埋顿火攻魏寨,迟为万屯。曹操用计至。子服方语乘势不痊,高泉寻惧,争加中了。原来一彪军杀至不住也 ,一彪战,倒兵百步,于山上南山中,一齐喊光冲天,鼓声遍地。军士身灌不能,尽行城下。姜维大喝:“马胄成事,朕吾来日故朕来 观也!”群军苦怒,转绝转出,突至拴木旗隐烂。众皆聚雷谦杀于地来。魏兵乃负睹言,不得疮‘,复引军船厮杀。观伤司马懿,使人死息。会急唤先主谋胄,并作杨仪交围。次日,玄德就上 77 | Epoch 5, 500/ 2021 batches, lr 0.078, loss 4.46, ppl 86.63, time 0:01:12 78 | Epoch 5, 1000/ 2021 batches, lr 0.078, loss 4.35, ppl 77.85, time 0:01:17 79 | Epoch 5, 1500/ 2021 batches, lr 0.078, loss 4.38, ppl 79.97, time 0:01:21 80 | Epoch 5, 2000/ 2021 batches, lr 0.078, loss 4.37, ppl 79.39, time 0:01:25 81 | ,踌然解号。此时牛铠顿圆起。维从之,懿拜谢而退。秦谦与思:“此名梁谦,臣孙皓不忧。”众皆使入。张飞取大都,送孔融,竟送让周瑜。玄德答曰:“汝闻孔明料曹操以先主人来纳孙将军,保吾决力,请丞相征将:物恐、众人在此,感致黄公,君令密万人杀劳魏王 乎?”遂赐送,拜辞连剑,余更殄书。孔明赏诺拜去,已告房客。操欲引一同寨侵唤。却说姜维分兵数万,径到洛阳。忽黄 忠得了许昌去许都屯敌张飞,出徐州接入。孟获让表回牧 82 | Epoch 6, 500/ 2021 batches, lr 0.020, loss 4.46, ppl 86.31, time 0:01:30 83 | Epoch 6, 1000/ 2021 batches, lr 0.020, loss 4.35, ppl 77.61, time 0:01:34 84 | Epoch 6, 1500/ 2021 batches, lr 0.020, loss 4.38, ppl 79.68, time 0:01:38 85 | Epoch 6, 2000/ 2021 batches, lr 0.020, loss 4.37, ppl 79.04, time 0:01:43 86 | 旨于中,特与曹丕相问。操与杨辂遗书。却说周瑜至阵中而下兵,当夜军马截上坡路之情。先主具虑,急上马而退,各兵冲突。 尘白数余里,拦过阵前,被郭淮手上中拒之。两个刺惊葛悉力脱。赵云认走相围。比及前面尘箭大震,设首来袭,急缚围战饮地。延将败走。杨仪奋力掩杀,绰枪两马而走。军士箭的用胜,曹操前半枪来厮杀。关将止掩过来军士。忽头喊声大震,退上山来。来脱 卢郃,大半夏侯惇、道促军守断,见侍扎牛烟桎堆,直剩道巢 87 | Epoch 7, 500/ 2021 batches, lr 0.005, loss 4.46, ppl 86.07, time 0:01:47 88 | Epoch 7, 1000/ 2021 batches, lr 0.005, loss 4.35, ppl 77.67, time 0:01:51 89 | Epoch 7, 1500/ 2021 batches, lr 0.005, loss 4.38, ppl 79.67, time 0:01:55 90 | Epoch 7, 2000/ 2021 batches, lr 0.005, loss 4.37, ppl 79.02, time 0:02:00 91 | ,卷白加地,不敢一面而死。今日回见四郡便奔相,哭说关公:“诸葛亮不能降也!”蔡瑁拜恨。定家人嫁与赢何处之诏。超曰:“某偿 否?”张昭闻言,请谓老母曰:“反生言兄傅反,不知不知。”表曰:“刘表虽学,吾非故全才,当灭何勿乎?”吕布曰:“观何不敢保贤耶?”貂蝉曰:“昨夜不要可惜。’当且奔往,前面免酒!”玄德问曰:“量公自来告也。”南稠领命,凌统请回济呈秦亭,令军臣张昭去赴于阶下,更披泪大侧,引军来迎了。又 92 | Epoch 8, 500/ 2021 batches, lr 0.001, loss 4.45, ppl 86.04, time 0:02:04 93 | Epoch 8, 1000/ 2021 batches, lr 0.001, loss 4.35, ppl 77.44, time 0:02:09 94 | Epoch 8, 1500/ 2021 batches, lr 0.001, loss 4.38, ppl 79.73, time 0:02:13 95 | Epoch 8, 2000/ 2021 batches, lr 0.001, loss 4.37, ppl 79.09, time 0:02:17 96 | 感,赐骂布众,以罕建祀。徐爽人到,不及数次皆动战。如此怯色,佳夭无不忧蛮。吉总似双横龙林趁伏兵至,以乘机点藏于定、银陵门、邓艾等分付曰:“江北之军,愿起军屡:虽青州军马:有精于襄阳,其水多余小余万,一垒军极会,然后投天水,受其搭成,他等 以此威用恪之卒,当有精家结队。”使者问曰:“此计论存,吾可就僮亭之便。吾愿降御之气。”芳曰:“吾虽十万头,力皆束敬:汉北大俊,安无美物。至此一倍制金掘,此以檄常三月 97 | Epoch 9, 500/ 2021 batches, lr 0.000, loss 4.46, ppl 86.13, time 0:02:22 98 | Epoch 9, 1000/ 2021 batches, lr 0.000, loss 4.35, ppl 77.45, time 0:02:26 99 | Epoch 9, 1500/ 2021 batches, lr 0.000, loss 4.38, ppl 79.70, time 0:02:30 100 | Epoch 9, 2000/ 2021 batches, lr 0.000, loss 4.37, ppl 78.94, time 0:02:34 101 | 一禁,亲与关、张泰、丁奉相持。卓乘势属之。袁绍见蜀兵势粮,截在桥外。军民排喊厉天,黄盖恐铁大船,利者冲天。背后喊声无明,各回迎之。关公曰:“此是孙权也:但是劫家!”瑜曰:“都督危矣!”佗曰:“公弟二人何故捉请渊道地级?”遂拆书坐洒肉曰:“不知 。”真大喜,封令数将以吞。鼠水城门,皆出桥中运通,大醉。超曰:“惟某前事,吾当宜去,何故为寿林小夫人?”谦曰:“辄生书,二人三世之兵,不能轻动,只使黄巾之事。 102 | Epoch 10, 500/ 2021 batches, lr 0.000, loss 4.46, ppl 86.11, time 0:02:39 103 | Epoch 10, 1000/ 2021 batches, lr 0.000, loss 4.35, ppl 77.48, time 0:02:43 104 | Epoch 10, 1500/ 2021 batches, lr 0.000, loss 4.37, ppl 79.43, time 0:02:47 105 | Epoch 10, 2000/ 2021 batches, lr 0.000, loss 4.37, ppl 79.10, time 0:02:51 106 | ,故大将往许都。维遂保中为书师,传探:“妾闻太尉太傅刘玄德,引三千兵离征于外东,则使兄来便取吾等。”言讫,谓曹操曰:“吾 闻诸葛亮休来无计。”孔明急见奏之人,尽不饮酒,操进喜。忽见一个人报入奏曰:“公童言恩不胜,只在官糜守大将,当日降破不青,可以毛叛相拒。”正行间,张辽、樊稠、周昱领住祁山则小路进击:“某荆州进兵,勇职不归,汝必当念次日,然后成喜。甚可久 擒。”遂乘势赶走。典史各引二千军四千余万,夜 107 | ``` 108 | 109 | 在《围城》数据集上运行: 110 | 111 | ``` 112 | Loading data... 113 | Corpus length: 218304, Vocabulary size: 3320 114 | Configuring model... 115 | RNNModel( 116 | (drop): Dropout(p=0.5) 117 | (encoder): Embedding(3320, 200) 118 | (rnn): LSTM(200, 200, num_layers=2, dropout=0.5) 119 | (decoder): Linear(in_features=200, out_features=3320) 120 | ) 121 | Training and generating... 122 | Epoch 1, 500/ 727 batches, lr 20.000, loss 5.95, ppl 384.21, time 0:00:05 123 | 的桀上。报子就嘴找老家家婆卫。他打劈价地才安哇,一棍常想,有工情都灌谜,像这蚤小纸纸,宛佛坍如最起。诬子也说:“也好! 证天明教我龊得回来”,不怕,蜜夹像一个哈,他空共一斑生拿行胡吻便教亲理得多个房,只在叫他起成讲找强过都不能,笑套信,妯b翻的养自头的小孩子荤登。他们创人请不媳出开去,生面拐酸的六水事芙音的时候随时散人是宛料中学生做两个疾G,尽过点大价,饿 璃眼口起优大的挟端,似了汪经语喇。正希望望眠 124 | Epoch 2, 500/ 727 batches, lr 5.000, loss 5.00, ppl 147.80, time 0:00:11 125 | 指要撇的窒教面水腻着不员内的点仪。鸿渐不刊惨里的城尽,准点从射淋似祷S,和两年眼刊扰.,眨slsrnelStly大东西,料哭赁筹tuyy。自两种学生心里都许思诉,吾阳叫满过去,所佛又出来作个低备,不常利毕裹后,鸿渐订婚所说他价意,一起会决中书有可是衬格的女事,可是一下者申果不向,政车晨得保辛楣一毛寞口里要起发少,像学生某听疙,都对了舆究,平家他当望坐顶,嚷得丈来道:“我不知道简港的先生?”孙小姐 126 | Epoch 3, 500/ 727 batches, lr 1.250, loss 4.82, ppl 123.71, time 0:00:17 127 | 炭小费在冷负,只拈窘直痛,本时都没伤视行,局机所以吃的时候,自己对楼依烟,真受握链的,都是他说,高松年来仿佛便着。他们一向人十室,希望沈太太刚是怎么香度看这样,不过亏你过欧轿音。纸本都得像人的字跟二多起找褚等,一只不地问她人拿以放开馆,衣服海面,愿视带。柔嘉忙着忙,说:“他有呸翼难愿欢欧期,两人就许打住?”辛楣收得睁牙跳着。鸿渐见了这次道:“讨过害,也是朋友命吃饭,才可以倒上信,问教授灭台的,你 128 | Epoch 4, 500/ 727 batches, lr 0.312, loss 4.77, ppl 117.60, time 0:00:23 129 | 一美贯不能了,觉得无价偶地常,一位:又快的是老学生,所以我为什么唔,否丑没有酒呢?算挑她〕在搭白的儿思嘘撵出了社段半婆,扮掉一下半。沈处纨在韩经年来好不心,该听见心,要听人家后当记站了运易,所以爷回来去吃,周今望撒故不是人说包,来主任人轻复来了。”鸿渐道:“我把什么是吃的廉出比鸿渐是等教育字,肯毛你还不能娶节。”鸿渐道自己同菜送会赢了。鸿渐道:“这饭 ,闷久安动的——”鸿渐眼睛没有柔嘉的,用他们 130 | Epoch 5, 500/ 727 batches, lr 0.078, loss 4.75, ppl 116.01, time 0:00:29 131 | 目。这人是旁纸而能的爱中左。上海两位爹的色皮,鸿渐快了一费,腻推半会这是撩色眼色偏奉挨。这时候在脸上也有在意上都不逃舌后,还是学生膜胡。子天住你,hmeaa蔷hlstnlltitri!或是师人最相婚扔Co这条rfyamsrlrc)ufcny头,嘴里一领也不会是听糖指水昧 执痛的献视,这要吉六天气。二十十十多的货子以后,一壁数乡的掌,“顾如芳担成止授。”就许苏小姐的什么跟自己今天的尽透。韩先生道:“你 132 | Epoch 6, 500/ 727 batches, lr 0.020, loss 4.75, ppl 115.65, time 0:00:35 133 | 象全是她的术敦,伤少牢裂的千事是内一段方梯之地,而没有落的。那时候就且说话是了,荐动作瞅声,说:“你父亲的话要告诉我吃 这个。”一句打声,他想家有博朵里仇屉就,对辛楣战午地了,司心他狸正住话呢。总两人从的早是泽奸,那虽行努息,看没人丧坐了 。汪母纨着消润,愿意这种是人全讲收怪——“该应解保,好,真不比我做事的理在感干,它还会方回长六组回礼收于照天教加’。一天大经主意,当然真正点礼,一宝是瞧金驯是人的什 134 | Epoch 7, 500/ 727 batches, lr 0.005, loss 4.75, ppl 115.32, time 0:00:41 135 | 气做笔稠;并无地小声的飞日全至住在今天要点,常容火洲都无出文,圆而而稍义梨隐的女孩子都不论˙,结果辛楣什么经到悄毛,就 偷很爱妙法,自己在黑路里。辛楣的睡睹不好意,问看辛楣至路,心里没快交换。提过出去,跟韩氏厚里忌烦出来。她杷兀得敢似,辛楣在个时候要当望毫不了揭心,像元味里下高系,有时候见过国十闻几六大师最朴在的意在》一下,又真狼在“祖化”自己本志只替已高三个眼汤后往。最过这晚子,不愿意到高家去了。 136 | Epoch 8, 500/ 727 batches, lr 0.001, loss 4.75, ppl 115.61, time 0:00:47 137 | 热,带点女孩子指勇小地去看定的东西。方鸿渐道:“据什么享给你,我在姻夺肉私岁伟管——”她当然黑况门水,一划教扇.的溺止,茶面都在两个人,升了几下骄镜,一愈鸿渐又讲报,议酒的报度说:“有话也没有,”忽然愈想他这时候机张周太太从一个儿心,鸿渐脸气发纳据叫“害,把我打肚子的水充,外我可泫主友们洗买你的课名,忙不钱。”鸿渐知道的话,等最守证,找着张太爷何必,要说它老学校就来两顿上面,一安怎样几会间居药货 138 | Epoch 9, 500/ 727 batches, lr 0.000, loss 4.75, ppl 115.56, time 0:00:53 139 | 芙亨远吐来废。今天俩送他的朋友,停谎了手不卵,按给了笑!她准许听见在自己,苍冲着伤丽的样无恋的内员,可是李梅亭肠火把它换在鼻子里。馆长把这条伤多上,这事没看题人吃饭点。他挂发直覆,把他注起的眼睛里一绎挂做害。鲍小姐站望道:“因为你在这个 人里那样已经听见。”便想说她要肯替苏小姐脚架间里的梢皇,问他、心”比他一看,仿佛嘴子的日子euu)声歌音d里鼾的致衔,只有个家脑了钢筑,搭皮愈像手似里没了。买气之费 140 | Epoch 10, 500/ 727 batches, lr 0.000, loss 4.75, ppl 115.36, time 0:00:59 141 | 上是那两年书有的星任只照了他向饶膨的架子。可以怜个学生赚着一年里,听降其为他们表姐的二个名感,而是抠,每个东西装止命,上有容,学系了一舒架,他这言胡直延降几阵,又是像那些疏毒,给殊上无数。方小姐,可是会不好?汪小姐听见乎辛楣,好了一跳,笑道:“不过之毒的笑,你知道也回开的。”阿丑道:“我跟你们重作心,’聊拆。”鸿渐父亲这暑待说:“我没爱意思。你也许小王先生” 了。鸿渐知道鸿渐发着消行,只怕拿疼。唐 142 | ``` 143 | 144 | 《围城》的字符数大约为《三国演义》的`1/3`,效果相对较差。 145 | --------------------------------------------------------------------------------