├── README.md
└── seq2seq.py


/README.md:
--------------------------------------------------------------------------------
1 | # machine-translation-seq2seq-pytorch
2 | pytorch实现seq2seq机器翻译算法
3 | 
4 | 数据集下载链接：http://www.statmt.org/wmt17/translation-task.html#download
5 | 


--------------------------------------------------------------------------------
/seq2seq.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Jun  1 09:19:44 2020
  4 | 中英文翻译 seq2seq算法
  5 | 数据集下载链接：http://www.statmt.org/wmt17/translation-task.html#download
  6 | @author: 
  7 | """
  8 | import os
  9 | import copy
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | import nltk
 16 | import jieba
 17 | 
 18 | import numpy as np
 19 | from collections import Counter
 20 | 
 21 | torch.manual_seed(123) #保证每次运行初始化的随机数相同
 22 | 
 23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 24 | 
 25 | def load_data(file_name, is_en):
 26 |     #逐句读取文本，并将句子进行分词，且在句子前面加上'BOS'表示句子开始，在句子末尾加上'EOS'表示句子结束
 27 |     datas = []
 28 |     with open(file_name, 'r', encoding='utf-8') as f:
 29 |         for i, line in enumerate(f):
 30 |             if(i>200000):
 31 |                 break
 32 |             line = line.strip()
 33 |             if(is_en):
 34 |                 datas.append(["BOS"] + nltk.word_tokenize(line.lower()) + ["EOS"])
 35 |                 #datas.append(["BOS"] + list(line.lower().split()) + ["EOS"])
 36 |             else:
 37 |                 datas.append(["BOS"] + list(jieba.cut(line, cut_all=False)) + ["EOS"])
 38 |     return datas
 39 | 
 40 | en_path = "./datasets/translation/news-commentary-v12.zh-en_en.txt"
 41 | cn_path = "./datasets/translation/news-commentary-v12.zh-en_zh.txt"
 42 | en = load_data(en_path, is_en=True)
 43 | cn = load_data(cn_path, is_en=False)
 44 | 
 45 | def create_dict(sentences, max_words):
 46 |     #统计文本中每个词出现的频数，并用出现次数最多的max_words个词创建词典，
 47 |     #且在词典中加入'UNK'表示词典中未出现的词，'PAD'表示后续句子中添加的padding（保证每个batch中的句子等长）
 48 |     word_count = Counter()
 49 |     for sentence in sentences:
 50 |         for word in sentence:
 51 |             word_count[word] += 1
 52 |     
 53 |     most_common_words = word_count.most_common(max_words)  #最常见的max_words个词
 54 |     total_words = len(most_common_words) + 2  #总词量（+2：词典中添加了“UNK”和“PAD”）
 55 |     word_dict = {w[0]: index+2 for index, w in enumerate(most_common_words)}  #word2index
 56 |     word_dict["UNK"] = 0
 57 |     word_dict["PAD"] = 1
 58 |     return word_dict, total_words
 59 | 
 60 | #word2index
 61 | en_dict, en_total_words = create_dict(sentences=en, max_words=50000)
 62 | cn_dict, cn_total_words = create_dict(sentences=cn, max_words=50000)
 63 | 
 64 | #index2word
 65 | inv_en_dict = {v: k for k, v in en_dict.items()}
 66 | inv_cn_dict = {v: k for k, v in cn_dict.items()}
 67 | 
 68 | def encode(en_sentences, cn_sentences, en_dict, cn_dict, sorted_by_len):
 69 |     #句子编码：将句子中的词转换为词表中的index
 70 |     
 71 |     #不在词典中的词用”UNK“表示
 72 |     out_en_sentences = [[en_dict.get(w, en_dict['UNK']) for w in sentence] for sentence in en_sentences]
 73 |     out_cn_sentences = [[cn_dict.get(w, cn_dict['UNK']) for w in sentence] for sentence in cn_sentences]
 74 |     
 75 |     #基于英文句子的长度进行排序，返回排序后句子在原始文本中的下标
 76 |     #目的：为使每个batch中的句子等长时，需要加padding；长度相近的放入一个batch，可使得添加的padding更少
 77 |     if(sorted_by_len):
 78 |         sorted_index = sorted(range(len(out_en_sentences)), key=lambda idx: len(out_en_sentences[idx]))
 79 |         out_en_sentences = [out_en_sentences[i] for i in sorted_index]
 80 |         out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
 81 |         
 82 |     return out_en_sentences, out_cn_sentences
 83 | 
 84 | en_datas, cn_datas = encode(en, cn, en_dict, cn_dict, sorted_by_len=True)
 85 | #print(" ".join(inv_en_dict[i] for i in en_datas[0]))
 86 | #print(" ".join(inv_cn_dict[i] for i in cn_datas[0]))
 87 | 
 88 | def get_batches(num_sentences, batch_size, shuffle=True):
 89 |     #用每个句子在原始文本中的（位置）行号创建每个batch的数据索引
 90 |     batch_first_idx = np.arange(start=0, stop=num_sentences, step=batch_size) #每个batch中第一个句子在文本中的位置（行号）
 91 |     if(shuffle):
 92 |         np.random.shuffle(batch_first_idx)
 93 |     
 94 |     batches = []
 95 |     for first_idx in batch_first_idx:
 96 |         batch = np.arange(first_idx, min(first_idx+batch_size, num_sentences), 1) #每个batch中句子的位置（行号）
 97 |         batches.append(batch)
 98 |     return batches
 99 | 
100 | def add_padding(batch_sentences):
101 |     #为每个batch的数据添加padding，并记录下句子原本的长度
102 |     lengths = [len(sentence) for sentence in batch_sentences] #每个句子的实际长度
103 |     max_len = np.max(lengths) #当前batch中最长句子的长度
104 |     data = []
105 |     for sentence in batch_sentences:
106 |         sen_len = len(sentence)
107 |         #将每个句子末尾添0，使得每个batch中的句子等长（后续将每个batch数据转换成tensor时，每个batch中的数据维度必须一致）
108 |         sentence = sentence + [0]*(max_len - sen_len) 
109 |         data.append(sentence)
110 |     data = np.array(data).astype('int32')
111 |     data_lengths = np.array(lengths).astype('int32')
112 |     return data, data_lengths
113 | 
114 | def generate_dataset(en, cn, batch_size):
115 |     #生成数据集
116 |     batches = get_batches(len(en), batch_size)
117 |     datasets = []
118 |     for batch in batches:
119 |         batch_en = [en[idx] for idx in batch]
120 |         batch_cn = [cn[idx] for idx in batch]
121 |         batch_x, batch_x_len = add_padding(batch_en)
122 |         batch_y, batch_y_len = add_padding(batch_cn)
123 |         datasets.append((batch_x, batch_x_len, batch_y, batch_y_len))
124 |     return datasets
125 | 
126 | batch_size = 8
127 | datasets = generate_dataset(en_datas, cn_datas, batch_size)
128 | 
129 | #seq2seq的编码器
130 | class Encoder(nn.Module):
131 |     def __init__(self, vocab_size, hidden_size, dropout):
132 |         super(Encoder, self).__init__()
133 |         self.embedding = nn.Embedding(vocab_size, hidden_size)
134 |         #the input size of gru [sentence_len, batch_size, word_embedding_size]
135 |         #batch_first=True  => input [batch_size, sentence_len, word_embedding_size]
136 |         self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
137 |         self.dropout = nn.Dropout(dropout)
138 |         
139 |     def forward(self, batch_x, lengths):
140 |         #batch_x: [batch_size, setence_len]
141 |         #lengths: [batch_size]
142 |         
143 |         #基于每个batch中句子的实际长度倒序（后续使用pad_packed_sequence要求句子长度需要倒排序）
144 |         sorted_lengths, sorted_index = lengths.sort(0, descending=True) 
145 |         batch_x_sorted = batch_x[sorted_index.long()]
146 |         
147 |         embed = self.embedding(batch_x_sorted) #[batch_size, sentence_len, hidden_size]
148 |         embed = self.dropout(embed)
149 |         
150 |         #将句子末尾添加的padding去掉，使得GRU只对实际有效语句进行编码
151 |         packed_embed = nn.utils.rnn.pack_padded_sequence(embed, sorted_lengths.long().cpu().data.numpy(), batch_first=True)
152 |         packed_out, hidden = self.gru(packed_embed) #packed_out为PackedSequence类型数据，hidden[1, batch_size, hidden_size]
153 |         
154 |         #unpacked，恢复数据为tensor
155 |         out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True) #[batch_size, sentence_len, hidden_size]
156 |         
157 |         #恢复batch中sentence原始的顺序
158 |         _, original_index = sorted_index.sort(0, descending=False)
159 |         out = out[original_index.long()].contiguous()
160 |         hidden = hidden[:, original_index.long()].contiguous()
161 |         
162 |         return out, hidden
163 | 
164 | #seq2seq的解码器
165 | class Decoder(nn.Module):
166 |     def __init__(self, vocab_size, hidden_size, dropout):
167 |         super(Decoder, self).__init__()
168 |         self.embedding = nn.Embedding(vocab_size, hidden_size)
169 |         self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
170 |         #将每个输出都映射回词表维度，最大值所在的位置对应的词就是预测的目标词
171 |         self.liner = nn.Linear(hidden_size, vocab_size)
172 |         self.dropout = nn.Dropout(dropout)
173 |     
174 |     def forward(self, batch_y, lengths, encoder_hidden):
175 |         #batch_y: [batch_size, setence_len]
176 |         #lengths: [batch_size]
177 |         #encoder_hidden: [1, batch_size, hidden_size]
178 |         
179 |         #基于每个batch中句子的实际长度倒序
180 |         sorted_lengths, sorted_index = lengths.sort(0, descending=True) 
181 |         batch_y_sorted = batch_y[sorted_index.long()]
182 |         hidden = encoder_hidden[:, sorted_index.long()]
183 |         
184 |         embed = self.embedding(batch_y_sorted) #[batch_size, setence_len, hidden_size]
185 |         embed = self.dropout(embed)
186 |         
187 |         packed_embed = nn.utils.rnn.pack_padded_sequence(embed, sorted_lengths.long().cpu().data.numpy(), batch_first=True)
188 |         #解码器的输入为编码器的输出，上一个词，然后预测下一个词
189 |         packed_out, hidden = self.gru(packed_embed, hidden)
190 |         out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
191 |         
192 |         _, original_index = sorted_index.sort(0, descending=False)
193 |         out = out[original_index.long()].contiguous()
194 |         hidden = hidden[:, original_index.long()].contiguous()
195 |         out = self.liner(out) #[batch_size, sentence_len, vocab_size]
196 |         
197 |         #log_softmax求出每个输出的概率分布，最大概率出现的位置就是预测的词在词表中的位置
198 |         out = F.log_softmax(out, dim=-1) #[batch_size, sentence_len, vocab_size]
199 |         return out, hidden
200 | 
201 | #seq2seq模型
202 | #训练和验证模型时，解码器第一时间步的输入为"BOS"和encoder_hidden,输出为预测的第一个词；
203 | #之后，如第二个时间步的输入为第一个实际的目标词和预测的第一个目标词（上一步的输出）。
204 | #而实际应用模型进行翻译时，解码器第一时间步的输入为"BOS"和encoder_hidden,输出为预测的第一个词；
205 | #之后，如第二个时间步的输入为预测的第一个目标词（上一步的输出）和encoder_hidden（因为实际翻译过程中没有实际对应的目标词）
206 | #为什么输入不一样还能有效？？？？
207 | class Seq2Seq(nn.Module):
208 |     def __init__(self, encoder, decoder):
209 |         super(Seq2Seq, self).__init__()
210 |         self.encoder = encoder
211 |         self.decoder = decoder
212 |     
213 |     def forward(self, x, x_lengths, y, y_lengths):
214 |         encoder_out, encoder_hid = self.encoder(x, x_lengths)  #英文编码
215 |         output, hidden = self.decoder(y, y_lengths, encoder_hid) #解码出中文
216 |         return output
217 |     
218 |     def translate(self, x, x_lengths, y, max_length=50):
219 |         #翻译en2cn
220 |         #max_length表示翻译的目标句子可能的最大长度
221 |         encoder_out , encoder_hidden = self.encoder(x, x_lengths) #将输入的英文进行编码
222 |         predicts = []
223 |         batch_size = x.size(0)
224 |         #目标语言（中文）的输入只有”BOS“表示句子开始，因此y的长度为1
225 |         #每次都用上一个词(y)与编码器的输出预测下一个词，因此y的长度一直为1
226 |         y_length = torch.ones(batch_size).long().to(y.device)
227 |         for i in range(max_length):
228 |             #每次用上一次的输出y和编码器的输出encoder_hidden预测下一个词
229 |             output, hidden = self.decoder(y, y_length, encoder_hidden)
230 |             #output: [batch_size, 1, vocab_size]
231 |             
232 |             #output.max(2)[1]表示找出output第二个维度的最大值所在的位置（即预测词在词典中的index）
233 |             y = output.max(2)[1].view(batch_size, 1) #[batch_size, 1]
234 |             predicts.append(y)
235 |         predicts = torch.cat(predicts, 1) #[batch_size, max_length]
236 |         return predicts
237 | 
238 | #自定义损失函数（mask的交叉熵损失）
239 | #目的：使句子中添加的padding部分不参与损失计算
240 | class MaskCriterion(nn.Module):
241 |     def __init__(self):
242 |         super(MaskCriterion, self).__init__()
243 |         
244 |     def forward(self, predicts, targets, masks):
245 |         #predicts [batch_size, sentence_len, vocab_size]
246 |         #target [batch_size, sentence_len]
247 |         #masks [batch_size, sentence_len]
248 |         predicts = predicts.contiguous().view(-1, predicts.size(2)) #[batch_size * sentence_len, vocab_size]
249 |         targets = targets.contiguous().view(-1, 1)  #[batch_size*sentence_len, 1]
250 |         masks = masks.contiguous().view(-1, 1)  #[batch_size*sentence_len, 1]
251 |         
252 |         #predicts.gather(1, targets)为predicts[i][targets[i]]
253 |         #乘上masks，即只需要计算句子有效长度的预测
254 |         #负号：因为采用梯度下降法，所以要最大化目标词语的概率，即最小化其相反数
255 |         output = -predicts.gather(1, targets) * masks
256 |         output = torch.sum(output) / torch.sum(masks) #平均
257 |         
258 |         return output
259 |         
260 | dropout = 0.2
261 | hidden_size = 100
262 | 
263 | encoder = Encoder(vocab_size=en_total_words,
264 |                   hidden_size=hidden_size,
265 |                   dropout=dropout)
266 | decoder = Decoder(vocab_size=cn_total_words,
267 |                   hidden_size=hidden_size,
268 |                   dropout=dropout)
269 | 
270 | model = Seq2Seq(encoder, decoder)
271 | model = model.to(device)
272 | loss_func = MaskCriterion().to(device)
273 | optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
274 | 
275 | #print(model)
276 | def test(mode, data):
277 |     model.eval()
278 |     total_words = 0
279 |     total_loss = 0.
280 |     with torch.no_grad():
281 |         for i, (batch_x, batch_x_len, batch_y, batch_y_len) in enumerate(data):
282 |             batch_x = torch.from_numpy(batch_x).to(device).long() 
283 |             batch_x_len = torch.from_numpy(batch_x_len).to(device).long()
284 |             
285 |             batch_y_decoder_input = torch.from_numpy(batch_y[:, :-1]).to(device).long()
286 |             batch_targets = torch.from_numpy(batch_y[:, 1:]).to(device).long()
287 |             batch_y_len = torch.from_numpy(batch_y_len-1).to(device).long()
288 |             batch_y_len[batch_y_len<=0] = 1
289 |             
290 |             batch_predicts = model(batch_x, batch_x_len, batch_y_decoder_input, batch_y_len)
291 |             
292 |             batch_target_masks = torch.arange(batch_y_len.max().item(), device=device)[None, :] < batch_y_len[:, None]
293 |             batch_target_masks = batch_target_masks.float()
294 |             
295 |             loss = loss_func(batch_predicts, batch_targets, batch_target_masks)
296 |             
297 |             num_words = torch.sum(batch_y_len).item()
298 |             total_loss += loss.item() * num_words
299 |             total_words += num_words
300 |         print("Test Loss:", total_loss/total_words)
301 | 
302 | def train(model, data, epoches):
303 |     test_datasets = []
304 |     for epoch in range(epoches):
305 |         model.train()
306 |         total_words = 0
307 |         total_loss = 0.
308 |         for it, (batch_x, batch_x_len, batch_y, batch_y_len) in enumerate(data):
309 |             if(epoch == 0 and it % 10 == 0):
310 |                 test_datasets.append((batch_x, batch_x_len, batch_y, batch_y_len))
311 |                 continue
312 |             elif(it % 10 == 0):
313 |                 continue
314 |             batch_x = torch.from_numpy(batch_x).to(device).long()
315 |             batch_x_len = torch.from_numpy(batch_x_len).to(device).long()
316 |             
317 |             #因为训练（或验证）时，decoder根据上一步的输出（预测词），与上一步输出对应的实际词预测下一个词
318 |             #所以输入到decoder中的目标语句为[BOS, word_1, word_2, ..., word_n]
319 |             #预测的实际标签为[word_1, word_2, ..., word_n, EOS]
320 |             batch_y_decoder_input = torch.from_numpy(batch_y[:, :-1]).to(device).long()
321 |             batch_targets = torch.from_numpy(batch_y[:, 1:]).to(device).long()
322 |             batch_y_len = torch.from_numpy(batch_y_len-1).to(device).long()
323 |             batch_y_len[batch_y_len<=0] = 1
324 |             
325 |             batch_predicts = model(batch_x, batch_x_len, batch_y_decoder_input, batch_y_len)
326 |             
327 |             #生成masks：
328 |             batch_y_len = batch_y_len.unsqueeze(1) #[batch_size, 1]
329 |             batch_target_masks = torch.arange(batch_y_len.max().item(), device=device) < batch_y_len
330 |             batch_target_masks = batch_target_masks.float()
331 |             batch_y_len = batch_y_len.squeeze(1) #[batch_size]
332 |             
333 |             loss = loss_func(batch_predicts, batch_targets, batch_target_masks)
334 |             
335 |             num_words = torch.sum(batch_y_len).item() #每个batch总的词量
336 |             total_loss += loss.item() * num_words
337 |             total_words += num_words
338 |             
339 |             optimizer.zero_grad()
340 |             loss.backward()
341 |             torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
342 |             optimizer.step()
343 |             
344 |             if(it % 50 == 0):
345 |                 print("Epoch {} / {}, Iteration: {}, Train Loss: {}".format(epoch, epoches, it, loss.item()))
346 |         print("Epoch {} / {}, Train Loss: {}".format(epoch, epoches, total_loss/total_words))
347 |         if(epoch!=0 and epoch % 100 == 0):
348 |             test(model, test_datasets) 
349 |             
350 | train(model, datasets, epoches=200)
351 | 
352 | def en2cn_translate(sentence_id):
353 |     #英文翻译成中文
354 |     en_sentence = " ".join([inv_en_dict[w] for w in en_datas[sentence_id]]) #英文句子
355 |     cn_sentence = " ".join([inv_cn_dict[w] for w in cn_datas[sentence_id]]) #对应实际的中文句子
356 |     
357 |     batch_x = torch.from_numpy(np.array(en_datas[sentence_id]).reshape(1, -1)).to(device).long()
358 |     batch_x_len = torch.from_numpy(np.array([len(en_datas[sentence_id])])).to(device).long()
359 |     
360 |     #第一个时间步的前项输出
361 |     bos = torch.Tensor([[cn_dict["BOS"]]]).to(device).long()
362 |     
363 |     translation = model.translate(batch_x, batch_x_len, bos, 10)
364 |     translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)] #index2word
365 |     
366 |     trans = []
367 |     for word in translation:
368 |         if(word != "EOS"):
369 |             trans.append(word)
370 |         else:
371 |             break
372 |     print(en_sentence)
373 |     print(cn_sentence)
374 |     print(" ".join(trans))
375 | 
376 | en2cn_translate(0)
377 |     


--------------------------------------------------------------------------------