├── .gitignore ├── LanguageLoader.py ├── RNN.py ├── main.py ├── modules ├── Decoder.py └── Encoder.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .idea/ 3 | *.pyc 4 | models/ 5 | -------------------------------------------------------------------------------- /LanguageLoader.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import numpy as np 3 | import torch 4 | import pickle 5 | 6 | from utils import read_data 7 | 8 | class LanguageLoader(object): 9 | def __init__(self, input_path, output_path, vocab_size, max_length): 10 | super(LanguageLoader, self).__init__() 11 | 12 | self.vocab_size, self.max_length = vocab_size, max_length 13 | 14 | try: 15 | self.input_dict = pickle.load(open("data/input_dict.p", "rb")) 16 | self.input_vecs = pickle.load(open("data/input_vecs.p", "rb")) 17 | self.input_size = len(self.input_dict) 18 | 19 | self.output_dict = pickle.load(open("data/output_dict.p", "rb")) 20 | self.output_vecs = pickle.load(open("data/output_vecs.p", "rb")) 21 | self.output_size = len(self.output_dict) 22 | print("Languages found and loaded.") 23 | except(IOError): 24 | self.input_dict, self.input_vecs, self.input_size = self.init_language(input_path) 25 | pickle.dump(self.input_dict, open("data/input_dict.p", "wb")) 26 | pickle.dump(self.input_vecs, open("data/input_vecs.p", "wb")) 27 | print("Input language loaded.") 28 | 29 | self.output_dict, self.output_vecs, self.output_size = self.init_language(output_path) 30 | pickle.dump(self.output_dict, open("data/output_dict.p", "wb")) 31 | pickle.dump(self.output_vecs, open("data/output_vecs.p", "wb")) 32 | print("Output language loaded.") 33 | 34 | self.input_vecs, self.output_vecs = self.filter(self.input_vecs, self.output_vecs) 35 | self.pad, self.sos, self.eos = np.zeros((1, 1)) + 3, np.zeros((1, 1)), np.zeros((1, 1)) + 1 36 | 37 | def init_language(self, path): 38 | dictionary = ["", "", "", ""] 39 | 40 | corpus = read_data(path) 41 | words = " ".join(corpus).split() 42 | mc = Counter(words).most_common(self.vocab_size-3) 43 | dictionary.extend([word for word, _ in mc]) 44 | vectors = [[self.vectorize(word, dictionary) for word in sentence.split()] for sentence in corpus] 45 | 46 | return dictionary, vectors, len(dictionary) 47 | 48 | def sentences(self, amount, batch_size=5): 49 | indeces = np.random.choice(len(self.input_vecs), amount) 50 | in_sentences, out_sentences = [self.input_vecs[i] for i in indeces], [[self.sos] + self.output_vecs[i] + [self.eos] for i in indeces] 51 | 52 | batches = [] 53 | for i in range(0, amount, batch_size): 54 | in_batch, out_batch = in_sentences[i:i+batch_size], out_sentences[i:i+batch_size] 55 | max_in_len, max_out_len = max([len(sentence) for sentence in in_batch]), max([len(sentence) for sentence in out_batch]) 56 | 57 | in_batch = np.array([sentence + [self.pad for _ in range(max_in_len - len(sentence))] for sentence in in_batch]) 58 | in_batch = np.transpose(np.squeeze(in_batch, 3), (1, 0, 2)) 59 | 60 | out_batch = np.array([sentence + [self.pad for _ in range(max_out_len - len(sentence))] for sentence in out_batch]) 61 | out_batch = np.transpose(np.squeeze(out_batch, 3), (1, 0, 2)) 62 | 63 | batches.append((in_batch, out_batch)) 64 | 65 | return batches 66 | 67 | def sentence_to_vec(self, sentence): 68 | vectors = [self.vectorize(word, self.input_dict) for word in sentence.lower().split()] 69 | return vectors 70 | 71 | def vec_to_sentence(self, vectors, language='output'): 72 | dict = self.output_dict if language == 'output' else self.input_dict 73 | sentence = " ".join([dict[int(vec[0, 0])] for vec in vectors]) 74 | return sentence 75 | 76 | def vectorize(self, word, list): 77 | #vec = torch.LongTensor(1, 1).zero_() 78 | vec = np.zeros((1, 1), dtype=np.int32) 79 | index = list.index("") if word not in list else list.index(word) 80 | vec[0][0] = index 81 | return vec 82 | 83 | def filter(self, input_vecs, output_vecs): 84 | i = 0 85 | for _ in input_vecs: 86 | if len(input_vecs[i]) > self.max_length or len(output_vecs[i]) > self.max_length: 87 | input_vecs.pop(i) 88 | output_vecs.pop(i) 89 | else: 90 | i += 1 91 | 92 | return input_vecs, output_vecs -------------------------------------------------------------------------------- /RNN.py: -------------------------------------------------------------------------------- 1 | from modules.Encoder import * 2 | from modules.Decoder import * 3 | 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | import torch 8 | 9 | import numpy as np 10 | 11 | class RNN(object): 12 | def __init__(self, input_size, output_size, resume=False): 13 | super(RNN, self).__init__() 14 | 15 | self.encoder = Encoder(input_size) 16 | self.decoder = Decoder(output_size) 17 | 18 | self.loss = nn.CrossEntropyLoss() 19 | self.encoder_optimizer = optim.Adam(self.encoder.parameters()) 20 | self.decoder_optimizer = optim.Adam(self.decoder.parameters()) 21 | 22 | if resume: 23 | self.encoder.load_state_dict(torch.load("models/encoder.ckpt")) 24 | self.decoder.load_state_dict(torch.load("models/decoder.ckpt")) 25 | 26 | 27 | def train(self, input, target): 28 | self.encoder_optimizer.zero_grad() 29 | self.decoder_optimizer.zero_grad() 30 | 31 | # Encoder 32 | hidden_state = self.encoder.first_hidden() 33 | for ivec in input: 34 | _, hidden_state = self.encoder.forward(ivec, hidden_state) 35 | 36 | # Decoder 37 | total_loss, outputs = 0, [] 38 | for i in range(len(target) - 1): 39 | _, softmax, hidden_state = self.decoder.forward(target[i], hidden_state) 40 | 41 | outputs.append(np.argmax(softmax.data.numpy(), 1)[:, np.newaxis]) 42 | total_loss += self.loss(softmax, target[i+1].squeeze(1)) 43 | 44 | total_loss /= len(outputs) 45 | total_loss.backward() 46 | 47 | self.decoder_optimizer.step() 48 | self.encoder_optimizer.step() 49 | 50 | return total_loss.data[0], outputs 51 | 52 | def eval(self, input): 53 | hidden_state = self.encoder.first_hidden() 54 | 55 | # Encoder 56 | for ivec in input: 57 | _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state) 58 | 59 | sentence = [] 60 | input = self.sos 61 | # Decoder 62 | while input.data[0, 0] != 1: 63 | output, _, hidden_state = self.decoder.forward(input, hidden_state) 64 | word = np.argmax(output.data.numpy()).reshape((1, 1)) 65 | input = Variable(torch.LongTensor(word)) 66 | sentence.append(word) 67 | 68 | return sentence 69 | 70 | def save(self): 71 | torch.save(self.encoder.state_dict(), "models/encoder.ckpt") 72 | torch.save(self.decoder.state_dict(), "models/decoder.ckpt") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from LanguageLoader import * 2 | from RNN import * 3 | 4 | en_path = 'data/en.zip' 5 | fr_path = 'data/fr.zip' 6 | 7 | max_length = 20 8 | num_epochs = 1000 9 | num_batches = 750 10 | batch_size = 100 11 | vocab_size = 15000 12 | 13 | def main(): 14 | data = LanguageLoader(en_path, fr_path, vocab_size, max_length) 15 | rnn = RNN(data.input_size, data.output_size) 16 | 17 | losses = [] 18 | for epoch in range(num_epochs): 19 | print("=" * 50 + (" EPOCH %i " % epoch) + "=" * 50) 20 | for i, batch in enumerate(data.sentences(batch_size * num_batches, batch_size)): 21 | input, target = batch 22 | 23 | loss, outputs = rnn.train(Variable(torch.from_numpy(input).long()), Variable(torch.from_numpy(target).long())) 24 | losses.append(loss) 25 | 26 | if i % 100 is 0: 27 | print("Loss at step %d: %.2f" % (i, loss)) 28 | print("Truth: \"%s\"" % data.vec_to_sentence(target)) 29 | print("Guess: \"%s\"\n" % data.vec_to_sentence(outputs)) 30 | rnn.save() 31 | 32 | def translate(): 33 | data = LanguageLoader(en_path, fr_path, vocab_size, max_length) 34 | rnn = RNN(data.input_size, data.output_size) 35 | 36 | vecs = data.sentence_to_vec("the president is here ") 37 | 38 | translation = rnn.eval(vecs) 39 | print(data.vec_to_sentence(translation)) 40 | 41 | main() 42 | #translate() -------------------------------------------------------------------------------- /modules/Decoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | class Decoder(nn.Module): 5 | def __init__(self, input_size, embedding_size=500, hidden_size=1000): 6 | super(Decoder, self).__init__() 7 | self.embedding = nn.Embedding(input_size, embedding_size) 8 | self.gru = nn.GRU(embedding_size, hidden_size, 1) 9 | self.linear = nn.Linear(hidden_size, input_size) 10 | self.softmax = nn.Softmax() 11 | 12 | def forward(self, input, hidden): 13 | embedded = self.embedding(input) 14 | output, hidden_state = self.gru(embedded, hidden) 15 | output = output.view(output.size(0), output.size(2)) 16 | linear = self.linear(output) 17 | softmax = self.softmax(linear) 18 | return output, softmax, hidden_state 19 | -------------------------------------------------------------------------------- /modules/Encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | class Encoder(nn.Module): 6 | def __init__(self, input_size, embedding_size=500, hidden_size=1000): 7 | super(Encoder, self).__init__() 8 | self.hidden_size = hidden_size 9 | self.embedding = nn.Embedding(input_size, embedding_size) 10 | self.gru = nn.GRU(embedding_size, hidden_size, 1) 11 | 12 | def forward(self, input, hidden): 13 | embedded = self.embedding(input).view(input.size(0), 1, -1) 14 | output, hidden_state = self.gru(embedded, hidden) 15 | return output, hidden_state 16 | 17 | def first_hidden(self): 18 | return Variable(torch.FloatTensor(1, 1, self.hidden_size).zero_()) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import torch 3 | 4 | def read_data(filename): 5 | z = zipfile.ZipFile(filename, 'r') 6 | 7 | lines = [] 8 | with z.open(z.namelist()[0]) as f: 9 | i = 0 10 | for line in f: 11 | if i % 100 == 0: 12 | line = line.decode('utf-8').lower().replace("'", " ").replace(".", "").replace("?", "")\ 13 | .replace("!", "").replace(":", "").replace(";", "") 14 | lines.append(line) 15 | i += 1 16 | 17 | z.close() 18 | return lines --------------------------------------------------------------------------------