├── .gitignore ├── README.md ├── seq2seq.py └── seq2seq_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Introduction: 2 | 3 | This is my first attempt on a simple project to create a language translation model using Sequence To Sequence Learning Approach. 4 | 5 | Details can be found at my blog post here: 6 | 7 | * [Creating A Language Translation Model Using Sequence To Sequence Learning Approach](https://chunml.github.io/ChunML.github.io/project/Sequence-To-Sequence/) 8 | 9 | ### Dataset: 10 | 11 | I used the Europarl's Parallel Corpus for training. To get the source code to work immediately, you have to use the newest version (release v8 at the time of writing) at the link below (following the link will start a 180MB download): 12 | 13 | * [Europarl v8](http://www.statmt.org/wmt15/europarl-v8.fi-en.tgz) 14 | 15 | Feel free to change the default dataset to anyone of your own. Just don't forget to modify the code! 16 | 17 | ### List of arguments: 18 | 19 | * max_len: specify the maximum length of sentence to extract from text. 20 | Default: 200 21 | * vocab_size: specify the number of the most frequent words to put in vocabulary set. 22 | Default: 20000 23 | * batch_size: specify the batch size. 24 | Default: 1000 25 | * layer_num: specify the number of recurrent layers in Decoder network. 26 | Default: 3 27 | * hidden_dim: specify the dimension of hidden state. 28 | Default: 1000 29 | * np_epoch: specify the number of training epochs. 30 | Default: 20 31 | * mode: specify whether to train or test the model. 32 | Default: train 33 | 34 | ### Train the model: 35 | 36 | * With default settings: 37 | 38 | ```python 39 | python seq2seq.py 40 | ``` 41 | 42 | * With user-defined settings: 43 | 44 | ```python 45 | # Max length:= 300, number of recurrent layers:= 2, dimension of hidden state:= 500 46 | python seq2seq.py -max_len 300 -layer_num 2 -hidden_dim 500 47 | ``` 48 | 49 | ### Test the model: 50 | 51 | The network must be trained at least once (trained weights must exist!). 52 | 53 | * If the network was trained with default settings: 54 | 55 | ```python 56 | python seq2seq.py -mode test 57 | ``` 58 | 59 | * If the network was trained with user-defined settings: 60 | 61 | ```python 62 | # Max length:= 300, number of recurrent layers:= 2, dimension of hidden state:= 500 63 | python seq2seq.py -mode test -max_len 300 -layer_num 2 -hidden_dim 500 64 | ``` 65 | -------------------------------------------------------------------------------- /seq2seq.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from keras.preprocessing.sequence import pad_sequences 3 | import numpy as np 4 | import sys 5 | 6 | import argparse 7 | from seq2seq_utils import * 8 | 9 | ap = argparse.ArgumentParser() 10 | ap.add_argument('-max_len', type=int, default=200) 11 | ap.add_argument('-vocab_size', type=int, default=20000) 12 | ap.add_argument('-batch_size', type=int, default=100) 13 | ap.add_argument('-layer_num', type=int, default=3) 14 | ap.add_argument('-hidden_dim', type=int, default=1000) 15 | ap.add_argument('-nb_epoch', type=int, default=20) 16 | ap.add_argument('-mode', default='train') 17 | args = vars(ap.parse_args()) 18 | 19 | MAX_LEN = args['max_len'] 20 | VOCAB_SIZE = args['vocab_size'] 21 | BATCH_SIZE = args['batch_size'] 22 | LAYER_NUM = args['layer_num'] 23 | HIDDEN_DIM = args['hidden_dim'] 24 | NB_EPOCH = args['nb_epoch'] 25 | MODE = args['mode'] 26 | 27 | if __name__ == '__main__': 28 | # Loading input sequences, output sequences and the necessary mapping dictionaries 29 | print('[INFO] Loading data...') 30 | X, X_vocab_len, X_word_to_ix, X_ix_to_word, y, y_vocab_len, y_word_to_ix, y_ix_to_word = load_data('europarl-v8.fi-en.en', 'europarl-v8.fi-en.fi', MAX_LEN, VOCAB_SIZE) 31 | 32 | # Finding the length of the longest sequence 33 | X_max_len = max([len(sentence) for sentence in X]) 34 | y_max_len = max([len(sentence) for sentence in y]) 35 | 36 | # Padding zeros to make all sequences have a same length with the longest one 37 | print('[INFO] Zero padding...') 38 | X = pad_sequences(X, maxlen=X_max_len, dtype='int32') 39 | y = pad_sequences(y, maxlen=y_max_len, dtype='int32') 40 | 41 | # Creating the network model 42 | print('[INFO] Compiling model...') 43 | model = create_model(X_vocab_len, X_max_len, y_vocab_len, y_max_len, HIDDEN_DIM, LAYER_NUM) 44 | 45 | # Finding trained weights of previous epoch if any 46 | saved_weights = find_checkpoint_file('.') 47 | 48 | # Training only if we chose training mode 49 | if MODE == 'train': 50 | k_start = 1 51 | 52 | # If any trained weight was found, then load them into the model 53 | if len(saved_weights) != 0: 54 | print('[INFO] Saved weights found, loading...') 55 | epoch = saved_weights[saved_weights.rfind('_')+1:saved_weights.rfind('.')] 56 | model.load_weights(saved_weights) 57 | k_start = int(epoch) + 1 58 | 59 | i_end = 0 60 | for k in range(k_start, NB_EPOCH+1): 61 | # Shuffling the training data every epoch to avoid local minima 62 | indices = np.arange(len(X)) 63 | np.random.shuffle(indices) 64 | X = X[indices] 65 | y = y[indices] 66 | 67 | # Training 1000 sequences at a time 68 | for i in range(0, len(X), 1000): 69 | if i + 1000 >= len(X): 70 | i_end = len(X) 71 | else: 72 | i_end = i + 1000 73 | y_sequences = process_data(y[i:i_end], y_max_len, y_word_to_ix) 74 | 75 | print('[INFO] Training model: epoch {}th {}/{} samples'.format(k, i, len(X))) 76 | model.fit(X[i:i_end], y_sequences, batch_size=BATCH_SIZE, nb_epoch=1, verbose=2) 77 | model.save_weights('checkpoint_epoch_{}.hdf5'.format(k)) 78 | 79 | # Performing test if we chose test mode 80 | else: 81 | # Only performing test if there is any saved weights 82 | if len(saved_weights) == 0: 83 | print("The network hasn't been trained! Program will exit...") 84 | sys.exit() 85 | else: 86 | X_test = load_test_data('test', X_word_to_ix, MAX_LEN) 87 | X_test = pad_sequences(X_test, maxlen=X_max_len, dtype='int32') 88 | model.load_weights(saved_weights) 89 | 90 | predictions = np.argmax(model.predict(X_test), axis=2) 91 | sequences = [] 92 | for prediction in predictions: 93 | sequence = ' '.join([y_ix_to_word(index) for index in prediction if index > 0]) 94 | print(sequence) 95 | sequences.append(sequence) 96 | np.savetxt('test_result', sequences, fmt='%s') 97 | 98 | -------------------------------------------------------------------------------- /seq2seq_utils.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing.text import text_to_word_sequence 2 | from keras.models import Sequential 3 | from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding 4 | from keras.layers.recurrent import LSTM 5 | from keras.optimizers import Adam, RMSprop 6 | from nltk import FreqDist 7 | import numpy as np 8 | import os 9 | import datetime 10 | 11 | def load_data(source, dist, max_len, vocab_size): 12 | 13 | # Reading raw text from source and destination files 14 | f = open(source, 'r') 15 | X_data = f.read() 16 | f.close() 17 | f = open(dist, 'r') 18 | y_data = f.read() 19 | f.close() 20 | 21 | # Splitting raw text into array of sequences 22 | X = [text_to_word_sequence(x)[::-1] for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 and len(x) <= max_len and len(y) <= max_len] 23 | y = [text_to_word_sequence(y) for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 and len(x) <= max_len and len(y) <= max_len] 24 | 25 | # Creating the vocabulary set with the most common words 26 | dist = FreqDist(np.hstack(X)) 27 | X_vocab = dist.most_common(vocab_size-1) 28 | dist = FreqDist(np.hstack(y)) 29 | y_vocab = dist.most_common(vocab_size-1) 30 | 31 | # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary 32 | X_ix_to_word = [word[0] for word in X_vocab] 33 | # Adding the word "ZERO" to the beginning of the array 34 | X_ix_to_word.insert(0, 'ZERO') 35 | # Adding the word 'UNK' to the end of the array (stands for UNKNOWN words) 36 | X_ix_to_word.append('UNK') 37 | 38 | # Creating the word-to-index dictionary from the array created above 39 | X_word_to_ix = {word:ix for ix, word in enumerate(X_ix_to_word)} 40 | 41 | # Converting each word to its index value 42 | for i, sentence in enumerate(X): 43 | for j, word in enumerate(sentence): 44 | if word in X_word_to_ix: 45 | X[i][j] = X_word_to_ix[word] 46 | else: 47 | X[i][j] = X_word_to_ix['UNK'] 48 | 49 | y_ix_to_word = [word[0] for word in y_vocab] 50 | y_ix_to_word.insert(0, 'ZERO') 51 | y_ix_to_word.append('UNK') 52 | y_word_to_ix = {word:ix for ix, word in enumerate(y_ix_to_word)} 53 | for i, sentence in enumerate(y): 54 | for j, word in enumerate(sentence): 55 | if word in y_word_to_ix: 56 | y[i][j] = y_word_to_ix[word] 57 | else: 58 | y[i][j] = y_word_to_ix['UNK'] 59 | return (X, len(X_vocab)+2, X_word_to_ix, X_ix_to_word, y, len(y_vocab)+2, y_word_to_ix, y_ix_to_word) 60 | 61 | def load_test_data(source, X_word_to_ix, max_len): 62 | f = open(source, 'r') 63 | X_data = f.read() 64 | f.close() 65 | 66 | X = [text_to_word_sequence(x)[::-1] for x in X_data.split('\n') if len(x) > 0 and len(x) <= max_len] 67 | for i, sentence in enumerate(X): 68 | for j, word in enumerate(sentence): 69 | if word in X_word_to_ix: 70 | X[i][j] = X_word_to_ix[word] 71 | else: 72 | X[i][j] = X_word_to_ix['UNK'] 73 | return X 74 | 75 | def create_model(X_vocab_len, X_max_len, y_vocab_len, y_max_len, hidden_size, num_layers): 76 | model = Sequential() 77 | 78 | # Creating encoder network 79 | model.add(Embedding(X_vocab_len, 1000, input_length=X_max_len, mask_zero=True)) 80 | model.add(LSTM(hidden_size)) 81 | model.add(RepeatVector(y_max_len)) 82 | 83 | # Creating decoder network 84 | for _ in range(num_layers): 85 | model.add(LSTM(hidden_size, return_sequences=True)) 86 | model.add(TimeDistributed(Dense(y_vocab_len))) 87 | model.add(Activation('softmax')) 88 | model.compile(loss='categorical_crossentropy', 89 | optimizer='rmsprop', 90 | metrics=['accuracy']) 91 | return model 92 | 93 | def process_data(word_sentences, max_len, word_to_ix): 94 | # Vectorizing each element in each sequence 95 | sequences = np.zeros((len(word_sentences), max_len, len(word_to_ix))) 96 | for i, sentence in enumerate(word_sentences): 97 | for j, word in enumerate(sentence): 98 | sequences[i, j, word] = 1. 99 | return sequences 100 | 101 | def find_checkpoint_file(folder): 102 | checkpoint_file = [f for f in os.listdir(folder) if 'checkpoint' in f] 103 | if len(checkpoint_file) == 0: 104 | return [] 105 | modified_time = [os.path.getmtime(f) for f in checkpoint_file] 106 | return checkpoint_file[np.argmax(modified_time)] --------------------------------------------------------------------------------