├── .gitignore
├── README.md
├── seq2seq.py
└── seq2seq_utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Introduction:
 2 | 
 3 | This is my first attempt on a simple project to create a language translation model using Sequence To Sequence Learning Approach.
 4 | 
 5 | Details can be found at my blog post here:
 6 | 
 7 | * [Creating A Language Translation Model Using Sequence To Sequence Learning Approach](https://chunml.github.io/ChunML.github.io/project/Sequence-To-Sequence/)
 8 | 
 9 | ### Dataset:
10 | 
11 | I used the Europarl's Parallel Corpus for training. To get the source code to work immediately, you have to use the newest version (release v8 at the time of writing) at the link below (following the link will start a 180MB download):
12 | 
13 | * [Europarl v8](http://www.statmt.org/wmt15/europarl-v8.fi-en.tgz)
14 | 
15 | Feel free to change the default dataset to anyone of your own. Just don't forget to modify the code!
16 | 
17 | ### List of arguments:
18 | 
19 | * max_len: specify the maximum length of sentence to extract from text.  
20 | Default: 200
21 | * vocab_size: specify the number of the most frequent words to put in vocabulary set.  
22 | Default: 20000
23 | * batch_size: specify the batch size.  
24 | Default: 1000
25 | * layer_num: specify the number of recurrent layers in Decoder network.  
26 | Default: 3
27 | * hidden_dim: specify the dimension of hidden state.  
28 | Default: 1000
29 | * np_epoch: specify the number of training epochs.  
30 | Default: 20
31 | * mode: specify whether to train or test the model.  
32 | Default: train
33 | 
34 | ### Train the model:
35 | 
36 | * With default settings:
37 | 
38 | ```python
39 | python seq2seq.py
40 | ```
41 | 
42 | * With user-defined settings:
43 | 
44 | ```python
45 | # Max length:= 300, number of recurrent layers:= 2, dimension of hidden state:= 500
46 | python seq2seq.py -max_len 300 -layer_num 2 -hidden_dim 500
47 | ```
48 | 
49 | ### Test the model:
50 | 
51 | The network must be trained at least once (trained weights must exist!).
52 | 
53 | * If the network was trained with default settings:
54 | 
55 | ```python
56 | python seq2seq.py -mode test
57 | ```
58 | 
59 | * If the network was trained with user-defined settings:
60 | 
61 | ```python
62 | # Max length:= 300, number of recurrent layers:= 2, dimension of hidden state:= 500
63 | python seq2seq.py -mode test -max_len 300 -layer_num 2 -hidden_dim 500
64 | ```
65 | 


--------------------------------------------------------------------------------
/seq2seq.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from keras.preprocessing.sequence import pad_sequences
 3 | import numpy as np
 4 | import sys
 5 | 
 6 | import argparse
 7 | from seq2seq_utils import *
 8 | 
 9 | ap = argparse.ArgumentParser()
10 | ap.add_argument('-max_len', type=int, default=200)
11 | ap.add_argument('-vocab_size', type=int, default=20000)
12 | ap.add_argument('-batch_size', type=int, default=100)
13 | ap.add_argument('-layer_num', type=int, default=3)
14 | ap.add_argument('-hidden_dim', type=int, default=1000)
15 | ap.add_argument('-nb_epoch', type=int, default=20)
16 | ap.add_argument('-mode', default='train')
17 | args = vars(ap.parse_args())
18 | 
19 | MAX_LEN = args['max_len']
20 | VOCAB_SIZE = args['vocab_size']
21 | BATCH_SIZE = args['batch_size']
22 | LAYER_NUM = args['layer_num']
23 | HIDDEN_DIM = args['hidden_dim']
24 | NB_EPOCH = args['nb_epoch']
25 | MODE = args['mode']
26 | 
27 | if __name__ == '__main__':
28 |     # Loading input sequences, output sequences and the necessary mapping dictionaries
29 |     print('[INFO] Loading data...')
30 |     X, X_vocab_len, X_word_to_ix, X_ix_to_word, y, y_vocab_len, y_word_to_ix, y_ix_to_word = load_data('europarl-v8.fi-en.en', 'europarl-v8.fi-en.fi', MAX_LEN, VOCAB_SIZE)
31 | 
32 |     # Finding the length of the longest sequence
33 |     X_max_len = max([len(sentence) for sentence in X])
34 |     y_max_len = max([len(sentence) for sentence in y])
35 | 
36 |     # Padding zeros to make all sequences have a same length with the longest one
37 |     print('[INFO] Zero padding...')
38 |     X = pad_sequences(X, maxlen=X_max_len, dtype='int32')
39 |     y = pad_sequences(y, maxlen=y_max_len, dtype='int32')
40 | 
41 |     # Creating the network model
42 |     print('[INFO] Compiling model...')
43 |     model = create_model(X_vocab_len, X_max_len, y_vocab_len, y_max_len, HIDDEN_DIM, LAYER_NUM)
44 | 
45 |     # Finding trained weights of previous epoch if any
46 |     saved_weights = find_checkpoint_file('.')
47 | 
48 |     # Training only if we chose training mode
49 |     if MODE == 'train':
50 |         k_start = 1
51 | 
52 |         # If any trained weight was found, then load them into the model
53 |         if len(saved_weights) != 0:
54 |             print('[INFO] Saved weights found, loading...')
55 |             epoch = saved_weights[saved_weights.rfind('_')+1:saved_weights.rfind('.')]
56 |             model.load_weights(saved_weights)
57 |             k_start = int(epoch) + 1
58 | 
59 |         i_end = 0
60 |         for k in range(k_start, NB_EPOCH+1):
61 |             # Shuffling the training data every epoch to avoid local minima
62 |             indices = np.arange(len(X))
63 |             np.random.shuffle(indices)
64 |             X = X[indices]
65 |             y = y[indices]
66 | 
67 |             # Training 1000 sequences at a time
68 |             for i in range(0, len(X), 1000):
69 |                 if i + 1000 >= len(X):
70 |                     i_end = len(X)
71 |                 else:
72 |                     i_end = i + 1000
73 |                 y_sequences = process_data(y[i:i_end], y_max_len, y_word_to_ix)
74 | 
75 |                 print('[INFO] Training model: epoch {}th {}/{} samples'.format(k, i, len(X)))
76 |                 model.fit(X[i:i_end], y_sequences, batch_size=BATCH_SIZE, nb_epoch=1, verbose=2)
77 |             model.save_weights('checkpoint_epoch_{}.hdf5'.format(k))
78 |     
79 |     # Performing test if we chose test mode
80 |     else:
81 |         # Only performing test if there is any saved weights
82 |         if len(saved_weights) == 0:
83 |             print("The network hasn't been trained! Program will exit...")
84 |             sys.exit()
85 |         else:
86 |             X_test = load_test_data('test', X_word_to_ix, MAX_LEN)
87 |             X_test = pad_sequences(X_test, maxlen=X_max_len, dtype='int32')
88 |             model.load_weights(saved_weights)
89 |             
90 |             predictions = np.argmax(model.predict(X_test), axis=2)
91 |             sequences = []
92 |             for prediction in predictions:
93 |                 sequence = ' '.join([y_ix_to_word(index) for index in prediction if index > 0])
94 |                 print(sequence)
95 |                 sequences.append(sequence)
96 |             np.savetxt('test_result', sequences, fmt='%s')
97 |                 
98 | 


--------------------------------------------------------------------------------
/seq2seq_utils.py:
--------------------------------------------------------------------------------
  1 | from keras.preprocessing.text import text_to_word_sequence
  2 | from keras.models import Sequential
  3 | from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding
  4 | from keras.layers.recurrent import LSTM
  5 | from keras.optimizers import Adam, RMSprop
  6 | from nltk import FreqDist
  7 | import numpy as np
  8 | import os
  9 | import datetime
 10 | 
 11 | def load_data(source, dist, max_len, vocab_size):
 12 | 
 13 |     # Reading raw text from source and destination files
 14 |     f = open(source, 'r')
 15 |     X_data = f.read()
 16 |     f.close()
 17 |     f = open(dist, 'r')
 18 |     y_data = f.read()
 19 |     f.close()
 20 | 
 21 |     # Splitting raw text into array of sequences
 22 |     X = [text_to_word_sequence(x)[::-1] for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 and len(x) <= max_len and len(y) <= max_len]
 23 |     y = [text_to_word_sequence(y) for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 and len(x) <= max_len and len(y) <= max_len]
 24 | 
 25 |     # Creating the vocabulary set with the most common words
 26 |     dist = FreqDist(np.hstack(X))
 27 |     X_vocab = dist.most_common(vocab_size-1)
 28 |     dist = FreqDist(np.hstack(y))
 29 |     y_vocab = dist.most_common(vocab_size-1)
 30 | 
 31 |     # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary
 32 |     X_ix_to_word = [word[0] for word in X_vocab]
 33 |     # Adding the word "ZERO" to the beginning of the array
 34 |     X_ix_to_word.insert(0, 'ZERO')
 35 |     # Adding the word 'UNK' to the end of the array (stands for UNKNOWN words)
 36 |     X_ix_to_word.append('UNK')
 37 | 
 38 |     # Creating the word-to-index dictionary from the array created above
 39 |     X_word_to_ix = {word:ix for ix, word in enumerate(X_ix_to_word)}
 40 | 
 41 |     # Converting each word to its index value
 42 |     for i, sentence in enumerate(X):
 43 |         for j, word in enumerate(sentence):
 44 |             if word in X_word_to_ix:
 45 |                 X[i][j] = X_word_to_ix[word]
 46 |             else:
 47 |                 X[i][j] = X_word_to_ix['UNK']
 48 | 
 49 |     y_ix_to_word = [word[0] for word in y_vocab]
 50 |     y_ix_to_word.insert(0, 'ZERO')
 51 |     y_ix_to_word.append('UNK')
 52 |     y_word_to_ix = {word:ix for ix, word in enumerate(y_ix_to_word)}
 53 |     for i, sentence in enumerate(y):
 54 |         for j, word in enumerate(sentence):
 55 |             if word in y_word_to_ix:
 56 |                 y[i][j] = y_word_to_ix[word]
 57 |             else:
 58 |                 y[i][j] = y_word_to_ix['UNK']
 59 |     return (X, len(X_vocab)+2, X_word_to_ix, X_ix_to_word, y, len(y_vocab)+2, y_word_to_ix, y_ix_to_word)
 60 | 
 61 | def load_test_data(source, X_word_to_ix, max_len):
 62 |     f = open(source, 'r')
 63 |     X_data = f.read()
 64 |     f.close()
 65 | 
 66 |     X = [text_to_word_sequence(x)[::-1] for x in X_data.split('\n') if len(x) > 0 and len(x) <= max_len]
 67 |     for i, sentence in enumerate(X):
 68 |         for j, word in enumerate(sentence):
 69 |             if word in X_word_to_ix:
 70 |                 X[i][j] = X_word_to_ix[word]
 71 |             else:
 72 |                 X[i][j] = X_word_to_ix['UNK']
 73 |     return X
 74 | 
 75 | def create_model(X_vocab_len, X_max_len, y_vocab_len, y_max_len, hidden_size, num_layers):
 76 |     model = Sequential()
 77 | 
 78 |     # Creating encoder network
 79 |     model.add(Embedding(X_vocab_len, 1000, input_length=X_max_len, mask_zero=True))
 80 |     model.add(LSTM(hidden_size))
 81 |     model.add(RepeatVector(y_max_len))
 82 | 
 83 |     # Creating decoder network
 84 |     for _ in range(num_layers):
 85 |         model.add(LSTM(hidden_size, return_sequences=True))
 86 |     model.add(TimeDistributed(Dense(y_vocab_len)))
 87 |     model.add(Activation('softmax'))
 88 |     model.compile(loss='categorical_crossentropy',
 89 |             optimizer='rmsprop',
 90 |             metrics=['accuracy'])
 91 |     return model
 92 | 
 93 | def process_data(word_sentences, max_len, word_to_ix):
 94 |     # Vectorizing each element in each sequence
 95 |     sequences = np.zeros((len(word_sentences), max_len, len(word_to_ix)))
 96 |     for i, sentence in enumerate(word_sentences):
 97 |         for j, word in enumerate(sentence):
 98 |             sequences[i, j, word] = 1.
 99 |     return sequences
100 | 
101 | def find_checkpoint_file(folder):
102 |     checkpoint_file = [f for f in os.listdir(folder) if 'checkpoint' in f]
103 |     if len(checkpoint_file) == 0:
104 |         return []
105 |     modified_time = [os.path.getmtime(f) for f in checkpoint_file]
106 |     return checkpoint_file[np.argmax(modified_time)]


--------------------------------------------------------------------------------