├── README.md
└── src
    ├── fetch_data.py
    ├── layers.py
    ├── model.py
    ├── preprocess_data.py
    ├── train.py
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
1 | # ESIM
2 | Implementation of the ESIM model for natural language inference with Keras
3 | 


--------------------------------------------------------------------------------
/src/fetch_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Download the data necessary for the ESIM model:
 3 |     - Stanford Natural Language Inference (SNLI) dataset.
 4 |     - GloVe word embedding vectors.
 5 | """
 6 | # Aurelien Coet, 2018.
 7 | 
 8 | import os
 9 | import sys
10 | import zipfile
11 | 
12 | 
13 | # Function from https://github.com/lukecq1231/nli/blob/master/data/download.py
14 | def download(url, targetdir):
15 |     """
16 |     Download data from an url and save it in some target directory.
17 |     (Note: wget must be installed on the machine in order for this function to
18 |     work.)
19 | 
20 |     Args:
21 |         url: The url from which the data must be downloaded.
22 |         target_dir: The target directory where the downloaded data must be
23 |                     saved.
24 | 
25 |     Returns:
26 |         The path to the downloaded data file.
27 |     """
28 |     filename = url.split('/')[-1]
29 |     filepath = os.path.join(targetdir, filename)
30 |     print("* Downloading data from {}".format(url))
31 |     os.system("wget {} -O {}".format(url, filepath))
32 |     return filepath
33 | 
34 | 
35 | # Function from https://github.com/lukecq1231/nli/blob/master/data/download.py
36 | def unzip(filepath):
37 |     """
38 |     Unzip a zipped file.
39 | 
40 |     Args:
41 |         filepath: The path to the file to unzip.
42 |     """
43 |     print("* Extracting: {}".format(filepath))
44 |     dirpath = os.path.dirname(filepath)
45 |     with zipfile.ZipFile(filepath) as zf:
46 |         zf.extractall(dirpath)
47 |     os.remove(filepath)
48 | 
49 | 
50 | def download_unzip(url, targetdir):
51 |     """
52 |     Download and unzip data from an url and save it in a target directory.
53 | 
54 |     Args:
55 |         url: The url to download the data from.
56 |         targetdir: The target directory in which to download and unzip the
57 |                    data.
58 |     """
59 |     filepath = os.path.join(targetdir, url.split('/')[-1])
60 |     if not os.path.exists(targetdir):
61 |         os.makedirs(targetdir)
62 |     # Download and unzip if the target directory is empty.
63 |     if not os.listdir(targetdir):
64 |         unzip(download(url, targetdir))
65 |     # Skip downloading if the zipped data is already available.
66 |     elif os.path.exists(filepath):
67 |         print("* Found zipped data - skipping download")
68 |         unzip(filepath)
69 |     # Skip unzipping if the unzipped data is already available.
70 |     else:
71 |         print("* Found unzipped data - skipping download and unzipping")
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     datadir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
76 |                            "..", "data")
77 |     snli_url = "https://nlp.stanford.edu/projects/snli/snli_1.0.zip"
78 |     glove_url = "http://www-nlp.stanford.edu/data/glove.840B.300d.zip"
79 |     print(20*'=' + "Fetching the SNLI data:" + 20*'=')
80 |     download_unzip(snli_url, os.path.join(datadir, "snli"))
81 |     print(20*'=' + "Fetching the GloVe data:" + 20*'=')
82 |     download_unzip(glove_url, os.path.join(datadir, "glove"))
83 | 


--------------------------------------------------------------------------------
/src/layers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Definition of the layers necessary for the ESIM model.
  3 | 
  4 | Inspired from the code on:
  5 | https://github.com/yuhsinliu1993/Quora_QuestionPairs_DL
  6 | """
  7 | 
  8 | import keras.backend as K
  9 | from keras.models import Sequential
 10 | from keras.layers import *
 11 | 
 12 | 
 13 | class EmbeddingLayer(object):
 14 |     """
 15 |     Layer to transform words represented by indices to word embeddings.
 16 |     """
 17 | 
 18 |     def __init__(self, voc_size, output_dim, embedding_weights=None,
 19 |                  max_length=100, trainable=True, mask_zero=False):
 20 |         self.voc_size = voc_size
 21 |         self.output_dim = output_dim
 22 |         self.max_length = max_length
 23 | 
 24 |         if embedding_weights is not None:
 25 |             self.model = Embedding(voc_size, output_dim,
 26 |                                    weights=[embedding_weights],
 27 |                                    input_length=max_length,
 28 |                                    trainable=trainable, mask_zero=mask_zero,
 29 |                                    name='embedding')
 30 |         else:
 31 |             # If no pretrained embedding weights are passed to the initialiser,
 32 |             # the model is set to be trainable by default.
 33 |             self.model = Embedding(voc_size, output_dim,
 34 |                                    input_length=max_length, trainable=True,
 35 |                                    mask_zero=mask_zero, name='embedding')
 36 | 
 37 |     def __call__(self, input):
 38 |         return self.model(input)
 39 | 
 40 | 
 41 | class EncodingLayer(object):
 42 |     """
 43 |     Layer to encode variable length sentences with a BiLSTM.
 44 |     """
 45 | 
 46 |     def __init__(self, hidden_units, max_length=100, dropout=0.5,
 47 |                  activation='tanh', sequences=True):
 48 |         self.layer = Bidirectional(LSTM(hidden_units, activation=activation,
 49 |                                         return_sequences=sequences,
 50 |                                         dropout=dropout,
 51 |                                         recurrent_dropout=dropout),
 52 |                                    merge_mode='concat')
 53 | 
 54 |     def __call__(self, input):
 55 |         return self.layer(input)
 56 | 
 57 | 
 58 | class LocalInferenceLayer(object):
 59 |     """
 60 |     Layer to compute local inference between two encoded sentences a and b.
 61 |     """
 62 | 
 63 |     def __call__(self, inputs):
 64 |         a = inputs[0]
 65 |         b = inputs[1]
 66 | 
 67 |         attention = Lambda(self._attention,
 68 |                            self._attention_output_shape)(inputs)
 69 | 
 70 |         align_a = Lambda(self._soft_alignment,
 71 |                          self._soft_alignment_output_shape)([attention, a])
 72 |         align_b = Lambda(self._soft_alignment,
 73 |                          self._soft_alignment_output_shape)([attention, b])
 74 | 
 75 |         # Enhancement of the local inference information obtained with the
 76 |         # attention mecanism and soft alignments.
 77 |         sub_a_align = Lambda(lambda x: x[0]-x[1])([a, align_a])
 78 |         sub_b_align = Lambda(lambda x: x[0]-x[1])([b, align_b])
 79 | 
 80 |         mul_a_align = Lambda(lambda x: x[0]*x[1])([a, align_a])
 81 |         mul_b_align = Lambda(lambda x: x[0]*x[1])([b, align_b])
 82 | 
 83 |         m_a = concatenate([a, align_a, sub_a_align, mul_a_align])
 84 |         m_b = concatenate([b, align_b, sub_b_align, mul_b_align])
 85 | 
 86 |         return m_a, m_b
 87 | 
 88 |     def _attention(self, inputs):
 89 |         """
 90 |         Compute the attention between elements of two sentences with the dot
 91 |         product.
 92 | 
 93 |         Args:
 94 |             inputs: A list containing two elements, one for the first sentence
 95 |                     and one for the second, both encoded by a BiLSTM.
 96 | 
 97 |         Returns:
 98 |             A tensor containing the dot product (attention weights between the
 99 |             elements of the two sentences).
100 |         """
101 |         attn_weights = K.batch_dot(x=inputs[0],
102 |                                    y=K.permute_dimensions(inputs[1],
103 |                                                           pattern=(0, 2, 1)))
104 |         return K.permute_dimensions(attn_weights, (0, 2, 1))
105 | 
106 |     def _attention_output_shape(self, inputs):
107 |         input_shape = inputs[0]
108 |         embedding_size = input_shape[1]
109 |         return (input_shape[0], embedding_size, embedding_size)
110 | 
111 |     def _soft_alignment(self, inputs):
112 |         """
113 |         Compute the soft alignment between the elements of two sentences.
114 | 
115 |         Args:
116 |             inputs: A list of two elements, the first is a tensor of attention
117 |                     weights, the second is the encoded sentence on which to
118 |                     compute the alignments.
119 | 
120 |         Returns:
121 |             A tensor containing the alignments.
122 |         """
123 |         attention = inputs[0]
124 |         sentence = inputs[1]
125 | 
126 |         # Subtract the max. from the attention weights to avoid overflows.
127 |         exp = K.exp(attention - K.max(attention, axis=-1, keepdims=True))
128 |         exp_sum = K.sum(exp, axis=-1, keepdims=True)
129 |         softmax = exp / exp_sum
130 | 
131 |         return K.batch_dot(softmax, sentence)
132 | 
133 |     def _soft_alignment_output_shape(self, inputs):
134 |         attention_shape = inputs[0]
135 |         sentence_shape = inputs[1]
136 |         return (attention_shape[0], attention_shape[1], sentence_shape[2])
137 | 
138 | 
139 | class InferenceCompositionLayer(object):
140 |     """
141 |     Layer to compose the local inference information.
142 |     """
143 | 
144 |     def __init__(self, hidden_units, max_length=100, dropout=0.5,
145 |                  activation='tanh', sequences=True):
146 |         self.hidden_units = hidden_units
147 |         self.max_length = max_length
148 |         self.dropout = dropout
149 |         self.activation = activation
150 |         self.sequences = sequences
151 | 
152 |     def __call__(self, input):
153 |         composition = Bidirectional(LSTM(self.hidden_units,
154 |                                          activation=self.activation,
155 |                                          return_sequences=self.sequences,
156 |                                          recurrent_dropout=self.dropout,
157 |                                          dropout=self.dropout))(input)
158 |         reduction = TimeDistributed(Dense(self.hidden_units,
159 |                                           kernel_initializer='he_normal',
160 |                                           activation='relu'))(composition)
161 | 
162 |         return Dropout(self.dropout)(reduction)
163 | 
164 | 
165 | class PoolingLayer(object):
166 |     """
167 |     Pooling layer to convert the vectors obtained in the previous layers to
168 |     fixed-length vectors.
169 |     """
170 | 
171 |     def __call__(self, inputs):
172 |         a = inputs[0]
173 |         b = inputs[1]
174 | 
175 |         a_avg = GlobalAveragePooling1D()(a)
176 |         a_max = GlobalMaxPooling1D()(a)
177 | 
178 |         b_avg = GlobalAveragePooling1D()(b)
179 |         b_max = GlobalMaxPooling1D()(b)
180 | 
181 |         return concatenate([a_avg, a_max, b_avg, b_max])
182 | 
183 | 
184 | class MLPLayer(object):
185 |     """
186 |     Multi-layer perceptron for classification.
187 |     """
188 | 
189 |     def __init__(self, hidden_units, n_classes, dropout=0.5,
190 |                  activations=['tanh', 'softmax']):
191 |         self.model = Sequential()
192 |         self.model.add(Dense(hidden_units, kernel_initializer='he_normal',
193 |                              activation=activations[0],
194 |                              input_shape=(4*hidden_units,)))
195 |         self.model.add(Dropout(dropout))
196 |         self.model.add(Dense(n_classes, kernel_initializer='zero',
197 |                              activation=activations[1]))
198 | 
199 |     def __call__(self, input):
200 |         return self.model(input)
201 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Definition of the ESIM model.
 3 | 
 4 | Inspired from the code on:
 5 | https://github.com/yuhsinliu1993/Quora_QuestionPairs_DL
 6 | """
 7 | 
 8 | from keras.layers import Input
 9 | from keras.models import Model
10 | from keras.optimizers import Adam
11 | from layers import *
12 | 
13 | 
14 | class ESIM(object):
15 |     """
16 |     ESIM model for Natural Language Inference (NLI) tasks.
17 |     """
18 | 
19 |     def __init__(self, n_classes, embedding_weights, max_length, hidden_units,
20 |                  dropout=0.5, learning_rate=0.0004):
21 |         self.n_classes = n_classes
22 |         self.embedding_weights = embedding_weights
23 |         self.voc_size, self.embedding_dim = embedding_weights.shape
24 |         self.max_length = max_length
25 |         self.hidden_units = hidden_units
26 |         self.dropout = dropout
27 |         self.learning_rate = learning_rate
28 | 
29 |     def build_model(self):
30 |         """
31 |         Build the model.
32 | 
33 |         Returns:
34 |             The ESIM model compiled and ready to be trained.
35 |         """
36 |         a = Input(shape=(self.max_length,), dtype='int32', name='premise')
37 |         b = Input(shape=(self.max_length,), dtype='int32', name='hypothesis')
38 | 
39 |         # ---------- Embedding layer ---------- #
40 |         embedding = EmbeddingLayer(self.voc_size, self.embedding_dim,
41 |                                    self.embedding_weights,
42 |                                    max_length=self.max_length)
43 | 
44 |         embedded_a = embedding(a)
45 |         embedded_b = embedding(b)
46 | 
47 |         # ---------- Encoding layer ---------- #
48 |         encoded_a = EncodingLayer(self.hidden_units,
49 |                                   self.max_length,
50 |                                   dropout=self.dropout)(embedded_a)
51 |         encoded_b = EncodingLayer(self.hidden_units,
52 |                                   self.max_length,
53 |                                   dropout=self.dropout)(embedded_b)
54 | 
55 |         # ---------- Local inference layer ---------- #
56 |         m_a, m_b = LocalInferenceLayer()([encoded_a, encoded_b])
57 | 
58 |         # ---------- Inference composition layer ---------- #
59 |         composed_a = InferenceCompositionLayer(self.hidden_units,
60 |                                                self.max_length,
61 |                                                dropout=self.dropout)(m_a)
62 |         composed_b = InferenceCompositionLayer(self.hidden_units,
63 |                                                self.max_length,
64 |                                                dropout=self.dropout)(m_b)
65 | 
66 |         # ---------- Pooling layer ---------- #
67 |         pooled = PoolingLayer()([composed_a, composed_b])
68 | 
69 |         # ---------- Classification layer ---------- #
70 |         prediction = MLPLayer(self.hidden_units, self.n_classes,
71 |                               dropout=self.dropout)(pooled)
72 | 
73 |         model = Model(inputs=[a, b], outputs=prediction)
74 |         model.compile(optimizer=Adam(lr=self.learning_rate),
75 |                       loss='categorical_crossentropy', metrics=['accuracy'])
76 | 
77 |         return model
78 | 


--------------------------------------------------------------------------------
/src/preprocess_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Preprocess the data necessary for the ESIM model.
  3 | """
  4 | # Aurelien Coet, 2018.
  5 | 
  6 | import os
  7 | import sys
  8 | import numpy
  9 | import pickle
 10 | import numpy as np
 11 | from keras.preprocessing.text import Tokenizer
 12 | 
 13 | 
 14 | def read_data(filepath):
 15 |     """
 16 |     Read the premises, hypotheses and labels from a file in the SNLI dataset
 17 |     and return them in three separate lists.
 18 | 
 19 |     Args:
 20 |         filepath: The path to a file from the SNLI dataset.
 21 | 
 22 |     Returns:
 23 |         A dictionnary containing three lists, one for the premises, one for the
 24 |         hypotheses and one for the labels.
 25 |     """
 26 |     labels_dict = {"entailment": '0', "neutral": '1', "contradiction": '2'}
 27 |     filename = os.path.basename(filepath)
 28 | 
 29 |     premises = []
 30 |     hypotheses = []
 31 |     labels = []
 32 |     premises_lens = []
 33 |     hypotheses_lens = []
 34 | 
 35 |     with open(filepath, 'r') as input:
 36 |         # Ignore the first line containing headers.
 37 |         next(input)
 38 |         for line in input:
 39 |             line = line.strip().split('\t')
 40 |             if line[0] == '-':
 41 |                 continue
 42 | 
 43 |             # Read the premise.
 44 |             sentence = line[5].rstrip()
 45 |             premises.append(sentence)
 46 |             premises_lens.append(len(sentence.split()))
 47 | 
 48 |             # Read the hypothesis.
 49 |             sentence = line[6].rstrip()
 50 |             hypotheses.append(sentence)
 51 |             hypotheses_lens.append(len(sentence.split()))
 52 | 
 53 |             # Read the label.
 54 |             labels.append(labels_dict[line[0]])
 55 | 
 56 |     print("Min. premise length: {}, max. premise length: {}"
 57 |           .format(min(premises_lens), max(premises_lens)))
 58 |     print("Min. hypothesis length: {}, max. hypothesis length: {}"
 59 |           .format(min(hypotheses_lens), max(hypotheses_lens)))
 60 | 
 61 |     return {"premises": premises, "hypotheses": hypotheses,
 62 |             "labels": labels}
 63 | 
 64 | 
 65 | def save_preprocessed_data(tokenizer, data, dataset, targetdir):
 66 |     """
 67 |     Save the preprocessed data to pickle files for later use. The preprocessed
 68 |     data consists in the premises and hypotheses with their words transformed
 69 |     to their indices, and the labels transformed to integer values.
 70 | 
 71 |     Args:
 72 |         tokenizer: A Keras Tokenizer object that has already been fit on
 73 |                    the training data (its word_index has been built).
 74 |         data: A dictionnary containing lists of strings for the sentences in
 75 |               the premises and hypotheses, as well as a list with their
 76 |               associated labels.
 77 |         dataset: A string indicating the type of dataset being saved (train,
 78 |                  test or dev).
 79 |         targetdir: The target directory in which to save the pickled files.
 80 |     """
 81 |     # Transform the words in the input data to their indexes and save them
 82 |     # in separate pickle files for the premises and hypotheses.
 83 |     with open(os.path.join(targetdir, "premises_{}.pkl".format(dataset)),
 84 |               'wb') as output:
 85 |         pickle.dump(tokenizer.texts_to_sequences(data["premises"]), output)
 86 | 
 87 |     with open(os.path.join(targetdir, "hypotheses_{}.pkl".format(dataset)),
 88 |               'wb') as output:
 89 |         pickle.dump(tokenizer.texts_to_sequences(data["hypotheses"]), output)
 90 | 
 91 |     # Pickle the labels too.
 92 |     with open(os.path.join(targetdir, "labels_{}.pkl".format(dataset)),
 93 |               'wb') as output:
 94 |         pickle.dump(data["labels"], output)
 95 | 
 96 | 
 97 | def build_embedding_weights(worddict, embeddings_file, targetdir):
 98 |     """
 99 |     Build the embedding weights matrix from a words dictionnary and existing
100 |     embeddings, and save it in pickled form.
101 | 
102 |     Args:
103 |         worddict: A dictionnary of words with their associated integer index.
104 |         embeddings_file: A file containing predefined word embeddings.
105 |         targetdir: The path to the target directory where to save the embedding
106 |                    weights matrix.
107 |     """
108 |     print("* Loading word embeddings from {}...".format(embeddings_file))
109 |     # Load the word embeddings in a dictionnary.
110 |     with open(embeddings_file, 'r') as input:
111 |         embeddings = {}
112 |         for line in input:
113 |             line = line.split()
114 |             # Ignore lines corresponding to words separated by spaces in the
115 |             # predefined embeddings.
116 |             if len(line[1:]) != 300:
117 |                 continue
118 |             word = line[0]
119 |             if word in worddict:
120 |                 last = word
121 |                 embeddings[word] = line[1:]
122 | 
123 |     print("* Building embedding weights matrix...")
124 |     # Initialize the embedding weights matrix.
125 |     num_words = len(worddict)
126 |     dims = len(embeddings[last])
127 |     embedding_weights = np.zeros((num_words, dims))
128 | 
129 |     # Build the embedding weights matrix.
130 |     for word, i in worddict.items():
131 |         if word in embeddings:
132 |             embedding_weights[i] = embeddings[word]
133 |         else:
134 |             # Out of vocabulary words are initialised with random gaussian
135 |             # samples.
136 |             embedding_weights[i] = np.random.normal(size=(dims))
137 | 
138 |     # Save the matrix in pickled form.
139 |     with open(os.path.join(targetdir, "embedding_weights.pkl"),
140 |               'wb') as output:
141 |         pickle.dump(embedding_weights, output)
142 | 
143 | 
144 | def preprocess_data(train_file, test_file, dev_file, embeddings_file,
145 |                     targetdir):
146 |     """
147 |     Preprocess the data for the ESIM model. Compute the word indices from the
148 |     training data, transform all words in the train/test/dev datasets to their
149 |     indices, save them in pickled files, and finally build the embedding matrix
150 |     and save it in pickled form.
151 | 
152 |     Args:
153 |         train_file: The path to the file containing the training data from the
154 |                     SNLI dataset.
155 |         test_file: The path to the file containing the test data from the SNLI
156 |                    dataset.
157 |         dev_file: The path to the file containing the dev data from the SNLI
158 |                   dataset.
159 |         embeddings_file: The path to the file containing the word embeddings to
160 |                          use for the embedding matrix.
161 |         targetdir: The path to the target directory for the pickled files
162 |                    produced by the function.
163 |     """
164 |     print(20*"=" + "Processing train data..." + 20*"=")
165 |     data = read_data(train_file)
166 | 
167 |     # Build the dictionnary of words from the training data with Keras'
168 |     # Tokenizer class. A special token is created for out of voc. words
169 |     # (token '__OOV__'), and index 0 is reserved for padding.
170 |     print("* Building word index dictionnary...")
171 |     tokenizer = Tokenizer(lower=False, oov_token="__OOV__")
172 |     tokenizer.fit_on_texts(data["premises"]+data["hypotheses"])
173 |     tokenizer.word_index["__PAD__"] = 0
174 |     print("** Total number of words: {}".format(len(tokenizer.word_index)))
175 |     # Save the dictionnary in a pickle file.
176 |     if not os.path.exists(targetdir):
177 |         os.makedirs(targetdir)
178 |     with open(os.path.join(targetdir, "worddict.pkl"), 'wb') as pkl_f:
179 |         pickle.dump(tokenizer.word_index, pkl_f)
180 | 
181 |     print("* Transforming and saving train data...")
182 |     save_preprocessed_data(tokenizer, data, "train", targetdir)
183 | 
184 |     # Preprocess and save the test dataset.
185 |     print(20*"=" + "Processing test data..." + 20*"=")
186 |     data = read_data(test_file)
187 |     print("* Transforming and saving test data...")
188 |     save_preprocessed_data(tokenizer, data, "test", targetdir)
189 | 
190 |     # Preprocess and save the dev dataset.
191 |     print(20*"=" + "Processing dev data..." + 20*"=")
192 |     data = read_data(dev_file)
193 |     print("* Transforming and saving dev data...")
194 |     save_preprocessed_data(tokenizer, data, "dev", targetdir)
195 | 
196 |     # Create and save the embedding weights matrix.
197 |     print(20*"=" + "Building embedding weights matrix..." + 20*"=")
198 |     build_embedding_weights(tokenizer.word_index, embeddings_file, targetdir)
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     basedir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
203 |                            "..", "data")
204 |     snli_dir = os.path.join(basedir, "snli", "snli_1.0")
205 |     glove_dir = os.path.join(basedir, "glove")
206 |     targetdir = os.path.join(basedir, "preprocessed")
207 | 
208 |     preprocess_data(os.path.join(snli_dir, "snli_1.0_train.txt"),
209 |                     os.path.join(snli_dir, "snli_1.0_test.txt"),
210 |                     os.path.join(snli_dir, "snli_1.0_dev.txt"),
211 |                     os.path.join(glove_dir, "glove.840B.300d.txt"),
212 |                     targetdir)
213 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Train the ESIM model on some dataset.
 3 | """
 4 | 
 5 | import os
 6 | import argparse
 7 | from keras.callbacks import ModelCheckpoint
 8 | from model import ESIM
 9 | from utils import prepare_data, load_embeddings
10 | 
11 | 
12 | def train(preproc_dir, n_classes, max_length, hidden_units, dropout,
13 |           batch_size, epochs, output_dir):
14 |     """
15 |     Train the ESIM model on some dataset and save the learned weights.
16 | 
17 |     Args:
18 |         preproc_dir: The directory where the preprocessed data is saved.
19 |         n_classes: The number of classes in the problem.
20 |         max_length: The maximum length of the sentences in the premises and
21 |                     hypotheses of the dataset.
22 |         hidden_units: The number of hidden units to use in the various layers
23 |                       of the model.
24 |         dropout: The dropout rate to use in the model.
25 |         batch_size: The size of the batches to use for training.
26 |         epochs: The number of epochs to apply during training.
27 |         output_dir: The path to the directory where the weights learned during
28 |                     training must be saved.
29 |     """
30 |     print("Loading training and validation data...")
31 |     train_premises, train_hyps, train_labels = prepare_data(preproc_dir,
32 |                                                             'train',
33 |                                                             n_classes,
34 |                                                             max_length)
35 |     valid_premises, valid_hyps, valid_labels = prepare_data(preproc_dir,
36 |                                                             'dev',
37 |                                                             n_classes,
38 |                                                             max_length)
39 | 
40 |     print("Loading embedding weights...")
41 |     embedding_weights = load_embeddings(os.path.join(preproc_dir,
42 |                                                      "embedding_weights.pkl"))
43 | 
44 |     # Build the model.
45 |     esim = ESIM(n_classes, embedding_weights, max_length, hidden_units,
46 |                 dropout)
47 |     model = esim.build_model()
48 | 
49 |     if not os.path.exists(output_dir):
50 |         os.makedirs(output_dir)
51 |     filepath = os.path.join(output_dir,
52 |                             "weights-{epoch:02d}-{val_acc:.2f}.hdf5")
53 |     checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
54 |                                  save_best_only=True, mode='max')
55 | 
56 |     model.fit(x=[train_premises, train_hyps],
57 |               y=train_labels,
58 |               batch_size=batch_size,
59 |               epochs=epochs,
60 |               validation_data=([valid_premises, valid_hyps], valid_labels),
61 |               callbacks=[checkpoint],
62 |               shuffle=True)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     parser = argparse.ArgumentParser(description='Train the ESIM model')
67 |     parser.add_argument('preproc_dir', help='Path to the directory containing\
68 |  the preprocessed data to be used to train the model.')
69 |     parser.add_argument('output_dir', help='Path to the directory where the\
70 |  learned weights of the model must be saved.')
71 |     parser.add_argument('--epochs', type=int, default=64, help='Number of\
72 |  epochs to run for training')
73 |     parser.add_argument('--batch_size', type=int, default=32, help='Size of the\
74 |  mini-batches to use during training.')
75 |     parser.add_argument('--hidden_units', type=int, default=300, help='Number\
76 |  of hidden units to use in the layers of the model')
77 |     parser.add_argument('--dropout', type=float, default=0.5, help='Dropout\
78 |  rate to use during training.')
79 |     parser.add_argument('--nclasses', type=int, default=3, help='Number of\
80 |  classes.')
81 |     parser.add_argument('--max_length', type=int, default=100, help='Max.\
82 |  length of the sentences for the premise and hypothesis.')
83 | 
84 |     args = parser.parse_args()
85 | 
86 |     print("Starting training of the model...")
87 |     train(args.preproc_dir, args.nclasses, args.max_length, args.hidden_units,
88 |           args.dropout, args.batch_size, args.epochs, args.output_dir)
89 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions.
 3 | """
 4 | # Aurelien Coet, 2018.
 5 | 
 6 | import os
 7 | import pickle
 8 | from keras.preprocessing.sequence import pad_sequences
 9 | from keras.utils import to_categorical
10 | 
11 | 
12 | def prepare_data(preproc_dir, dataset, n_classes, max_length=None):
13 |     """
14 |     Load and prepare preprocessed data for the ESIM model.
15 | 
16 |     Args:
17 |         preproc_dir: The path to the directory containing the preprocessed
18 |                           data to be loaded.
19 |         dataset: The type of the dataset that must be loaded (train, test or
20 |                  dev).
21 | 
22 |     Returns:
23 |         A tuple containing numpy arrays. The two first are the premises and
24 |         hypotheses of the dataset padded with zeros to all have the same
25 |         length. The third one is a numpy array containing the labels
26 |         transformed to categorical form.
27 |     """
28 |     with open(os.path.join(preproc_dir, "premises_{}.pkl".format(dataset)),
29 |               'rb') as premise_file:
30 |         premises = pickle.load(premise_file)
31 | 
32 |     with open(os.path.join(preproc_dir, "hypotheses_{}.pkl".format(dataset)),
33 |               'rb') as hypotheses_file:
34 |         hypotheses = pickle.load(hypotheses_file)
35 | 
36 |     with open(os.path.join(preproc_dir, "labels_{}.pkl".format(dataset)),
37 |               'rb') as labels_file:
38 |         labels = pickle.load(labels_file)
39 | 
40 |     premises = pad_sequences(premises, maxlen=max_length,
41 |                              padding='post', truncating='post')
42 | 
43 |     hypotheses = pad_sequences(hypotheses, maxlen=max_length,
44 |                                padding='post', truncating='post')
45 | 
46 |     # Convert the labels to one-hot vectors.
47 |     labels = to_categorical(labels, num_classes=n_classes)
48 | 
49 |     return (premises, hypotheses, labels)
50 | 
51 | 
52 | def load_embeddings(filepath):
53 |     """
54 |     Load an embedding weights matrix from a pickle file.
55 | 
56 |     Args:
57 |         filepath: The path to the file containing the embedding matrix.
58 | 
59 |     Returns:
60 |         The embedding matrix.
61 |     """
62 |     with open(filepath, 'rb') as embed_file:
63 |         embedding_weights = pickle.load(embed_file)
64 | 
65 |     return embedding_weights
66 | 


--------------------------------------------------------------------------------