├── README.md └── src ├── fetch_data.py ├── layers.py ├── model.py ├── preprocess_data.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # ESIM 2 | Implementation of the ESIM model for natural language inference with Keras 3 | -------------------------------------------------------------------------------- /src/fetch_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download the data necessary for the ESIM model: 3 | - Stanford Natural Language Inference (SNLI) dataset. 4 | - GloVe word embedding vectors. 5 | """ 6 | # Aurelien Coet, 2018. 7 | 8 | import os 9 | import sys 10 | import zipfile 11 | 12 | 13 | # Function from https://github.com/lukecq1231/nli/blob/master/data/download.py 14 | def download(url, targetdir): 15 | """ 16 | Download data from an url and save it in some target directory. 17 | (Note: wget must be installed on the machine in order for this function to 18 | work.) 19 | 20 | Args: 21 | url: The url from which the data must be downloaded. 22 | target_dir: The target directory where the downloaded data must be 23 | saved. 24 | 25 | Returns: 26 | The path to the downloaded data file. 27 | """ 28 | filename = url.split('/')[-1] 29 | filepath = os.path.join(targetdir, filename) 30 | print("* Downloading data from {}".format(url)) 31 | os.system("wget {} -O {}".format(url, filepath)) 32 | return filepath 33 | 34 | 35 | # Function from https://github.com/lukecq1231/nli/blob/master/data/download.py 36 | def unzip(filepath): 37 | """ 38 | Unzip a zipped file. 39 | 40 | Args: 41 | filepath: The path to the file to unzip. 42 | """ 43 | print("* Extracting: {}".format(filepath)) 44 | dirpath = os.path.dirname(filepath) 45 | with zipfile.ZipFile(filepath) as zf: 46 | zf.extractall(dirpath) 47 | os.remove(filepath) 48 | 49 | 50 | def download_unzip(url, targetdir): 51 | """ 52 | Download and unzip data from an url and save it in a target directory. 53 | 54 | Args: 55 | url: The url to download the data from. 56 | targetdir: The target directory in which to download and unzip the 57 | data. 58 | """ 59 | filepath = os.path.join(targetdir, url.split('/')[-1]) 60 | if not os.path.exists(targetdir): 61 | os.makedirs(targetdir) 62 | # Download and unzip if the target directory is empty. 63 | if not os.listdir(targetdir): 64 | unzip(download(url, targetdir)) 65 | # Skip downloading if the zipped data is already available. 66 | elif os.path.exists(filepath): 67 | print("* Found zipped data - skipping download") 68 | unzip(filepath) 69 | # Skip unzipping if the unzipped data is already available. 70 | else: 71 | print("* Found unzipped data - skipping download and unzipping") 72 | 73 | 74 | if __name__ == "__main__": 75 | datadir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 76 | "..", "data") 77 | snli_url = "https://nlp.stanford.edu/projects/snli/snli_1.0.zip" 78 | glove_url = "http://www-nlp.stanford.edu/data/glove.840B.300d.zip" 79 | print(20*'=' + "Fetching the SNLI data:" + 20*'=') 80 | download_unzip(snli_url, os.path.join(datadir, "snli")) 81 | print(20*'=' + "Fetching the GloVe data:" + 20*'=') 82 | download_unzip(glove_url, os.path.join(datadir, "glove")) 83 | -------------------------------------------------------------------------------- /src/layers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definition of the layers necessary for the ESIM model. 3 | 4 | Inspired from the code on: 5 | https://github.com/yuhsinliu1993/Quora_QuestionPairs_DL 6 | """ 7 | 8 | import keras.backend as K 9 | from keras.models import Sequential 10 | from keras.layers import * 11 | 12 | 13 | class EmbeddingLayer(object): 14 | """ 15 | Layer to transform words represented by indices to word embeddings. 16 | """ 17 | 18 | def __init__(self, voc_size, output_dim, embedding_weights=None, 19 | max_length=100, trainable=True, mask_zero=False): 20 | self.voc_size = voc_size 21 | self.output_dim = output_dim 22 | self.max_length = max_length 23 | 24 | if embedding_weights is not None: 25 | self.model = Embedding(voc_size, output_dim, 26 | weights=[embedding_weights], 27 | input_length=max_length, 28 | trainable=trainable, mask_zero=mask_zero, 29 | name='embedding') 30 | else: 31 | # If no pretrained embedding weights are passed to the initialiser, 32 | # the model is set to be trainable by default. 33 | self.model = Embedding(voc_size, output_dim, 34 | input_length=max_length, trainable=True, 35 | mask_zero=mask_zero, name='embedding') 36 | 37 | def __call__(self, input): 38 | return self.model(input) 39 | 40 | 41 | class EncodingLayer(object): 42 | """ 43 | Layer to encode variable length sentences with a BiLSTM. 44 | """ 45 | 46 | def __init__(self, hidden_units, max_length=100, dropout=0.5, 47 | activation='tanh', sequences=True): 48 | self.layer = Bidirectional(LSTM(hidden_units, activation=activation, 49 | return_sequences=sequences, 50 | dropout=dropout, 51 | recurrent_dropout=dropout), 52 | merge_mode='concat') 53 | 54 | def __call__(self, input): 55 | return self.layer(input) 56 | 57 | 58 | class LocalInferenceLayer(object): 59 | """ 60 | Layer to compute local inference between two encoded sentences a and b. 61 | """ 62 | 63 | def __call__(self, inputs): 64 | a = inputs[0] 65 | b = inputs[1] 66 | 67 | attention = Lambda(self._attention, 68 | self._attention_output_shape)(inputs) 69 | 70 | align_a = Lambda(self._soft_alignment, 71 | self._soft_alignment_output_shape)([attention, a]) 72 | align_b = Lambda(self._soft_alignment, 73 | self._soft_alignment_output_shape)([attention, b]) 74 | 75 | # Enhancement of the local inference information obtained with the 76 | # attention mecanism and soft alignments. 77 | sub_a_align = Lambda(lambda x: x[0]-x[1])([a, align_a]) 78 | sub_b_align = Lambda(lambda x: x[0]-x[1])([b, align_b]) 79 | 80 | mul_a_align = Lambda(lambda x: x[0]*x[1])([a, align_a]) 81 | mul_b_align = Lambda(lambda x: x[0]*x[1])([b, align_b]) 82 | 83 | m_a = concatenate([a, align_a, sub_a_align, mul_a_align]) 84 | m_b = concatenate([b, align_b, sub_b_align, mul_b_align]) 85 | 86 | return m_a, m_b 87 | 88 | def _attention(self, inputs): 89 | """ 90 | Compute the attention between elements of two sentences with the dot 91 | product. 92 | 93 | Args: 94 | inputs: A list containing two elements, one for the first sentence 95 | and one for the second, both encoded by a BiLSTM. 96 | 97 | Returns: 98 | A tensor containing the dot product (attention weights between the 99 | elements of the two sentences). 100 | """ 101 | attn_weights = K.batch_dot(x=inputs[0], 102 | y=K.permute_dimensions(inputs[1], 103 | pattern=(0, 2, 1))) 104 | return K.permute_dimensions(attn_weights, (0, 2, 1)) 105 | 106 | def _attention_output_shape(self, inputs): 107 | input_shape = inputs[0] 108 | embedding_size = input_shape[1] 109 | return (input_shape[0], embedding_size, embedding_size) 110 | 111 | def _soft_alignment(self, inputs): 112 | """ 113 | Compute the soft alignment between the elements of two sentences. 114 | 115 | Args: 116 | inputs: A list of two elements, the first is a tensor of attention 117 | weights, the second is the encoded sentence on which to 118 | compute the alignments. 119 | 120 | Returns: 121 | A tensor containing the alignments. 122 | """ 123 | attention = inputs[0] 124 | sentence = inputs[1] 125 | 126 | # Subtract the max. from the attention weights to avoid overflows. 127 | exp = K.exp(attention - K.max(attention, axis=-1, keepdims=True)) 128 | exp_sum = K.sum(exp, axis=-1, keepdims=True) 129 | softmax = exp / exp_sum 130 | 131 | return K.batch_dot(softmax, sentence) 132 | 133 | def _soft_alignment_output_shape(self, inputs): 134 | attention_shape = inputs[0] 135 | sentence_shape = inputs[1] 136 | return (attention_shape[0], attention_shape[1], sentence_shape[2]) 137 | 138 | 139 | class InferenceCompositionLayer(object): 140 | """ 141 | Layer to compose the local inference information. 142 | """ 143 | 144 | def __init__(self, hidden_units, max_length=100, dropout=0.5, 145 | activation='tanh', sequences=True): 146 | self.hidden_units = hidden_units 147 | self.max_length = max_length 148 | self.dropout = dropout 149 | self.activation = activation 150 | self.sequences = sequences 151 | 152 | def __call__(self, input): 153 | composition = Bidirectional(LSTM(self.hidden_units, 154 | activation=self.activation, 155 | return_sequences=self.sequences, 156 | recurrent_dropout=self.dropout, 157 | dropout=self.dropout))(input) 158 | reduction = TimeDistributed(Dense(self.hidden_units, 159 | kernel_initializer='he_normal', 160 | activation='relu'))(composition) 161 | 162 | return Dropout(self.dropout)(reduction) 163 | 164 | 165 | class PoolingLayer(object): 166 | """ 167 | Pooling layer to convert the vectors obtained in the previous layers to 168 | fixed-length vectors. 169 | """ 170 | 171 | def __call__(self, inputs): 172 | a = inputs[0] 173 | b = inputs[1] 174 | 175 | a_avg = GlobalAveragePooling1D()(a) 176 | a_max = GlobalMaxPooling1D()(a) 177 | 178 | b_avg = GlobalAveragePooling1D()(b) 179 | b_max = GlobalMaxPooling1D()(b) 180 | 181 | return concatenate([a_avg, a_max, b_avg, b_max]) 182 | 183 | 184 | class MLPLayer(object): 185 | """ 186 | Multi-layer perceptron for classification. 187 | """ 188 | 189 | def __init__(self, hidden_units, n_classes, dropout=0.5, 190 | activations=['tanh', 'softmax']): 191 | self.model = Sequential() 192 | self.model.add(Dense(hidden_units, kernel_initializer='he_normal', 193 | activation=activations[0], 194 | input_shape=(4*hidden_units,))) 195 | self.model.add(Dropout(dropout)) 196 | self.model.add(Dense(n_classes, kernel_initializer='zero', 197 | activation=activations[1])) 198 | 199 | def __call__(self, input): 200 | return self.model(input) 201 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definition of the ESIM model. 3 | 4 | Inspired from the code on: 5 | https://github.com/yuhsinliu1993/Quora_QuestionPairs_DL 6 | """ 7 | 8 | from keras.layers import Input 9 | from keras.models import Model 10 | from keras.optimizers import Adam 11 | from layers import * 12 | 13 | 14 | class ESIM(object): 15 | """ 16 | ESIM model for Natural Language Inference (NLI) tasks. 17 | """ 18 | 19 | def __init__(self, n_classes, embedding_weights, max_length, hidden_units, 20 | dropout=0.5, learning_rate=0.0004): 21 | self.n_classes = n_classes 22 | self.embedding_weights = embedding_weights 23 | self.voc_size, self.embedding_dim = embedding_weights.shape 24 | self.max_length = max_length 25 | self.hidden_units = hidden_units 26 | self.dropout = dropout 27 | self.learning_rate = learning_rate 28 | 29 | def build_model(self): 30 | """ 31 | Build the model. 32 | 33 | Returns: 34 | The ESIM model compiled and ready to be trained. 35 | """ 36 | a = Input(shape=(self.max_length,), dtype='int32', name='premise') 37 | b = Input(shape=(self.max_length,), dtype='int32', name='hypothesis') 38 | 39 | # ---------- Embedding layer ---------- # 40 | embedding = EmbeddingLayer(self.voc_size, self.embedding_dim, 41 | self.embedding_weights, 42 | max_length=self.max_length) 43 | 44 | embedded_a = embedding(a) 45 | embedded_b = embedding(b) 46 | 47 | # ---------- Encoding layer ---------- # 48 | encoded_a = EncodingLayer(self.hidden_units, 49 | self.max_length, 50 | dropout=self.dropout)(embedded_a) 51 | encoded_b = EncodingLayer(self.hidden_units, 52 | self.max_length, 53 | dropout=self.dropout)(embedded_b) 54 | 55 | # ---------- Local inference layer ---------- # 56 | m_a, m_b = LocalInferenceLayer()([encoded_a, encoded_b]) 57 | 58 | # ---------- Inference composition layer ---------- # 59 | composed_a = InferenceCompositionLayer(self.hidden_units, 60 | self.max_length, 61 | dropout=self.dropout)(m_a) 62 | composed_b = InferenceCompositionLayer(self.hidden_units, 63 | self.max_length, 64 | dropout=self.dropout)(m_b) 65 | 66 | # ---------- Pooling layer ---------- # 67 | pooled = PoolingLayer()([composed_a, composed_b]) 68 | 69 | # ---------- Classification layer ---------- # 70 | prediction = MLPLayer(self.hidden_units, self.n_classes, 71 | dropout=self.dropout)(pooled) 72 | 73 | model = Model(inputs=[a, b], outputs=prediction) 74 | model.compile(optimizer=Adam(lr=self.learning_rate), 75 | loss='categorical_crossentropy', metrics=['accuracy']) 76 | 77 | return model 78 | -------------------------------------------------------------------------------- /src/preprocess_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess the data necessary for the ESIM model. 3 | """ 4 | # Aurelien Coet, 2018. 5 | 6 | import os 7 | import sys 8 | import numpy 9 | import pickle 10 | import numpy as np 11 | from keras.preprocessing.text import Tokenizer 12 | 13 | 14 | def read_data(filepath): 15 | """ 16 | Read the premises, hypotheses and labels from a file in the SNLI dataset 17 | and return them in three separate lists. 18 | 19 | Args: 20 | filepath: The path to a file from the SNLI dataset. 21 | 22 | Returns: 23 | A dictionnary containing three lists, one for the premises, one for the 24 | hypotheses and one for the labels. 25 | """ 26 | labels_dict = {"entailment": '0', "neutral": '1', "contradiction": '2'} 27 | filename = os.path.basename(filepath) 28 | 29 | premises = [] 30 | hypotheses = [] 31 | labels = [] 32 | premises_lens = [] 33 | hypotheses_lens = [] 34 | 35 | with open(filepath, 'r') as input: 36 | # Ignore the first line containing headers. 37 | next(input) 38 | for line in input: 39 | line = line.strip().split('\t') 40 | if line[0] == '-': 41 | continue 42 | 43 | # Read the premise. 44 | sentence = line[5].rstrip() 45 | premises.append(sentence) 46 | premises_lens.append(len(sentence.split())) 47 | 48 | # Read the hypothesis. 49 | sentence = line[6].rstrip() 50 | hypotheses.append(sentence) 51 | hypotheses_lens.append(len(sentence.split())) 52 | 53 | # Read the label. 54 | labels.append(labels_dict[line[0]]) 55 | 56 | print("Min. premise length: {}, max. premise length: {}" 57 | .format(min(premises_lens), max(premises_lens))) 58 | print("Min. hypothesis length: {}, max. hypothesis length: {}" 59 | .format(min(hypotheses_lens), max(hypotheses_lens))) 60 | 61 | return {"premises": premises, "hypotheses": hypotheses, 62 | "labels": labels} 63 | 64 | 65 | def save_preprocessed_data(tokenizer, data, dataset, targetdir): 66 | """ 67 | Save the preprocessed data to pickle files for later use. The preprocessed 68 | data consists in the premises and hypotheses with their words transformed 69 | to their indices, and the labels transformed to integer values. 70 | 71 | Args: 72 | tokenizer: A Keras Tokenizer object that has already been fit on 73 | the training data (its word_index has been built). 74 | data: A dictionnary containing lists of strings for the sentences in 75 | the premises and hypotheses, as well as a list with their 76 | associated labels. 77 | dataset: A string indicating the type of dataset being saved (train, 78 | test or dev). 79 | targetdir: The target directory in which to save the pickled files. 80 | """ 81 | # Transform the words in the input data to their indexes and save them 82 | # in separate pickle files for the premises and hypotheses. 83 | with open(os.path.join(targetdir, "premises_{}.pkl".format(dataset)), 84 | 'wb') as output: 85 | pickle.dump(tokenizer.texts_to_sequences(data["premises"]), output) 86 | 87 | with open(os.path.join(targetdir, "hypotheses_{}.pkl".format(dataset)), 88 | 'wb') as output: 89 | pickle.dump(tokenizer.texts_to_sequences(data["hypotheses"]), output) 90 | 91 | # Pickle the labels too. 92 | with open(os.path.join(targetdir, "labels_{}.pkl".format(dataset)), 93 | 'wb') as output: 94 | pickle.dump(data["labels"], output) 95 | 96 | 97 | def build_embedding_weights(worddict, embeddings_file, targetdir): 98 | """ 99 | Build the embedding weights matrix from a words dictionnary and existing 100 | embeddings, and save it in pickled form. 101 | 102 | Args: 103 | worddict: A dictionnary of words with their associated integer index. 104 | embeddings_file: A file containing predefined word embeddings. 105 | targetdir: The path to the target directory where to save the embedding 106 | weights matrix. 107 | """ 108 | print("* Loading word embeddings from {}...".format(embeddings_file)) 109 | # Load the word embeddings in a dictionnary. 110 | with open(embeddings_file, 'r') as input: 111 | embeddings = {} 112 | for line in input: 113 | line = line.split() 114 | # Ignore lines corresponding to words separated by spaces in the 115 | # predefined embeddings. 116 | if len(line[1:]) != 300: 117 | continue 118 | word = line[0] 119 | if word in worddict: 120 | last = word 121 | embeddings[word] = line[1:] 122 | 123 | print("* Building embedding weights matrix...") 124 | # Initialize the embedding weights matrix. 125 | num_words = len(worddict) 126 | dims = len(embeddings[last]) 127 | embedding_weights = np.zeros((num_words, dims)) 128 | 129 | # Build the embedding weights matrix. 130 | for word, i in worddict.items(): 131 | if word in embeddings: 132 | embedding_weights[i] = embeddings[word] 133 | else: 134 | # Out of vocabulary words are initialised with random gaussian 135 | # samples. 136 | embedding_weights[i] = np.random.normal(size=(dims)) 137 | 138 | # Save the matrix in pickled form. 139 | with open(os.path.join(targetdir, "embedding_weights.pkl"), 140 | 'wb') as output: 141 | pickle.dump(embedding_weights, output) 142 | 143 | 144 | def preprocess_data(train_file, test_file, dev_file, embeddings_file, 145 | targetdir): 146 | """ 147 | Preprocess the data for the ESIM model. Compute the word indices from the 148 | training data, transform all words in the train/test/dev datasets to their 149 | indices, save them in pickled files, and finally build the embedding matrix 150 | and save it in pickled form. 151 | 152 | Args: 153 | train_file: The path to the file containing the training data from the 154 | SNLI dataset. 155 | test_file: The path to the file containing the test data from the SNLI 156 | dataset. 157 | dev_file: The path to the file containing the dev data from the SNLI 158 | dataset. 159 | embeddings_file: The path to the file containing the word embeddings to 160 | use for the embedding matrix. 161 | targetdir: The path to the target directory for the pickled files 162 | produced by the function. 163 | """ 164 | print(20*"=" + "Processing train data..." + 20*"=") 165 | data = read_data(train_file) 166 | 167 | # Build the dictionnary of words from the training data with Keras' 168 | # Tokenizer class. A special token is created for out of voc. words 169 | # (token '__OOV__'), and index 0 is reserved for padding. 170 | print("* Building word index dictionnary...") 171 | tokenizer = Tokenizer(lower=False, oov_token="__OOV__") 172 | tokenizer.fit_on_texts(data["premises"]+data["hypotheses"]) 173 | tokenizer.word_index["__PAD__"] = 0 174 | print("** Total number of words: {}".format(len(tokenizer.word_index))) 175 | # Save the dictionnary in a pickle file. 176 | if not os.path.exists(targetdir): 177 | os.makedirs(targetdir) 178 | with open(os.path.join(targetdir, "worddict.pkl"), 'wb') as pkl_f: 179 | pickle.dump(tokenizer.word_index, pkl_f) 180 | 181 | print("* Transforming and saving train data...") 182 | save_preprocessed_data(tokenizer, data, "train", targetdir) 183 | 184 | # Preprocess and save the test dataset. 185 | print(20*"=" + "Processing test data..." + 20*"=") 186 | data = read_data(test_file) 187 | print("* Transforming and saving test data...") 188 | save_preprocessed_data(tokenizer, data, "test", targetdir) 189 | 190 | # Preprocess and save the dev dataset. 191 | print(20*"=" + "Processing dev data..." + 20*"=") 192 | data = read_data(dev_file) 193 | print("* Transforming and saving dev data...") 194 | save_preprocessed_data(tokenizer, data, "dev", targetdir) 195 | 196 | # Create and save the embedding weights matrix. 197 | print(20*"=" + "Building embedding weights matrix..." + 20*"=") 198 | build_embedding_weights(tokenizer.word_index, embeddings_file, targetdir) 199 | 200 | 201 | if __name__ == "__main__": 202 | basedir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 203 | "..", "data") 204 | snli_dir = os.path.join(basedir, "snli", "snli_1.0") 205 | glove_dir = os.path.join(basedir, "glove") 206 | targetdir = os.path.join(basedir, "preprocessed") 207 | 208 | preprocess_data(os.path.join(snli_dir, "snli_1.0_train.txt"), 209 | os.path.join(snli_dir, "snli_1.0_test.txt"), 210 | os.path.join(snli_dir, "snli_1.0_dev.txt"), 211 | os.path.join(glove_dir, "glove.840B.300d.txt"), 212 | targetdir) 213 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train the ESIM model on some dataset. 3 | """ 4 | 5 | import os 6 | import argparse 7 | from keras.callbacks import ModelCheckpoint 8 | from model import ESIM 9 | from utils import prepare_data, load_embeddings 10 | 11 | 12 | def train(preproc_dir, n_classes, max_length, hidden_units, dropout, 13 | batch_size, epochs, output_dir): 14 | """ 15 | Train the ESIM model on some dataset and save the learned weights. 16 | 17 | Args: 18 | preproc_dir: The directory where the preprocessed data is saved. 19 | n_classes: The number of classes in the problem. 20 | max_length: The maximum length of the sentences in the premises and 21 | hypotheses of the dataset. 22 | hidden_units: The number of hidden units to use in the various layers 23 | of the model. 24 | dropout: The dropout rate to use in the model. 25 | batch_size: The size of the batches to use for training. 26 | epochs: The number of epochs to apply during training. 27 | output_dir: The path to the directory where the weights learned during 28 | training must be saved. 29 | """ 30 | print("Loading training and validation data...") 31 | train_premises, train_hyps, train_labels = prepare_data(preproc_dir, 32 | 'train', 33 | n_classes, 34 | max_length) 35 | valid_premises, valid_hyps, valid_labels = prepare_data(preproc_dir, 36 | 'dev', 37 | n_classes, 38 | max_length) 39 | 40 | print("Loading embedding weights...") 41 | embedding_weights = load_embeddings(os.path.join(preproc_dir, 42 | "embedding_weights.pkl")) 43 | 44 | # Build the model. 45 | esim = ESIM(n_classes, embedding_weights, max_length, hidden_units, 46 | dropout) 47 | model = esim.build_model() 48 | 49 | if not os.path.exists(output_dir): 50 | os.makedirs(output_dir) 51 | filepath = os.path.join(output_dir, 52 | "weights-{epoch:02d}-{val_acc:.2f}.hdf5") 53 | checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 54 | save_best_only=True, mode='max') 55 | 56 | model.fit(x=[train_premises, train_hyps], 57 | y=train_labels, 58 | batch_size=batch_size, 59 | epochs=epochs, 60 | validation_data=([valid_premises, valid_hyps], valid_labels), 61 | callbacks=[checkpoint], 62 | shuffle=True) 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser(description='Train the ESIM model') 67 | parser.add_argument('preproc_dir', help='Path to the directory containing\ 68 | the preprocessed data to be used to train the model.') 69 | parser.add_argument('output_dir', help='Path to the directory where the\ 70 | learned weights of the model must be saved.') 71 | parser.add_argument('--epochs', type=int, default=64, help='Number of\ 72 | epochs to run for training') 73 | parser.add_argument('--batch_size', type=int, default=32, help='Size of the\ 74 | mini-batches to use during training.') 75 | parser.add_argument('--hidden_units', type=int, default=300, help='Number\ 76 | of hidden units to use in the layers of the model') 77 | parser.add_argument('--dropout', type=float, default=0.5, help='Dropout\ 78 | rate to use during training.') 79 | parser.add_argument('--nclasses', type=int, default=3, help='Number of\ 80 | classes.') 81 | parser.add_argument('--max_length', type=int, default=100, help='Max.\ 82 | length of the sentences for the premise and hypothesis.') 83 | 84 | args = parser.parse_args() 85 | 86 | print("Starting training of the model...") 87 | train(args.preproc_dir, args.nclasses, args.max_length, args.hidden_units, 88 | args.dropout, args.batch_size, args.epochs, args.output_dir) 89 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions. 3 | """ 4 | # Aurelien Coet, 2018. 5 | 6 | import os 7 | import pickle 8 | from keras.preprocessing.sequence import pad_sequences 9 | from keras.utils import to_categorical 10 | 11 | 12 | def prepare_data(preproc_dir, dataset, n_classes, max_length=None): 13 | """ 14 | Load and prepare preprocessed data for the ESIM model. 15 | 16 | Args: 17 | preproc_dir: The path to the directory containing the preprocessed 18 | data to be loaded. 19 | dataset: The type of the dataset that must be loaded (train, test or 20 | dev). 21 | 22 | Returns: 23 | A tuple containing numpy arrays. The two first are the premises and 24 | hypotheses of the dataset padded with zeros to all have the same 25 | length. The third one is a numpy array containing the labels 26 | transformed to categorical form. 27 | """ 28 | with open(os.path.join(preproc_dir, "premises_{}.pkl".format(dataset)), 29 | 'rb') as premise_file: 30 | premises = pickle.load(premise_file) 31 | 32 | with open(os.path.join(preproc_dir, "hypotheses_{}.pkl".format(dataset)), 33 | 'rb') as hypotheses_file: 34 | hypotheses = pickle.load(hypotheses_file) 35 | 36 | with open(os.path.join(preproc_dir, "labels_{}.pkl".format(dataset)), 37 | 'rb') as labels_file: 38 | labels = pickle.load(labels_file) 39 | 40 | premises = pad_sequences(premises, maxlen=max_length, 41 | padding='post', truncating='post') 42 | 43 | hypotheses = pad_sequences(hypotheses, maxlen=max_length, 44 | padding='post', truncating='post') 45 | 46 | # Convert the labels to one-hot vectors. 47 | labels = to_categorical(labels, num_classes=n_classes) 48 | 49 | return (premises, hypotheses, labels) 50 | 51 | 52 | def load_embeddings(filepath): 53 | """ 54 | Load an embedding weights matrix from a pickle file. 55 | 56 | Args: 57 | filepath: The path to the file containing the embedding matrix. 58 | 59 | Returns: 60 | The embedding matrix. 61 | """ 62 | with open(filepath, 'rb') as embed_file: 63 | embedding_weights = pickle.load(embed_file) 64 | 65 | return embedding_weights 66 | --------------------------------------------------------------------------------