├── README.md
├── config.py
├── models.py
├── data_helper.py
└── main.py


/README.md:
--------------------------------------------------------------------------------
1 | # ESIM
2 | ESIM model : implementation of Enhanced LSTM for Natural language inference
3 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Parameters(object):
 3 | 
 4 |     jieba_dictionary = './dataset/segdict.txt'
 5 |     embedding_path = '/home/linhx/word2vec/model/word2vec_wx'
 6 |     clean_path = './dataset/stopwords.txt'
 7 |     train_data_path = './dataset/dataset.csv'
 8 |     model_path = './checkpoint/pre_emb/'
 9 | 
10 |     MAX_NB_WORDS = 30000
11 |     BATCH_SIZE = 128
12 |     EMBEDDING_DIM = 256
13 |     MAX_SEQUENCE_LENGTH = 50
14 |     RECURRENT_UNITS = 300
15 |     DENSE_UNITS = 300
16 |     DROPOUT_RATE = 0.5
17 | 
18 |     keep_punctuation = True
19 |     clean_data = True
20 |     remove_stopwords = False
21 |     use_owndict = False
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | from keras.layers import *
 3 | from keras.activations import softmax
 4 | from keras.models import Model
 5 | from keras.layers.merge import concatenate
 6 | from keras.layers.normalization import BatchNormalization
 7 | from keras.utils import multi_gpu_model
 8 | 
 9 | 
10 | def get_ESIM_model(nb_words, embedding_dim, embedding_matrix, recurrent_units, dense_units, dropout_rate, max_sequence_length, out_size):
11 |     embedding_layer = Embedding(nb_words,
12 |                                 embedding_dim,
13 |                                 # embeddings_initializer='uniform',
14 |                                 weights=[embedding_matrix],
15 |                                 input_length=max_sequence_length,
16 |                                 trainable=False)
17 | 
18 |     input_q1_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q1')
19 |     input_q2_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q2')
20 | 
21 |     embedding_sequence_q1 = BatchNormalization(axis=2)(embedding_layer(input_q1_layer))
22 |     embedding_sequence_q2 = BatchNormalization(axis=2)(embedding_layer(input_q2_layer))
23 | 
24 |     final_embedding_sequence_q1 = SpatialDropout1D(0.25)(embedding_sequence_q1)
25 |     final_embedding_sequence_q2 = SpatialDropout1D(0.25)(embedding_sequence_q2)
26 | 
27 |     rnn_layer_q1 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q1)
28 |     rnn_layer_q2 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q2)
29 | 
30 |     attention = Dot(axes=-1)([rnn_layer_q1, rnn_layer_q2])
31 |     w_attn_1 = Lambda(lambda x: softmax(x, axis=1))(attention)
32 |     w_attn_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2))(attention))
33 |     align_layer_1 = Dot(axes=1)([w_attn_1, rnn_layer_q1])
34 |     align_layer_2 = Dot(axes=1)([w_attn_2, rnn_layer_q2])
35 | 
36 |     subtract_layer_1 = subtract([rnn_layer_q1, align_layer_1])
37 |     subtract_layer_2 = subtract([rnn_layer_q2, align_layer_2])
38 | 
39 |     multiply_layer_1 = multiply([rnn_layer_q1, align_layer_1])
40 |     multiply_layer_2 = multiply([rnn_layer_q2, align_layer_2])
41 | 
42 |     m_q1 = concatenate([rnn_layer_q1, align_layer_1, subtract_layer_1, multiply_layer_1])
43 |     m_q2 = concatenate([rnn_layer_q2, align_layer_2, subtract_layer_2, multiply_layer_2])
44 | 
45 |     v_q1_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q1)
46 |     v_q2_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q2)
47 | 
48 |     avgpool_q1 = GlobalAveragePooling1D()(v_q1_i)
49 |     avgpool_q2 = GlobalAveragePooling1D()(v_q2_i)
50 |     maxpool_q1 = GlobalMaxPooling1D()(v_q1_i)
51 |     maxpool_q2 = GlobalMaxPooling1D()(v_q2_i)
52 | 
53 |     merged_q1 = concatenate([avgpool_q1, maxpool_q1])
54 |     merged_q2 = concatenate([avgpool_q2, maxpool_q2])
55 | 
56 |     final_v = BatchNormalization()(concatenate([merged_q1, merged_q2]))
57 |     output = Dense(units=dense_units, activation='relu')(final_v)
58 |     output = BatchNormalization()(output)
59 |     output = Dropout(dropout_rate)(output)
60 |     output = Dense(units=out_size, activation='sigmoid')(output)
61 | 
62 |     model = Model(inputs=[input_q1_layer, input_q2_layer], output=output)
63 |     adam_optimizer = keras.optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
64 |     parallel_model = multi_gpu_model(model, gpus=2)
65 | 
66 |     parallel_model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['binary_crossentropy', 'accuracy'])
67 | 
68 |     return parallel_model
69 | 


--------------------------------------------------------------------------------
/data_helper.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from string import punctuation as p
  4 | from config import Parameters as pm
  5 | from tqdm import tqdm
  6 | from keras.preprocessing.text import Tokenizer
  7 | from keras.preprocessing.sequence import pad_sequences
  8 | from gensim.models import word2vec
  9 | import re, jieba
 10 | if pm.use_owndict:
 11 |     jieba.load_userdict(pm.jieba_dictionary)
 12 | 
 13 | jieba.suggest_freq(('亲', '工'), True)
 14 | jieba.suggest_freq(('对', '子'), True)
 15 | 
 16 | 
 17 | class Dataloader(object):
 18 |     def __init__(self):
 19 |         self.q1_data, self.q2_data, self.label = self.read_dataset(pm.train_data_path)
 20 |         self.embedding_index = self.load_pretrain_embedding(pm.embedding_path)
 21 |         if pm.clean_data:
 22 |             if pm.remove_stopwords:
 23 |                 self.ignored_word = self.load_clean_words(pm.clean_path)
 24 |             self.cleaned_q1_data, self.cleaned_q2_data = [], []
 25 |             for text in self.q1_data:
 26 |                 self.cleaned_q1_data.append(self.clean_data(text))
 27 |             for text in self.q2_data:
 28 |                 self.cleaned_q2_data.append(self.clean_data(text))
 29 |         self.q1_sequences, self.q2_sequences, self.word_index = self.tokenizer()
 30 |         self.nb_words, self.embedding_matrix = self.prepare_embedding_matrix()
 31 | 
 32 |     def read_dataset(self, train_path):
 33 |         train = pd.read_csv(train_path)
 34 | 
 35 |         q1_data = train['Q1'].values
 36 |         q2_data = train['Q2'].values
 37 |         label = train['label'].values
 38 | 
 39 |         return q1_data, q2_data, label
 40 | 
 41 |     def load_pretrain_embedding(self, file):
 42 |         print('Indexing word vector...')
 43 |         embedding_index = word2vec.Word2Vec.load(file)
 44 | 
 45 |         return embedding_index
 46 | 
 47 |     def load_clean_words(self, file):
 48 |         clean_word_dict = {}
 49 |         with open(file, 'r', encoding='utf-8') as f:
 50 |             for line in f:
 51 |                 line = line.strip('\n')
 52 |                 # typo, correct = line.split(',')
 53 |                 # clean_word_dict[typo] = correct
 54 |                 clean_word_dict[line] = ','
 55 | 
 56 |         return clean_word_dict
 57 | 
 58 |     def clean_data(self, text):
 59 |         replace_numbers = re.compile(r'\d+', re.IGNORECASE)
 60 | 
 61 |         text = text.lower()
 62 |         text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
 63 |         text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
 64 |         text = re.sub(r"what's", "what is ", text)
 65 |         text = re.sub(r"\'s", " ", text)
 66 |         text = re.sub(r"\'ve", " have ", text)
 67 |         text = re.sub(r"can't", "cannot ", text)
 68 |         text = re.sub(r"n't", " not ", text)
 69 |         text = re.sub(r"i'm", "i am ", text)
 70 |         text = re.sub(r"i’m", "i am", text)
 71 |         text = re.sub(r"\'re", " are ", text)
 72 |         text = re.sub(r"\'d", " would ", text)
 73 |         text = re.sub(r"\'ll", " will ", text)
 74 |         text = re.sub(r"e - mail", "email", text)
 75 |         text = re.sub(r" +", "", text)
 76 | 
 77 |         stop_p = p + "~·！@#￥%……&*（）——=+-{}【】：；“”‘’《》，。？、|、"
 78 | 
 79 |         if pm.keep_punctuation:
 80 |             text = re.sub(r"”", "\"", text)
 81 |             text = re.sub(r"“", "\"", text)
 82 |             text = re.sub(r"´", "'", text)
 83 |             text = re.sub(r"—", " ", text)
 84 |             text = re.sub(r"’", "'", text)
 85 |             text = re.sub(r"‘", "'", text)
 86 |             text = re.sub(r",", " ", text)
 87 |             text = re.sub(r"\.", " ", text)
 88 |             text = re.sub(r"!", " ! ", text)
 89 |             text = re.sub(r"\/", " ", text)
 90 |             text = re.sub(r"\^", " ^ ", text)
 91 |             text = re.sub(r"\+", " + ", text)
 92 |             text = re.sub(r"\-", " - ", text)
 93 |             text = re.sub(r"\=", " = ", text)
 94 |             text = re.sub(r"'", " ", text)
 95 |             text = re.sub(r":", " : ", text)
 96 |             text = re.sub(r"−", " ", text)
 97 |             text = re.sub(r"\?", " ? ", text)
 98 |             text = re.sub(r"\^", " ^ ", text)
 99 |             text = re.sub(r"#", " # ", text)
100 |             text = re.sub(r"￥", "$", text)
101 |         else:
102 |             for token in stop_p:
103 |                 text = re.sub(token, "", text)
104 | 
105 |         text = replace_numbers.sub('', text)
106 | 
107 |         if pm.remove_stopwords:
108 |             text = "".join([word for word in text if word not in self.ignored_word])
109 | 
110 |         return text
111 | 
112 |     def tokenizer(self):
113 |         tokenizer = Tokenizer(num_words=pm.MAX_NB_WORDS, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')
114 |         q1_cutted_data = self.segmentation(self.cleaned_q1_data)
115 |         q2_cutted_data = self.segmentation(self.cleaned_q2_data)
116 | 
117 |         tokenizer.fit_on_texts(q1_cutted_data + q2_cutted_data)
118 |         q1_sequences = tokenizer.texts_to_sequences(q1_cutted_data)
119 |         q2_sequences = tokenizer.texts_to_sequences(q2_cutted_data)
120 | 
121 |         word_index = tokenizer.word_index
122 |         print('Found %s unique tokens' % len(word_index))
123 | 
124 |         # Padding
125 |         q1_data = pad_sequences(q1_sequences, maxlen=pm.MAX_SEQUENCE_LENGTH)
126 |         print('Shape of q1_data tensor: ', q1_data.shape)
127 |         q2_data = pad_sequences(q2_sequences, maxlen=pm.MAX_SEQUENCE_LENGTH)
128 |         print('Shape of q2_data tensor: ', q2_data.shape)
129 |         print('Shape of label tensor: ', self.label.shape)
130 | 
131 |         return q1_data, q2_data, word_index
132 | 
133 |     def segmentation(self, data):
134 |         data_cutted = []
135 |         for sentence in tqdm(data):
136 |             seg_list = jieba.cut(sentence, cut_all=False)
137 |             data_cutted.append(" ".join(seg_list))
138 |         print('Finished segment for dataset.')
139 | 
140 |         return data_cutted
141 | 
142 |     def prepare_embedding_matrix(self):
143 |         nb_words = min(pm.MAX_NB_WORDS, len(self.word_index))
144 |         embedding_matrix = np.zeros((nb_words + 1, pm.EMBEDDING_DIM))
145 | 
146 |         print('Creating embedding matrix ...')
147 |         for word, idx in self.word_index.items():
148 |             if idx >= pm.MAX_NB_WORDS:
149 |                 continue
150 |             if word in self.embedding_index.wv.vocab:
151 |                 embedding_vector = self.embedding_index.wv[word]
152 |                 embedding_matrix[idx] = embedding_vector
153 | 
154 |         return nb_words, embedding_matrix
155 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import warnings, os
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | from data_helper import Dataloader
  5 | from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, precision_score, recall_score, f1_score
  6 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  7 | from keras.backend.tensorflow_backend import set_session
  8 | from keras.models import load_model
  9 | from config import Parameters as pm
 10 | from models import get_ESIM_model
 11 | warnings.filterwarnings('ignore')
 12 | 
 13 | 
 14 | # Init settings
 15 | config = tf.ConfigProto()
 16 | config.gpu_options.allow_growth = True
 17 | # config.gpu_options.per_process_gpu_memory_fraction = 0.5
 18 | set_session(tf.Session(config=config))
 19 | 
 20 | 
 21 | def train_model_by_logloss(model, batch_size, train_q1, train_q2, train_y, val_q1, val_q2, val_y, fold_id):
 22 |     early_stopping = EarlyStopping(monitor='val_loss', patience=7)
 23 |     best_model_path = pm.model_path + 'ESIM_' + str(fold_id) + '.h5'
 24 |     model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
 25 |     hist = model.fit([train_q1, train_q2], train_y, validation_data=([val_q1, val_q2], val_y),
 26 |                      epochs=50, batch_size=batch_size, shuffle=True,
 27 |                      callbacks=[early_stopping, model_checkpoint])
 28 |     best_val_score = min(hist.history['val_loss'])
 29 |     predictions = model.predict([val_q1, val_q2])
 30 |     auc = roc_auc_score(val_y, predictions)
 31 |     print('AUC Score : ', auc)
 32 | 
 33 |     return model, best_val_score, auc, predictions
 34 | 
 35 | 
 36 | def train_folds(q1, q2, y, fold_count, batch_size, get_model_func):
 37 |     fold_size = len(q1) // fold_count
 38 |     models, fold_predictions = [], []
 39 |     score, total_auc = 0, 0
 40 |     write_file = open('./log/Logger.txt', 'w', encoding='utf-8')
 41 |     for fold_id in range(0, fold_count):
 42 |         fold_start = fold_size * fold_id
 43 |         fold_end = fold_start + fold_size
 44 | 
 45 |         if fold_id == fold_count - 1:
 46 |             fold_end = len(q1)
 47 | 
 48 |         train_q1 = np.concatenate([q1[:fold_start], q1[fold_end:]])
 49 |         train_q2 = np.concatenate([q2[:fold_start], q2[fold_end:]])
 50 |         train_y = np.concatenate([y[:fold_start], y[fold_end:]])
 51 | 
 52 |         val_q1 = q1[fold_start: fold_end]
 53 |         val_q2 = q2[fold_start: fold_end]
 54 |         val_y = y[fold_start: fold_end]
 55 | 
 56 |         print('In fold {}'.format(fold_id + 1))
 57 |         model, best_val_score, auc, fold_prediction = train_model_by_logloss(get_model_func, batch_size,
 58 |                                                                              train_q1, train_q2, train_y,
 59 |                                                                              val_q1, val_q2, val_y, fold_id)
 60 |         score += best_val_score
 61 |         total_auc += auc
 62 |         fold_predictions.append(fold_prediction)
 63 |         models.append(model)
 64 |         write_file.write('Fold {}\tLoss {}\tAUC {}\n'.format(fold_id + 1, best_val_score, auc))
 65 |         write_file.flush()
 66 |     
 67 |     write_file.close()
 68 | 
 69 |     return models, score / fold_count, total_auc / fold_count, fold_predictions
 70 | 
 71 | 
 72 | def train():
 73 |     # q1 & q2 sequences (after tokenize operation) + label + embedding_matrix
 74 |     data_loader = Dataloader()
 75 |     if not os.path.exists(pm.model_path):
 76 |         os.makedirs(pm.model_path)
 77 | 
 78 |     model = get_ESIM_model(data_loader.nb_words + 1, pm.EMBEDDING_DIM, data_loader.embedding_matrix,
 79 |                            pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE,
 80 |                            pm.MAX_SEQUENCE_LENGTH, 1)
 81 |     # model = get_ESIM_model(pm.MAX_NB_WORDS, pm.EMBEDDING_DIM, None,
 82 |     #                        pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE,
 83 |     #                        pm.MAX_SEQUENCE_LENGTH, 1)
 84 |     print(model.summary())
 85 | 
 86 |     models, val_loss, total_auc, fold_predictions = train_folds(data_loader.q1_sequences,
 87 |                                                                 data_loader.q2_sequences,
 88 |                                                                 data_loader.label,
 89 |                                                                 10,
 90 |                                                                 pm.BATCH_SIZE,
 91 |                                                                 model)
 92 | 
 93 |     print('Overall val-loss: {}, AUC {}'.format(val_loss, total_auc))
 94 | 
 95 | 
 96 | def evaluate():
 97 |     '''
 98 |     For training OOB(out-of-bag) Evaluation.
 99 |     '''
100 |     data_loader = Dataloader()
101 |     eval_predicts_list = []
102 |     for fold_id in range(0, 10):
103 |         model = get_ESIM_model(data_loader.nb_words + 1, pm.EMBEDDING_DIM, data_loader.embedding_matrix,
104 |                            pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE,
105 |                            pm.MAX_SEQUENCE_LENGTH, 1)
106 |         model.load_weights(pm.model_path + 'ESIM_' + str(fold_id) + '.h5')
107 |         eval_predict = model.predict([data_loader.q1_sequences, data_loader.q2_sequences], 
108 |                                      batch_size=pm.BATCH_SIZE, verbose=1)
109 |         eval_predicts_list.append(eval_predict)
110 |     
111 |         train_auc = roc_auc_score(data_loader.label, eval_predict)
112 |         train_loss = log_loss(data_loader.label, eval_predict)
113 |         train_acc = accuracy_score(data_loader.label, eval_predict.round())
114 |         train_precision = precision_score(data_loader.label, eval_predict.round())
115 |         train_recall = recall_score(data_loader.label, eval_predict.round())
116 |         train_f1_score = f1_score(data_loader.label, eval_predict.round())
117 |         print('Training AUC:{}\tLOSS:{}\tACCURACY:{}\tPRECISION:{}\tRECALL:{}\tF1_SCORE:{}'.format(
118 |             train_auc, train_loss, train_acc, train_precision, train_recall, train_f1_score))
119 | 
120 |     
121 |     train_fold_predictions = np.zeros(eval_predicts_list[0].shape)
122 |     for fold_predict in eval_predicts_list:
123 |         train_fold_predictions += fold_predict
124 |     train_fold_predictions /= len(eval_predicts_list)
125 | 
126 |     train_auc = roc_auc_score(data_loader.label, train_fold_predictions)
127 |     train_loss = log_loss(data_loader.label, train_fold_predictions)
128 |     train_acc = accuracy_score(data_loader.label, train_fold_predictions.round())
129 |     train_precision = precision_score(data_loader.label, train_fold_predictions.round())
130 |     train_recall = recall_score(data_loader.label, train_fold_predictions.round())
131 |     train_f1_score = f1_score(data_loader.label, train_fold_predictions.round())
132 |     print('Training AUC:{}\tLOSS:{}\tACCURACY:{}\tPRECISION:{}\tRECALL:{}\tF1_SCORE:{}'.format(
133 |         train_auc, train_loss, train_acc, train_precision, train_recall, train_f1_score))
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     # train()
138 |     evaluate()
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------