├── README.md ├── config.py ├── models.py ├── data_helper.py └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # ESIM 2 | ESIM model : implementation of Enhanced LSTM for Natural language inference 3 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | 2 | class Parameters(object): 3 | 4 | jieba_dictionary = './dataset/segdict.txt' 5 | embedding_path = '/home/linhx/word2vec/model/word2vec_wx' 6 | clean_path = './dataset/stopwords.txt' 7 | train_data_path = './dataset/dataset.csv' 8 | model_path = './checkpoint/pre_emb/' 9 | 10 | MAX_NB_WORDS = 30000 11 | BATCH_SIZE = 128 12 | EMBEDDING_DIM = 256 13 | MAX_SEQUENCE_LENGTH = 50 14 | RECURRENT_UNITS = 300 15 | DENSE_UNITS = 300 16 | DROPOUT_RATE = 0.5 17 | 18 | keep_punctuation = True 19 | clean_data = True 20 | remove_stopwords = False 21 | use_owndict = False 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import keras 2 | from keras.layers import * 3 | from keras.activations import softmax 4 | from keras.models import Model 5 | from keras.layers.merge import concatenate 6 | from keras.layers.normalization import BatchNormalization 7 | from keras.utils import multi_gpu_model 8 | 9 | 10 | def get_ESIM_model(nb_words, embedding_dim, embedding_matrix, recurrent_units, dense_units, dropout_rate, max_sequence_length, out_size): 11 | embedding_layer = Embedding(nb_words, 12 | embedding_dim, 13 | # embeddings_initializer='uniform', 14 | weights=[embedding_matrix], 15 | input_length=max_sequence_length, 16 | trainable=False) 17 | 18 | input_q1_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q1') 19 | input_q2_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q2') 20 | 21 | embedding_sequence_q1 = BatchNormalization(axis=2)(embedding_layer(input_q1_layer)) 22 | embedding_sequence_q2 = BatchNormalization(axis=2)(embedding_layer(input_q2_layer)) 23 | 24 | final_embedding_sequence_q1 = SpatialDropout1D(0.25)(embedding_sequence_q1) 25 | final_embedding_sequence_q2 = SpatialDropout1D(0.25)(embedding_sequence_q2) 26 | 27 | rnn_layer_q1 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q1) 28 | rnn_layer_q2 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q2) 29 | 30 | attention = Dot(axes=-1)([rnn_layer_q1, rnn_layer_q2]) 31 | w_attn_1 = Lambda(lambda x: softmax(x, axis=1))(attention) 32 | w_attn_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2))(attention)) 33 | align_layer_1 = Dot(axes=1)([w_attn_1, rnn_layer_q1]) 34 | align_layer_2 = Dot(axes=1)([w_attn_2, rnn_layer_q2]) 35 | 36 | subtract_layer_1 = subtract([rnn_layer_q1, align_layer_1]) 37 | subtract_layer_2 = subtract([rnn_layer_q2, align_layer_2]) 38 | 39 | multiply_layer_1 = multiply([rnn_layer_q1, align_layer_1]) 40 | multiply_layer_2 = multiply([rnn_layer_q2, align_layer_2]) 41 | 42 | m_q1 = concatenate([rnn_layer_q1, align_layer_1, subtract_layer_1, multiply_layer_1]) 43 | m_q2 = concatenate([rnn_layer_q2, align_layer_2, subtract_layer_2, multiply_layer_2]) 44 | 45 | v_q1_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q1) 46 | v_q2_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q2) 47 | 48 | avgpool_q1 = GlobalAveragePooling1D()(v_q1_i) 49 | avgpool_q2 = GlobalAveragePooling1D()(v_q2_i) 50 | maxpool_q1 = GlobalMaxPooling1D()(v_q1_i) 51 | maxpool_q2 = GlobalMaxPooling1D()(v_q2_i) 52 | 53 | merged_q1 = concatenate([avgpool_q1, maxpool_q1]) 54 | merged_q2 = concatenate([avgpool_q2, maxpool_q2]) 55 | 56 | final_v = BatchNormalization()(concatenate([merged_q1, merged_q2])) 57 | output = Dense(units=dense_units, activation='relu')(final_v) 58 | output = BatchNormalization()(output) 59 | output = Dropout(dropout_rate)(output) 60 | output = Dense(units=out_size, activation='sigmoid')(output) 61 | 62 | model = Model(inputs=[input_q1_layer, input_q2_layer], output=output) 63 | adam_optimizer = keras.optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5) 64 | parallel_model = multi_gpu_model(model, gpus=2) 65 | 66 | parallel_model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['binary_crossentropy', 'accuracy']) 67 | 68 | return parallel_model 69 | -------------------------------------------------------------------------------- /data_helper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from string import punctuation as p 4 | from config import Parameters as pm 5 | from tqdm import tqdm 6 | from keras.preprocessing.text import Tokenizer 7 | from keras.preprocessing.sequence import pad_sequences 8 | from gensim.models import word2vec 9 | import re, jieba 10 | if pm.use_owndict: 11 | jieba.load_userdict(pm.jieba_dictionary) 12 | 13 | jieba.suggest_freq(('亲', '工'), True) 14 | jieba.suggest_freq(('对', '子'), True) 15 | 16 | 17 | class Dataloader(object): 18 | def __init__(self): 19 | self.q1_data, self.q2_data, self.label = self.read_dataset(pm.train_data_path) 20 | self.embedding_index = self.load_pretrain_embedding(pm.embedding_path) 21 | if pm.clean_data: 22 | if pm.remove_stopwords: 23 | self.ignored_word = self.load_clean_words(pm.clean_path) 24 | self.cleaned_q1_data, self.cleaned_q2_data = [], [] 25 | for text in self.q1_data: 26 | self.cleaned_q1_data.append(self.clean_data(text)) 27 | for text in self.q2_data: 28 | self.cleaned_q2_data.append(self.clean_data(text)) 29 | self.q1_sequences, self.q2_sequences, self.word_index = self.tokenizer() 30 | self.nb_words, self.embedding_matrix = self.prepare_embedding_matrix() 31 | 32 | def read_dataset(self, train_path): 33 | train = pd.read_csv(train_path) 34 | 35 | q1_data = train['Q1'].values 36 | q2_data = train['Q2'].values 37 | label = train['label'].values 38 | 39 | return q1_data, q2_data, label 40 | 41 | def load_pretrain_embedding(self, file): 42 | print('Indexing word vector...') 43 | embedding_index = word2vec.Word2Vec.load(file) 44 | 45 | return embedding_index 46 | 47 | def load_clean_words(self, file): 48 | clean_word_dict = {} 49 | with open(file, 'r', encoding='utf-8') as f: 50 | for line in f: 51 | line = line.strip('\n') 52 | # typo, correct = line.split(',') 53 | # clean_word_dict[typo] = correct 54 | clean_word_dict[line] = ',' 55 | 56 | return clean_word_dict 57 | 58 | def clean_data(self, text): 59 | replace_numbers = re.compile(r'\d+', re.IGNORECASE) 60 | 61 | text = text.lower() 62 | text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text) 63 | text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text) 64 | text = re.sub(r"what's", "what is ", text) 65 | text = re.sub(r"\'s", " ", text) 66 | text = re.sub(r"\'ve", " have ", text) 67 | text = re.sub(r"can't", "cannot ", text) 68 | text = re.sub(r"n't", " not ", text) 69 | text = re.sub(r"i'm", "i am ", text) 70 | text = re.sub(r"i’m", "i am", text) 71 | text = re.sub(r"\'re", " are ", text) 72 | text = re.sub(r"\'d", " would ", text) 73 | text = re.sub(r"\'ll", " will ", text) 74 | text = re.sub(r"e - mail", "email", text) 75 | text = re.sub(r" +", "", text) 76 | 77 | stop_p = p + "~·!@#¥%……&*()——=+-{}【】:;“”‘’《》,。?、|、" 78 | 79 | if pm.keep_punctuation: 80 | text = re.sub(r"”", "\"", text) 81 | text = re.sub(r"“", "\"", text) 82 | text = re.sub(r"´", "'", text) 83 | text = re.sub(r"—", " ", text) 84 | text = re.sub(r"’", "'", text) 85 | text = re.sub(r"‘", "'", text) 86 | text = re.sub(r",", " ", text) 87 | text = re.sub(r"\.", " ", text) 88 | text = re.sub(r"!", " ! ", text) 89 | text = re.sub(r"\/", " ", text) 90 | text = re.sub(r"\^", " ^ ", text) 91 | text = re.sub(r"\+", " + ", text) 92 | text = re.sub(r"\-", " - ", text) 93 | text = re.sub(r"\=", " = ", text) 94 | text = re.sub(r"'", " ", text) 95 | text = re.sub(r":", " : ", text) 96 | text = re.sub(r"−", " ", text) 97 | text = re.sub(r"\?", " ? ", text) 98 | text = re.sub(r"\^", " ^ ", text) 99 | text = re.sub(r"#", " # ", text) 100 | text = re.sub(r"¥", "$", text) 101 | else: 102 | for token in stop_p: 103 | text = re.sub(token, "", text) 104 | 105 | text = replace_numbers.sub('', text) 106 | 107 | if pm.remove_stopwords: 108 | text = "".join([word for word in text if word not in self.ignored_word]) 109 | 110 | return text 111 | 112 | def tokenizer(self): 113 | tokenizer = Tokenizer(num_words=pm.MAX_NB_WORDS, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n') 114 | q1_cutted_data = self.segmentation(self.cleaned_q1_data) 115 | q2_cutted_data = self.segmentation(self.cleaned_q2_data) 116 | 117 | tokenizer.fit_on_texts(q1_cutted_data + q2_cutted_data) 118 | q1_sequences = tokenizer.texts_to_sequences(q1_cutted_data) 119 | q2_sequences = tokenizer.texts_to_sequences(q2_cutted_data) 120 | 121 | word_index = tokenizer.word_index 122 | print('Found %s unique tokens' % len(word_index)) 123 | 124 | # Padding 125 | q1_data = pad_sequences(q1_sequences, maxlen=pm.MAX_SEQUENCE_LENGTH) 126 | print('Shape of q1_data tensor: ', q1_data.shape) 127 | q2_data = pad_sequences(q2_sequences, maxlen=pm.MAX_SEQUENCE_LENGTH) 128 | print('Shape of q2_data tensor: ', q2_data.shape) 129 | print('Shape of label tensor: ', self.label.shape) 130 | 131 | return q1_data, q2_data, word_index 132 | 133 | def segmentation(self, data): 134 | data_cutted = [] 135 | for sentence in tqdm(data): 136 | seg_list = jieba.cut(sentence, cut_all=False) 137 | data_cutted.append(" ".join(seg_list)) 138 | print('Finished segment for dataset.') 139 | 140 | return data_cutted 141 | 142 | def prepare_embedding_matrix(self): 143 | nb_words = min(pm.MAX_NB_WORDS, len(self.word_index)) 144 | embedding_matrix = np.zeros((nb_words + 1, pm.EMBEDDING_DIM)) 145 | 146 | print('Creating embedding matrix ...') 147 | for word, idx in self.word_index.items(): 148 | if idx >= pm.MAX_NB_WORDS: 149 | continue 150 | if word in self.embedding_index.wv.vocab: 151 | embedding_vector = self.embedding_index.wv[word] 152 | embedding_matrix[idx] = embedding_vector 153 | 154 | return nb_words, embedding_matrix 155 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import warnings, os 2 | import tensorflow as tf 3 | import numpy as np 4 | from data_helper import Dataloader 5 | from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, precision_score, recall_score, f1_score 6 | from keras.callbacks import EarlyStopping, ModelCheckpoint 7 | from keras.backend.tensorflow_backend import set_session 8 | from keras.models import load_model 9 | from config import Parameters as pm 10 | from models import get_ESIM_model 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # Init settings 15 | config = tf.ConfigProto() 16 | config.gpu_options.allow_growth = True 17 | # config.gpu_options.per_process_gpu_memory_fraction = 0.5 18 | set_session(tf.Session(config=config)) 19 | 20 | 21 | def train_model_by_logloss(model, batch_size, train_q1, train_q2, train_y, val_q1, val_q2, val_y, fold_id): 22 | early_stopping = EarlyStopping(monitor='val_loss', patience=7) 23 | best_model_path = pm.model_path + 'ESIM_' + str(fold_id) + '.h5' 24 | model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True) 25 | hist = model.fit([train_q1, train_q2], train_y, validation_data=([val_q1, val_q2], val_y), 26 | epochs=50, batch_size=batch_size, shuffle=True, 27 | callbacks=[early_stopping, model_checkpoint]) 28 | best_val_score = min(hist.history['val_loss']) 29 | predictions = model.predict([val_q1, val_q2]) 30 | auc = roc_auc_score(val_y, predictions) 31 | print('AUC Score : ', auc) 32 | 33 | return model, best_val_score, auc, predictions 34 | 35 | 36 | def train_folds(q1, q2, y, fold_count, batch_size, get_model_func): 37 | fold_size = len(q1) // fold_count 38 | models, fold_predictions = [], [] 39 | score, total_auc = 0, 0 40 | write_file = open('./log/Logger.txt', 'w', encoding='utf-8') 41 | for fold_id in range(0, fold_count): 42 | fold_start = fold_size * fold_id 43 | fold_end = fold_start + fold_size 44 | 45 | if fold_id == fold_count - 1: 46 | fold_end = len(q1) 47 | 48 | train_q1 = np.concatenate([q1[:fold_start], q1[fold_end:]]) 49 | train_q2 = np.concatenate([q2[:fold_start], q2[fold_end:]]) 50 | train_y = np.concatenate([y[:fold_start], y[fold_end:]]) 51 | 52 | val_q1 = q1[fold_start: fold_end] 53 | val_q2 = q2[fold_start: fold_end] 54 | val_y = y[fold_start: fold_end] 55 | 56 | print('In fold {}'.format(fold_id + 1)) 57 | model, best_val_score, auc, fold_prediction = train_model_by_logloss(get_model_func, batch_size, 58 | train_q1, train_q2, train_y, 59 | val_q1, val_q2, val_y, fold_id) 60 | score += best_val_score 61 | total_auc += auc 62 | fold_predictions.append(fold_prediction) 63 | models.append(model) 64 | write_file.write('Fold {}\tLoss {}\tAUC {}\n'.format(fold_id + 1, best_val_score, auc)) 65 | write_file.flush() 66 | 67 | write_file.close() 68 | 69 | return models, score / fold_count, total_auc / fold_count, fold_predictions 70 | 71 | 72 | def train(): 73 | # q1 & q2 sequences (after tokenize operation) + label + embedding_matrix 74 | data_loader = Dataloader() 75 | if not os.path.exists(pm.model_path): 76 | os.makedirs(pm.model_path) 77 | 78 | model = get_ESIM_model(data_loader.nb_words + 1, pm.EMBEDDING_DIM, data_loader.embedding_matrix, 79 | pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE, 80 | pm.MAX_SEQUENCE_LENGTH, 1) 81 | # model = get_ESIM_model(pm.MAX_NB_WORDS, pm.EMBEDDING_DIM, None, 82 | # pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE, 83 | # pm.MAX_SEQUENCE_LENGTH, 1) 84 | print(model.summary()) 85 | 86 | models, val_loss, total_auc, fold_predictions = train_folds(data_loader.q1_sequences, 87 | data_loader.q2_sequences, 88 | data_loader.label, 89 | 10, 90 | pm.BATCH_SIZE, 91 | model) 92 | 93 | print('Overall val-loss: {}, AUC {}'.format(val_loss, total_auc)) 94 | 95 | 96 | def evaluate(): 97 | ''' 98 | For training OOB(out-of-bag) Evaluation. 99 | ''' 100 | data_loader = Dataloader() 101 | eval_predicts_list = [] 102 | for fold_id in range(0, 10): 103 | model = get_ESIM_model(data_loader.nb_words + 1, pm.EMBEDDING_DIM, data_loader.embedding_matrix, 104 | pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE, 105 | pm.MAX_SEQUENCE_LENGTH, 1) 106 | model.load_weights(pm.model_path + 'ESIM_' + str(fold_id) + '.h5') 107 | eval_predict = model.predict([data_loader.q1_sequences, data_loader.q2_sequences], 108 | batch_size=pm.BATCH_SIZE, verbose=1) 109 | eval_predicts_list.append(eval_predict) 110 | 111 | train_auc = roc_auc_score(data_loader.label, eval_predict) 112 | train_loss = log_loss(data_loader.label, eval_predict) 113 | train_acc = accuracy_score(data_loader.label, eval_predict.round()) 114 | train_precision = precision_score(data_loader.label, eval_predict.round()) 115 | train_recall = recall_score(data_loader.label, eval_predict.round()) 116 | train_f1_score = f1_score(data_loader.label, eval_predict.round()) 117 | print('Training AUC:{}\tLOSS:{}\tACCURACY:{}\tPRECISION:{}\tRECALL:{}\tF1_SCORE:{}'.format( 118 | train_auc, train_loss, train_acc, train_precision, train_recall, train_f1_score)) 119 | 120 | 121 | train_fold_predictions = np.zeros(eval_predicts_list[0].shape) 122 | for fold_predict in eval_predicts_list: 123 | train_fold_predictions += fold_predict 124 | train_fold_predictions /= len(eval_predicts_list) 125 | 126 | train_auc = roc_auc_score(data_loader.label, train_fold_predictions) 127 | train_loss = log_loss(data_loader.label, train_fold_predictions) 128 | train_acc = accuracy_score(data_loader.label, train_fold_predictions.round()) 129 | train_precision = precision_score(data_loader.label, train_fold_predictions.round()) 130 | train_recall = recall_score(data_loader.label, train_fold_predictions.round()) 131 | train_f1_score = f1_score(data_loader.label, train_fold_predictions.round()) 132 | print('Training AUC:{}\tLOSS:{}\tACCURACY:{}\tPRECISION:{}\tRECALL:{}\tF1_SCORE:{}'.format( 133 | train_auc, train_loss, train_acc, train_precision, train_recall, train_f1_score)) 134 | 135 | 136 | if __name__ == '__main__': 137 | # train() 138 | evaluate() 139 | 140 | 141 | 142 | --------------------------------------------------------------------------------