├── NLP.py ├── README.md ├── Siamene_LSTM_network.py ├── Wrapper.py ├── model.py └── pre_processing.py /NLP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Aug 30 11:22:58 2018 4 | 5 | @author: anjalidharmik 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from nltk.corpus import stopwords 12 | stop = stopwords.words('english') 13 | 14 | #from textblob import Word 15 | 16 | def cleaning_dataset(input_file): 17 | df_train = pd.read_csv(input_file) 18 | 19 | ##Analyze the training dataset... 20 | #print('Total number of question pairs for training: {}'.format(len(df_train))) 21 | # 22 | #print('Duplicate pairs: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2))) 23 | # 24 | ##combined question1 and question2 in a single list as a series... 25 | #qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist()) 26 | ##identify unique values in the questions list... 27 | ##option -> len(set(qids)) 28 | #print('Total number of questions in the training data: {}'.format(len(np.unique(qids)))) 29 | # 30 | ##number of duplicate question and qids.value_counts() to identify a question occurences... 31 | #print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1))) 32 | 33 | #Pre-Processing... 34 | #convert all questions in string format... 35 | df_train['question1'] = df_train['question1'].astype(str) 36 | df_train['question2'] = df_train['question2'].astype(str) 37 | 38 | #convert all questions in lower case... 39 | df_train['question1'] = df_train['question1'].apply(lambda question1: question1.lower()) 40 | df_train['question2'] = df_train['question2'].apply(lambda question2: question2.lower()) 41 | 42 | #Remove of Stop Words from questions... 43 | df_train['question1'] = df_train['question1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) 44 | df_train['question2'] = df_train['question2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) 45 | 46 | #df_train['question1'] = df_train['question1'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 47 | #df_train['question2'] = df_train['question2'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 48 | 49 | #remove special characters from questions... 50 | df_train['question1'] = df_train['question1'].str.replace('\W', ' ') 51 | df_train['question2'] = df_train['question2'].str.replace('\W', ' ') 52 | 53 | return df_train 54 | 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text-Similarity-Using-Siamese-Deep-Neural-Network 2 | 3 | Siamese neural network is a class of neural network architectures that contain two or more identical subnetworks. Identical means they have the same configuration with the same parameters and weights. Parameter updating is mirrored across both subnetworks. Instead of a model learning to classify its inputs, the neural networks learns to differentiate between two inputs. It learns the similarity between them. 4 | 5 | It is a keras based implementation of Deep Siamese Bidirectional LSTM network to capture phrase/sentence similarity using word embedding. 6 | 7 | Capabilities 8 | 9 | Given adequate training pairs, this model can learn Semantic as well as structural similarity. 10 | For phrases, the model learns word based embeddings to identify structural/syntactic similarities. 11 | 12 | Examples : 13 | Question1 : What is the step by step guide to invest in share market in india? 14 | Question2 : What is the step by step guide to invest in share market? 15 | Result: It is not duplicate 16 | 17 | Question1 : Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 18 | Question2 : I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 19 | Result: It is duplicate 20 | 21 | Step by step Working: 22 | 23 | 1. Analyze input dataset with pandas 24 | - Check total number of question pairs for training 25 | - Check duplicate data ratio and percentage 26 | - Combined question1 and question2 in a single list as a series 27 | - Identify unique values in the questions list 28 | - Total number of questions in the training data 29 | - Number of questions that appear multiple times 30 | 31 | 2. Clean input dataset with natural language processing following techniques: 32 | - Convert all questions in same case (lower or upper) 33 | - Remove of Stop Words from questions 34 | - Remove special characters from questions 35 | - Apply Lemmatization 36 | 37 | 3. Train Siamese neural network with following steps: 38 | - created questions pairs 39 | - created word embedding meta data 40 | - Fit trainning dataset in the model 41 | 42 | 4. Test Siamese neural network with following steps: 43 | - Predict output results (is duplicate or not) 44 | - Map output results with input dataset. 45 | 46 | Environment: 47 | 48 | Keras -> 2.1.6 49 | 50 | pandas -> 0.17.1 51 | 52 | gensim -> 3.1.0 53 | 54 | numpy -> 1.13.1 55 | -------------------------------------------------------------------------------- /Siamene_LSTM_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 31 09:11:17 2018 4 | 5 | @author: anjalidharmik 6 | """ 7 | import numpy as np 8 | 9 | import pre_processing 10 | 11 | from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.layers.embeddings import Embedding 14 | from keras.layers.merge import concatenate 15 | 16 | from keras.models import load_model,Model 17 | 18 | from keras.callbacks import TensorBoard,EarlyStopping, ModelCheckpoint 19 | 20 | import time,os 21 | 22 | class SiameneLSTM: 23 | def __init__(self, embedding_dim, max_sequence_length, number_lstm, number_dense, rate_drop_lstm,\ 24 | rate_drop_dense, hidden_activation, validation_split_ratio): 25 | self.embedding_dim = embedding_dim 26 | self.max_sequence_length = max_sequence_length 27 | self.number_lstm_units = number_lstm 28 | self.rate_drop_lstm = rate_drop_lstm 29 | self.number_dense_units = number_dense 30 | self.activation_function = hidden_activation 31 | self.rate_drop_dense = rate_drop_dense 32 | self.validation_split_ratio = validation_split_ratio 33 | 34 | 35 | 36 | def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'): 37 | 38 | tokenizer, embedding_matrix = embedding_meta_data['tokenizer'], embedding_meta_data['embedding_matrix'] 39 | 40 | train_data_x1, train_data_x2, train_labels, leaks_train, \ 41 | val_data_x1, val_data_x2, val_labels, leaks_val = pre_processing.create_train_dev_set(tokenizer, sentences_pair,is_similar, self.max_sequence_length, self.validation_split_ratio) 42 | 43 | if train_data_x1 is None: 44 | print("-----Failure: Unable to train model-----") 45 | return None 46 | 47 | nb_words = len(tokenizer.word_index) + 1 48 | 49 | # Creating word embedding layer 50 | embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix], 51 | input_length=self.max_sequence_length, trainable=False) 52 | 53 | # Creating LSTM Encoder 54 | lstm_layer = Bidirectional(LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm)) 55 | 56 | # Creating LSTM Encoder layer for First Sentence 57 | sequence_1_input = Input(shape=(self.max_sequence_length,), dtype='int32') 58 | embedded_sequences_1 = embedding_layer(sequence_1_input) 59 | x1 = lstm_layer(embedded_sequences_1) 60 | 61 | # Creating LSTM Encoder layer for Second Sentence 62 | sequence_2_input = Input(shape=(self.max_sequence_length,), dtype='int32') 63 | embedded_sequences_2 = embedding_layer(sequence_2_input) 64 | x2 = lstm_layer(embedded_sequences_2) 65 | 66 | # Creating leaks input 67 | leaks_input = Input(shape=(leaks_train.shape[1],)) 68 | leaks_dense = Dense(self.number_dense_units/2, activation=self.activation_function)(leaks_input) 69 | 70 | # Merging two LSTM encodes vectors from sentences to 71 | # pass it to dense layer applying dropout and batch normalisation 72 | merged = concatenate([x1, x2, leaks_dense]) 73 | merged = BatchNormalization()(merged) 74 | merged = Dropout(self.rate_drop_dense)(merged) 75 | merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) 76 | merged = BatchNormalization()(merged) 77 | merged = Dropout(self.rate_drop_dense)(merged) 78 | preds = Dense(1, activation='sigmoid')(merged) 79 | 80 | model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds) 81 | model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) 82 | 83 | early_stopping = EarlyStopping(monitor='val_loss', patience=3) 84 | 85 | STAMP = 'lstm_%d_%d_%.2f_%.2f' % (self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense) 86 | 87 | checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time.time())) + '/' 88 | 89 | if not os.path.exists(checkpoint_dir): 90 | os.makedirs(checkpoint_dir) 91 | 92 | bst_model_path = checkpoint_dir + STAMP + '.h5' 93 | 94 | model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) 95 | 96 | tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time())) 97 | 98 | model.fit([train_data_x1, train_data_x2, leaks_train], train_labels, 99 | validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), 100 | epochs=200, batch_size=64, shuffle=True, 101 | callbacks=[early_stopping, model_checkpoint, tensorboard]) 102 | 103 | return bst_model_path -------------------------------------------------------------------------------- /Wrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 31 10:17:46 2018 4 | 5 | @author: anjalidharmik 6 | """ 7 | 8 | import NLP 9 | import model 10 | 11 | #train model... 12 | input_train_dataset = "/home/anjalidharmik/aatasks/input/train.csv" 13 | 14 | df = NLP.cleaning_dataset(input_train_dataset) 15 | 16 | train_model = model.train_dataset_model(df) 17 | 18 | #test model... 19 | input_test_dataset = "/home/anjalidharmik/aatasks/input/test.csv" 20 | 21 | df_test = NLP.cleaning_dataset(input_test_dataset) 22 | test_results = model.test_dataset_model(df_test,train_model) 23 | test_results.to_csv("/home/anjalidharmik/aatasks/output/results.csv") 24 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 31 09:02:53 2018 4 | 5 | @author: anjalidharmik 6 | """ 7 | 8 | from keras.models import load_model 9 | 10 | import pandas as pd 11 | 12 | import Siamene_LSTM_network 13 | import pre_processing 14 | 15 | from operator import itemgetter 16 | 17 | #initialized required parameters for LSTM network... 18 | EMBEDDING_DIM = 50 19 | MAX_SEQUENCE_LENGTH = 10 20 | VALIDATION_SPLIT = 0.1 21 | RATE_DROP_LSTM = 0.17 22 | RATE_DROP_DENSE = 0.25 23 | NUMBER_LSTM = 50 24 | NUMBER_DENSE_UNITS = 50 25 | ACTIVATION_FUNCTION = 'relu' 26 | 27 | def train_dataset_model(df): 28 | df['question1'] = df['question1'].astype(str) 29 | df['question2'] = df['question2'].astype(str) 30 | 31 | question1 = df['question1'].tolist() 32 | question2 = df['question2'].tolist() 33 | is_duplicate = df['is_duplicate'].tolist() 34 | 35 | ## creating questions pairs 36 | questions_pair = [(x1, x2) for x1, x2 in zip(question1, question2)] 37 | print("----------created questions pairs-----------") 38 | 39 | # creating word embedding meta data for word embedding 40 | tokenizer, embedding_matrix = pre_processing.word_embed_meta_data(question1 + question2, EMBEDDING_DIM) 41 | embedding_meta_data = {'tokenizer': tokenizer,'embedding_matrix': embedding_matrix} 42 | print("----------created word embedding meta data-----------") 43 | 44 | #SiameneBiLSTM is a class for Long short Term Memory networks 45 | siamese = Siamene_LSTM_network.SiameneLSTM(EMBEDDING_DIM ,MAX_SEQUENCE_LENGTH, NUMBER_LSTM, NUMBER_DENSE_UNITS, RATE_DROP_LSTM, RATE_DROP_DENSE, ACTIVATION_FUNCTION, VALIDATION_SPLIT) 46 | model_path = siamese.train_model(questions_pair, is_duplicate, embedding_meta_data, model_save_directory='./') 47 | 48 | #load the train data in model... 49 | model = load_model(model_path) 50 | print("----------model trained-----------") 51 | return model 52 | 53 | def test_dataset_model(df_test,model): 54 | 55 | df_test['question1'] = df_test['question1'].astype(str) 56 | df_test['question2'] = df_test['question2'].astype(str) 57 | 58 | question1_test = df_test['question1'].tolist() 59 | question2_test = df_test['question2'].tolist() 60 | 61 | ## creating questions pairs 62 | questions_test_pair = [(x1, x2) for x1, x2 in zip(question1_test, question2_test)] 63 | print("----------created test dataset-----------") 64 | 65 | 66 | test_data_x1, test_data_x2, leaks_test = pre_processing.create_test_data(tokenizer,questions_test_pair, MAX_SEQUENCE_LENGTH) 67 | 68 | #predict the results 69 | preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) 70 | print("----------predicted test results-----------") 71 | 72 | #mapping results with input test data... 73 | results = [(x, y, z) for (x, y), z in zip(questions_test_pair, preds)] 74 | print("----------mapping test results-----------") 75 | 76 | results.sort(key=itemgetter(2), reverse=True) 77 | 78 | return results -------------------------------------------------------------------------------- /pre_processing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 31 09:36:48 2018 4 | 5 | @author: anjalidharmik 6 | """ 7 | from keras.preprocessing.text import Tokenizer 8 | from keras.preprocessing.sequence import pad_sequences 9 | from gensim.models import Word2Vec 10 | 11 | import numpy as np 12 | import gc 13 | 14 | def train_word2vec(documents, embedding_dim): 15 | model = Word2Vec(documents, min_count=1, size=embedding_dim) 16 | word_vectors = model.wv 17 | del model 18 | return word_vectors 19 | 20 | def create_embedding_matrix(tokenizer, word_vectors, embedding_dim): 21 | nb_words = len(tokenizer.word_index) + 1 22 | word_index = tokenizer.word_index 23 | embedding_matrix = np.zeros((nb_words, embedding_dim)) 24 | for word, i in word_index.items(): 25 | embedding_vector = word_vectors[word] 26 | if embedding_vector is not None: 27 | embedding_matrix[i] = embedding_vector 28 | return embedding_matrix 29 | 30 | #creating word vector with gensim model 31 | def word_embed_meta_data(documents, embedding_dim): 32 | 33 | tokenizer = Tokenizer() 34 | tokenizer.fit_on_texts(" ".join(documents)) 35 | word_vector = train_word2vec(documents, embedding_dim) 36 | embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim) 37 | del word_vector 38 | gc.collect() 39 | return tokenizer, embedding_matrix 40 | 41 | #creating training dataset... 42 | def create_train_dev_set(tokenizer, questions_pair, is_similar, max_sequence_length, validation_split_ratio): 43 | 44 | questions1 = [x[0] for x in questions_pair] 45 | questions2 = [x[1] for x in questions_pair] 46 | train_questions_1 = tokenizer.texts_to_sequences(questions1) 47 | train_questions_2 = tokenizer.texts_to_sequences(questions2) 48 | leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))] 49 | for x1, x2 in zip(train_questions_1, train_questions_2)] 50 | 51 | train_padded_data_1 = pad_sequences(train_questions_1, maxlen=max_sequence_length) 52 | train_padded_data_2 = pad_sequences(train_questions_2, maxlen=max_sequence_length) 53 | train_labels = np.array(is_similar) 54 | leaks = np.array(leaks) 55 | 56 | shuffle_indices = np.random.permutation(np.arange(len(train_labels))) 57 | train_data_1_shuffled = train_padded_data_1[shuffle_indices] 58 | train_data_2_shuffled = train_padded_data_2[shuffle_indices] 59 | train_labels_shuffled = train_labels[shuffle_indices] 60 | leaks_shuffled = leaks[shuffle_indices] 61 | 62 | dev_idx = max(1, int(len(train_labels_shuffled) * validation_split_ratio)) 63 | 64 | del train_padded_data_1 65 | del train_padded_data_2 66 | gc.collect() 67 | 68 | train_data_1, val_data_1 = train_data_1_shuffled[:-dev_idx], train_data_1_shuffled[-dev_idx:] 69 | train_data_2, val_data_2 = train_data_2_shuffled[:-dev_idx], train_data_2_shuffled[-dev_idx:] 70 | labels_train, labels_val = train_labels_shuffled[:-dev_idx], train_labels_shuffled[-dev_idx:] 71 | leaks_train, leaks_val = leaks_shuffled[:-dev_idx], leaks_shuffled[-dev_idx:] 72 | 73 | return train_data_1, train_data_2, labels_train, leaks_train, val_data_1, val_data_2, labels_val, leaks_val 74 | 75 | def create_test_data(tokenizer, test_questions_pair, max_sequence_length): 76 | 77 | test_questions1 = [x[0] for x in test_questions_pair] 78 | test_questions2 = [x[1] for x in test_questions_pair] 79 | 80 | test_questions_1 = tokenizer.texts_to_questions(test_questions1) 81 | test_questions_2 = tokenizer.texts_to_questions(test_questions2) 82 | leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))] 83 | for x1, x2 in zip(test_questions_1, test_questions_2)] 84 | 85 | leaks_test = np.array(leaks_test) 86 | test_data_1 = pad_sequences(test_questions_1, maxlen=max_sequence_length) 87 | test_data_2 = pad_sequences(test_questions_2, maxlen=max_sequence_length) 88 | 89 | return test_data_1, test_data_2, leaks_test 90 | 91 | --------------------------------------------------------------------------------