├── NLP.py
├── README.md
├── Siamene_LSTM_network.py
├── Wrapper.py
├── model.py
└── pre_processing.py


/NLP.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Aug 30 11:22:58 2018
 4 | 
 5 | @author: anjalidharmik
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | from nltk.corpus import stopwords
12 | stop = stopwords.words('english')
13 | 
14 | #from textblob import Word
15 | 
16 | def cleaning_dataset(input_file):
17 |     df_train = pd.read_csv(input_file)
18 |     
19 |     ##Analyze the training dataset...
20 |     #print('Total number of question pairs for training: {}'.format(len(df_train)))
21 |     #
22 |     #print('Duplicate pairs: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))
23 |     #
24 |     ##combined question1 and question2 in a single list as a series...
25 |     #qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
26 |     ##identify unique values in the questions list...
27 |     ##option -> len(set(qids))
28 |     #print('Total number of questions in the training data: {}'.format(len(np.unique(qids))))
29 |     #
30 |     ##number of duplicate question and qids.value_counts() to identify a question occurences...
31 |     #print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1)))
32 |     
33 |     #Pre-Processing...
34 |     #convert all questions in string format...
35 |     df_train['question1'] = df_train['question1'].astype(str)
36 |     df_train['question2'] = df_train['question2'].astype(str)
37 |     
38 |     #convert all questions in lower case...
39 |     df_train['question1'] = df_train['question1'].apply(lambda question1: question1.lower())
40 |     df_train['question2'] = df_train['question2'].apply(lambda question2: question2.lower())
41 |     
42 |     #Remove of Stop Words from questions...
43 |     df_train['question1'] = df_train['question1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
44 |     df_train['question2'] = df_train['question2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
45 |     
46 |     #df_train['question1'] = df_train['question1'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
47 |     #df_train['question2'] = df_train['question2'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
48 |     
49 |     #remove special characters from questions...
50 |     df_train['question1'] = df_train['question1'].str.replace('\W', ' ')
51 |     df_train['question2'] = df_train['question2'].str.replace('\W', ' ')
52 |     
53 |     return df_train
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text-Similarity-Using-Siamese-Deep-Neural-Network
 2 | 
 3 | Siamese neural network is a class of neural network architectures that contain two or more identical subnetworks. Identical means they have the same configuration with the same parameters and weights. Parameter updating is mirrored across both subnetworks. Instead of a model learning to classify its inputs, the neural networks learns to differentiate between two inputs. It learns the similarity between them.
 4 | 
 5 | It is a keras based implementation of Deep Siamese Bidirectional LSTM network to capture phrase/sentence similarity using word embedding.
 6 | 
 7 | Capabilities
 8 | 
 9 | Given adequate training pairs, this model can learn Semantic as well as structural similarity.
10 | For phrases, the model learns word based embeddings to identify structural/syntactic similarities.
11 | 
12 | Examples :
13 | 	Question1 : What is the step by step guide to invest in share market in india?
14 | 	Question2 : What is the step by step guide to invest in share market?
15 | 	Result: It is not duplicate
16 |   
17 | 	Question1 : Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
18 | 	Question2 : I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about 	me?
19 | 	Result: It is  duplicate
20 | 
21 | Step by step Working:
22 | 
23 | 1. Analyze input dataset with pandas
24 | - Check total number of question pairs for training
25 | - Check duplicate data ratio and percentage
26 | - Combined question1 and question2 in a single list as a series
27 | - Identify unique values in the questions list
28 | - Total number of questions in the training data
29 | - Number of questions that appear multiple times
30 | 
31 | 2. Clean input dataset with natural language processing following techniques:
32 | - Convert all questions in same case (lower or upper)
33 | - Remove of Stop Words from questions
34 | - Remove special characters from questions
35 | - Apply Lemmatization
36 | 
37 | 3. Train Siamese neural network with following steps:
38 | - created questions pairs
39 | - created word embedding meta data
40 | - Fit trainning dataset in the model
41 | 
42 | 4. Test Siamese neural network with following steps:
43 | - Predict output results (is duplicate or not)
44 | - Map output results with input dataset.
45 | 
46 | Environment:
47 | 
48 | Keras -> 2.1.6
49 | 
50 | pandas -> 0.17.1
51 | 
52 | gensim -> 3.1.0
53 | 
54 | numpy -> 1.13.1
55 | 


--------------------------------------------------------------------------------
/Siamene_LSTM_network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Aug 31 09:11:17 2018
  4 | 
  5 | @author: anjalidharmik
  6 | """
  7 | import numpy as np
  8 | 
  9 | import pre_processing
 10 | 
 11 | from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
 12 | from keras.layers.normalization import BatchNormalization
 13 | from keras.layers.embeddings import Embedding
 14 | from keras.layers.merge import concatenate
 15 | 
 16 | from keras.models import load_model,Model
 17 | 
 18 | from keras.callbacks import TensorBoard,EarlyStopping, ModelCheckpoint
 19 | 
 20 | import time,os
 21 | 
 22 | class SiameneLSTM:
 23 |     def __init__(self, embedding_dim, max_sequence_length, number_lstm, number_dense, rate_drop_lstm,\
 24 |                  rate_drop_dense, hidden_activation, validation_split_ratio):
 25 |         self.embedding_dim = embedding_dim
 26 |         self.max_sequence_length = max_sequence_length
 27 |         self.number_lstm_units = number_lstm
 28 |         self.rate_drop_lstm = rate_drop_lstm
 29 |         self.number_dense_units = number_dense
 30 |         self.activation_function = hidden_activation
 31 |         self.rate_drop_dense = rate_drop_dense
 32 |         self.validation_split_ratio = validation_split_ratio
 33 |         
 34 |         
 35 |     
 36 |     def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'):
 37 |         
 38 |         tokenizer, embedding_matrix = embedding_meta_data['tokenizer'], embedding_meta_data['embedding_matrix']
 39 | 
 40 |         train_data_x1, train_data_x2, train_labels, leaks_train, \
 41 |         val_data_x1, val_data_x2, val_labels, leaks_val = pre_processing.create_train_dev_set(tokenizer, sentences_pair,is_similar, self.max_sequence_length, self.validation_split_ratio)
 42 | 
 43 |         if train_data_x1 is None:
 44 |             print("-----Failure: Unable to train model-----")
 45 |             return None
 46 | 
 47 |         nb_words = len(tokenizer.word_index) + 1
 48 | 
 49 |         # Creating word embedding layer
 50 |         embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix],
 51 |                                     input_length=self.max_sequence_length, trainable=False)
 52 | 
 53 |         # Creating LSTM Encoder
 54 |         lstm_layer = Bidirectional(LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm))
 55 | 
 56 |         # Creating LSTM Encoder layer for First Sentence
 57 |         sequence_1_input = Input(shape=(self.max_sequence_length,), dtype='int32')
 58 |         embedded_sequences_1 = embedding_layer(sequence_1_input)
 59 |         x1 = lstm_layer(embedded_sequences_1)
 60 | 
 61 |         # Creating LSTM Encoder layer for Second Sentence
 62 |         sequence_2_input = Input(shape=(self.max_sequence_length,), dtype='int32')
 63 |         embedded_sequences_2 = embedding_layer(sequence_2_input)
 64 |         x2 = lstm_layer(embedded_sequences_2)
 65 | 
 66 |         # Creating leaks input
 67 |         leaks_input = Input(shape=(leaks_train.shape[1],))
 68 |         leaks_dense = Dense(self.number_dense_units/2, activation=self.activation_function)(leaks_input)
 69 | 
 70 |         # Merging two LSTM encodes vectors from sentences to
 71 |         # pass it to dense layer applying dropout and batch normalisation
 72 |         merged = concatenate([x1, x2, leaks_dense])
 73 |         merged = BatchNormalization()(merged)
 74 |         merged = Dropout(self.rate_drop_dense)(merged)
 75 |         merged = Dense(self.number_dense_units, activation=self.activation_function)(merged)
 76 |         merged = BatchNormalization()(merged)
 77 |         merged = Dropout(self.rate_drop_dense)(merged)
 78 |         preds = Dense(1, activation='sigmoid')(merged)
 79 | 
 80 |         model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds)
 81 |         model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
 82 | 
 83 |         early_stopping = EarlyStopping(monitor='val_loss', patience=3)
 84 | 
 85 |         STAMP = 'lstm_%d_%d_%.2f_%.2f' % (self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense)
 86 | 
 87 |         checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time.time())) + '/'
 88 | 
 89 |         if not os.path.exists(checkpoint_dir):
 90 |             os.makedirs(checkpoint_dir)
 91 | 
 92 |         bst_model_path = checkpoint_dir + STAMP + '.h5'
 93 | 
 94 |         model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
 95 | 
 96 |         tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))
 97 | 
 98 |         model.fit([train_data_x1, train_data_x2, leaks_train], train_labels,
 99 |                   validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels),
100 |                   epochs=200, batch_size=64, shuffle=True,
101 |                   callbacks=[early_stopping, model_checkpoint, tensorboard])
102 | 
103 |         return bst_model_path


--------------------------------------------------------------------------------
/Wrapper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Aug 31 10:17:46 2018
 4 | 
 5 | @author: anjalidharmik
 6 | """
 7 | 
 8 | import NLP
 9 | import model
10 | 
11 | #train model...
12 | input_train_dataset  = "/home/anjalidharmik/aatasks/input/train.csv"
13 | 
14 | df = NLP.cleaning_dataset(input_train_dataset)
15 | 
16 | train_model = model.train_dataset_model(df)
17 | 
18 | #test model...
19 | input_test_dataset  = "/home/anjalidharmik/aatasks/input/test.csv"
20 | 
21 | df_test = NLP.cleaning_dataset(input_test_dataset)
22 | test_results = model.test_dataset_model(df_test,train_model)
23 | test_results.to_csv("/home/anjalidharmik/aatasks/output/results.csv")
24 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Aug 31 09:02:53 2018
 4 | 
 5 | @author: anjalidharmik
 6 | """
 7 | 
 8 | from keras.models import load_model
 9 | 
10 | import pandas as pd
11 | 
12 | import Siamene_LSTM_network
13 | import pre_processing
14 | 
15 | from operator import itemgetter
16 | 
17 | #initialized required parameters for LSTM network...
18 | EMBEDDING_DIM = 50
19 | MAX_SEQUENCE_LENGTH = 10
20 | VALIDATION_SPLIT = 0.1
21 | RATE_DROP_LSTM = 0.17
22 | RATE_DROP_DENSE = 0.25
23 | NUMBER_LSTM = 50
24 | NUMBER_DENSE_UNITS = 50
25 | ACTIVATION_FUNCTION = 'relu'
26 | 
27 | def train_dataset_model(df):
28 |     df['question1'] = df['question1'].astype(str)
29 |     df['question2'] = df['question2'].astype(str)
30 |     
31 |     question1 = df['question1'].tolist()
32 |     question2 = df['question2'].tolist()
33 |     is_duplicate = df['is_duplicate'].tolist()
34 |     
35 |     ## creating questions pairs
36 |     questions_pair = [(x1, x2) for x1, x2 in zip(question1, question2)]
37 |     print("----------created questions pairs-----------")
38 |     
39 |     # creating word embedding meta data for word embedding 
40 |     tokenizer, embedding_matrix = pre_processing.word_embed_meta_data(question1 + question2,  EMBEDDING_DIM)
41 |     embedding_meta_data = {'tokenizer': tokenizer,'embedding_matrix': embedding_matrix}
42 |     print("----------created word embedding meta data-----------")
43 |     
44 |     #SiameneBiLSTM is a class for  Long short Term Memory networks
45 |     siamese = Siamene_LSTM_network.SiameneLSTM(EMBEDDING_DIM ,MAX_SEQUENCE_LENGTH, NUMBER_LSTM, NUMBER_DENSE_UNITS, RATE_DROP_LSTM, RATE_DROP_DENSE, ACTIVATION_FUNCTION, VALIDATION_SPLIT)
46 |     model_path = siamese.train_model(questions_pair, is_duplicate, embedding_meta_data, model_save_directory='./')
47 |     
48 |     #load the train data in model...
49 |     model = load_model(model_path)
50 |     print("----------model trained-----------")
51 |     return model
52 | 
53 | def test_dataset_model(df_test,model):
54 |     
55 |     df_test['question1'] = df_test['question1'].astype(str)
56 |     df_test['question2'] = df_test['question2'].astype(str)
57 |     
58 |     question1_test = df_test['question1'].tolist()
59 |     question2_test = df_test['question2'].tolist()
60 |     
61 |     ## creating questions pairs
62 |     questions_test_pair = [(x1, x2) for x1, x2 in zip(question1_test, question2_test)]
63 |     print("----------created test dataset-----------")
64 |     
65 |     
66 |     test_data_x1, test_data_x2, leaks_test = pre_processing.create_test_data(tokenizer,questions_test_pair, MAX_SEQUENCE_LENGTH)
67 |     
68 |     #predict the results
69 |     preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
70 |     print("----------predicted test results-----------")
71 |     
72 |     #mapping results with input test data...
73 |     results = [(x, y, z) for (x, y), z in zip(questions_test_pair, preds)]
74 |     print("----------mapping test results-----------")
75 |     
76 |     results.sort(key=itemgetter(2), reverse=True)
77 |     
78 |     return results


--------------------------------------------------------------------------------
/pre_processing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Aug 31 09:36:48 2018
 4 | 
 5 | @author: anjalidharmik
 6 | """
 7 | from keras.preprocessing.text import Tokenizer
 8 | from keras.preprocessing.sequence import pad_sequences
 9 | from gensim.models import Word2Vec
10 | 
11 | import numpy as np
12 | import gc
13 | 
14 | def train_word2vec(documents, embedding_dim):
15 |     model = Word2Vec(documents, min_count=1, size=embedding_dim)
16 |     word_vectors = model.wv
17 |     del model
18 |     return word_vectors
19 |     
20 | def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
21 |     nb_words = len(tokenizer.word_index) + 1
22 |     word_index = tokenizer.word_index
23 |     embedding_matrix = np.zeros((nb_words, embedding_dim))
24 |     for word, i in word_index.items():
25 |         embedding_vector = word_vectors[word]
26 |         if embedding_vector is not None:
27 |             embedding_matrix[i] = embedding_vector
28 |     return embedding_matrix
29 |     
30 | #creating word vector with gensim model
31 | def word_embed_meta_data(documents, embedding_dim):
32 |     
33 |     tokenizer = Tokenizer()
34 |     tokenizer.fit_on_texts(" ".join(documents))
35 |     word_vector = train_word2vec(documents, embedding_dim)
36 |     embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
37 |     del word_vector
38 |     gc.collect()
39 |     return tokenizer, embedding_matrix
40 |  
41 | #creating training dataset...   
42 | def create_train_dev_set(tokenizer, questions_pair, is_similar, max_sequence_length, validation_split_ratio):
43 |         
44 |     questions1 = [x[0] for x in questions_pair]
45 |     questions2 = [x[1] for x in questions_pair]
46 |     train_questions_1 = tokenizer.texts_to_sequences(questions1)
47 |     train_questions_2 = tokenizer.texts_to_sequences(questions2)
48 |     leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
49 |              for x1, x2 in zip(train_questions_1, train_questions_2)]
50 |     
51 |     train_padded_data_1 = pad_sequences(train_questions_1, maxlen=max_sequence_length)
52 |     train_padded_data_2 = pad_sequences(train_questions_2, maxlen=max_sequence_length)
53 |     train_labels = np.array(is_similar)
54 |     leaks = np.array(leaks)
55 |     
56 |     shuffle_indices = np.random.permutation(np.arange(len(train_labels)))
57 |     train_data_1_shuffled = train_padded_data_1[shuffle_indices]
58 |     train_data_2_shuffled = train_padded_data_2[shuffle_indices]
59 |     train_labels_shuffled = train_labels[shuffle_indices]
60 |     leaks_shuffled = leaks[shuffle_indices]
61 |     
62 |     dev_idx = max(1, int(len(train_labels_shuffled) * validation_split_ratio))
63 |     
64 |     del train_padded_data_1
65 |     del train_padded_data_2
66 |     gc.collect()
67 |     
68 |     train_data_1, val_data_1 = train_data_1_shuffled[:-dev_idx], train_data_1_shuffled[-dev_idx:]
69 |     train_data_2, val_data_2 = train_data_2_shuffled[:-dev_idx], train_data_2_shuffled[-dev_idx:]
70 |     labels_train, labels_val = train_labels_shuffled[:-dev_idx], train_labels_shuffled[-dev_idx:]
71 |     leaks_train, leaks_val = leaks_shuffled[:-dev_idx], leaks_shuffled[-dev_idx:]
72 |     
73 |     return train_data_1, train_data_2, labels_train, leaks_train, val_data_1, val_data_2, labels_val, leaks_val
74 | 
75 | def create_test_data(tokenizer, test_questions_pair, max_sequence_length):
76 |    
77 |     test_questions1 = [x[0] for x in test_questions_pair]
78 |     test_questions2 = [x[1] for x in test_questions_pair]
79 | 
80 |     test_questions_1 = tokenizer.texts_to_questions(test_questions1)
81 |     test_questions_2 = tokenizer.texts_to_questions(test_questions2)
82 |     leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
83 |                   for x1, x2 in zip(test_questions_1, test_questions_2)]
84 | 
85 |     leaks_test = np.array(leaks_test)
86 |     test_data_1 = pad_sequences(test_questions_1, maxlen=max_sequence_length)
87 |     test_data_2 = pad_sequences(test_questions_2, maxlen=max_sequence_length)
88 | 
89 |     return test_data_1, test_data_2, leaks_test
90 |     
91 | 


--------------------------------------------------------------------------------