├── README.md
├── glove_vocab.py
├── keras_cnn_model.py
├── keras_cnn_model_v0.1.py
├── keras_cnn_model_v0.2.py
├── keras_predict.py
├── keras_predict_v0.1.py
├── keras_train.py
├── keras_train_v0.1.py
├── keras_train_v0.2.py
└── preprocess_dataset.py


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # CNN Text Classification using Keras
 3 | 
 4 | keras_cnn_model.py is cnn model in keras for text classification.
 5 | 
 6 | keras_train.py creates x_train and y_train. It reads from a directory where each file as some sentences and those sentences 
 7 | belong to class - filename. The filename as read in order is the class order.
 8 | 
 9 | keras_predict.py predicts the class of new sentences. The order of class in the keras_train while training has to match order in this file.
10 | 
11 | ## To call keras_predict.py
12 | 
13 | python keras_predict.py <MODEL_PATH> <TOKENIZER_PATH> <SENTENCE_IN_DOUBLE_QUOTES>
14 | 
15 | I have included different versions of cnn model and keras_train file here,
16 | 
17 | ## v0.1
18 | 
19 | ### keras_train_v0.1.py
20 | 
21 | Includes KFold cross validation code, classification_report and confusion matrix created on the best model from cross validation.
22 | 
23 | ### keras_cnn_model_v0.1.py
24 | 
25 | Additional code ( commented right now ) for using Adam optimizer and another layer of convolution with attention layer at the top.
26 | 
27 | ## v0.2
28 | 
29 | ### keras_train_v0.2.py 
30 | 
31 | Includes what revision 0.1 had, additionally it contains code to include pre-trained glove vector using code from glove_vocab.py
32 | 
33 | ### keras_cnn_model_v0.2.py
34 | 
35 | It includes layer in following sequence
36 | 
37 | - Embedding Layer ( includes pretrained glove vector if supplied )
38 | - Convolution 1D  kernel = 1 , stride = 1
39 | - MaxPooling 1D   patch = 3 , stride = 1
40 | - Dropout
41 | - Convolution 1D  kernel = 2 , stride = 1
42 | - MaxPooling 1D   patch = 2 , stride = 1
43 | - Dropout
44 | - Dense           256 as output 
45 | - Dropout
46 | - Dense ( final layer ) 6 as output
47 | 
48 | ### In my dataset, I was able to reach max validation accuracy of 0.8888888955116272
49 | 
50 | 


--------------------------------------------------------------------------------
/glove_vocab.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pickle
  4 | from keras.preprocessing.text import Tokenizer
  5 | from keras.preprocessing.sequence import pad_sequences
  6 | from keras.utils import to_categorical
  7 | 
  8 | class Glove:
  9 | 	def __init__(self,glove_path,embedding_size,load_from_exiting_path=None):
 10 | 		if load_from_exiting_path != None:
 11 | 			if os.path.isdir(load_from_exiting_path) == False:
 12 | 				raise Exception("path {} is not found".format(load_from_exiting_path))
 13 | 			embeddings_index_path = os.path.join(load_from_exiting_path,'embedding_index.bin')
 14 | 			embedding_matrix_path = os.path.join(load_from_exiting_path,'embedding_matrix.bin')
 15 | 			if os.path.isfile(embeddings_index_path) == False or os.path.isfile(embedding_matrix_path) == False:
 16 | 				raise Exception("file {} and {} not found".format(embeddings_index_path,embedding_matrix_path))
 17 | 			self.embeddings_index=pickle.load(embeddings_index_path)
 18 | 			self.embedding_matrix=pickle.load(embedding_matrix_path)
 19 | 			return
 20 | 		if os.path.isfile(glove_path) == False:
 21 | 			raise Exception("glove file {} file not found".format(glove_path))
 22 | 		self.glove_path=glove_path
 23 | 		self.embedding_size=embedding_size
 24 | 		self.embeddings_index={}
 25 | 		f = open(os.path.join(self.glove_path))
 26 | 		for line in f:
 27 | 			values = line.split()
 28 | 			word = values[0]
 29 | 			coefs = np.asarray(values[1:], dtype='float32')
 30 | 			self.embeddings_index[word] = coefs
 31 | 		f.close()
 32 | 	def get_embedding_index(self):
 33 | 		return self.embeddings_index
 34 | 	def create_embedding_matrix(self,word_index):
 35 | 		""" word_index is tokenizer.word_index and tokenizer is from keras.preprocessing.text import Tokenizer """
 36 | 		self.embedding_matrix = np.zeros((len(word_index)+1, self.embedding_size))
 37 | 		for word, i in word_index.items():
 38 | 			embedding_vector = self.embeddings_index.get(word)
 39 | 			if embedding_vector is not None:
 40 | 				# words not found in embedding index will be all-zeros.
 41 | 				self.embedding_matrix[i] = embedding_vector
 42 | 	def get_vector(self,word_id):
 43 | 		""" word_id is number assigned to the word when create_embedding_matrix is called """
 44 | 		return self.embedding_matrix.get(word_id)
 45 | 	def store(self,store_path):
 46 | 		if os.path.isdir(store_path) == False:
 47 | 			raise Exception("unable to save to {} , dir does not exists".format(store_path))
 48 | 		pickle.dump(self.embeddings_index,open(os.path.join(store_path,'embedding_index.bin'),'wb'))
 49 | 		pickle.dump(self.embedding_matrix,open(os.path.join(store_path,'embedding_matrix.bin'),'wb'))
 50 | 
 51 | class Vocabulary:
 52 | 	def __init__(self,max_num_words,max_sequence_length,load_from_exiting_path=None):
 53 | 		if load_from_exiting_path != None:
 54 | 			if os.path.isdir(load_from_exiting_path) == False:
 55 | 				raise Exception("path {} is not found".format(load_from_exiting_path))
 56 | 			vocabulary_path = os.path.join(load_from_exiting_path,'vocabulary.bin')
 57 | 			if os.path.isfile(vocabulary_path) == False:
 58 | 				raise Exception("file {} and {} not found".format(embeddings_index_path,embedding_matrix_path))
 59 | 			vocab = pickle.load(open(vocabulary_path,'rb'))
 60 | 			self.max_num_words=vocab.max_num_words
 61 | 			self.max_sequence_length=vocab.max_sequence_length
 62 | 			self.tokenizer=vocab.tokenizer
 63 | 			self.sequences=vocab.sequences
 64 | 			self.data=vocab.data
 65 | 			return
 66 | 		self.max_num_words=max_num_words
 67 | 		self.max_sequence_length=max_sequence_length
 68 | 		self.tokenizer = Tokenizer(num_words=max_num_words)
 69 | 	def get_word_index(self):
 70 | 		return self.tokenizer.word_index
 71 | 	def fit_and_pad(self,texts):
 72 | 		""" called while training """
 73 | 		self.tokenizer.fit_on_texts(texts)
 74 | 		self.sequences = self.tokenizer.texts_to_sequences(texts)
 75 | 		self.data=pad_sequences(self.sequences,maxlen=self.max_sequence_length)
 76 | 		return self.data
 77 | 	def get_padded_sequences(self,texts):
 78 | 		""" called while inference """
 79 | 		self.sequences = self.tokenizer.texts_to_sequences(texts)
 80 | 		self.data=pad_sequences(self.sequences,maxlen=self.max_sequence_length)
 81 | 		return self.data
 82 | 	def store(self,store_path):
 83 | 		if os.path.isdir(store_path) == False:
 84 | 			raise Exception("unable to save to {} , dir does not exists".format(store_path))
 85 | 		pickle.dump(self,open(os.path.join(store_path,'vocabulary.bin'),'wb'))
 86 | 
 87 | def read_class_files_for_training(class_info_file,class_data_directory):
 88 | 	texts = []  # list of text samples
 89 | 	labels_index = {}  # dictionary mapping label id to name
 90 | 	labels = []  # list of label ids
 91 | 	if os.path.isfile(class_info_file) == False:
 92 | 		raise Exception("file {} not found".format(class_info_file))
 93 | 	fh=open(class_info_file)
 94 | 	classes = fh.readlines()
 95 | 	fh.close()
 96 | 	
 97 | 	idx=0
 98 | 	for cls in classes:
 99 | 		print(idx)
100 | 		cls = cls.replace('\n','')
101 | 		full_path = os.path.join(class_data_directory,cls)
102 | 		if os.path.isfile(full_path) == False:
103 | 			raise Exception("file {} not found".format(full_path))
104 | 		fh=open(full_path)
105 | 		lines=fh.readlines()
106 | 		fh.close()
107 | 		for ln in lines:
108 | 			ln = ln.replace('\n','')
109 | 			texts.append(ln)
110 | 			labels.append(idx)
111 | 		labels_index[idx]=cls
112 | 		idx += 1
113 | 	labels = to_categorical(np.asarray(labels))
114 | 	return (texts,labels,labels_index)
115 | def read_class_file_for_prediction(class_info_file):
116 | 	labels_index = {}  # dictionary mapping label id to name
117 | 	if os.path.isfile(class_info_file) == False:
118 | 		raise Exception("file {} not found".format(class_info_file))
119 | 	fh=open(class_info_file)
120 | 	classes = fh.readlines()
121 | 	fh.close()
122 | 	idx=0
123 | 	for cls in classes:
124 | 		labels_index[idx]=cls
125 | 		idx += 1
126 | 	return labels_index
127 | def read_training_file_for_retrieval_model(file_name):
128 | 	""" 
129 | 		Training file is supposed to be of following format 
130 | 		line[1]: question | answer | 1   --> for correct answer against the question
131 | 		line[2]: question | answer | 0   --> for incorrect answer agains the question
132 | 	"""
133 | 	fh = open(file_name,'r')
134 | 	lines = fh.readlines()
135 | 	fh.close()
136 | 
137 | 	questions=[]
138 | 	answers=[]
139 | 	target=[]
140 | 	for ln in lines:
141 | 		ln = ln.strip()
142 | 		arr=ln.split('|')
143 | 		questions.append(arr[0].strip())
144 | 		answers.append(arr[1].strip())
145 | 		target.append(arr[2].strip())
146 | 	return (questions,answers,target)
147 | 
148 | 


--------------------------------------------------------------------------------
/keras_cnn_model.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Conv1D, MaxPooling1D , Dense, Dropout, Activation
 2 | from keras.layers.embeddings import Embedding
 3 | from keras.layers.core import Flatten
 4 | from keras.models import Sequential
 5 | import keras
 6 | 
 7 | def create_model(vocab_size,embedding_size,max_sentence_length,filter_sizes,num_filters,dropout):
 8 | 	model = Sequential()
 9 | 	model.add(Embedding(vocab_size,embedding_size,input_length=max_sentence_length))
10 | 	#for filter_size in filter_sizes:
11 | 	model.add(Conv1D(num_filters,3,activation='relu'))
12 | 	model.add(MaxPooling1D(pool_size=(max_sentence_length - 3 + 1,),strides=1))
13 | 	model.add(Dropout(dropout))
14 | 	model.add(Flatten())
15 | 	model.add(Dense(8,activation='relu'))
16 | 	model.add(Activation('softmax'))
17 | 	#model.compile(loss=keras.losses.categorical_crossentropy,optimzer=keras.optimizers.SGD(),metrics=['accuracy'])
18 | 	model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
19 | 	return model
20 | 


--------------------------------------------------------------------------------
/keras_cnn_model_v0.1.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Conv1D, MaxPooling1D , Dense, Dropout, Activation
 2 | from keras.layers.embeddings import Embedding
 3 | from keras.layers.core import Flatten
 4 | from keras.models import Sequential
 5 | from keras.optimizers import Adam
 6 | import keras
 7 | 
 8 | def create_model(vocab_size,embedding_size,max_sentence_length,filter_sizes,num_filters,dropout):
 9 | 	model = Sequential()
10 | 	model.add(Embedding(vocab_size,embedding_size,input_length=max_sentence_length))
11 | 	#for filter_size in filter_sizes:
12 | 	#model.add(Dense(50,activation='softmax'))
13 | 	#model.add(Conv1D(num_filters,3,activation='relu',padding="same"))
14 | 	model.add(Conv1D(num_filters,3,activation='relu'))
15 | 	model.add(MaxPooling1D(pool_size=(max_sentence_length - 3 + 1,),strides=1))
16 | 	model.add(Dropout(dropout))
17 | 	model.add(Flatten())
18 | 	model.add(Dense(6,activation='relu'))
19 | 	model.add(Activation('softmax'))
20 | 	#model.compile(loss=keras.losses.categorical_crossentropy,optimzer=keras.optimizers.SGD(),metrics=['accuracy'])
21 | 	#adam = Adam(lr=0.0001, decay=1e-5)
22 | 	model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['mse', 'acc'])
23 | 	return model
24 | 


--------------------------------------------------------------------------------
/keras_cnn_model_v0.2.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Conv1D, MaxPooling1D , Dense, Dropout, Activation
 2 | from keras.layers.embeddings import Embedding
 3 | from keras.layers.core import Flatten
 4 | from keras.models import Sequential
 5 | from keras.optimizers import Adam
 6 | import keras
 7 | 
 8 | def create_model(vocab_size,embedding_size,max_sentence_length,filter_sizes,num_filters,dropout,embedding_matrix=None):
 9 | 	model = Sequential()
10 | 	if embedding_matrix is None:
11 | 		model.add(Embedding(vocab_size+1,embedding_size,input_length=max_sentence_length))
12 | 	else:
13 | 		model.add(Embedding(vocab_size+1,embedding_size,weights=[embedding_matrix],input_length=max_sentence_length,trainable=False))
14 | 	#for filter_size in filter_sizes:
15 | 	#model.add(Dense(50,activation='softmax'))
16 | 	#model.add(Conv1D(num_filters,3,activation='relu',padding="same"))
17 | 	model.add(Conv1D(num_filters,1,activation='tanh'))
18 | 	model.add(MaxPooling1D(pool_size=(3,),strides=1))
19 | 	model.add(Dropout(dropout))
20 | 	model.add(Conv1D(num_filters*2,2,activation='relu'))
21 | 	model.add(MaxPooling1D(pool_size=(2,),strides=1))
22 | 	model.add(Dropout(dropout))
23 | 	model.add(Flatten())
24 | 	model.add(Dense(256,activation='tanh'))
25 | 	model.add(Dropout(dropout))
26 | 	model.add(Dense(128,activation='tanh'))
27 | 	model.add(Dense(6,activation='softmax'))
28 | 	#model.add(Activation('softmax'))
29 | 	#model.compile(loss=keras.losses.categorical_crossentropy,optimzer=keras.optimizers.SGD(),metrics=['accuracy'])
30 | 	adam = Adam(lr=0.0001, decay=1e-5)
31 | 	model.compile(loss='categorical_crossentropy',optimizer=adam,metrics=['mse', 'acc'])
32 | 	return model
33 | 


--------------------------------------------------------------------------------
/keras_predict.py:
--------------------------------------------------------------------------------
 1 | from keras.models import load_model
 2 | from keras.preprocessing.sequence import pad_sequences
 3 | import pickle
 4 | import sys
 5 | 
 6 | """
 7 | sys.argv[1] = saved model full path
 8 | sys.argv[2] = pickled tokenizer full path
 9 | sys.argv[3] = text to classifiy
10 | """
11 | 
12 | class_category = ['CONFIDENTIAL','SFS','TOTAL_INDUSTRY_TESTING','EFC','CORPORATE','HR_MISC','AROUND_OFFERING','RAPID_NWTX']   # Change this if the order of classes while training a model changes
13 | 
14 | if len(sys.argv) != 4:
15 | 	print("Parameters missing")
16 | 	sys.exit(1)
17 | model = load_model(sys.argv[1])
18 | tokenizer = pickle.load(open(sys.argv[2],'rb'))
19 | 
20 | x_pred = tokenizer.texts_to_sequences([sys.argv[3]])
21 | x_pred = pad_sequences(x_pred,maxlen=50) # Max len hardcoded here, has to be parameterized in case of production version
22 | result = model.predict(x_pred)
23 | print(class_category[result.argmax()])
24 | 


--------------------------------------------------------------------------------
/keras_predict_v0.1.py:
--------------------------------------------------------------------------------
 1 | from keras.models import load_model
 2 | from keras.preprocessing.sequence import pad_sequences
 3 | import pickle
 4 | import sys
 5 | 
 6 | """
 7 | sys.argv[1] = saved model full path
 8 | sys.argv[2] = pickled tokenizer full path
 9 | sys.argv[3] = text to classifiy
10 | """
11 | 
12 | class_category = ['UX ANALYST','SYS ANALYST','CONFIGURATION','BIZ ANALYST','DATA ANALYST','SECURITY'];
13 | 
14 | if len(sys.argv) != 4:
15 | 	print("Parameters missing")
16 | 	sys.exit(1)
17 | model = load_model(sys.argv[1])
18 | tokenizer = pickle.load(open(sys.argv[2],'rb'))
19 | 
20 | x_pred = tokenizer.texts_to_sequences([sys.argv[3]])
21 | x_pred = pad_sequences(x_pred,maxlen=50) # Max len hardcoded here, has to be parameterized in case of production version
22 | result = model.predict(x_pred)
23 | print(class_category[result.argmax()])
24 | 


--------------------------------------------------------------------------------
/keras_train.py:
--------------------------------------------------------------------------------
 1 | from keras.preprocessing.text import Tokenizer
 2 | from keras.preprocessing.sequence import pad_sequences
 3 | import os
 4 | import sys
 5 | import pickle
 6 | import numpy as np
 7 | from keras_cnn_model import create_model
 8 | from keras.utils import to_categorical
 9 | 
10 | texts=[]
11 | labels=[]
12 | 
13 | def read_inputs(folder_name):
14 | 	global texts
15 | 	global labels
16 | 	dirs=os.listdir(folder_name)
17 | 	class_id=0
18 | 	for fn in dirs:
19 | 		print("Processing {}".format(fn))
20 | 		full_path = os.path.join(folder_name,fn)
21 | 		fh=open(full_path)
22 | 		lines=fh.readlines()
23 | 		fh.close()
24 | 		texts = texts+lines
25 | 		[labels.append(class_id) for x in lines]
26 | 		class_id += 1
27 | 
28 | if __name__ == '__main__':
29 | 	read_inputs('./data')
30 | 	tokenizer = Tokenizer(num_words=500)
31 | 	tokenizer.fit_on_texts(texts)
32 | 	sequences = tokenizer.texts_to_sequences(texts)
33 | 	word_index = tokenizer.word_index
34 | 	vocab_size = len(word_index)
35 | 	data=pad_sequences(sequences,maxlen=50)
36 | 	print("Length of training data {}".format(len(data)))
37 | 	print("Shape of data {}".format(data.shape))
38 | 	indices = np.arange(data.shape[0])
39 | 	np.random.shuffle(indices)
40 | 	print("Indices {}".format(indices))
41 | 	data = data[indices]
42 | 	labels = to_categorical(np.asarray(labels))  # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..]
43 | 	print(labels)
44 | 	labels = labels[indices]
45 | 	model = create_model(vocab_size,100,50,(3,),256,0.5) # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filter
46 | 	""" Ready to train """
47 | 	model.fit(data,labels,epochs=500,batch_size=16)
48 | 	model.save('./keras_saved_model/intent_model.h5')
49 | 	pickle.dump(tokenizer,open('./keras_saved_model/tokenizer.p','wb'))
50 | 


--------------------------------------------------------------------------------
/keras_train_v0.1.py:
--------------------------------------------------------------------------------
 1 | from keras.preprocessing.text import Tokenizer
 2 | from keras.preprocessing.sequence import pad_sequences
 3 | import os
 4 | import sys
 5 | import pickle
 6 | import numpy as np
 7 | from keras_cnn_model import create_model
 8 | from keras.utils import to_categorical
 9 | 
10 | from sklearn.model_selection import StratifiedKFold
11 | from sklearn.metrics import classification_report,confusion_matrix
12 | 
13 | texts=[]
14 | labels=[]
15 | 
16 | def read_inputs(folder_name):
17 | 	global texts
18 | 	global labels
19 | 	dirs=os.listdir(folder_name)
20 | 	class_id=0
21 | 	for fn in dirs:
22 | 		print("Processing {}".format(fn))
23 | 		full_path = os.path.join(folder_name,fn)
24 | 		fh=open(full_path)
25 | 		lines=fh.readlines()
26 | 		fh.close()
27 | 		texts = texts+lines
28 | 		[labels.append(class_id) for x in lines]
29 | 		class_id += 1
30 | 
31 | if __name__ == '__main__':
32 | 	read_inputs('./data')
33 | 	tokenizer = Tokenizer(num_words=500)
34 | 	tokenizer.fit_on_texts(texts)
35 | 	sequences = tokenizer.texts_to_sequences(texts)
36 | 	word_index = tokenizer.word_index
37 | 	vocab_size = len(word_index)
38 | 	data=pad_sequences(sequences,maxlen=50)
39 | 	print("Length of training data {}".format(len(data)))
40 | 	print("Shape of data {}".format(data.shape))
41 | 	indices = np.arange(data.shape[0])
42 | 	#np.random.shuffle(indices)
43 | 	#print("Indices {}".format(indices))
44 | 	#data = data[indices]
45 | 	labels = np.array(labels)
46 | 	labels_cat = to_categorical(labels)
47 | 	#labels = to_categorical(np.asarray(labels))  # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..]
48 | 	#print(labels)
49 | 	#labels = labels[indices]
50 | 
51 | 
52 | 	kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=12)
53 | 	cvscores = []
54 | 	models=[]
55 | 	test_data=[]
56 | 
57 | 	""" Ready to train """
58 | 	print(" data shape {}".format(data.shape))
59 | 	print(" train shape {}".format(labels.shape))
60 | 	for train,test in kfold.split(data,labels):
61 | 		# As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filte
62 | 		Y = labels_cat[train]	
63 | 		Y_test = labels_cat[test]
64 | 		model = create_model(vocab_size,100,50,(3,),256,0.3)
65 | 		model.fit(data[train],Y,epochs=80,batch_size=16)
66 | 		scores = model.evaluate(data[test],Y_test,verbose=1)
67 | 		print("{} {}".format(model.metrics,scores))
68 | 		cvscores.append(scores[2])
69 | 		models.append(model)
70 | 		test_data.append(test)
71 | 	print(cvscores)
72 | 	max_index=np.array(cvscores).argmax()
73 | 	model = models[max_index]
74 | 	t_data = test_data[max_index]
75 | 	predicted = model.predict(data[t_data])
76 | 	print(np.round(predicted))
77 | 	print(labels_cat[t_data])
78 | 	print(classification_report(labels_cat[t_data],np.round(predicted)))
79 | 	print(confusion_matrix(np.argmax(labels_cat[t_data],axis=1),np.argmax(np.round(predicted),axis=1)))
80 | 	#cvscores.append(scores)
81 | 	#model.save('./keras_saved_model/intent_model.2.h5')
82 | 	#pickle.dump(tokenizer,open('./keras_saved_model/tokenizer.2.p','wb'))
83 | 


--------------------------------------------------------------------------------
/keras_train_v0.2.py:
--------------------------------------------------------------------------------
 1 | from keras.preprocessing.text import Tokenizer
 2 | from keras.preprocessing.sequence import pad_sequences
 3 | import os
 4 | import sys
 5 | import pickle
 6 | import numpy as np
 7 | from keras_cnn_model import create_model
 8 | from keras.utils import to_categorical
 9 | 
10 | from sklearn.model_selection import StratifiedKFold
11 | from sklearn.metrics import classification_report,confusion_matrix
12 | 
13 | from glove_vocab import Glove
14 | 
15 | texts=[]
16 | labels=[]
17 | 
18 | def read_inputs(folder_name):
19 | 	global texts
20 | 	global labels
21 | 	dirs=os.listdir(folder_name)
22 | 	class_id=0
23 | 	for fn in dirs:
24 | 		print("Processing {}".format(fn))
25 | 		full_path = os.path.join(folder_name,fn)
26 | 		fh=open(full_path)
27 | 		lines=fh.readlines()
28 | 		fh.close()
29 | 		texts = texts+lines
30 | 		[labels.append(class_id) for x in lines]
31 | 		class_id += 1
32 | 
33 | if __name__ == '__main__':
34 | 	read_inputs('./data')
35 | 	tokenizer = Tokenizer(num_words=1000)
36 | 	tokenizer.fit_on_texts(texts)
37 | 	sequences = tokenizer.texts_to_sequences(texts)
38 | 	word_index = tokenizer.word_index
39 | 	vocab_size = len(word_index)
40 | 	glove = Glove('/home/cdpai/tensorflow-models/rateresponses/glove_pretrained_word_embedding/glove.6B.100d.txt',100)
41 | 	glove.create_embedding_matrix(word_index)
42 | 	data=pad_sequences(sequences,maxlen=50)
43 | 	print("Length of training data {}".format(len(data)))
44 | 	print("Shape of data {}".format(data.shape))
45 | 	indices = np.arange(data.shape[0])
46 | 	#np.random.shuffle(indices)
47 | 	#print("Indices {}".format(indices))
48 | 	#data = data[indices]
49 | 	labels = np.array(labels)
50 | 	labels_cat = to_categorical(labels)
51 | 	#labels = to_categorical(np.asarray(labels))  # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..]
52 | 	#print(labels)
53 | 	#labels = labels[indices]
54 | 
55 | 
56 | 	kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=12)
57 | 	cvscores = []
58 | 	models=[]
59 | 	test_data=[]
60 | 
61 | 	""" Ready to train """
62 | 	print(" data shape {}".format(data.shape))
63 | 	print(" train shape {}".format(labels.shape))
64 | 	for train,test in kfold.split(data,labels):
65 | 		# As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filte
66 | 		Y = labels_cat[train]	
67 | 		Y_test = labels_cat[test]
68 | 		model = create_model(vocab_size,100,50,(3,),256,0.3,embedding_matrix=glove.embedding_matrix)
69 | 		model.fit(data[train],Y,epochs=80,batch_size=16)
70 | 		scores = model.evaluate(data[test],Y_test,verbose=1)
71 | 		print("{} {}".format(model.metrics,scores))
72 | 		cvscores.append(scores[2])
73 | 		models.append(model)
74 | 		test_data.append(test)
75 | 	print(cvscores)
76 | 	max_index=np.array(cvscores).argmax()
77 | 	model = models[max_index]
78 | 	t_data = test_data[max_index]
79 | 	predicted = model.predict(data[t_data])
80 | 	print(np.round(predicted))
81 | 	print(labels_cat[t_data])
82 | 	print(classification_report(labels_cat[t_data],np.round(predicted)))
83 | 	print(confusion_matrix(np.argmax(labels_cat[t_data],axis=1),np.argmax(np.round(predicted),axis=1)))
84 | 	#cvscores.append(scores)
85 | 	#model.save('./keras_saved_model/intent_model.2.h5')
86 | 	#pickle.dump(tokenizer,open('./keras_saved_model/tokenizer.2.p','wb'))
87 | 


--------------------------------------------------------------------------------
/preprocess_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is preprocessing code for specific use case, can be udpated to work with other use case as well.
 3 | """
 4 | 
 5 | import sys
 6 | from nltk.stem.porter import PorterStemmer
 7 | porter_stemmer = PorterStemmer()
 8 | 
 9 | fh = open(sys.argv[1],'r')
10 | lines=fh.readlines()
11 | fh.close()
12 | 
13 | 
14 | arr=[]
15 | 
16 | from nltk.corpus import stopwords
17 | from nltk.tokenize import word_tokenize
18 | 
19 | stopwrd = set(stopwords.words('english'))
20 | for ln in lines:
21 | 	ln = ln.replace(r"don't","do not")
22 | 	ln = ln.replace(r"doesn't","does not")
23 | 	ln = ln.replace(r"can't","cannot")
24 | 	ln = ln.replace(r"caore","core")
25 | 	ln = ln.replace(r'&quot;',' ')
26 | 	ln = ln.replace(r'&nbsp;',' ')
27 | 	ln = ln.replace(r'.',' . ')
28 | 	ln = ln.replace(r'...',' . ')
29 | 	ln = ln.replace(r'-',' - ')
30 | 	ln = ln.replace(r'/',' / ')
31 | 	ln = ln.replace(r"'s",' ')
32 | 	ln = ln.replace(r"'",' ')
33 | 	ln = ln.replace(r"<!-- rich text -->",'')
34 | 	words = word_tokenize(ln)
35 | 	new_words = [w for w in words if w not in stopwrd]
36 | 	new_ln = " ".join(new_words)
37 | 	new_ln = new_ln.replace(r'``','')
38 | 	new_ln = new_ln.replace(r"''",'')
39 | 	new_ln = new_ln.replace(r"< ! -- rich text -- > ",'')
40 | 	#words = word_tokenize(new_ln)
41 | 	#new_words = [porter_stemmer.stem(w) for w in words]
42 | 	#new_ln = " ".join(new_words)
43 | 	arr.append(new_ln.lower())
44 | 
45 | fw = open(sys.argv[1]+'.1','w')
46 | 
47 | for ln in arr:
48 | 	fw.write(ln + "\n")
49 | 
50 | fw.close()
51 | 


--------------------------------------------------------------------------------