├── README.md ├── glove_vocab.py ├── keras_cnn_model.py ├── keras_cnn_model_v0.1.py ├── keras_cnn_model_v0.2.py ├── keras_predict.py ├── keras_predict_v0.1.py ├── keras_train.py ├── keras_train_v0.1.py ├── keras_train_v0.2.py └── preprocess_dataset.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | # CNN Text Classification using Keras 3 | 4 | keras_cnn_model.py is cnn model in keras for text classification. 5 | 6 | keras_train.py creates x_train and y_train. It reads from a directory where each file as some sentences and those sentences 7 | belong to class - filename. The filename as read in order is the class order. 8 | 9 | keras_predict.py predicts the class of new sentences. The order of class in the keras_train while training has to match order in this file. 10 | 11 | ## To call keras_predict.py 12 | 13 | python keras_predict.py 14 | 15 | I have included different versions of cnn model and keras_train file here, 16 | 17 | ## v0.1 18 | 19 | ### keras_train_v0.1.py 20 | 21 | Includes KFold cross validation code, classification_report and confusion matrix created on the best model from cross validation. 22 | 23 | ### keras_cnn_model_v0.1.py 24 | 25 | Additional code ( commented right now ) for using Adam optimizer and another layer of convolution with attention layer at the top. 26 | 27 | ## v0.2 28 | 29 | ### keras_train_v0.2.py 30 | 31 | Includes what revision 0.1 had, additionally it contains code to include pre-trained glove vector using code from glove_vocab.py 32 | 33 | ### keras_cnn_model_v0.2.py 34 | 35 | It includes layer in following sequence 36 | 37 | - Embedding Layer ( includes pretrained glove vector if supplied ) 38 | - Convolution 1D kernel = 1 , stride = 1 39 | - MaxPooling 1D patch = 3 , stride = 1 40 | - Dropout 41 | - Convolution 1D kernel = 2 , stride = 1 42 | - MaxPooling 1D patch = 2 , stride = 1 43 | - Dropout 44 | - Dense 256 as output 45 | - Dropout 46 | - Dense ( final layer ) 6 as output 47 | 48 | ### In my dataset, I was able to reach max validation accuracy of 0.8888888955116272 49 | 50 | -------------------------------------------------------------------------------- /glove_vocab.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pickle 4 | from keras.preprocessing.text import Tokenizer 5 | from keras.preprocessing.sequence import pad_sequences 6 | from keras.utils import to_categorical 7 | 8 | class Glove: 9 | def __init__(self,glove_path,embedding_size,load_from_exiting_path=None): 10 | if load_from_exiting_path != None: 11 | if os.path.isdir(load_from_exiting_path) == False: 12 | raise Exception("path {} is not found".format(load_from_exiting_path)) 13 | embeddings_index_path = os.path.join(load_from_exiting_path,'embedding_index.bin') 14 | embedding_matrix_path = os.path.join(load_from_exiting_path,'embedding_matrix.bin') 15 | if os.path.isfile(embeddings_index_path) == False or os.path.isfile(embedding_matrix_path) == False: 16 | raise Exception("file {} and {} not found".format(embeddings_index_path,embedding_matrix_path)) 17 | self.embeddings_index=pickle.load(embeddings_index_path) 18 | self.embedding_matrix=pickle.load(embedding_matrix_path) 19 | return 20 | if os.path.isfile(glove_path) == False: 21 | raise Exception("glove file {} file not found".format(glove_path)) 22 | self.glove_path=glove_path 23 | self.embedding_size=embedding_size 24 | self.embeddings_index={} 25 | f = open(os.path.join(self.glove_path)) 26 | for line in f: 27 | values = line.split() 28 | word = values[0] 29 | coefs = np.asarray(values[1:], dtype='float32') 30 | self.embeddings_index[word] = coefs 31 | f.close() 32 | def get_embedding_index(self): 33 | return self.embeddings_index 34 | def create_embedding_matrix(self,word_index): 35 | """ word_index is tokenizer.word_index and tokenizer is from keras.preprocessing.text import Tokenizer """ 36 | self.embedding_matrix = np.zeros((len(word_index)+1, self.embedding_size)) 37 | for word, i in word_index.items(): 38 | embedding_vector = self.embeddings_index.get(word) 39 | if embedding_vector is not None: 40 | # words not found in embedding index will be all-zeros. 41 | self.embedding_matrix[i] = embedding_vector 42 | def get_vector(self,word_id): 43 | """ word_id is number assigned to the word when create_embedding_matrix is called """ 44 | return self.embedding_matrix.get(word_id) 45 | def store(self,store_path): 46 | if os.path.isdir(store_path) == False: 47 | raise Exception("unable to save to {} , dir does not exists".format(store_path)) 48 | pickle.dump(self.embeddings_index,open(os.path.join(store_path,'embedding_index.bin'),'wb')) 49 | pickle.dump(self.embedding_matrix,open(os.path.join(store_path,'embedding_matrix.bin'),'wb')) 50 | 51 | class Vocabulary: 52 | def __init__(self,max_num_words,max_sequence_length,load_from_exiting_path=None): 53 | if load_from_exiting_path != None: 54 | if os.path.isdir(load_from_exiting_path) == False: 55 | raise Exception("path {} is not found".format(load_from_exiting_path)) 56 | vocabulary_path = os.path.join(load_from_exiting_path,'vocabulary.bin') 57 | if os.path.isfile(vocabulary_path) == False: 58 | raise Exception("file {} and {} not found".format(embeddings_index_path,embedding_matrix_path)) 59 | vocab = pickle.load(open(vocabulary_path,'rb')) 60 | self.max_num_words=vocab.max_num_words 61 | self.max_sequence_length=vocab.max_sequence_length 62 | self.tokenizer=vocab.tokenizer 63 | self.sequences=vocab.sequences 64 | self.data=vocab.data 65 | return 66 | self.max_num_words=max_num_words 67 | self.max_sequence_length=max_sequence_length 68 | self.tokenizer = Tokenizer(num_words=max_num_words) 69 | def get_word_index(self): 70 | return self.tokenizer.word_index 71 | def fit_and_pad(self,texts): 72 | """ called while training """ 73 | self.tokenizer.fit_on_texts(texts) 74 | self.sequences = self.tokenizer.texts_to_sequences(texts) 75 | self.data=pad_sequences(self.sequences,maxlen=self.max_sequence_length) 76 | return self.data 77 | def get_padded_sequences(self,texts): 78 | """ called while inference """ 79 | self.sequences = self.tokenizer.texts_to_sequences(texts) 80 | self.data=pad_sequences(self.sequences,maxlen=self.max_sequence_length) 81 | return self.data 82 | def store(self,store_path): 83 | if os.path.isdir(store_path) == False: 84 | raise Exception("unable to save to {} , dir does not exists".format(store_path)) 85 | pickle.dump(self,open(os.path.join(store_path,'vocabulary.bin'),'wb')) 86 | 87 | def read_class_files_for_training(class_info_file,class_data_directory): 88 | texts = [] # list of text samples 89 | labels_index = {} # dictionary mapping label id to name 90 | labels = [] # list of label ids 91 | if os.path.isfile(class_info_file) == False: 92 | raise Exception("file {} not found".format(class_info_file)) 93 | fh=open(class_info_file) 94 | classes = fh.readlines() 95 | fh.close() 96 | 97 | idx=0 98 | for cls in classes: 99 | print(idx) 100 | cls = cls.replace('\n','') 101 | full_path = os.path.join(class_data_directory,cls) 102 | if os.path.isfile(full_path) == False: 103 | raise Exception("file {} not found".format(full_path)) 104 | fh=open(full_path) 105 | lines=fh.readlines() 106 | fh.close() 107 | for ln in lines: 108 | ln = ln.replace('\n','') 109 | texts.append(ln) 110 | labels.append(idx) 111 | labels_index[idx]=cls 112 | idx += 1 113 | labels = to_categorical(np.asarray(labels)) 114 | return (texts,labels,labels_index) 115 | def read_class_file_for_prediction(class_info_file): 116 | labels_index = {} # dictionary mapping label id to name 117 | if os.path.isfile(class_info_file) == False: 118 | raise Exception("file {} not found".format(class_info_file)) 119 | fh=open(class_info_file) 120 | classes = fh.readlines() 121 | fh.close() 122 | idx=0 123 | for cls in classes: 124 | labels_index[idx]=cls 125 | idx += 1 126 | return labels_index 127 | def read_training_file_for_retrieval_model(file_name): 128 | """ 129 | Training file is supposed to be of following format 130 | line[1]: question | answer | 1 --> for correct answer against the question 131 | line[2]: question | answer | 0 --> for incorrect answer agains the question 132 | """ 133 | fh = open(file_name,'r') 134 | lines = fh.readlines() 135 | fh.close() 136 | 137 | questions=[] 138 | answers=[] 139 | target=[] 140 | for ln in lines: 141 | ln = ln.strip() 142 | arr=ln.split('|') 143 | questions.append(arr[0].strip()) 144 | answers.append(arr[1].strip()) 145 | target.append(arr[2].strip()) 146 | return (questions,answers,target) 147 | 148 | -------------------------------------------------------------------------------- /keras_cnn_model.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Conv1D, MaxPooling1D , Dense, Dropout, Activation 2 | from keras.layers.embeddings import Embedding 3 | from keras.layers.core import Flatten 4 | from keras.models import Sequential 5 | import keras 6 | 7 | def create_model(vocab_size,embedding_size,max_sentence_length,filter_sizes,num_filters,dropout): 8 | model = Sequential() 9 | model.add(Embedding(vocab_size,embedding_size,input_length=max_sentence_length)) 10 | #for filter_size in filter_sizes: 11 | model.add(Conv1D(num_filters,3,activation='relu')) 12 | model.add(MaxPooling1D(pool_size=(max_sentence_length - 3 + 1,),strides=1)) 13 | model.add(Dropout(dropout)) 14 | model.add(Flatten()) 15 | model.add(Dense(8,activation='relu')) 16 | model.add(Activation('softmax')) 17 | #model.compile(loss=keras.losses.categorical_crossentropy,optimzer=keras.optimizers.SGD(),metrics=['accuracy']) 18 | model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy']) 19 | return model 20 | -------------------------------------------------------------------------------- /keras_cnn_model_v0.1.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Conv1D, MaxPooling1D , Dense, Dropout, Activation 2 | from keras.layers.embeddings import Embedding 3 | from keras.layers.core import Flatten 4 | from keras.models import Sequential 5 | from keras.optimizers import Adam 6 | import keras 7 | 8 | def create_model(vocab_size,embedding_size,max_sentence_length,filter_sizes,num_filters,dropout): 9 | model = Sequential() 10 | model.add(Embedding(vocab_size,embedding_size,input_length=max_sentence_length)) 11 | #for filter_size in filter_sizes: 12 | #model.add(Dense(50,activation='softmax')) 13 | #model.add(Conv1D(num_filters,3,activation='relu',padding="same")) 14 | model.add(Conv1D(num_filters,3,activation='relu')) 15 | model.add(MaxPooling1D(pool_size=(max_sentence_length - 3 + 1,),strides=1)) 16 | model.add(Dropout(dropout)) 17 | model.add(Flatten()) 18 | model.add(Dense(6,activation='relu')) 19 | model.add(Activation('softmax')) 20 | #model.compile(loss=keras.losses.categorical_crossentropy,optimzer=keras.optimizers.SGD(),metrics=['accuracy']) 21 | #adam = Adam(lr=0.0001, decay=1e-5) 22 | model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['mse', 'acc']) 23 | return model 24 | -------------------------------------------------------------------------------- /keras_cnn_model_v0.2.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Conv1D, MaxPooling1D , Dense, Dropout, Activation 2 | from keras.layers.embeddings import Embedding 3 | from keras.layers.core import Flatten 4 | from keras.models import Sequential 5 | from keras.optimizers import Adam 6 | import keras 7 | 8 | def create_model(vocab_size,embedding_size,max_sentence_length,filter_sizes,num_filters,dropout,embedding_matrix=None): 9 | model = Sequential() 10 | if embedding_matrix is None: 11 | model.add(Embedding(vocab_size+1,embedding_size,input_length=max_sentence_length)) 12 | else: 13 | model.add(Embedding(vocab_size+1,embedding_size,weights=[embedding_matrix],input_length=max_sentence_length,trainable=False)) 14 | #for filter_size in filter_sizes: 15 | #model.add(Dense(50,activation='softmax')) 16 | #model.add(Conv1D(num_filters,3,activation='relu',padding="same")) 17 | model.add(Conv1D(num_filters,1,activation='tanh')) 18 | model.add(MaxPooling1D(pool_size=(3,),strides=1)) 19 | model.add(Dropout(dropout)) 20 | model.add(Conv1D(num_filters*2,2,activation='relu')) 21 | model.add(MaxPooling1D(pool_size=(2,),strides=1)) 22 | model.add(Dropout(dropout)) 23 | model.add(Flatten()) 24 | model.add(Dense(256,activation='tanh')) 25 | model.add(Dropout(dropout)) 26 | model.add(Dense(128,activation='tanh')) 27 | model.add(Dense(6,activation='softmax')) 28 | #model.add(Activation('softmax')) 29 | #model.compile(loss=keras.losses.categorical_crossentropy,optimzer=keras.optimizers.SGD(),metrics=['accuracy']) 30 | adam = Adam(lr=0.0001, decay=1e-5) 31 | model.compile(loss='categorical_crossentropy',optimizer=adam,metrics=['mse', 'acc']) 32 | return model 33 | -------------------------------------------------------------------------------- /keras_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import load_model 2 | from keras.preprocessing.sequence import pad_sequences 3 | import pickle 4 | import sys 5 | 6 | """ 7 | sys.argv[1] = saved model full path 8 | sys.argv[2] = pickled tokenizer full path 9 | sys.argv[3] = text to classifiy 10 | """ 11 | 12 | class_category = ['CONFIDENTIAL','SFS','TOTAL_INDUSTRY_TESTING','EFC','CORPORATE','HR_MISC','AROUND_OFFERING','RAPID_NWTX'] # Change this if the order of classes while training a model changes 13 | 14 | if len(sys.argv) != 4: 15 | print("Parameters missing") 16 | sys.exit(1) 17 | model = load_model(sys.argv[1]) 18 | tokenizer = pickle.load(open(sys.argv[2],'rb')) 19 | 20 | x_pred = tokenizer.texts_to_sequences([sys.argv[3]]) 21 | x_pred = pad_sequences(x_pred,maxlen=50) # Max len hardcoded here, has to be parameterized in case of production version 22 | result = model.predict(x_pred) 23 | print(class_category[result.argmax()]) 24 | -------------------------------------------------------------------------------- /keras_predict_v0.1.py: -------------------------------------------------------------------------------- 1 | from keras.models import load_model 2 | from keras.preprocessing.sequence import pad_sequences 3 | import pickle 4 | import sys 5 | 6 | """ 7 | sys.argv[1] = saved model full path 8 | sys.argv[2] = pickled tokenizer full path 9 | sys.argv[3] = text to classifiy 10 | """ 11 | 12 | class_category = ['UX ANALYST','SYS ANALYST','CONFIGURATION','BIZ ANALYST','DATA ANALYST','SECURITY']; 13 | 14 | if len(sys.argv) != 4: 15 | print("Parameters missing") 16 | sys.exit(1) 17 | model = load_model(sys.argv[1]) 18 | tokenizer = pickle.load(open(sys.argv[2],'rb')) 19 | 20 | x_pred = tokenizer.texts_to_sequences([sys.argv[3]]) 21 | x_pred = pad_sequences(x_pred,maxlen=50) # Max len hardcoded here, has to be parameterized in case of production version 22 | result = model.predict(x_pred) 23 | print(class_category[result.argmax()]) 24 | -------------------------------------------------------------------------------- /keras_train.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing.text import Tokenizer 2 | from keras.preprocessing.sequence import pad_sequences 3 | import os 4 | import sys 5 | import pickle 6 | import numpy as np 7 | from keras_cnn_model import create_model 8 | from keras.utils import to_categorical 9 | 10 | texts=[] 11 | labels=[] 12 | 13 | def read_inputs(folder_name): 14 | global texts 15 | global labels 16 | dirs=os.listdir(folder_name) 17 | class_id=0 18 | for fn in dirs: 19 | print("Processing {}".format(fn)) 20 | full_path = os.path.join(folder_name,fn) 21 | fh=open(full_path) 22 | lines=fh.readlines() 23 | fh.close() 24 | texts = texts+lines 25 | [labels.append(class_id) for x in lines] 26 | class_id += 1 27 | 28 | if __name__ == '__main__': 29 | read_inputs('./data') 30 | tokenizer = Tokenizer(num_words=500) 31 | tokenizer.fit_on_texts(texts) 32 | sequences = tokenizer.texts_to_sequences(texts) 33 | word_index = tokenizer.word_index 34 | vocab_size = len(word_index) 35 | data=pad_sequences(sequences,maxlen=50) 36 | print("Length of training data {}".format(len(data))) 37 | print("Shape of data {}".format(data.shape)) 38 | indices = np.arange(data.shape[0]) 39 | np.random.shuffle(indices) 40 | print("Indices {}".format(indices)) 41 | data = data[indices] 42 | labels = to_categorical(np.asarray(labels)) # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..] 43 | print(labels) 44 | labels = labels[indices] 45 | model = create_model(vocab_size,100,50,(3,),256,0.5) # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filter 46 | """ Ready to train """ 47 | model.fit(data,labels,epochs=500,batch_size=16) 48 | model.save('./keras_saved_model/intent_model.h5') 49 | pickle.dump(tokenizer,open('./keras_saved_model/tokenizer.p','wb')) 50 | -------------------------------------------------------------------------------- /keras_train_v0.1.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing.text import Tokenizer 2 | from keras.preprocessing.sequence import pad_sequences 3 | import os 4 | import sys 5 | import pickle 6 | import numpy as np 7 | from keras_cnn_model import create_model 8 | from keras.utils import to_categorical 9 | 10 | from sklearn.model_selection import StratifiedKFold 11 | from sklearn.metrics import classification_report,confusion_matrix 12 | 13 | texts=[] 14 | labels=[] 15 | 16 | def read_inputs(folder_name): 17 | global texts 18 | global labels 19 | dirs=os.listdir(folder_name) 20 | class_id=0 21 | for fn in dirs: 22 | print("Processing {}".format(fn)) 23 | full_path = os.path.join(folder_name,fn) 24 | fh=open(full_path) 25 | lines=fh.readlines() 26 | fh.close() 27 | texts = texts+lines 28 | [labels.append(class_id) for x in lines] 29 | class_id += 1 30 | 31 | if __name__ == '__main__': 32 | read_inputs('./data') 33 | tokenizer = Tokenizer(num_words=500) 34 | tokenizer.fit_on_texts(texts) 35 | sequences = tokenizer.texts_to_sequences(texts) 36 | word_index = tokenizer.word_index 37 | vocab_size = len(word_index) 38 | data=pad_sequences(sequences,maxlen=50) 39 | print("Length of training data {}".format(len(data))) 40 | print("Shape of data {}".format(data.shape)) 41 | indices = np.arange(data.shape[0]) 42 | #np.random.shuffle(indices) 43 | #print("Indices {}".format(indices)) 44 | #data = data[indices] 45 | labels = np.array(labels) 46 | labels_cat = to_categorical(labels) 47 | #labels = to_categorical(np.asarray(labels)) # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..] 48 | #print(labels) 49 | #labels = labels[indices] 50 | 51 | 52 | kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=12) 53 | cvscores = [] 54 | models=[] 55 | test_data=[] 56 | 57 | """ Ready to train """ 58 | print(" data shape {}".format(data.shape)) 59 | print(" train shape {}".format(labels.shape)) 60 | for train,test in kfold.split(data,labels): 61 | # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filte 62 | Y = labels_cat[train] 63 | Y_test = labels_cat[test] 64 | model = create_model(vocab_size,100,50,(3,),256,0.3) 65 | model.fit(data[train],Y,epochs=80,batch_size=16) 66 | scores = model.evaluate(data[test],Y_test,verbose=1) 67 | print("{} {}".format(model.metrics,scores)) 68 | cvscores.append(scores[2]) 69 | models.append(model) 70 | test_data.append(test) 71 | print(cvscores) 72 | max_index=np.array(cvscores).argmax() 73 | model = models[max_index] 74 | t_data = test_data[max_index] 75 | predicted = model.predict(data[t_data]) 76 | print(np.round(predicted)) 77 | print(labels_cat[t_data]) 78 | print(classification_report(labels_cat[t_data],np.round(predicted))) 79 | print(confusion_matrix(np.argmax(labels_cat[t_data],axis=1),np.argmax(np.round(predicted),axis=1))) 80 | #cvscores.append(scores) 81 | #model.save('./keras_saved_model/intent_model.2.h5') 82 | #pickle.dump(tokenizer,open('./keras_saved_model/tokenizer.2.p','wb')) 83 | -------------------------------------------------------------------------------- /keras_train_v0.2.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing.text import Tokenizer 2 | from keras.preprocessing.sequence import pad_sequences 3 | import os 4 | import sys 5 | import pickle 6 | import numpy as np 7 | from keras_cnn_model import create_model 8 | from keras.utils import to_categorical 9 | 10 | from sklearn.model_selection import StratifiedKFold 11 | from sklearn.metrics import classification_report,confusion_matrix 12 | 13 | from glove_vocab import Glove 14 | 15 | texts=[] 16 | labels=[] 17 | 18 | def read_inputs(folder_name): 19 | global texts 20 | global labels 21 | dirs=os.listdir(folder_name) 22 | class_id=0 23 | for fn in dirs: 24 | print("Processing {}".format(fn)) 25 | full_path = os.path.join(folder_name,fn) 26 | fh=open(full_path) 27 | lines=fh.readlines() 28 | fh.close() 29 | texts = texts+lines 30 | [labels.append(class_id) for x in lines] 31 | class_id += 1 32 | 33 | if __name__ == '__main__': 34 | read_inputs('./data') 35 | tokenizer = Tokenizer(num_words=1000) 36 | tokenizer.fit_on_texts(texts) 37 | sequences = tokenizer.texts_to_sequences(texts) 38 | word_index = tokenizer.word_index 39 | vocab_size = len(word_index) 40 | glove = Glove('/home/cdpai/tensorflow-models/rateresponses/glove_pretrained_word_embedding/glove.6B.100d.txt',100) 41 | glove.create_embedding_matrix(word_index) 42 | data=pad_sequences(sequences,maxlen=50) 43 | print("Length of training data {}".format(len(data))) 44 | print("Shape of data {}".format(data.shape)) 45 | indices = np.arange(data.shape[0]) 46 | #np.random.shuffle(indices) 47 | #print("Indices {}".format(indices)) 48 | #data = data[indices] 49 | labels = np.array(labels) 50 | labels_cat = to_categorical(labels) 51 | #labels = to_categorical(np.asarray(labels)) # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..] 52 | #print(labels) 53 | #labels = labels[indices] 54 | 55 | 56 | kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=12) 57 | cvscores = [] 58 | models=[] 59 | test_data=[] 60 | 61 | """ Ready to train """ 62 | print(" data shape {}".format(data.shape)) 63 | print(" train shape {}".format(labels.shape)) 64 | for train,test in kfold.split(data,labels): 65 | # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filte 66 | Y = labels_cat[train] 67 | Y_test = labels_cat[test] 68 | model = create_model(vocab_size,100,50,(3,),256,0.3,embedding_matrix=glove.embedding_matrix) 69 | model.fit(data[train],Y,epochs=80,batch_size=16) 70 | scores = model.evaluate(data[test],Y_test,verbose=1) 71 | print("{} {}".format(model.metrics,scores)) 72 | cvscores.append(scores[2]) 73 | models.append(model) 74 | test_data.append(test) 75 | print(cvscores) 76 | max_index=np.array(cvscores).argmax() 77 | model = models[max_index] 78 | t_data = test_data[max_index] 79 | predicted = model.predict(data[t_data]) 80 | print(np.round(predicted)) 81 | print(labels_cat[t_data]) 82 | print(classification_report(labels_cat[t_data],np.round(predicted))) 83 | print(confusion_matrix(np.argmax(labels_cat[t_data],axis=1),np.argmax(np.round(predicted),axis=1))) 84 | #cvscores.append(scores) 85 | #model.save('./keras_saved_model/intent_model.2.h5') 86 | #pickle.dump(tokenizer,open('./keras_saved_model/tokenizer.2.p','wb')) 87 | -------------------------------------------------------------------------------- /preprocess_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is preprocessing code for specific use case, can be udpated to work with other use case as well. 3 | """ 4 | 5 | import sys 6 | from nltk.stem.porter import PorterStemmer 7 | porter_stemmer = PorterStemmer() 8 | 9 | fh = open(sys.argv[1],'r') 10 | lines=fh.readlines() 11 | fh.close() 12 | 13 | 14 | arr=[] 15 | 16 | from nltk.corpus import stopwords 17 | from nltk.tokenize import word_tokenize 18 | 19 | stopwrd = set(stopwords.words('english')) 20 | for ln in lines: 21 | ln = ln.replace(r"don't","do not") 22 | ln = ln.replace(r"doesn't","does not") 23 | ln = ln.replace(r"can't","cannot") 24 | ln = ln.replace(r"caore","core") 25 | ln = ln.replace(r'"',' ') 26 | ln = ln.replace(r' ',' ') 27 | ln = ln.replace(r'.',' . ') 28 | ln = ln.replace(r'...',' . ') 29 | ln = ln.replace(r'-',' - ') 30 | ln = ln.replace(r'/',' / ') 31 | ln = ln.replace(r"'s",' ') 32 | ln = ln.replace(r"'",' ') 33 | ln = ln.replace(r"",'') 34 | words = word_tokenize(ln) 35 | new_words = [w for w in words if w not in stopwrd] 36 | new_ln = " ".join(new_words) 37 | new_ln = new_ln.replace(r'``','') 38 | new_ln = new_ln.replace(r"''",'') 39 | new_ln = new_ln.replace(r"< ! -- rich text -- > ",'') 40 | #words = word_tokenize(new_ln) 41 | #new_words = [porter_stemmer.stem(w) for w in words] 42 | #new_ln = " ".join(new_words) 43 | arr.append(new_ln.lower()) 44 | 45 | fw = open(sys.argv[1]+'.1','w') 46 | 47 | for ln in arr: 48 | fw.write(ln + "\n") 49 | 50 | fw.close() 51 | --------------------------------------------------------------------------------