├── README.md
├── Test_module_retrain_model_test_CLEANED.py
└── Naive_Bayes_Classifer_main_training_file_test_CLEANED.py


/README.md:
--------------------------------------------------------------------------------
1 | # ECTD-Document-Classification-using-sklearn-and-ML
2 | ECTD Document Classification using sklearn and Machine Learning
3 | 


--------------------------------------------------------------------------------
/Test_module_retrain_model_test_CLEANED.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding: utf-8
  3 | # ### Importing Libraries
  4 | 
  5 | # In[1]:
  6 | #main_file = 'Naive_Bayes_Classifer_main_training_file_test_CLEANED.py'
  7 | 
  8 | 
  9 | import sklearn
 10 | import string
 11 | import pandas as pd, numpy as np
 12 | import PyPDF2
 13 | import os, pickle
 14 | from time import time
 15 | from sklearn.naive_bayes import MultinomialNB
 16 | from sklearn.feature_extraction.text import CountVectorizer
 17 | 
 18 | 
 19 | # In[2]:
 20 | 
 21 | 
 22 | t0 = time()
 23 | testfiles=[]
 24 | path=str(input("Enter test directory path: \n"))
 25 | try:
 26 |     for root, dirs, files in os.walk(path):
 27 |         for file in files:
 28 |             if file.endswith(".pdf"):
 29 |                 testfiles.append(os.path.join(root, file))
 30 |  # print(testfiles)            
 31 |     testfile_content = []
 32 |     for file in testfiles:
 33 |         content_data= ""
 34 |         PDF_fileObj2 = open(file, 'rb')
 35 |         pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2)
 36 |         for i in range(0 , pdfReader.numPages):
 37 |             pageObj = pdfReader.getPage(i)
 38 |             if i <=3:
 39 |                 content_text = pageObj.extractText()
 40 |                 content_data += content_text
 41 |         testfile_content.append(content_data)
 42 |     test_data = pd.DataFrame(testfile_content)
 43 |     test_data.columns = [['TEXT']]
 44 |     test_data.info()
 45 | 
 46 | except Exception as e:
 47 |     print("Invalid Path!\nPlease enter correct path.")
 48 | 
 49 | 
 50 | # In[3]:
 51 | 
 52 | #Keywords file
 53 | Section_keywords = pd.read_excel("")  #Keywords filepath
 54 | Keywords_df = Section_keywords.copy().fillna(0)
 55 | Keywords_df = Keywords_df.drop(Keywords_df.columns[:3],axis=1)
 56 | 
 57 | keyword_list = []
 58 | for col in Keywords_df.columns:
 59 |     for val in Keywords_df[col]:
 60 |         if val != 0:
 61 |             keyword_list.append(val.lower())      
 62 | keyword_list_unique = set(keyword_list)
 63 | 
 64 | 
 65 | # ### Cleaning & Standardising test document
 66 | 
 67 | # In[4]:
 68 | 
 69 | 
 70 | for i in range(0,len(test_data['TEXT'])):
 71 |     test_data.iloc[i]['TEXT'] = test_data.iloc[i]['TEXT'].str.lower().replace("\n"," ").replace("\t"," ").str.strip(" ")
 72 |     
 73 | for i in range(len(test_data)):
 74 |     temp = " ".join([w for w in test_data.iloc[i]])
 75 |     test_data.iloc[i] = " ".join([w for w in temp.split(" ") if w in keyword_list_unique])
 76 |     
 77 | test_data
 78 | 
 79 | 
 80 | # ### load the model & vocab
 81 | 
 82 | # In[5]:
 83 | 
 84 | 
 85 | import pickle
 86 | model_path = "" #Model_filepath
 87 | vocab = ""      #Vocabulary_filepath
 88 | loaded_vectorizer = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open(vocab, "rb")))
 89 | model_loaded = pickle.load(open(model_path, 'rb'))
 90 | 
 91 | 
 92 | # ### Prediction
 93 | 
 94 | # In[6]:
 95 | 
 96 | 
 97 | predicted_labels = []
 98 | for i in range(len(test_data)):
 99 |     test_cv = loaded_vectorizer.transform(test_data.iloc[i])
100 |     predicted_labels.append(model_loaded.predict(test_cv.toarray()))
101 |     print(" Filename: {}\t,  Index:{}\t, Predicted Label:{}".format(testfiles[i],  i, model_loaded.predict(test_cv.toarray())))
102 | #     print("Predicted Label: " , model_loaded.predict(test_cv.toarray()))
103 | 
104 | 
105 | # ### Taking Feedback as Input
106 | ## Feedback section
107 | 
108 | get_labels = str(input("Enter Index and Correct Label seperated by comma:\n"))
109 | get_labels_list = get_labels.split(",")
110 | index = int(get_labels_list[0])
111 | label = [get_labels_list[1]]
112 | 
113 | 
114 | # In[8]:
115 | 
116 | predicted_labels_copy = predicted_labels
117 | predicted_labels_copy.insert(index,label)
118 | predicted_labels_copy.remove(predicted_labels_copy[index+1])
119 | print([str(val) for val in predicted_labels_copy])
120 | 
121 | 
122 | # ### Taking the data for label
123 | 
124 | # In[9]:
125 | 
126 | 
127 | feedback_data = test_data.iloc[[index]]
128 | feedback_data.index=[0]
129 | # print(feedback_data)
130 | 
131 | 
132 | # ### Re-Training data
133 | 
134 | # In[53]:
135 | 
136 | 
137 | Retraining_data_path = ""  #Retraining data directory path
138 | retrain_df = pd.concat([feedback_data,pd.DataFrame([label],columns=["LABEL"])],axis=1)
139 | retrain_df.columns=[['TEXT','LABEL']]
140 | print(retrain_df)
141 | retrain_df.to_excel(os.path.join(Retraining_data_path,"Retrain_data_" +str(str(time()).split(".")[0]) + ".xlsx"))
142 | 
143 | 
144 | # ### Re-Training the Model
145 | 
146 | # In[63]:
147 | 
148 | 
149 | import glob
150 | retrain_data = [pd.read_excel(file,index_col=[0]) for file in glob.glob("")]  #Collecting all training data from excel files
151 | full_retrain_data = pd.concat(retrain_data, ignore_index=1)
152 | full_retrain_data.dropna(inplace=True)
153 | 
154 | 
155 | # In[ ]:
156 | 
157 | 
158 | text_data_df = pd.read_excel("",index_col=[0]) #Training data filepath
159 | 
160 | 
161 | # In[67]:
162 | 
163 | 
164 | full_data=pd.concat([text_data_df,full_retrain_data])
165 | full_data.to_excel(os.path.join(""))  #Main Training + Feedback data filepath
166 | print(full_data)
167 | 
168 | os.system("python """) #Specify Main code filepath
169 | print("Time Taken: %0.3fs" % (time() - t0)) #Timer


--------------------------------------------------------------------------------
/Naive_Bayes_Classifer_main_training_file_test_CLEANED.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import sklearn
  8 | import string, re
  9 | import pandas as pd, numpy as np
 10 | import PyPDF2
 11 | import os, pickle
 12 | import matplotlib.pyplot as plt, seaborn as sns
 13 | from sklearn.naive_bayes import MultinomialNB
 14 | from sklearn.metrics     import accuracy_score,confusion_matrix,f1_score, classification_report
 15 | from sklearn.model_selection import train_test_split
 16 | from sklearn.feature_extraction.text import CountVectorizer
 17 | from nltk.corpus import wordnet as wn
 18 | from nltk.stem.wordnet import WordNetLemmatizer
 19 | from nltk import word_tokenize, pos_tag
 20 | from collections import defaultdict
 21 | # from sklearn.model_selection import KFold, cross_val_score
 22 | 
 23 | 
 24 | # In[3]:
 25 | 
 26 | 
 27 | # files_list = []
 28 | # for root, dirs, files in os.walk(""):    #PDF documents filepath
 29 | #     for file in files:
 30 | #         if file.endswith(".pdf"):
 31 | #             files_list.append(os.path.join(root, file))
 32 |             
 33 | # for file in files_list:
 34 | #     print(file)
 35 | 
 36 | 
 37 | # In[4]:
 38 | 
 39 | 
 40 | # file_content = []
 41 | # for file in files_list:
 42 | #     content_data= ""
 43 | #     PDF_fileObj2 = open(file, 'rb')
 44 | #     pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2)
 45 | #     for i in range(0 , pdfReader.numPages):
 46 | #         pageObj = pdfReader.getPage(i)
 47 | #         if i <=3:   #Extracting first 3 pages from PDF
 48 | #             content_text = pageObj.extractText()
 49 | #             content_data += content_text
 50 | #     file_content.append(content_data)
 51 | 
 52 | 
 53 | # In[4]:
 54 | 
 55 | 
 56 | # file_content
 57 | 
 58 | 
 59 | # In[5]:
 60 | 
 61 | 
 62 | #Exporting to excel
 63 | # pd.DataFrame(file_content).to_excel("")  #Export to excel file (specify path) to create training data
 64 | 
 65 | 
 66 | # In[56]:
 67 | #files_list=[]
 68 | for root, dirs, files in os.walk(""):   #filepath for retrained data (training data + feedback data).
 69 |     for file in files:
 70 |          if file.endswith(".xlsx"):
 71 |                 print(file)
 72 | #               files_list.append(file)
 73 |                 
 74 | text_data_df = pd.read_excel(os.path.join(root, file), index_col=[0])
 75 | 
 76 | # In[57]:
 77 | 
 78 | 
 79 | text_data_df.info()
 80 | text_data_df.head()
 81 | 
 82 | 
 83 | # In[58]:
 84 | 
 85 | 
 86 | #Stopwords
 87 | from nltk.corpus import stopwords
 88 | stop_words_full = pd.read_excel("") #Stop words file path (extracted from web)
 89 | stop_words_full_list = [i for i in stop_words_full['stop_words']]
 90 | 
 91 | stop_words = set(stop_words_full_list + stopwords.words('english'))
 92 | 
 93 | #Geo-words
 94 | geo_words = pd.read_excel("")  #Geo specific words filepath
 95 | geo_words = [i for i in geo_words['geo_words']]
 96 | 
 97 | 
 98 | # ### No of documents for each section:
 99 | 
100 | # In[59]:
101 | 
102 | 
103 | text_data_df['LABEL'].str.strip().value_counts()
104 | 
105 | 
106 | # ### Removing punctuations and cleaning
107 | 
108 | # In[60]:
109 | 
110 | 
111 | text_data_df['LABEL'] = text_data_df['LABEL'].str.strip()
112 | punct = [p for p in set(string.punctuation) if p not in (".")]
113 | 
114 | for i in range(0,len(text_data_df['TEXT'])):
115 |     if type(text_data_df.iloc[i]['TEXT']) != float:
116 |         text_data_df.iloc[i]['TEXT'] = text_data_df.iloc[i]['TEXT'].lower().replace("\n"," ").replace("\t"," ").strip(" ")
117 |         text_data_df.iloc[i]['TEXT'] = "".join(c for c in text_data_df.iloc[i]['TEXT'] if c not in punct)
118 |         text_data_df.iloc[i]['TEXT'] = " ".join([c for c in text_data_df['TEXT'].iloc[i].split(" ") if not(c[:1].isdigit() and c[1:2] in (p for p in punct))])
119 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w not in stop_words])
120 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w[:-1] not in stop_words])
121 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w not in geo_words])  
122 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w[:1] not in list(map(lambda x: str(x),range(3))) and w[:1] not in list(map(lambda x: str(x),range(4,10)))])  
123 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split(" ") if not(w[:1].isdigit() and w[1:].isalpha())])
124 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split(" ") if not(w[:3].isdigit() and w[3:].isalpha())])
125 |         text_data_df.iloc[i]['TEXT'] = " ".join([w[:-1] if not(w[:1].isdigit()) and w.endswith(".") else w for w in text_data_df.iloc[i]['TEXT'].split(" ")])
126 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split(" ") if len(w) > 2 and len(w) < 15])
127 |     
128 | text_data_df.head()
129 | 
130 | 
131 | # ### Cleaning II:
132 | 
133 | # In[61]:
134 | 
135 | 
136 | #Removing dot(.) from text
137 | for i in range(0,len(text_data_df['TEXT'])):
138 |     if type(text_data_df.iloc[i]['TEXT']) != float:
139 |         text_data_df.iloc[i]['TEXT'] = " ".join([w.replace("."," ") if len(w) > 9 or len(w) < 7 else w for w in text_data_df['TEXT'].iloc[i].split(" ") ])  
140 | 
141 | 
142 | # ### Lemmatization
143 | 
144 | # In[62]:
145 | 
146 | 
147 | tag_map = defaultdict(lambda : wn.NOUN)
148 | tag_map['J'] = wn.ADJ
149 | tag_map['V'] = wn.VERB
150 | tag_map['R'] = wn.ADV
151 | for i in range(len(text_data_df['TEXT'])):
152 |     lemma = []
153 | #     text_lemma = ""
154 |     text_tokens = word_tokenize(text_data_df.iloc[i]['TEXT'])
155 |     lemma_function = WordNetLemmatizer()
156 |     for token , tag in pos_tag(text_tokens):
157 |         lemma.append(lemma_function.lemmatize(token, tag_map[tag[0]]))
158 |     text_data_df.iloc[i]['TEXT'] = " ".join(l for l in lemma )
159 | 
160 | 
161 | # In[63]:
162 | 
163 | 
164 | text_data_df.head()
165 | 
166 | 
167 | # ### Extracting identified keywords from text
168 | 
169 | # In[64]:
170 | 
171 | 
172 | Section_keywords = pd.read_excel("D:\\Nishant\\ML_Project\\m3\\ML Resources\\M3_Keywords_32S_individual_words_32S7.xlsx")
173 | Keywords_df = Section_keywords.copy().fillna(0)
174 | Keywords_df = Keywords_df.drop(Keywords_df.columns[:3],axis=1)
175 | Keywords_df.head(20)
176 | 
177 | 
178 | # In[65]:
179 | 
180 | 
181 | keyword_list = []
182 | for col in Keywords_df.columns:
183 |     for val in Keywords_df[col]:
184 |         if val != 0:
185 |             keyword_list.append(val.lower())
186 |             
187 | #Keywords lemmatization            
188 | keyword_lemma=[]
189 | for token , tag in pos_tag(keyword_list):
190 |         keyword_lemma.append(lemma_function.lemmatize(token, tag_map[tag[0]]))
191 | 
192 | print(keyword_list[:5])
193 | print(keyword_lemma[:5])
194 | 
195 | 
196 | # In[66]:
197 | 
198 | 
199 | keyword_list_unique = set(keyword_lemma)
200 | len(keyword_list_unique)
201 | 
202 | 
203 | # In[14]:
204 | 
205 | 
206 | # pd.DataFrame(keyword_list_unique).to_excel("D:\\Nishant\\ML_Project\\m3\\Training_Data\\keywords_list.xlsx")
207 | 
208 | 
209 | # In[67]:
210 | 
211 | 
212 | #Matching & Filtering data with keywords:
213 | for i in range(len(text_data_df['TEXT'])):
214 |     if type(text_data_df.iloc[i]['TEXT']) != float:
215 |         text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df.iloc[i]['TEXT'].split(" ") if w in keyword_list_unique])
216 |     elif type(text_data_df.iloc[i]['TEXT']) == float:
217 |         text_data_df.iloc[i]['TEXT'] = " "
218 |         
219 | text_data_df.head()
220 | 
221 | 
222 | # ### Randomise data before train test split
223 | 
224 | # In[68]:
225 | 
226 | 
227 | text_data_randomise = text_data_df.sample(frac=1).reset_index(drop=True)
228 | text_data_randomise.head(10)
229 | 
230 | 
231 | # In[69]:
232 | 
233 | 
234 | np.random.seed(442)
235 | X_train, X_test, y_train, y_test = train_test_split(text_data_randomise['TEXT'], text_data_randomise['LABEL'], 
236 |                                                     test_size=0.2, random_state=1)
237 | print(len(X_train),len(y_train),len(X_test),len(y_test))
238 | 
239 | 
240 | # In[70]:
241 | 
242 | 
243 | X_train.head()
244 | 
245 | 
246 | # In[71]:
247 | 
248 | 
249 | y_train[:5]
250 | 
251 | 
252 | # In[72]:
253 | 
254 | 
255 | y_test[:5]
256 | 
257 | 
258 | # ### Feature engineering
259 | 
260 | # In[101]:
261 | 
262 | 
263 | # Fit and tranform X_train
264 | count_vectorizer = CountVectorizer(strip_accents='ascii',lowercase=True, analyzer='word', 
265 |                                    max_df=0.25, min_df=0.05, ngram_range=(1, 2),
266 |                                 token_pattern= u'(?ui)\\b(?:3\.\w+)+(?:\.\w+)+\\b|\\b\\w*[a-zA-Z]+\\w*\\b')
267 |                                    
268 | X_train_cv = count_vectorizer.fit_transform(X_train)
269 | # # Save vectorizer.vocabulary_
270 | # pickle.dump(count_vectorizer.vocabulary_,open("D://Nishant//ML_Project//m3//Trained_Models/vocabulary_32S6_5SEP_FINAL.pkl","wb"))
271 | import pickle
272 | rand_num = np.random.randint(212)
273 | root_path = "D:/Nishant/ML_Project/m3/Trained_Models/" 
274 | pickle.dump(count_vectorizer.vocabulary_,open(root_path + "vocab_" + str(rand_num) + ".pkl","wb"))
275 | 
276 | print ('Shape of Sparse Matrix: ', X_train_cv.shape)
277 | print ('Amount of Non-Zero occurences: ', X_train_cv.nnz)
278 | print ('sparsity: %.2f%%' % (100.0*X_train_cv.nnz/ (X_train_cv.shape[0] * X_train_cv.shape[1])))
279 | 
280 | # Transform X_test
281 | X_test_cv = count_vectorizer.transform(X_test)
282 | 
283 | 
284 | # In[102]:
285 | 
286 | 
287 | Features = pd.DataFrame(count_vectorizer.get_feature_names())
288 | Features.head(10)
289 | 
290 | 
291 | # ### Model Building
292 | 
293 | # In[103]:
294 | 
295 | 
296 | NB_Model = MultinomialNB(alpha=0.01)
297 | NB_Model.fit(X_train_cv.toarray(), np.array(y_train))
298 | 
299 | 
300 | # In[104]:
301 | 
302 | 
303 | print(y_test)
304 | 
305 | 
306 | # In[105]:
307 | 
308 | 
309 | y_pred = NB_Model.predict(X_test_cv.toarray())
310 | y_pred
311 | 
312 | 
313 | # In[106]:
314 | 
315 | 
316 | ticks = ['3.2.S.1.1','3.2.S.1.2','3.2.S.1.3','3.2.S.2.1','3.2.S.2.2','3.2.S.2.3','3.2.S.2.4','3.2.S.2.5','3.2.S.2.6',
317 |          '3.2.S.3.1', '3.2.S.3.2', '3.2.S.5', '3.2.S.6']
318 | 
319 | 
320 | # In[107]:
321 | 
322 | 
323 | acc_score = accuracy_score(y_test,y_pred)
324 | acc_score
325 | 
326 | 
327 | # ### Confusion Matrix:
328 | 
329 | # In[108]:
330 | 
331 | 
332 | cm = confusion_matrix(y_test, y_pred)
333 | sns.heatmap(cm, square=False, annot=True, annot_kws={"size": 14}, xticklabels=ticks, yticklabels=ticks, cmap='coolwarm', cbar=False)
334 | sns.set(font_scale=1)
335 | plt.xlabel('PREDICTED CLASS')
336 | plt.ylabel('ACTUAL CLASS')
337 | #plt.show()
338 | 
339 | 
340 | # ### Classification report:
341 | 
342 | # In[109]:
343 | 
344 | 
345 | print('\nClasification report:\n', classification_report(y_test, y_pred))
346 | 
347 | 
348 | # ### Class Probabilities:
349 | 
350 | # In[110]:
351 | 
352 | 
353 | probabilites = NB_Model.predict_proba(X_test_cv.toarray())
354 | for j in range(len(probabilites)):
355 |         print([round(i*100,2) for i in probabilites[j]])
356 | 
357 | 
358 | # ### save the model to disk
359 | 
360 | # In[111]:
361 | 
362 | #filename = "D:\\Nishant\\ML_Project\\m3\\Trained_Models\\NB_model_9SEP_32S7.mdl"
363 | rand_num_model = np.random.randint(22)
364 | filename = "D:\\Nishant\\ML_Project\\m3\\Trained_Models\\" + "NB_Model_" + str(rand_num_model) + ".mdl"
365 | #print(filename)
366 | pickle.dump(NB_Model, open(filename, 'wb'))
367 | 
368 | # ### load the model from disk
369 | 
370 | # In[32]:
371 | 
372 | 
373 | model_loaded = pickle.load(open(filename, 'rb'))
374 | model_loaded
375 | 
376 | 
377 | # ### loading vocab
378 | 
379 | # In[65]:
380 | 
381 | 
382 | #vocab = "D://Nishant//ML_Project//m3//Trained_Models/vocabulary_9SEP_32S7.pkl"
383 | loaded_vectorizer = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open(root_path + "vocab_" + str(rand_num) + ".pkl","rb")))
384 | 
385 | 
386 | # ### Test document (test case)
387 | 
388 | # In[87]:
389 | 
390 | 
391 | testfiles=[]
392 | for root, dirs, files in os.walk("D://Nishant//ML_Project//m3//Training_data_Rathnadeep/Set-2/"):
393 |     for file in files:
394 |         if file.endswith(".pdf"):
395 |             testfiles.append(os.path.join(root, file))
396 |             
397 | testfile_content = []
398 | for file in testfiles[:5]:
399 |     print(file)
400 |     content_data= ""
401 |     PDF_fileObj2 = open(file, 'rb')
402 |     pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2)
403 |     for i in range(0 , pdfReader.numPages):
404 |         pageObj = pdfReader.getPage(i)
405 |         if i <=5:
406 |             content_text = pageObj.extractText()
407 |             content_data += content_text
408 |     testfile_content.append(content_data.replace("\n"," "))
409 |     
410 | test_data = pd.DataFrame(testfile_content)
411 | test_data.columns = [['TEXT']]
412 | #test_data.info()
413 | 
414 | 
415 | # In[55]:
416 | 
417 | for i in range(0,len(test_data['TEXT'])):
418 |     test_data.iloc[i]['TEXT'] = test_data.iloc[i]['TEXT'].str.lower().replace("\n"," ").replace("\t"," ").str.strip(" ")
419 |     test_data.iloc[i]['TEXT'] = "".join(c for c in test_data.iloc[i]['TEXT'] if c not in punct)
420 | 
421 | test_data.head(2)
422 | 
423 | 
424 | # ### Prediction
425 | 
426 | # In[132]:
427 | 
428 | 
429 | for i in range(len(test_data)):
430 |     test_cv = loaded_vectorizer.transform(test_data.iloc[i])
431 |     print("Predicted Label:" + "\tDoc " + str(i) + "\t" + str(model_loaded.predict(test_cv.toarray())))
432 |     print("Predicted Probability:\t\t" + str(np.max(model_loaded.predict_proba(test_cv.toarray()))*100))
433 | 
434 | 


--------------------------------------------------------------------------------