├── README.md ├── Test_module_retrain_model_test_CLEANED.py └── Naive_Bayes_Classifer_main_training_file_test_CLEANED.py /README.md: -------------------------------------------------------------------------------- 1 | # ECTD-Document-Classification-using-sklearn-and-ML 2 | ECTD Document Classification using sklearn and Machine Learning 3 | -------------------------------------------------------------------------------- /Test_module_retrain_model_test_CLEANED.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding: utf-8 3 | # ### Importing Libraries 4 | 5 | # In[1]: 6 | #main_file = 'Naive_Bayes_Classifer_main_training_file_test_CLEANED.py' 7 | 8 | 9 | import sklearn 10 | import string 11 | import pandas as pd, numpy as np 12 | import PyPDF2 13 | import os, pickle 14 | from time import time 15 | from sklearn.naive_bayes import MultinomialNB 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | 18 | 19 | # In[2]: 20 | 21 | 22 | t0 = time() 23 | testfiles=[] 24 | path=str(input("Enter test directory path: \n")) 25 | try: 26 | for root, dirs, files in os.walk(path): 27 | for file in files: 28 | if file.endswith(".pdf"): 29 | testfiles.append(os.path.join(root, file)) 30 | # print(testfiles) 31 | testfile_content = [] 32 | for file in testfiles: 33 | content_data= "" 34 | PDF_fileObj2 = open(file, 'rb') 35 | pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2) 36 | for i in range(0 , pdfReader.numPages): 37 | pageObj = pdfReader.getPage(i) 38 | if i <=3: 39 | content_text = pageObj.extractText() 40 | content_data += content_text 41 | testfile_content.append(content_data) 42 | test_data = pd.DataFrame(testfile_content) 43 | test_data.columns = [['TEXT']] 44 | test_data.info() 45 | 46 | except Exception as e: 47 | print("Invalid Path!\nPlease enter correct path.") 48 | 49 | 50 | # In[3]: 51 | 52 | #Keywords file 53 | Section_keywords = pd.read_excel("") #Keywords filepath 54 | Keywords_df = Section_keywords.copy().fillna(0) 55 | Keywords_df = Keywords_df.drop(Keywords_df.columns[:3],axis=1) 56 | 57 | keyword_list = [] 58 | for col in Keywords_df.columns: 59 | for val in Keywords_df[col]: 60 | if val != 0: 61 | keyword_list.append(val.lower()) 62 | keyword_list_unique = set(keyword_list) 63 | 64 | 65 | # ### Cleaning & Standardising test document 66 | 67 | # In[4]: 68 | 69 | 70 | for i in range(0,len(test_data['TEXT'])): 71 | test_data.iloc[i]['TEXT'] = test_data.iloc[i]['TEXT'].str.lower().replace("\n"," ").replace("\t"," ").str.strip(" ") 72 | 73 | for i in range(len(test_data)): 74 | temp = " ".join([w for w in test_data.iloc[i]]) 75 | test_data.iloc[i] = " ".join([w for w in temp.split(" ") if w in keyword_list_unique]) 76 | 77 | test_data 78 | 79 | 80 | # ### load the model & vocab 81 | 82 | # In[5]: 83 | 84 | 85 | import pickle 86 | model_path = "" #Model_filepath 87 | vocab = "" #Vocabulary_filepath 88 | loaded_vectorizer = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open(vocab, "rb"))) 89 | model_loaded = pickle.load(open(model_path, 'rb')) 90 | 91 | 92 | # ### Prediction 93 | 94 | # In[6]: 95 | 96 | 97 | predicted_labels = [] 98 | for i in range(len(test_data)): 99 | test_cv = loaded_vectorizer.transform(test_data.iloc[i]) 100 | predicted_labels.append(model_loaded.predict(test_cv.toarray())) 101 | print(" Filename: {}\t, Index:{}\t, Predicted Label:{}".format(testfiles[i], i, model_loaded.predict(test_cv.toarray()))) 102 | # print("Predicted Label: " , model_loaded.predict(test_cv.toarray())) 103 | 104 | 105 | # ### Taking Feedback as Input 106 | ## Feedback section 107 | 108 | get_labels = str(input("Enter Index and Correct Label seperated by comma:\n")) 109 | get_labels_list = get_labels.split(",") 110 | index = int(get_labels_list[0]) 111 | label = [get_labels_list[1]] 112 | 113 | 114 | # In[8]: 115 | 116 | predicted_labels_copy = predicted_labels 117 | predicted_labels_copy.insert(index,label) 118 | predicted_labels_copy.remove(predicted_labels_copy[index+1]) 119 | print([str(val) for val in predicted_labels_copy]) 120 | 121 | 122 | # ### Taking the data for label 123 | 124 | # In[9]: 125 | 126 | 127 | feedback_data = test_data.iloc[[index]] 128 | feedback_data.index=[0] 129 | # print(feedback_data) 130 | 131 | 132 | # ### Re-Training data 133 | 134 | # In[53]: 135 | 136 | 137 | Retraining_data_path = "" #Retraining data directory path 138 | retrain_df = pd.concat([feedback_data,pd.DataFrame([label],columns=["LABEL"])],axis=1) 139 | retrain_df.columns=[['TEXT','LABEL']] 140 | print(retrain_df) 141 | retrain_df.to_excel(os.path.join(Retraining_data_path,"Retrain_data_" +str(str(time()).split(".")[0]) + ".xlsx")) 142 | 143 | 144 | # ### Re-Training the Model 145 | 146 | # In[63]: 147 | 148 | 149 | import glob 150 | retrain_data = [pd.read_excel(file,index_col=[0]) for file in glob.glob("")] #Collecting all training data from excel files 151 | full_retrain_data = pd.concat(retrain_data, ignore_index=1) 152 | full_retrain_data.dropna(inplace=True) 153 | 154 | 155 | # In[ ]: 156 | 157 | 158 | text_data_df = pd.read_excel("",index_col=[0]) #Training data filepath 159 | 160 | 161 | # In[67]: 162 | 163 | 164 | full_data=pd.concat([text_data_df,full_retrain_data]) 165 | full_data.to_excel(os.path.join("")) #Main Training + Feedback data filepath 166 | print(full_data) 167 | 168 | os.system("python """) #Specify Main code filepath 169 | print("Time Taken: %0.3fs" % (time() - t0)) #Timer -------------------------------------------------------------------------------- /Naive_Bayes_Classifer_main_training_file_test_CLEANED.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import sklearn 8 | import string, re 9 | import pandas as pd, numpy as np 10 | import PyPDF2 11 | import os, pickle 12 | import matplotlib.pyplot as plt, seaborn as sns 13 | from sklearn.naive_bayes import MultinomialNB 14 | from sklearn.metrics import accuracy_score,confusion_matrix,f1_score, classification_report 15 | from sklearn.model_selection import train_test_split 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | from nltk.corpus import wordnet as wn 18 | from nltk.stem.wordnet import WordNetLemmatizer 19 | from nltk import word_tokenize, pos_tag 20 | from collections import defaultdict 21 | # from sklearn.model_selection import KFold, cross_val_score 22 | 23 | 24 | # In[3]: 25 | 26 | 27 | # files_list = [] 28 | # for root, dirs, files in os.walk(""): #PDF documents filepath 29 | # for file in files: 30 | # if file.endswith(".pdf"): 31 | # files_list.append(os.path.join(root, file)) 32 | 33 | # for file in files_list: 34 | # print(file) 35 | 36 | 37 | # In[4]: 38 | 39 | 40 | # file_content = [] 41 | # for file in files_list: 42 | # content_data= "" 43 | # PDF_fileObj2 = open(file, 'rb') 44 | # pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2) 45 | # for i in range(0 , pdfReader.numPages): 46 | # pageObj = pdfReader.getPage(i) 47 | # if i <=3: #Extracting first 3 pages from PDF 48 | # content_text = pageObj.extractText() 49 | # content_data += content_text 50 | # file_content.append(content_data) 51 | 52 | 53 | # In[4]: 54 | 55 | 56 | # file_content 57 | 58 | 59 | # In[5]: 60 | 61 | 62 | #Exporting to excel 63 | # pd.DataFrame(file_content).to_excel("") #Export to excel file (specify path) to create training data 64 | 65 | 66 | # In[56]: 67 | #files_list=[] 68 | for root, dirs, files in os.walk(""): #filepath for retrained data (training data + feedback data). 69 | for file in files: 70 | if file.endswith(".xlsx"): 71 | print(file) 72 | # files_list.append(file) 73 | 74 | text_data_df = pd.read_excel(os.path.join(root, file), index_col=[0]) 75 | 76 | # In[57]: 77 | 78 | 79 | text_data_df.info() 80 | text_data_df.head() 81 | 82 | 83 | # In[58]: 84 | 85 | 86 | #Stopwords 87 | from nltk.corpus import stopwords 88 | stop_words_full = pd.read_excel("") #Stop words file path (extracted from web) 89 | stop_words_full_list = [i for i in stop_words_full['stop_words']] 90 | 91 | stop_words = set(stop_words_full_list + stopwords.words('english')) 92 | 93 | #Geo-words 94 | geo_words = pd.read_excel("") #Geo specific words filepath 95 | geo_words = [i for i in geo_words['geo_words']] 96 | 97 | 98 | # ### No of documents for each section: 99 | 100 | # In[59]: 101 | 102 | 103 | text_data_df['LABEL'].str.strip().value_counts() 104 | 105 | 106 | # ### Removing punctuations and cleaning 107 | 108 | # In[60]: 109 | 110 | 111 | text_data_df['LABEL'] = text_data_df['LABEL'].str.strip() 112 | punct = [p for p in set(string.punctuation) if p not in (".")] 113 | 114 | for i in range(0,len(text_data_df['TEXT'])): 115 | if type(text_data_df.iloc[i]['TEXT']) != float: 116 | text_data_df.iloc[i]['TEXT'] = text_data_df.iloc[i]['TEXT'].lower().replace("\n"," ").replace("\t"," ").strip(" ") 117 | text_data_df.iloc[i]['TEXT'] = "".join(c for c in text_data_df.iloc[i]['TEXT'] if c not in punct) 118 | text_data_df.iloc[i]['TEXT'] = " ".join([c for c in text_data_df['TEXT'].iloc[i].split(" ") if not(c[:1].isdigit() and c[1:2] in (p for p in punct))]) 119 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w not in stop_words]) 120 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w[:-1] not in stop_words]) 121 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w not in geo_words]) 122 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split() if w[:1] not in list(map(lambda x: str(x),range(3))) and w[:1] not in list(map(lambda x: str(x),range(4,10)))]) 123 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split(" ") if not(w[:1].isdigit() and w[1:].isalpha())]) 124 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split(" ") if not(w[:3].isdigit() and w[3:].isalpha())]) 125 | text_data_df.iloc[i]['TEXT'] = " ".join([w[:-1] if not(w[:1].isdigit()) and w.endswith(".") else w for w in text_data_df.iloc[i]['TEXT'].split(" ")]) 126 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df['TEXT'].iloc[i].split(" ") if len(w) > 2 and len(w) < 15]) 127 | 128 | text_data_df.head() 129 | 130 | 131 | # ### Cleaning II: 132 | 133 | # In[61]: 134 | 135 | 136 | #Removing dot(.) from text 137 | for i in range(0,len(text_data_df['TEXT'])): 138 | if type(text_data_df.iloc[i]['TEXT']) != float: 139 | text_data_df.iloc[i]['TEXT'] = " ".join([w.replace("."," ") if len(w) > 9 or len(w) < 7 else w for w in text_data_df['TEXT'].iloc[i].split(" ") ]) 140 | 141 | 142 | # ### Lemmatization 143 | 144 | # In[62]: 145 | 146 | 147 | tag_map = defaultdict(lambda : wn.NOUN) 148 | tag_map['J'] = wn.ADJ 149 | tag_map['V'] = wn.VERB 150 | tag_map['R'] = wn.ADV 151 | for i in range(len(text_data_df['TEXT'])): 152 | lemma = [] 153 | # text_lemma = "" 154 | text_tokens = word_tokenize(text_data_df.iloc[i]['TEXT']) 155 | lemma_function = WordNetLemmatizer() 156 | for token , tag in pos_tag(text_tokens): 157 | lemma.append(lemma_function.lemmatize(token, tag_map[tag[0]])) 158 | text_data_df.iloc[i]['TEXT'] = " ".join(l for l in lemma ) 159 | 160 | 161 | # In[63]: 162 | 163 | 164 | text_data_df.head() 165 | 166 | 167 | # ### Extracting identified keywords from text 168 | 169 | # In[64]: 170 | 171 | 172 | Section_keywords = pd.read_excel("D:\\Nishant\\ML_Project\\m3\\ML Resources\\M3_Keywords_32S_individual_words_32S7.xlsx") 173 | Keywords_df = Section_keywords.copy().fillna(0) 174 | Keywords_df = Keywords_df.drop(Keywords_df.columns[:3],axis=1) 175 | Keywords_df.head(20) 176 | 177 | 178 | # In[65]: 179 | 180 | 181 | keyword_list = [] 182 | for col in Keywords_df.columns: 183 | for val in Keywords_df[col]: 184 | if val != 0: 185 | keyword_list.append(val.lower()) 186 | 187 | #Keywords lemmatization 188 | keyword_lemma=[] 189 | for token , tag in pos_tag(keyword_list): 190 | keyword_lemma.append(lemma_function.lemmatize(token, tag_map[tag[0]])) 191 | 192 | print(keyword_list[:5]) 193 | print(keyword_lemma[:5]) 194 | 195 | 196 | # In[66]: 197 | 198 | 199 | keyword_list_unique = set(keyword_lemma) 200 | len(keyword_list_unique) 201 | 202 | 203 | # In[14]: 204 | 205 | 206 | # pd.DataFrame(keyword_list_unique).to_excel("D:\\Nishant\\ML_Project\\m3\\Training_Data\\keywords_list.xlsx") 207 | 208 | 209 | # In[67]: 210 | 211 | 212 | #Matching & Filtering data with keywords: 213 | for i in range(len(text_data_df['TEXT'])): 214 | if type(text_data_df.iloc[i]['TEXT']) != float: 215 | text_data_df.iloc[i]['TEXT'] = " ".join([w for w in text_data_df.iloc[i]['TEXT'].split(" ") if w in keyword_list_unique]) 216 | elif type(text_data_df.iloc[i]['TEXT']) == float: 217 | text_data_df.iloc[i]['TEXT'] = " " 218 | 219 | text_data_df.head() 220 | 221 | 222 | # ### Randomise data before train test split 223 | 224 | # In[68]: 225 | 226 | 227 | text_data_randomise = text_data_df.sample(frac=1).reset_index(drop=True) 228 | text_data_randomise.head(10) 229 | 230 | 231 | # In[69]: 232 | 233 | 234 | np.random.seed(442) 235 | X_train, X_test, y_train, y_test = train_test_split(text_data_randomise['TEXT'], text_data_randomise['LABEL'], 236 | test_size=0.2, random_state=1) 237 | print(len(X_train),len(y_train),len(X_test),len(y_test)) 238 | 239 | 240 | # In[70]: 241 | 242 | 243 | X_train.head() 244 | 245 | 246 | # In[71]: 247 | 248 | 249 | y_train[:5] 250 | 251 | 252 | # In[72]: 253 | 254 | 255 | y_test[:5] 256 | 257 | 258 | # ### Feature engineering 259 | 260 | # In[101]: 261 | 262 | 263 | # Fit and tranform X_train 264 | count_vectorizer = CountVectorizer(strip_accents='ascii',lowercase=True, analyzer='word', 265 | max_df=0.25, min_df=0.05, ngram_range=(1, 2), 266 | token_pattern= u'(?ui)\\b(?:3\.\w+)+(?:\.\w+)+\\b|\\b\\w*[a-zA-Z]+\\w*\\b') 267 | 268 | X_train_cv = count_vectorizer.fit_transform(X_train) 269 | # # Save vectorizer.vocabulary_ 270 | # pickle.dump(count_vectorizer.vocabulary_,open("D://Nishant//ML_Project//m3//Trained_Models/vocabulary_32S6_5SEP_FINAL.pkl","wb")) 271 | import pickle 272 | rand_num = np.random.randint(212) 273 | root_path = "D:/Nishant/ML_Project/m3/Trained_Models/" 274 | pickle.dump(count_vectorizer.vocabulary_,open(root_path + "vocab_" + str(rand_num) + ".pkl","wb")) 275 | 276 | print ('Shape of Sparse Matrix: ', X_train_cv.shape) 277 | print ('Amount of Non-Zero occurences: ', X_train_cv.nnz) 278 | print ('sparsity: %.2f%%' % (100.0*X_train_cv.nnz/ (X_train_cv.shape[0] * X_train_cv.shape[1]))) 279 | 280 | # Transform X_test 281 | X_test_cv = count_vectorizer.transform(X_test) 282 | 283 | 284 | # In[102]: 285 | 286 | 287 | Features = pd.DataFrame(count_vectorizer.get_feature_names()) 288 | Features.head(10) 289 | 290 | 291 | # ### Model Building 292 | 293 | # In[103]: 294 | 295 | 296 | NB_Model = MultinomialNB(alpha=0.01) 297 | NB_Model.fit(X_train_cv.toarray(), np.array(y_train)) 298 | 299 | 300 | # In[104]: 301 | 302 | 303 | print(y_test) 304 | 305 | 306 | # In[105]: 307 | 308 | 309 | y_pred = NB_Model.predict(X_test_cv.toarray()) 310 | y_pred 311 | 312 | 313 | # In[106]: 314 | 315 | 316 | ticks = ['3.2.S.1.1','3.2.S.1.2','3.2.S.1.3','3.2.S.2.1','3.2.S.2.2','3.2.S.2.3','3.2.S.2.4','3.2.S.2.5','3.2.S.2.6', 317 | '3.2.S.3.1', '3.2.S.3.2', '3.2.S.5', '3.2.S.6'] 318 | 319 | 320 | # In[107]: 321 | 322 | 323 | acc_score = accuracy_score(y_test,y_pred) 324 | acc_score 325 | 326 | 327 | # ### Confusion Matrix: 328 | 329 | # In[108]: 330 | 331 | 332 | cm = confusion_matrix(y_test, y_pred) 333 | sns.heatmap(cm, square=False, annot=True, annot_kws={"size": 14}, xticklabels=ticks, yticklabels=ticks, cmap='coolwarm', cbar=False) 334 | sns.set(font_scale=1) 335 | plt.xlabel('PREDICTED CLASS') 336 | plt.ylabel('ACTUAL CLASS') 337 | #plt.show() 338 | 339 | 340 | # ### Classification report: 341 | 342 | # In[109]: 343 | 344 | 345 | print('\nClasification report:\n', classification_report(y_test, y_pred)) 346 | 347 | 348 | # ### Class Probabilities: 349 | 350 | # In[110]: 351 | 352 | 353 | probabilites = NB_Model.predict_proba(X_test_cv.toarray()) 354 | for j in range(len(probabilites)): 355 | print([round(i*100,2) for i in probabilites[j]]) 356 | 357 | 358 | # ### save the model to disk 359 | 360 | # In[111]: 361 | 362 | #filename = "D:\\Nishant\\ML_Project\\m3\\Trained_Models\\NB_model_9SEP_32S7.mdl" 363 | rand_num_model = np.random.randint(22) 364 | filename = "D:\\Nishant\\ML_Project\\m3\\Trained_Models\\" + "NB_Model_" + str(rand_num_model) + ".mdl" 365 | #print(filename) 366 | pickle.dump(NB_Model, open(filename, 'wb')) 367 | 368 | # ### load the model from disk 369 | 370 | # In[32]: 371 | 372 | 373 | model_loaded = pickle.load(open(filename, 'rb')) 374 | model_loaded 375 | 376 | 377 | # ### loading vocab 378 | 379 | # In[65]: 380 | 381 | 382 | #vocab = "D://Nishant//ML_Project//m3//Trained_Models/vocabulary_9SEP_32S7.pkl" 383 | loaded_vectorizer = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open(root_path + "vocab_" + str(rand_num) + ".pkl","rb"))) 384 | 385 | 386 | # ### Test document (test case) 387 | 388 | # In[87]: 389 | 390 | 391 | testfiles=[] 392 | for root, dirs, files in os.walk("D://Nishant//ML_Project//m3//Training_data_Rathnadeep/Set-2/"): 393 | for file in files: 394 | if file.endswith(".pdf"): 395 | testfiles.append(os.path.join(root, file)) 396 | 397 | testfile_content = [] 398 | for file in testfiles[:5]: 399 | print(file) 400 | content_data= "" 401 | PDF_fileObj2 = open(file, 'rb') 402 | pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2) 403 | for i in range(0 , pdfReader.numPages): 404 | pageObj = pdfReader.getPage(i) 405 | if i <=5: 406 | content_text = pageObj.extractText() 407 | content_data += content_text 408 | testfile_content.append(content_data.replace("\n"," ")) 409 | 410 | test_data = pd.DataFrame(testfile_content) 411 | test_data.columns = [['TEXT']] 412 | #test_data.info() 413 | 414 | 415 | # In[55]: 416 | 417 | for i in range(0,len(test_data['TEXT'])): 418 | test_data.iloc[i]['TEXT'] = test_data.iloc[i]['TEXT'].str.lower().replace("\n"," ").replace("\t"," ").str.strip(" ") 419 | test_data.iloc[i]['TEXT'] = "".join(c for c in test_data.iloc[i]['TEXT'] if c not in punct) 420 | 421 | test_data.head(2) 422 | 423 | 424 | # ### Prediction 425 | 426 | # In[132]: 427 | 428 | 429 | for i in range(len(test_data)): 430 | test_cv = loaded_vectorizer.transform(test_data.iloc[i]) 431 | print("Predicted Label:" + "\tDoc " + str(i) + "\t" + str(model_loaded.predict(test_cv.toarray()))) 432 | print("Predicted Probability:\t\t" + str(np.max(model_loaded.predict_proba(test_cv.toarray()))*100)) 433 | 434 | --------------------------------------------------------------------------------