├── All_machine_learning _models.py ├── Detailed_performance_metrics (1).xlsx ├── README.md ├── Supplementary_File .pdf ├── TextConvoNet.py ├── requirements.txt └── sample.sh /All_machine_learning _models.py: -------------------------------------------------------------------------------- 1 | # %% [code] 2 | # This Python 3 environment comes with many helpful analytics libraries installed 3 | # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python 4 | # For example, here's several helpful packages to load 5 | 6 | import numpy as np # linear algebra 7 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 8 | 9 | # Input data files are available in the read-only "../input/" directory 10 | # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory 11 | 12 | 13 | # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 14 | # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session 15 | 16 | # %% [code] 17 | 18 | import pandas as pd 19 | import numpy as np 20 | import matplotlib.pyplot as plt 21 | import seaborn as sns 22 | import nltk 23 | from nltk.corpus import stopwords 24 | import string 25 | import math 26 | from sklearn.feature_extraction.text import CountVectorizer 27 | from sklearn.model_selection import train_test_split, cross_val_score 28 | from sklearn.metrics import classification_report 29 | from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve 30 | #from sklearn.grid_search import GridSearchCV 31 | %matplotlib inline 32 | 33 | 34 | '''import bz2 35 | def get_labels_and_texts(file): 36 | labels = [] 37 | texts = [] 38 | for line in bz2.BZ2File(file): 39 | x = line.decode("utf-8") 40 | labels.append(1 if int(x[9]) == 2 else 0) 41 | texts.append(x[10:].strip()) 42 | return np.array(labels), texts 43 | train_labels, train_texts = get_labels_and_texts('/kaggle/input/amazonreviews/train.ft.txt.bz2') 44 | test_labels, test_texts = get_labels_and_texts('/kaggle/input/amazonreviews/test.ft.txt.bz2') 45 | 46 | #data_train['review'][7] 47 | print(train_labels[4]) 48 | print(train_texts[4]) 49 | 50 | # In[6]: 51 | data={"text":train_texts,'stars':train_labels} 52 | data_train=pd.DataFrame(data) 53 | data1={"text":test_texts,'stars':test_labels} 54 | data_test=pd.DataFrame(data1) 55 | ''' 56 | import numpy as np 57 | import pandas as pd 58 | def multiclass_metrics(cnf_matrix): 59 | cnf_matrix=np.asarray(cnf_matrix) 60 | FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) 61 | FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) 62 | TP = np.diag(cnf_matrix) 63 | TN = cnf_matrix.sum() - (FP + FN + TP) 64 | FP = FP.astype(float) 65 | FN = FN.astype(float) 66 | TP = TP.astype(float) 67 | TN = TN.astype(float) 68 | 69 | TP=np.sum(TP) 70 | TN=np.sum(TN) 71 | FP=np.sum(FP) 72 | FN=np.sum(FN) 73 | 74 | 75 | accuracy=(TP+TN)/(TP+FP+FN+TN) 76 | precision=TP/(TP+FP) 77 | recalll=TP/(FN+TP) 78 | F1=2*precision*recalll/(precision+recalll) 79 | sensiti=TP/(TP+FN) 80 | specifici=TN/(TN+FP) 81 | numerator=TP*TN - FP*FN 82 | 83 | denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN)) 84 | MCc=numerator/denominator 85 | G_mean1=np.sqrt(sensiti*precision) 86 | G_mean2=np.sqrt(sensiti*specifici) 87 | print('precision:' ,TP/(TP+FP)) 88 | print('recall:',TP/(FN+TP)) 89 | print("F1:",F1) 90 | print("Specificity:",TN/(TN+FP)) 91 | print("Sensitivity ",TP/(TP+FN)) 92 | print('G-mean1:',np.sqrt(sensiti*precision)) 93 | print("G-mean2",np.sqrt(sensiti*specifici)) 94 | print("MCC :",MCc) 95 | acc=[] 96 | pre=[] 97 | recall=[] 98 | f1=[] 99 | specificity=[] 100 | sensitivity=[] 101 | GMean1=[] 102 | Gmean2=[] 103 | MCC=[] 104 | tp=[] 105 | fp=[] 106 | fn=[] 107 | tn=[] 108 | acc.append(accuracy) 109 | pre.append(precision) 110 | recall.append(recalll) 111 | f1.append(F1) 112 | specificity.append(specifici) 113 | sensitivity.append(sensiti) 114 | GMean1.append(G_mean1) 115 | Gmean2.append(G_mean2) 116 | MCC.append(MCc) 117 | tp.append(TP) 118 | fp.append(FP) 119 | tn.append(TN) 120 | fn.append(FN) 121 | data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn,} 122 | metric=pd.DataFrame(data) 123 | return metric 124 | 125 | #cnf_matrix=[[1025,0,0,20,0,0,0,0,17],[0,0,0,2,0,0,0,0,3],[83,0,63,5,0,0,0,0,0],[18,0,0,330,0,0,0,0,1],[16,0,0,0,165,0,0,0,0],[51,0,0,0,0,0,0,0,0],[2,0,0,1,0,0,0,0,2],[8,0,0,0,0,0,0,0,0],[32,0,0,2,0,0,0,0,154]] 126 | 127 | 128 | data_train=pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv') 129 | 130 | data_test=pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv') 131 | print("hi") 132 | 133 | data_train=data_train[:10000] 134 | data_test=data_test[10000:] 135 | # In[7]: 136 | 137 | 138 | 139 | data_train.rename(columns={'text':'title','airline_sentiment':'tag'},inplace=True) 140 | data_test.rename(columns={'text':'title','airline_sentiment':'tag'},inplace=True) 141 | # In[94]: 142 | print('jij') 143 | 144 | # In[8]: 145 | 146 | 147 | # In[88]: 148 | 149 | 150 | data_train['title']=data_train['title'].astype(str) 151 | data_test['title']=data_test['title'].astype(str) 152 | #data_train 153 | print('fdd') 154 | 155 | 156 | '''def make_tags(x): #converting the ratings column into 0's and 1's. for binary classifier to take place 157 | if(x<=3): 158 | return 0 159 | else: 160 | return 1 161 | 162 | 163 | 164 | # In[10]: 165 | 166 | 167 | data_train['tag']=data_train['tag'].apply(lambda x: make_tags(x)) 168 | data_test['tag']=data_test['tag'].apply(lambda x: make_tags(x)) 169 | print('sddsd') 170 | ''' 171 | x_train=data_train['title'] 172 | y_train=data_train['tag'] 173 | 174 | test_cnn_data=data_test['title'] 175 | #y_test=data_test['tag'] 176 | 177 | print('sdfsdfsdf') 178 | '''def text_process(text): 179 | nopunc = [char for char in text if char not in string.punctuation] 180 | nopunc = ''.join(nopunc) 181 | return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')] 182 | ''' 183 | print('ddsd') 184 | vocab = CountVectorizer().fit(x_train) 185 | print("dwerty") 186 | print(len(vocab.vocabulary_)) 187 | #print(x_train[2000]) 188 | '''r0 = x[2000] 189 | print(r0) 190 | vocab0 = vocab.transform([r0]) 191 | print(vocab0) 192 | """ 193 | Now the words in the review number 78 have been converted into a vector. 194 | The data that we can see is the transformed words. 195 | If we now get the feature's name - we can get the word back! 196 | """ 197 | print("Getting the words back:") 198 | print(vocab.get_feature_names()[19648]) 199 | print(vocab.get_feature_names()[10643]) 200 | ''' 201 | 202 | x_train = vocab.transform(x_train) 203 | test_cnn_data=vocab.transform(test_cnn_data) 204 | print("Shape of the sparse matrix: ", x_train.shape) 205 | print(y_train) 206 | 207 | #########MULTIONOMIAL NAIVEBAYES 208 | from sklearn.naive_bayes import MultinomialNB 209 | model = MultinomialNB() 210 | print("hih") 211 | model.fit(x_train,y_train.values) 212 | #predmnb = mnb.predict(x_test) 213 | #print("Confusion Matrix for Multinomial Naive Bayes:") 214 | #print(confusion_matrix(y_test,predmnb)) 215 | #print("Score:",round(accuracy_score(y_test,predmnb)*100,2)) 216 | #print("Classification Report:",classification_report(y_test,predmnb)) 217 | 218 | 219 | 220 | 221 | pred=model.predict(test_cnn_data) 222 | #print(y_test) 223 | y_test=pred 224 | y_test=y_test.tolist() 225 | output_class_pred=[] 226 | '''for i in range(len(y_test)): 227 | if(y_test[i][0]<0.5): 228 | output_class_pred.append(0) 229 | else: 230 | output_class_pred.append(1) 231 | ''' 232 | output_class_pred=y_test 233 | original_ans=data_test['tag'] 234 | original_ans=original_ans.tolist() 235 | 236 | # In[ ]: 237 | from sklearn.metrics import confusion_matrix 238 | from sklearn.metrics import classification_report 239 | 240 | #as its a fake news classifier , so identifying a fake class will be a TP 241 | def check_metric(output_class_pred,original_ans): 242 | rightly_predicted=0 243 | TP=0 244 | for i in range(len(y_test)): 245 | if(original_ans[i]==output_class_pred[i]): 246 | rightly_predicted+=1 247 | 248 | 249 | print("Overall_acuracy:",rightly_predicted/len(output_class_pred)) 250 | print('TP',TP) 251 | accuracy=rightly_predicted/len(y_test) 252 | print(classification_report(original_ans,output_class_pred)) 253 | print(confusion_matrix(original_ans,output_class_pred)) 254 | TN=confusion_matrix(original_ans,output_class_pred)[0][0] 255 | TP=confusion_matrix(original_ans,output_class_pred)[1][1] 256 | FP=confusion_matrix(original_ans,output_class_pred)[0][1] 257 | FN=confusion_matrix(original_ans,output_class_pred)[1][0] 258 | 259 | precision=TP/(TP+FP) 260 | recalll=TP/(FN+TP) 261 | F1=2*precision*recalll/(precision+recalll) 262 | sensiti=TP/(TP+FN) 263 | specifici=TN/(TN+FP) 264 | numerator=TP*TN - FP*FN 265 | 266 | denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN)) 267 | MCc=numerator/denominator 268 | G_mean1=np.sqrt(sensiti*precision) 269 | G_mean2=np.sqrt(sensiti*specifici) 270 | print('precision:' ,TP/(TP+FP)) 271 | print('recall:',TP/(FN+TP)) 272 | print("F1:",F1) 273 | print("Specificity:",TN/(TN+FP)) 274 | print("Sensitivity ",TP/(TP+FN)) 275 | print('G-mean1:',np.sqrt(sensiti*precision)) 276 | print("G-mean2",np.sqrt(sensiti*specifici)) 277 | print("MCC :",MCc) 278 | acc=[] 279 | pre=[] 280 | recall=[] 281 | f1=[] 282 | specificity=[] 283 | sensitivity=[] 284 | GMean1=[] 285 | Gmean2=[] 286 | MCC=[] 287 | tp=[] 288 | fp=[] 289 | fn=[] 290 | tn=[] 291 | acc.append(accuracy) 292 | pre.append(precision) 293 | recall.append(recalll) 294 | f1.append(F1) 295 | specificity.append(specifici) 296 | sensitivity.append(sensiti) 297 | GMean1.append(G_mean1) 298 | Gmean2.append(G_mean2) 299 | MCC.append(MCc) 300 | tp.append(TP) 301 | fp.append(FP) 302 | tn.append(TN) 303 | fn.append(FN) 304 | data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn} 305 | metric=pd.DataFrame(data) 306 | return metric 307 | 308 | 309 | 310 | 311 | 312 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 313 | 314 | 315 | resi=multiclass_metrics(cnf_matrix) 316 | resi.to_csv('results1.csv', mode='w', index = False, header=resi.columns,columns=resi.columns) 317 | 318 | 319 | # In[ ]: 320 | 321 | 322 | 323 | ########RANDOMFOREST 324 | from sklearn.ensemble import RandomForestClassifier 325 | model = RandomForestClassifier() 326 | model.fit(x_train,y_train.values) 327 | 328 | pred=model.predict(test_cnn_data) 329 | print(y_test) 330 | y_test=pred 331 | y_test=y_test.tolist() 332 | output_class_pred=[] 333 | output_class_pred=y_test 334 | original_ans=data_test['tag'] 335 | original_ans=original_ans.tolist() 336 | 337 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 338 | 339 | 340 | resi=multiclass_metrics(cnf_matrix) 341 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 342 | 343 | 344 | 345 | ####DECISION TREE 346 | from sklearn.tree import DecisionTreeClassifier 347 | model= DecisionTreeClassifier() 348 | model.fit(x_train,y_train.values) 349 | 350 | pred=model.predict(test_cnn_data) 351 | print(y_test) 352 | y_test=pred 353 | y_test=y_test.tolist() 354 | output_class_pred=[] 355 | output_class_pred=y_test 356 | original_ans=data_test['tag'] 357 | original_ans=original_ans.tolist() 358 | 359 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 360 | 361 | 362 | resi=multiclass_metrics(cnf_matrix) 363 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 364 | 365 | 366 | 367 | 368 | #####SVC 369 | from sklearn.svm import SVC 370 | model = SVC(random_state=101) 371 | model.fit(x_train,y_train.values) 372 | pred=model.predict(test_cnn_data) 373 | print(y_test) 374 | y_test=pred 375 | y_test=y_test.tolist() 376 | output_class_pred=[] 377 | output_class_pred=y_test 378 | original_ans=data_test['tag'] 379 | original_ans=original_ans.tolist() 380 | 381 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 382 | 383 | 384 | resi=multiclass_metrics(cnf_matrix) 385 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 386 | 387 | 388 | 389 | ####GRADIENT BOOSTING CLASSIFIER 390 | from sklearn.ensemble import GradientBoostingClassifier 391 | model = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,max_features=0.5,random_state=999999) 392 | model.fit(x_train,y_train.values) 393 | 394 | pred=model.predict(test_cnn_data) 395 | print(y_test) 396 | y_test=pred 397 | y_test=y_test.tolist() 398 | output_class_pred=[] 399 | output_class_pred=y_test 400 | original_ans=data_test['tag'] 401 | original_ans=original_ans.tolist() 402 | 403 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 404 | 405 | 406 | resi=multiclass_metrics(cnf_matrix) 407 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 408 | 409 | 410 | 411 | #####KNN 412 | from sklearn.neighbors import KNeighborsClassifier 413 | model = KNeighborsClassifier(n_neighbors=10) 414 | model.fit(x_train,y_train.values) 415 | pred=model.predict(test_cnn_data) 416 | print(y_test) 417 | y_test=pred 418 | y_test=y_test.tolist() 419 | output_class_pred=[] 420 | output_class_pred=y_test 421 | original_ans=data_test['tag'] 422 | original_ans=original_ans.tolist() 423 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 424 | 425 | 426 | resi=multiclass_metrics(cnf_matrix) 427 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 428 | 429 | 430 | 431 | 432 | ####XGBOOST CLASSIFIER 433 | import xgboost 434 | from xgboost import XGBClassifier 435 | model = XGBClassifier() 436 | model.fit(x_train,y_train) 437 | pred=model.predict(test_cnn_data) 438 | print(y_test) 439 | y_test=pred 440 | y_test=y_test.tolist() 441 | output_class_pred=[] 442 | output_class_pred=y_test 443 | original_ans=data_test['tag'] 444 | original_ans=original_ans.tolist() 445 | cnf_matrix=confusion_matrix(original_ans,output_class_pred) 446 | 447 | 448 | resi=multiclass_metrics(cnf_matrix) 449 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 450 | 451 | 452 | 453 | print(output_class_pred) 454 | print(original_ans) -------------------------------------------------------------------------------- /Detailed_performance_metrics (1).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sonisanskar/TextConvoNet/a5814e868cc5ef9504774ca9a431d5d3febc3379/Detailed_performance_metrics (1).xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TextConvoNet 2 | The above code is for a novel CNN based Text classification architecture **TextConvoNet** which uses a paragraph matrix and 2D Convolution for text classification tasks. 3 |
4 | Click Here to view the detailed architecture of the [*TextConvoNet*](https://drive.google.com/file/d/1Q7kuPXbtMQtRNGUj-Tmg9hIgSI_2mv5k/view?usp=sharing) 5 |
6 | 7 | 8 | -------------------------------------------------------------------------------- /Supplementary_File .pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sonisanskar/TextConvoNet/a5814e868cc5ef9504774ca9a431d5d3febc3379/Supplementary_File .pdf -------------------------------------------------------------------------------- /TextConvoNet.py: -------------------------------------------------------------------------------- 1 | # This Python 3 environment comes with many helpful analytics libraries installed 2 | # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python 3 | # For example, here's several helpful packages to load 4 | 5 | import numpy as np # linear algebra 6 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 7 | 8 | # Input data files are available in the read-only "../input/" directory 9 | # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory 10 | 11 | import bz2 12 | import pickle 13 | import os 14 | ''' 15 | for dirname, _, filenames in os.walk('/kaggle/input'): 16 | for filename in filenames: 17 | print(os.path.join(dirname, filename)) 18 | ''' 19 | # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 20 | # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session 21 | 22 | #!/usr/bin/env python 23 | # coding: utf-8 24 | 25 | # In[2]: 26 | 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import seaborn as sns 31 | import matplotlib as plt 32 | import json 33 | #get_ipython().run_line_magic('matplotlib', 'inline') 34 | ''' 35 | from tensorflow.keras.optimizers import Adam 36 | 37 | # In[3]: 38 | trainfile = bz2.BZ2File('../input/amazonreviews/train.ft.txt.bz2','r') 39 | lines = trainfile.readlines() 40 | 41 | sent_analysis = [] 42 | def sent_list(docs,splitStr='__label__'): 43 | for i in range(1,len(docs)): 44 | text=str(lines[i]) 45 | splitText=text.split(splitStr) 46 | #print(i) 47 | secHalf=splitText[1] 48 | text=secHalf[2:len(secHalf)-1] 49 | sentiment=secHalf[0] 50 | sent_analysis.append([text,sentiment]) 51 | return sent_analysis 52 | 53 | sentiment_list=sent_list(lines[:1000000],splitStr='__label__') 54 | 55 | train_df = pd.DataFrame(sentiment_list,columns=['Text','Sentiment']) 56 | 57 | data_train=train_df[:4000] 58 | data_test=train_df[4000:5000] 59 | ''' 60 | #a=input('path of the taining dataset with fields as title and tag(0,1) ') 61 | #b=input('path of test dataset') 62 | #data_train=pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv') 63 | 64 | 65 | # In[4]: 66 | 67 | 68 | #data_train 69 | 70 | 71 | # In[5]: 72 | 73 | data_train=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv') 74 | data_test=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv') 75 | 76 | data_train=data_train[:40000] 77 | data_test=data_test[40000:] 78 | 79 | 80 | 81 | # In[8]: 82 | data_train.rename(columns={'review':'title','sentiment':'tag'},inplace=True) 83 | data_test.rename(columns={'review':'title','sentiment':'tag'},inplace=True) 84 | 85 | #data_train['rating'].value_counts() 86 | #print('training_dataset',data_train) 87 | #print('training_dataset',data_test) 88 | 89 | # In[9]: 90 | 91 | #print(data_train) 92 | 93 | def make_tags(x): #converting the ratings column into 0's and 1's. for binary classifier to take place 94 | if(x=="negative"): 95 | return 0 96 | else: 97 | return 1 98 | 99 | 100 | 101 | # In[10]: 102 | 103 | 104 | data_train['tag']=data_train['tag'].apply(lambda x: make_tags(x)) 105 | data_test['tag']=data_test['tag'].apply(lambda x: make_tags(x)) 106 | 107 | #print(data_train) 108 | 109 | count0=(data_train['tag']==0).sum() 110 | count1=(data_train['tag']==1).sum() 111 | if(count0>count1): 112 | imbalance_ratio=(count0)/count1 113 | else: 114 | imbalance_ratio=(count1)/count0 115 | # In[11]: 116 | 117 | print('imbalance_ratio',imbalance_ratio) 118 | #print(data_train) 119 | 120 | 121 | # In[12]: 122 | 123 | 124 | 125 | def no_of_words_in_paragraph(x): 126 | return len(list(x)) 127 | 128 | data_train['no_of_words_in_paragraph']=data_train['title'].apply(lambda x:no_of_words_in_paragraph(x)) 129 | 130 | data_test['no_of_words_in_paragraph']=data_test['title'].apply(lambda x:no_of_words_in_paragraph(x)) 131 | 132 | 133 | 134 | print(data_train) 135 | avg=data_train['no_of_words_in_paragraph'].mean() 136 | maxim=data_train['no_of_words_in_paragraph'].max() 137 | print('average paragraph length',data_train['no_of_words_in_paragraph'].mean()) 138 | print('maximum para length',data_train['no_of_words_in_paragraph'].max()) 139 | print('hii') 140 | excess=(data_train['no_of_words_in_paragraph']>avg).sum() 141 | excess_ratio=excess/len(data_train) 142 | print('excess_ratio',excess_ratio) 143 | 144 | 145 | #applying sentence tokenizer 146 | import nltk.data 147 | tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle') 148 | # Loading PunktSentenceTokenizer using English pickle file 149 | def make_sent_token(x): 150 | return tokenizer.tokenize(x) 151 | #converting each paragraph into separate sentences 152 | 153 | 154 | # In[13]: 155 | 156 | 157 | data_train['sentence_token']=data_train['title'].apply(lambda x: make_sent_token(x)) 158 | 159 | data_test['sentence_token']=data_test['title'].apply(lambda x: make_sent_token(x)) 160 | 161 | 162 | # In[15]: 163 | 164 | 165 | #data_train.drop(columns=['uniqueID','date','usefulCount','condition','drugName'],inplace=True,axis=1)# dropping irrelevant columns 166 | 167 | 168 | # In[16]: 169 | 170 | 171 | #data_test.drop(columns=['uniqueID','date','usefulCount','condition','drugName'],inplace=True,axis=1) 172 | 173 | 174 | # In[17]: 175 | 176 | 177 | #data_train 178 | 179 | 180 | # In[18]: 181 | 182 | 183 | data_train['no_of_sentences']=data_train['sentence_token'].apply(lambda x:len(x)) 184 | 185 | 186 | # In[19]: 187 | 188 | 189 | data_test['no_of_sentences']=data_test['sentence_token'].apply(lambda x:len(x)) 190 | 191 | 192 | # In[20]: 193 | avg_sen_length=data_train['no_of_words_in_paragraph'].sum()/data_train['no_of_sentences'].sum() 194 | print(avg_sen_length) 195 | 196 | #max(data_train['no_of_sentences'])##no of rows in sentence matrix which is to be feed in model(max number of sentence in any paragraph) 197 | 198 | 199 | # In[21]: 200 | 201 | 202 | #len(data_train[data_train['no_of_sentences']==92]['review']) 203 | 204 | 205 | # In[22]: 206 | 207 | 208 | #max(data_test['no_of_sentences']) 209 | 210 | 211 | # In[23]: 212 | 213 | 214 | def max_length_of_sentence(x,y): 215 | sen=x 216 | nu=y 217 | #print(sen) 218 | ma=0 219 | if(nu>1): 220 | l=sen.split('.') 221 | #print(l) 222 | for i in range(len(l)): 223 | k=l[i].replace(',','') 224 | maxi=len(k.split()) 225 | #print(maxi) 226 | if(maxi>ma): 227 | ma=maxi 228 | return ma 229 | else: 230 | return len(sen.split()) 231 | 232 | 233 | 234 | 235 | # In[24]: 236 | 237 | 238 | data_train['max_words_in_sentence']=data_train.apply(lambda x: max_length_of_sentence(x.title,x.no_of_sentences),axis=1) 239 | 240 | 241 | # In[25]: 242 | 243 | 244 | data_test['max_words_in_sentence']=data_test.apply(lambda x: max_length_of_sentence(x.title,x.no_of_sentences),axis=1) 245 | 246 | 247 | # In[26]: 248 | 249 | 250 | #max(data_train['max_words_in_sentence'])## number of columns in the data to be feeded 251 | 252 | 253 | # In[27]: 254 | 255 | x1=max(data_train['no_of_sentences']) 256 | y1=max(data_train['max_words_in_sentence']) 257 | 258 | x2=max(data_test['no_of_sentences']) 259 | y2=max(data_test['max_words_in_sentence']) 260 | 261 | if(x1>=x2): 262 | m=x1 263 | print(m) 264 | m=m 265 | else: 266 | m=x2 267 | m=m 268 | 269 | if(y1>=y2): 270 | n=y1 271 | else: 272 | n=y2 273 | 274 | #So each para will be converted to a m*n matrix 275 | if(m<5): 276 | m=6 277 | else: 278 | m+=2 279 | print('x1,x2,y1,y2',x1,x2,y1,y2) 280 | 281 | print("m-->",m,n) 282 | #So each para will be converted to a m*n matrix 283 | 284 | 285 | # In[28]: 286 | 287 | 288 | 289 | 290 | # # Major part starts here ..... Now converting the paragraph into required matrix 291 | 292 | # In[29]: 293 | 294 | 295 | import re 296 | import string 297 | from nltk import word_tokenize 298 | from nltk.corpus import stopwords 299 | def make_tokens(text): ##Converting into single tokens in order to create the vocabulary 300 | return word_tokenize(text) 301 | 302 | 303 | data_train['tokens']=data_train['title'].apply(lambda x: make_tokens(x)) 304 | data_test['tokens']=data_test['title'].apply(lambda x: make_tokens(x)) 305 | 306 | 307 | # In[30]: 308 | 309 | 310 | #data_train['tokens'] 311 | 312 | 313 | # In[ ]: 314 | 315 | 316 | #from gensim import models 317 | #word2vec_path = 'GoogleNews-vectors-negative300.bin.gz' 318 | #word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True) 319 | 320 | 321 | 322 | embeddings_index = {} 323 | f = open('../input/glove6b300dtxt/glove.6B.300d.txt') 324 | for line in f: 325 | values = line.split(' ') 326 | word = values[0] ## The first entry is the word 327 | coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word 328 | embeddings_index[word] = coefs 329 | f.close() 330 | 331 | print('GloVe data loaded') 332 | 333 | # In[ ]: 334 | 335 | 336 | all_training_words = [word for tokens in data_train["tokens"] for word in tokens] 337 | training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]] 338 | TRAINING_VOCAB = sorted(list(set(all_training_words))) 339 | print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB))) 340 | print("Max sentence length is %s" % max(training_sentence_lengths)) 341 | para_max=max(training_sentence_lengths) 342 | 343 | vocab=len(TRAINING_VOCAB) 344 | 345 | # In[ ]: 346 | 347 | 348 | #len(TRAINING_VOCAB) 349 | 350 | 351 | # In[ ]: 352 | 353 | 354 | from tensorflow.keras.preprocessing.text import Tokenizer 355 | from tensorflow.keras.preprocessing.sequence import pad_sequences 356 | tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), char_level=False) 357 | tokenizer.fit_on_texts(data_train['title']) # we assigned values 358 | 359 | 360 | # In[ ]: 361 | 362 | 363 | train_word_index = tokenizer.word_index 364 | 365 | 366 | # In[ ]: 367 | 368 | 369 | #print(train_word_index) 370 | 371 | 372 | # In[ ]: 373 | 374 | 375 | #data_train.to_csv('medic_train.csv') 376 | #data_test.to_csv('medic_test.csv') 377 | 378 | 379 | # In[ ]: 380 | 381 | 382 | def make_train_seq(x): 383 | return tokenizer.texts_to_sequences(x) 384 | data_train['train_seq']=data_train['sentence_token'].apply(lambda x:make_train_seq(x) ) 385 | data_test['train_seq']=data_test['sentence_token'].apply(lambda x:make_train_seq(x) ) 386 | 387 | 388 | # In[ ]: 389 | 390 | 391 | #(data_train['train_seq']) # here every para has been encoded 392 | 393 | 394 | # In[ ]: 395 | #print(data_train) 396 | 397 | 398 | 399 | 400 | # In[ ]: 401 | 402 | 403 | from tensorflow.keras.preprocessing.sequence import pad_sequences 404 | def padding(x): #now padding each sentence to a length of n...number of columns 405 | MAX_SENTENCE_LENGTH=n #(no of columns) 406 | return pad_sequences(x,maxlen=MAX_SENTENCE_LENGTH,padding='post') 407 | 408 | data_train['padded']=data_train['train_seq'].apply(lambda x:padding(x)) 409 | data_test['padded']=data_test['train_seq'].apply(lambda x:padding(x)) 410 | 411 | 412 | # In[ ]: 413 | 414 | 415 | #(data_train.padded[8]) 416 | 417 | 418 | # In[ ]: 419 | 420 | 421 | 422 | ## More code adapted from the keras reference (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py) 423 | # prepare embedding matrix 424 | from tensorflow.keras.layers import Embedding 425 | from tensorflow.keras.initializers import Constant 426 | 427 | ## EMBEDDING_DIM = ## seems to need to match the embeddings_index dimension 428 | EMBEDDING_DIM = embeddings_index.get('a').shape[0] 429 | print(EMBEDDING_DIM) 430 | #num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 431 | #= np.zeros(len(train_word_index) + 1, EMBEDDING_DIM) 432 | train_embedding_weights = np.zeros((len(train_word_index)+1, 433 | EMBEDDING_DIM)) 434 | for word, i in train_word_index.items(): 435 | #print("sd") 436 | embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary 437 | if embedding_vector is not None: 438 | train_embedding_weights[i] = embedding_vector 439 | print(train_embedding_weights.shape) 440 | # words not found in embedding index will be all-zeros. 441 | 442 | 443 | # load pre-trained word embeddings into an Embedding layer 444 | # note that we set trainable = False so as to keep the embeddings fixed 445 | #embedding_layer = Embedding(num_words, 446 | # EMBEDDING_DIM, 447 | # embeddings_initializer=Constant(embedding_matrix), 448 | # input_length=MAX_SEQUENCE_LENGTH, 449 | # trainable=False) 450 | 451 | 452 | #EMBEDDING_DIM=300 453 | #train_embedding_weights = np.zeros((len(train_word_index)+1, 454 | #EMBEDDING_DIM)) 455 | #for word,index in train_word_index.items(): 456 | #train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM) 457 | #print(train_embedding_weights.shape) 458 | 459 | 460 | # In[43]: 461 | 462 | 463 | def make_full_para(x): #92 cross 192 matrix of a paragraph. (m*n) 464 | l=len(x) 465 | h=m-l #no. of extra rows to be added 466 | z=[0]*h*n #1D vector(#addding extra lines for zeroes as padding) 467 | z=np.reshape(z,(h,n)) #reshaping it to match the dimension of paragraph 468 | s=x.tolist()+z.tolist() 469 | return s 470 | 471 | 472 | # In[ ]: 473 | 474 | 475 | 476 | 477 | 478 | # In[ ]: 479 | 480 | 481 | data_train['full_para']=data_train['padded'].apply(lambda x : make_full_para(x)) 482 | data_test['full_para']=data_test['padded'].apply(lambda x : make_full_para(x)) 483 | 484 | 485 | # In[ ]: 486 | 487 | 488 | #data_train.full_para 489 | 490 | 491 | # In[ ]: 492 | 493 | 494 | def create_1d_para(x): 495 | l=[] 496 | for i in x: 497 | l+=i #concatenating all the sentences in a para into a single 1 d arrray 498 | return l 499 | 500 | 501 | 502 | 503 | # In[ ]: 504 | 505 | data_train['single_d_array']=data_train['full_para'].apply(lambda x: create_1d_para(x) ) 506 | data_test['single_d_array']=data_test['full_para'].apply(lambda x: create_1d_para(x) ) 507 | 508 | 509 | # In[ ]: 510 | 511 | 512 | #train_cnn_data=np.array(data_train['single_d_array'].tolist()) 513 | 514 | 515 | # In[ ]: 516 | 517 | 518 | train_cnn_data=np.array(data_train['single_d_array'].tolist()) 519 | test_cnn_data=np.array(data_test['single_d_array'].tolist()) 520 | 521 | 522 | # In[ ]: 523 | 524 | from sklearn.model_selection import train_test_split 525 | y_train=data_train['tag'].values 526 | 527 | 528 | 529 | # In[ ]: 530 | 531 | print('Startting the training') 532 | #from __future__ import print_function 533 | from tensorflow.keras.layers import Embedding 534 | 535 | from tensorflow.keras.preprocessing.text import text_to_word_sequence 536 | import pandas as pd 537 | from tensorflow.keras.preprocessing.text import Tokenizer 538 | import numpy as np 539 | 540 | 541 | from tensorflow.keras.preprocessing import sequence 542 | from tensorflow.keras.models import Sequential,Model 543 | from tensorflow.keras.layers import Dense, Dropout, Activation,Flatten,Bidirectional,GRU,LSTM,SpatialDropout1D,Reshape 544 | from tensorflow.keras.layers import Embedding,concatenate 545 | from tensorflow.keras.layers import Conv2D, GlobalMaxPooling2D,MaxPool2D,MaxPool3D,GlobalAveragePooling2D,Conv3D 546 | from tensorflow.keras.models import Model 547 | from tensorflow.keras.layers import Input 548 | 549 | 550 | # In[ ]: 551 | 552 | filter_sizes = [1,2,3,4] 553 | num_filters = 32 554 | embed_size=300 555 | embedding_matrix=train_embedding_weights 556 | max_features=len(train_word_index)+1 557 | maxlen=m*n 558 | 559 | def get_model(): 560 | inp = Input(shape=(maxlen, )) 561 | x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp) 562 | x = SpatialDropout1D(0.4)(x) 563 | x = Reshape((m, n, 300))(x) 564 | #print(x) 565 | conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 2), 566 | activation='relu')(x) 567 | conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 3), 568 | activation='relu')(x) 569 | 570 | conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 4), 571 | activation='relu')(x) 572 | 573 | 574 | 575 | 576 | 577 | conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 1), 578 | activation='relu')(x) 579 | conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 2), activation='relu')(x) 580 | 581 | conv_6 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 3), 582 | activation='relu')(x) 583 | 584 | 585 | 586 | maxpool_0 = MaxPool2D()(conv_0) 587 | maxpool_0=Flatten()(maxpool_0) 588 | maxpool_1 = MaxPool2D()(conv_1) 589 | maxpool_1=Flatten()(maxpool_1) 590 | maxpool_2 = MaxPool2D()(conv_2) 591 | maxpool_2 = Flatten()(maxpool_2) 592 | 593 | maxpool_4 = MaxPool2D()(conv_4) 594 | maxpool_4=Flatten()(maxpool_4) 595 | maxpool_5 = MaxPool2D()(conv_5) 596 | maxpool_5=Flatten()(maxpool_5) 597 | maxpool_6 = MaxPool2D()(conv_6) 598 | maxpool_6=Flatten()(maxpool_6) 599 | #maxpool_7 = MaxPool2D()(conv_7) 600 | # maxpool_7=Flatten()(maxpool_7) 601 | z = concatenate([maxpool_0, maxpool_1,maxpool_2],axis=1) 602 | w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1) 603 | #w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1) 604 | #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1) 605 | #z = concatenate([maxpool_0, maxpool_1,maxpool_4, maxpool_5],axis=1) 606 | 607 | #z = Flatten()(z) 608 | z=concatenate([w,z],axis=1) 609 | z=Dense(units=64,activation="relu")(z) 610 | z = Dropout(0.4)(z) 611 | 612 | outp = Dense(1, activation="sigmoid")(z) 613 | 614 | model = Model(inputs=inp, outputs=outp) 615 | 616 | model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) 617 | 618 | return model 619 | 620 | 621 | # In[ ]: 622 | 623 | 624 | model=get_model() 625 | 626 | 627 | # In[ ]: 628 | 629 | 630 | print(model.summary()) 631 | 632 | 633 | # In[ ]: 634 | 635 | 636 | 637 | #define callbacks 638 | from tensorflow.keras.callbacks import EarlyStopping 639 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) 640 | callbacks_list = [early_stopping] 641 | 642 | import time, datetime 643 | start = datetime.datetime.now() 644 | history=model.fit(train_cnn_data, y_train, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 ) 645 | end = datetime.datetime.now() 646 | diff1= (end - start) 647 | print('time taken by text_6',diff1) 648 | 649 | 650 | 651 | 652 | 653 | 654 | pred=model.predict(test_cnn_data) 655 | y_test=pred 656 | y_test=y_test.tolist() 657 | output_class_pred=[] 658 | for i in range(len(y_test)): 659 | if(y_test[i][0]<0.5): 660 | output_class_pred.append(0) 661 | else: 662 | output_class_pred.append(1) 663 | 664 | original_ans=data_test['tag'] 665 | original_ans=original_ans.tolist() 666 | 667 | # In[ ]: 668 | 669 | from sklearn.metrics import confusion_matrix 670 | from sklearn.metrics import classification_report 671 | 672 | #as its a fake news classifier , so identifying a fake class will be a TP 673 | def check_metric(output_class_pred,original_ans,diff1): 674 | rightly_predicted=0 675 | TP=0 676 | for i in range(len(y_test)): 677 | if(original_ans[i]==output_class_pred[i]): 678 | rightly_predicted+=1 679 | 680 | 681 | print("Overall_acuracy:",rightly_predicted/len(output_class_pred)) 682 | print('TP',TP) 683 | accuracy=rightly_predicted/len(y_test) 684 | print(classification_report(original_ans,output_class_pred)) 685 | print(confusion_matrix(original_ans,output_class_pred)) 686 | TN=confusion_matrix(original_ans,output_class_pred)[0][0] 687 | TP=confusion_matrix(original_ans,output_class_pred)[1][1] 688 | FP=confusion_matrix(original_ans,output_class_pred)[0][1] 689 | FN=confusion_matrix(original_ans,output_class_pred)[1][0] 690 | 691 | precision=TP/(TP+FP) 692 | recalll=TP/(FN+TP) 693 | F1=2*precision*recalll/(precision+recalll) 694 | sensiti=TP/(TP+FN) 695 | specifici=TN/(TN+FP) 696 | numerator=TP*TN - FP*FN 697 | 698 | denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN)) 699 | MCc=numerator/denominator 700 | G_mean1=np.sqrt(sensiti*precision) 701 | G_mean2=np.sqrt(sensiti*specifici) 702 | print('precision:' ,TP/(TP+FP)) 703 | print('recall:',TP/(FN+TP)) 704 | print("F1:",F1) 705 | print("Specificity:",TN/(TN+FP)) 706 | print("Sensitivity ",TP/(TP+FN)) 707 | print('G-mean1:',np.sqrt(sensiti*precision)) 708 | print("G-mean2",np.sqrt(sensiti*specifici)) 709 | print("MCC :",MCc) 710 | acc=[] 711 | pre=[] 712 | recall=[] 713 | f1=[] 714 | specificity=[] 715 | sensitivity=[] 716 | GMean1=[] 717 | Gmean2=[] 718 | MCC=[] 719 | tp=[] 720 | fp=[] 721 | fn=[] 722 | tn=[] 723 | acc.append(accuracy) 724 | pre.append(precision) 725 | recall.append(recalll) 726 | f1.append(F1) 727 | specificity.append(specifici) 728 | sensitivity.append(sensiti) 729 | GMean1.append(G_mean1) 730 | Gmean2.append(G_mean2) 731 | MCC.append(MCc) 732 | tp.append(TP) 733 | fp.append(FP) 734 | tn.append(TN) 735 | fn.append(FN) 736 | data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn,"traintime":diff1,"Exceeding_ratio":excess_ratio,"imbalance_ratio":imbalance_ratio,"Average_length_of_paragraph":avg,"Maximum_length_of_a_paragraph":maxim,"Average_length_of_sentences":avg_sen_length,"Maximum_length_of_a_sentence_in_a_paragraph":n,"Maximum_no_of_sentence_in_any_paragraph":m,"Vocabular_size":vocab,"label0":count0,"label1":count1} 737 | metric=pd.DataFrame(data) 738 | return metric 739 | 740 | print(history.history.keys()) 741 | 742 | resi=check_metric(output_class_pred,original_ans,diff1) 743 | resi.to_csv('results_text.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 744 | 745 | 746 | 747 | ##### 748 | 749 | filter_sizes = [1,2,3,4] 750 | num_filters = 32 751 | embed_size=300 752 | embedding_matrix=train_embedding_weights 753 | max_features=len(train_word_index)+1 754 | maxlen=m*n 755 | def get_model(): 756 | inp = Input(shape=(maxlen, )) 757 | x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp) 758 | x = SpatialDropout1D(0.4)(x) 759 | x = Reshape((m, n, 300))(x) 760 | #print(x) 761 | conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 2), 762 | activation='relu')(x) 763 | conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 3), 764 | activation='relu')(x) 765 | 766 | #conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 4), 767 | #activation='relu')(x) 768 | 769 | 770 | 771 | 772 | 773 | conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 1), 774 | activation='relu')(x) 775 | conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 2), activation='relu')(x) 776 | 777 | #conv_6 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 3), 778 | #activation='relu')(x) 779 | 780 | 781 | 782 | maxpool_0 = MaxPool2D()(conv_0) 783 | maxpool_0=Flatten()(maxpool_0) 784 | maxpool_1 = MaxPool2D()(conv_1) 785 | maxpool_1=Flatten()(maxpool_1) 786 | #maxpool_2 = MaxPool2D()(conv_2) 787 | #maxpool_2 = Flatten()(maxpool_2) 788 | 789 | maxpool_4 = MaxPool2D()(conv_4) 790 | maxpool_4=Flatten()(maxpool_4) 791 | maxpool_5 = MaxPool2D()(conv_5) 792 | maxpool_5=Flatten()(maxpool_5) 793 | #maxpool_6 = MaxPool2D()(conv_6) 794 | #maxpool_6=Flatten()(maxpool_6) 795 | #maxpool_7 = MaxPool2D()(conv_7) 796 | # maxpool_7=Flatten()(maxpool_7) 797 | 798 | #w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1) 799 | #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1) 800 | #z = concatenate([maxpool_0, maxpool_1,maxpool_4, maxpool_5],axis=1) 801 | w=concatenate([maxpool_4, maxpool_5],axis=1) 802 | #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1) 803 | z = concatenate([maxpool_0, maxpool_1],axis=1) 804 | 805 | #z = Flatten()(z) 806 | z=concatenate([w,z],axis=1) 807 | #z = Flatten()(z) 808 | #z=concatenate([w,z],axis=1) 809 | z=Dense(units=64,activation="relu")(z) 810 | z = Dropout(0.4)(z) 811 | 812 | outp = Dense(1, activation="sigmoid")(z) 813 | 814 | model = Model(inputs=inp, outputs=outp) 815 | 816 | model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy']) 817 | 818 | return model 819 | 820 | 821 | # In[ ]: 822 | 823 | 824 | model=get_model() 825 | 826 | 827 | # In[ ]: 828 | 829 | 830 | print(model.summary()) 831 | 832 | 833 | # In[ ]: 834 | 835 | 836 | 837 | #define callbacks 838 | from tensorflow.keras.callbacks import EarlyStopping 839 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) 840 | callbacks_list = [early_stopping] 841 | 842 | import time, datetime 843 | start = datetime.datetime.now() 844 | 845 | history=model.fit(train_cnn_data, y_train, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 ) 846 | 847 | end = datetime.datetime.now() 848 | diff1= (end - start) 849 | print('time taken by text_4',diff1) 850 | 851 | 852 | 853 | 854 | 855 | pred=model.predict(test_cnn_data) 856 | y_test=pred 857 | y_test=y_test.tolist() 858 | output_class_pred=[] 859 | for i in range(len(y_test)): 860 | if(y_test[i][0]<0.5): 861 | output_class_pred.append(0) 862 | else: 863 | output_class_pred.append(1) 864 | 865 | original_ans=data_test['tag'] 866 | original_ans=original_ans.tolist() 867 | 868 | # In[ ]: 869 | from sklearn.metrics import confusion_matrix 870 | from sklearn.metrics import classification_report 871 | 872 | #as its a fake news classifier , so identifying a fake class will be a TP 873 | 874 | 875 | resi=check_metric(output_class_pred,original_ans,diff1) 876 | resi.to_csv('results_text.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | # In[ ]: 885 | 886 | 887 | 888 | 889 | 890 | # In[ ]: 891 | 892 | 893 | ## now perparing training data for yoon kim model 894 | 895 | 896 | # In[ ]: 897 | 898 | 899 | def create_single_line_para(x): 900 | l=[] 901 | for i in x: 902 | l+=i #concatenating all the sentences in a para into a single 1 d arrray 903 | return l 904 | 905 | 906 | 907 | # In[ ]: 908 | 909 | 910 | data_train['create_single_line_para']=data_train['train_seq'].apply(lambda x: create_single_line_para(x) ) 911 | data_test['create_single_line_para']=data_test['train_seq'].apply(lambda x: create_single_line_para(x) ) 912 | 913 | 914 | # In[ ]: 915 | 916 | 917 | from tensorflow.keras.preprocessing.sequence import pad_sequences 918 | yoon_kim_train_data=np.array(data_train['create_single_line_para'].tolist()) 919 | yoon_kim_train_data=pad_sequences(yoon_kim_train_data,maxlen=para_max,padding='post') 920 | 921 | # In[ ]: 922 | yoon_kim_test_data=np.array(data_test['create_single_line_para'].tolist()) 923 | yoon_kim_test_data=pad_sequences(yoon_kim_test_data,maxlen=para_max,padding='post') 924 | 925 | 926 | #from __future__ import print_function 927 | from tensorflow.keras.layers import Embedding 928 | 929 | from tensorflow.keras.preprocessing.text import text_to_word_sequence 930 | import pandas as pd 931 | from tensorflow.keras.preprocessing.text import Tokenizer 932 | import numpy as np 933 | 934 | 935 | from tensorflow.keras.preprocessing import sequence 936 | from tensorflow.keras.models import Sequential,Model 937 | from tensorflow.keras.layers import Dense, Dropout, Activation,Flatten,Bidirectional,GRU,LSTM 938 | from tensorflow.keras.layers import Embedding,concatenate 939 | from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D,GlobalAveragePooling1D 940 | from tensorflow.keras.models import Model 941 | from tensorflow.keras.layers import Input 942 | 943 | 944 | # In[ ]: 945 | 946 | 947 | train_y=pd.get_dummies(y_train) 948 | 949 | 950 | # In[ ]: 951 | 952 | 953 | trains_y=train_y[[0,1]].values 954 | 955 | 956 | # In[ ]: 957 | 958 | 959 | embed_size=300 960 | embedding_matrix=train_embedding_weights 961 | max_features=len(train_word_index)+1 962 | maxlen=para_max 963 | max_sequence_length=para_max 964 | MAX_SEQUENCE_LENGTH=para_max 965 | EMBEDDING_DIM=300 966 | 967 | 968 | #model3 yoon kim 969 | 970 | 971 | # In[ ]: 972 | 973 | 974 | def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False): 975 | 976 | embedding_layer = Embedding(num_words, 977 | embedding_dim, 978 | weights=[embeddings], 979 | input_length=max_sequence_length, 980 | trainable=trainable) 981 | 982 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32') 983 | embedded_sequences = embedding_layer(sequence_input) 984 | 985 | # Yoon Kim model (https://arxiv.org/abs/1408.5882) 986 | convs = [] 987 | filter_sizes = [3,4,5] 988 | 989 | for filter_size in filter_sizes: 990 | l_conv = Conv1D(filters=100, kernel_size=filter_size, activation='relu')(embedded_sequences) 991 | l_pool = MaxPooling1D(pool_size=2)(l_conv) 992 | convs.append(l_pool) 993 | 994 | l_merge = concatenate(convs, axis=1) 995 | 996 | # add a 1D convnet with global maxpooling, instead of Yoon Kim model 997 | #conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences) 998 | #pool = MaxPooling1D(pool_size=2)(conv) 999 | 1000 | #if extra_conv==True: 1001 | #x = Dropout(0.01)(l_merge) 1002 | #else: 1003 | # Original Yoon Kim model 1004 | #x = Dropout(0.001)(pool) 1005 | x = Flatten()(l_merge) 1006 | 1007 | x = Dropout(0.5)(x) 1008 | # Finally, we feed the output into a Sigmoid layer. 1009 | # The reason why sigmoid is used is because we are trying to achieve a binary classification(1,0) 1010 | # for each of the 6 labels, and the sigmoid function will squash the output between the bounds of 0 and 1. 1011 | preds = Dense(2, activation='softmax')(x) 1012 | 1013 | model = Model(sequence_input, preds) 1014 | model.compile(loss='categorical_crossentropy', 1015 | optimizer='Adam', 1016 | metrics=['acc']) 1017 | model.summary() 1018 | return model 1019 | 1020 | 1021 | # In[ ]: 1022 | 1023 | 1024 | model1 = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 1025 | True) 1026 | 1027 | 1028 | # In[ ]: 1029 | 1030 | 1031 | training_data=yoon_kim_train_data 1032 | 1033 | 1034 | # In[ ]: 1035 | 1036 | 1037 | testing_data=yoon_kim_test_data 1038 | 1039 | 1040 | # In[ ]: 1041 | 1042 | 1043 | 1044 | #define callbacks 1045 | from tensorflow.keras.callbacks import EarlyStopping 1046 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) 1047 | callbacks_list = [early_stopping] 1048 | 1049 | import time, datetime 1050 | start = datetime.datetime.now() 1051 | 1052 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 ) 1053 | end = datetime.datetime.now() 1054 | diff1= (end - start) 1055 | print('time taken by yoon',diff1) 1056 | 1057 | 1058 | # In[ ]: 1059 | 1060 | 1061 | pred=model1.predict(testing_data) 1062 | y_test=pred 1063 | y_test=y_test.tolist() 1064 | output_class_pred=[] 1065 | #output_class_pred=[] 1066 | for i in range(len(y_test)): 1067 | m=max(y_test[i]) 1068 | if(y_test[i].index(m)==0): 1069 | output_class_pred.append(0) 1070 | else: 1071 | output_class_pred.append(1) 1072 | 1073 | 1074 | original_ans=data_test['tag'] 1075 | original_ans=original_ans.tolist() 1076 | 1077 | 1078 | # In[ ]: 1079 | 1080 | 1081 | #as its a fake news classifier , so identifying a fake class will be a TP 1082 | resi=check_metric(output_class_pred,original_ans,diff1) 1083 | 1084 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns) 1085 | 1086 | 1087 | 1088 | from tensorflow.keras.models import Model, Sequential 1089 | 1090 | from tensorflow.keras.layers import Dropout, Embedding, concatenate 1091 | from tensorflow.keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, ZeroPadding1D 1092 | from tensorflow.keras.layers import Dense, Input, Flatten, BatchNormalization 1093 | from tensorflow.keras.layers import Concatenate, Dot, Multiply, RepeatVector 1094 | from tensorflow.keras.layers import Bidirectional, TimeDistributed 1095 | from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute 1096 | 1097 | #from tensorflow.keras.layers.core import Reshape, Activation 1098 | from tensorflow.keras.optimizers import Adam 1099 | from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard 1100 | #from tensorflow.keras.constraints import maxnorm 1101 | #from tensorflow.keras.regularizers import l2 1102 | 1103 | def ConvNet_vdcnn(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False): 1104 | 1105 | embedding_layer = Embedding(num_words, 1106 | embedding_dim, 1107 | weights=[embeddings], 1108 | input_length=max_sequence_length, 1109 | trainable=trainable) 1110 | 1111 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32') 1112 | embedded_sequences = embedding_layer(sequence_input) 1113 | 1114 | 1115 | 1116 | # 4 pairs of convolution blocks followed by pooling 1117 | conv = Conv1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_sequences) 1118 | 1119 | 1120 | # 4 pairs of convolution blocks followed by pooling 1121 | for filter_size in [64, 128, 256, 512]: 1122 | 1123 | # each iteration is a convolution block 1124 | for cb_i in [0,1]: 1125 | conv=(Conv1D(filter_size, 3, padding="same",activation='relu'))(conv) 1126 | #model_1.add(BatchNormalization()) 1127 | #model_1.add(Activation("relu")) 1128 | conv=(Conv1D(filter_size, 1, padding="same",activation='relu'))(conv) 1129 | #model_1.add(BatchNormalization()) 1130 | #model_1.add(Activation("relu")) 1131 | 1132 | conv=(MaxPooling1D(pool_size=2, strides=3))(conv) 1133 | 1134 | # model.add(KMaxPooling(k=2)) 1135 | conv=(Flatten())(conv) 1136 | conv=(Dense(4096, activation="relu"))(conv) 1137 | conv=(Dense(2048, activation="relu"))(conv) 1138 | conv=(Dense(2048, activation="relu"))(conv) 1139 | #(Dense(9, activation="softmax")) 1140 | 1141 | preds = Dense(2, activation='softmax')(conv) 1142 | 1143 | model = Model(sequence_input, preds) 1144 | model.compile(loss='categorical_crossentropy', 1145 | optimizer='Adam',metrics=['acc']) 1146 | print(model.summary()) 1147 | return model 1148 | 1149 | 1150 | 1151 | model1 = ConvNet_vdcnn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 1152 | True) 1153 | 1154 | 1155 | 1156 | 1157 | # In[ ]: 1158 | 1159 | 1160 | training_data=yoon_kim_train_data 1161 | 1162 | 1163 | # In[ ]: 1164 | 1165 | 1166 | testing_data=yoon_kim_test_data 1167 | 1168 | 1169 | # In[ ]: 1170 | 1171 | 1172 | 1173 | #define callbacks 1174 | from tensorflow.keras.callbacks import EarlyStopping 1175 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) 1176 | callbacks_list = [early_stopping] 1177 | 1178 | import time, datetime 1179 | start = datetime.datetime.now() 1180 | 1181 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 ) 1182 | end = datetime.datetime.now() 1183 | diff1= (end - start) 1184 | print('time taken by yoon',diff1) 1185 | 1186 | 1187 | # In[ ]: 1188 | 1189 | 1190 | pred=model1.predict(testing_data) 1191 | y_test=pred 1192 | y_test=y_test.tolist() 1193 | output_class_pred=[] 1194 | #output_class_pred=[] 1195 | for i in range(len(y_test)): 1196 | m=max(y_test[i]) 1197 | if(y_test[i].index(m)==0): 1198 | output_class_pred.append(0) 1199 | else: 1200 | output_class_pred.append(1) 1201 | 1202 | 1203 | original_ans=data_test['tag'] 1204 | original_ans=original_ans.tolist() 1205 | 1206 | 1207 | # In[ ]: 1208 | 1209 | 1210 | #as its a fake news classifier , so identifying a fake class will be a TP 1211 | resi=check_metric(output_class_pred,original_ans,diff1) 1212 | 1213 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns) 1214 | 1215 | 1216 | 1217 | 1218 | 1219 | def ConvNet_clstm(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False): 1220 | 1221 | embedding_layer = Embedding(num_words, 1222 | embedding_dim, 1223 | weights=[embeddings], 1224 | input_length=max_sequence_length, 1225 | trainable=trainable) 1226 | 1227 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32') 1228 | embedded_sequences = embedding_layer(sequence_input) 1229 | 1230 | convs = [] 1231 | filter_sizes = [10, 20, 30, 40] 1232 | 1233 | for filter_size in filter_sizes: 1234 | l_conv = Conv1D(filters=64, kernel_size=filter_size, padding='valid', activation='relu')(embedded_sequences) 1235 | convs.append(l_conv) 1236 | 1237 | cnn_feature_maps = Concatenate(axis=1)(convs) 1238 | sentence_encoder = LSTM(64,return_sequences=False)(cnn_feature_maps) 1239 | fc_layer =Dense(128, activation="relu")(sentence_encoder) 1240 | #output_layer = Dense(9,activation="softmax")(fc_layer) 1241 | 1242 | #model_1 = Model(inputs=[text_input_layer], outputs=[output_layer]) 1243 | preds = Dense(2, activation='softmax')(fc_layer) 1244 | 1245 | model = Model(sequence_input, preds) 1246 | model.compile(loss='categorical_crossentropy', 1247 | optimizer='Adam', 1248 | metrics=['acc']) 1249 | model.summary() 1250 | return model 1251 | 1252 | 1253 | model1 = ConvNet_clstm(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 1254 | True) 1255 | 1256 | 1257 | 1258 | 1259 | # In[ ]: 1260 | 1261 | 1262 | training_data=yoon_kim_train_data 1263 | 1264 | 1265 | # In[ ]: 1266 | 1267 | 1268 | testing_data=yoon_kim_test_data 1269 | 1270 | 1271 | # In[ ]: 1272 | 1273 | 1274 | 1275 | #define callbacks 1276 | from tensorflow.keras.callbacks import EarlyStopping 1277 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) 1278 | callbacks_list = [early_stopping] 1279 | 1280 | import time, datetime 1281 | start = datetime.datetime.now() 1282 | 1283 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 ) 1284 | end = datetime.datetime.now() 1285 | diff1= (end - start) 1286 | print('time taken by yoon',diff1) 1287 | 1288 | 1289 | # In[ ]: 1290 | 1291 | 1292 | pred=model1.predict(testing_data) 1293 | y_test=pred 1294 | y_test=y_test.tolist() 1295 | output_class_pred=[] 1296 | #output_class_pred=[] 1297 | for i in range(len(y_test)): 1298 | m=max(y_test[i]) 1299 | if(y_test[i].index(m)==0): 1300 | output_class_pred.append(0) 1301 | else: 1302 | output_class_pred.append(1) 1303 | 1304 | 1305 | original_ans=data_test['tag'] 1306 | original_ans=original_ans.tolist() 1307 | 1308 | 1309 | # In[ ]: 1310 | 1311 | 1312 | #as its a fake news classifier , so identifying a fake class will be a TP 1313 | resi=check_metric(output_class_pred,original_ans,diff1) 1314 | 1315 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns) 1316 | 1317 | 1318 | 1319 | 1320 | def ConvNet_lstm(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False): 1321 | 1322 | 1323 | embedding_layer = Embedding(num_words,embedding_dim,weights=[embeddings],input_length=max_sequence_length,trainable=trainable) 1324 | 1325 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32') 1326 | embedded_sequences = embedding_layer(sequence_input) 1327 | 1328 | sentence_encoder = LSTM(64,return_sequences=False)(embedded_sequences) 1329 | fc_layer =Dense(128, activation="relu")(sentence_encoder) 1330 | #output_layer = Dense(9,activation="softmax")(fc_layer) 1331 | #model_1 = Model(inputs=[text_input_layer], outputs=[output_layer]) 1332 | preds = Dense(2, activation='softmax')(fc_layer) 1333 | 1334 | model = Model(sequence_input, preds) 1335 | model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['acc']) 1336 | model.summary() 1337 | return model 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | model1 = ConvNet_lstm(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 1345 | True) 1346 | 1347 | 1348 | 1349 | 1350 | # In[ ]: 1351 | 1352 | 1353 | training_data=yoon_kim_train_data 1354 | 1355 | 1356 | # In[ ]: 1357 | 1358 | 1359 | testing_data=yoon_kim_test_data 1360 | 1361 | 1362 | # In[ ]: 1363 | 1364 | 1365 | 1366 | #define callbacks 1367 | from tensorflow.keras.callbacks import EarlyStopping 1368 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) 1369 | callbacks_list = [early_stopping] 1370 | 1371 | import time, datetime 1372 | start = datetime.datetime.now() 1373 | 1374 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 ) 1375 | end = datetime.datetime.now() 1376 | diff1= (end - start) 1377 | print('time taken by yoon',diff1) 1378 | 1379 | 1380 | # In[ ]: 1381 | 1382 | 1383 | pred=model1.predict(testing_data) 1384 | y_test=pred 1385 | y_test=y_test.tolist() 1386 | output_class_pred=[] 1387 | #output_class_pred=[] 1388 | for i in range(len(y_test)): 1389 | m=max(y_test[i]) 1390 | if(y_test[i].index(m)==0): 1391 | output_class_pred.append(0) 1392 | else: 1393 | output_class_pred.append(1) 1394 | 1395 | 1396 | original_ans=data_test['tag'] 1397 | original_ans=original_ans.tolist() 1398 | 1399 | 1400 | # In[ ]: 1401 | 1402 | 1403 | #as its a fake news classifier , so identifying a fake class will be a TP 1404 | resi=check_metric(output_class_pred,original_ans,diff1) 1405 | 1406 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns) 1407 | 1408 | 1409 | 1410 | #resi.to_csv('results.csv', mode='a', index = False, header=resi.columns,columns=resi.columns) 1411 | 1412 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | argon2-cffi==20.1.0 3 | arrow==0.13.1 4 | asn1crypto==1.2.0 5 | astor==0.8.0 6 | async-generator==1.10 7 | attrs==19.3.0 8 | Automat==0.8.0 9 | backcall==0.1.0 10 | bcrypt==3.1.7 11 | beautifulsoup4==4.8.2 12 | bleach==3.3.0 13 | blinker==1.4 14 | cachetools==3.1.1 15 | certifi==2019.11.28 16 | cffi==1.13.0 17 | chardet==3.0.4 18 | Click==7.0 19 | cloudpickle==1.3.0 20 | conda==4.8.2 21 | conda-package-handling==1.6.0 22 | constantly==15.1.0 23 | cryptography==2.8 24 | cssselect==1.1.0 25 | cycler==0.10.0 26 | Cython==0.29.17 27 | cytoolz==0.10.1 28 | dask==2.11.0 29 | debugpy==1.3.0 30 | decorator==4.4.1 31 | defusedxml==0.7.1 32 | eli5==0.10.1 33 | entrypoints==0.3 34 | fastcache==1.1.0 35 | future==0.18.2 36 | gast==0.2.2 37 | gensim==4.0.1 38 | gmpy2==2.0.8 39 | google-auth==1.11.2 40 | google-auth-oauthlib==0.4.1 41 | google-pasta==0.1.8 42 | graphviz==0.13.2 43 | grpcio==1.27.2 44 | h5py==2.8.0 45 | hyperlink==19.0.0 46 | idna==2.8 47 | imageio==2.6.1 48 | importlib-metadata==1.5.0 49 | incremental==17.5.0 50 | ipykernel==6.0.0 51 | ipython==7.25.0 52 | ipython-genutils==0.2.0 53 | jedi==0.16.0 54 | Jinja2==2.11.1 55 | joblib==0.14.1 56 | jsonschema==3.2.0 57 | jupyter-client==6.1.12 58 | jupyter-core==4.7.1 59 | jupyterlab-pygments==0.1.2 60 | Keras==2.3.1 61 | Keras-Applications==1.0.8 62 | Keras-Preprocessing==1.1.0 63 | kiwisolver==1.1.0 64 | leveldb==0.201 65 | lightgbm==2.3.0 66 | lxml==4.5.0 67 | Mako==1.1.1 68 | Markdown==3.1.1 69 | MarkupSafe==1.1.1 70 | matplotlib==3.1.3 71 | matplotlib-inline==0.1.2 72 | mistune==0.8.4 73 | mkl-fft==1.0.15 74 | mkl-random==1.1.0 75 | mkl-service==2.3.0 76 | more-itertools==8.2.0 77 | mpmath==1.1.0 78 | nbclient==0.5.3 79 | nbconvert==6.1.0 80 | nbformat==5.1.3 81 | nest-asyncio==1.5.1 82 | networkx==2.4 83 | nltk==3.4.5 84 | nose==1.3.7 85 | notebook==6.4.0 86 | numpy==1.18.1 87 | oauthlib==3.1.0 88 | olefile==0.46 89 | opt-einsum==3.1.0 90 | packaging==20.1 91 | pandas==1.0.1 92 | pandocfilters==1.4.3 93 | parsel==1.5.2 94 | parso==0.6.1 95 | patsy==0.5.1 96 | peewee==3.10.0 97 | pexpect==4.8.0 98 | pickleshare==0.7.5 99 | Pillow==7.0.0 100 | pluggy==0.13.1 101 | prometheus-client==0.11.0 102 | prompt-toolkit==3.0.3 103 | protobuf==3.11.4 104 | ptyprocess==0.6.0 105 | py==1.8.1 106 | pyasn1==0.4.8 107 | pyasn1-modules==0.2.7 108 | pycosat==0.6.3 109 | pycparser==2.19 110 | PyDispatcher==2.0.5 111 | pyglet==1.5.0 112 | Pygments==2.5.2 113 | pygpu==0.7.6 114 | PyHamcrest==1.9.0 115 | PyJWT==1.7.1 116 | pyOpenSSL==19.0.0 117 | pyparsing==2.4.6 118 | pyrsistent==0.18.0 119 | PySocks==1.7.1 120 | pytest==5.3.5 121 | pytest-runner==5.2 122 | python-dateutil==2.8.1 123 | python-gflags==3.1.2 124 | pytz==2019.3 125 | PyWavelets==1.1.1 126 | PyYAML==5.3 127 | pyzmq==22.1.0 128 | queuelib==1.5.0 129 | requests==2.22.0 130 | requests-oauthlib==1.3.0 131 | rsa==4.0 132 | ruamel-yaml==0.15.46 133 | scapy==2.4.3 134 | scikit-image==0.16.2 135 | scikit-learn==0.22.1 136 | scipy==1.4.1 137 | Scrapy==1.6.0 138 | seaborn==0.10.0 139 | Send2Trash==1.7.1 140 | service-identity==18.1.0 141 | simplejson==3.17.0 142 | singledispatch==3.4.0.3 143 | six==1.12.0 144 | smart-open==5.1.0 145 | soupsieve==1.9.5 146 | SQLAlchemy==1.3.13 147 | statsmodels==0.11.0 148 | sympy==1.5.1 149 | tabulate==0.8.6 150 | tensorboard==2.1.0 151 | tensorflow==2.0.0 152 | tensorflow-estimator==2.0.0 153 | termcolor==1.1.0 154 | terminado==0.10.1 155 | testpath==0.5.0 156 | Theano==1.0.4 157 | toolz==0.10.0 158 | torch==1.3.1 159 | tornado==6.1 160 | tqdm==4.36.1 161 | traitlets==4.3.3 162 | Twisted==19.10.0 163 | urllib3==1.24.2 164 | w3lib==1.21.0 165 | wcwidth==0.1.8 166 | webencodings==0.5.1 167 | Werkzeug==0.16.1 168 | wrapt==1.11.2 169 | wxPython==4.0.4 170 | zipp==2.2.0 171 | zope.interface==4.7.1 172 | -------------------------------------------------------------------------------- /sample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -N 1 #1 Node 3 | #SBATCH --ntasks-per-node=8 4 | #SBATCH --time=3-00:00:00 5 | #SBATCH --job-name=run_models_gpu_1_16GB 6 | #SBATCH --error=%J.err 7 | #SBATCH --output=%J.out 8 | 9 | #SBATCH --partition=gpu 10 | #SBATCH --gres=gpu:1 11 | 12 | eval "$(conda shell.bash hook)" 13 | 14 | module load python/conda-python/3.7 15 | 16 | python3 /scratch/satyendrac.mnitjaipur/codes/soni/run_models.py --------------------------------------------------------------------------------