├── All_machine_learning _models.py
├── Detailed_performance_metrics (1).xlsx
├── README.md
├── Supplementary_File .pdf
├── TextConvoNet.py
├── requirements.txt
└── sample.sh


/All_machine_learning _models.py:
--------------------------------------------------------------------------------
  1 | # %% [code]
  2 | # This Python 3 environment comes with many helpful analytics libraries installed
  3 | # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
  4 | # For example, here's several helpful packages to load
  5 | 
  6 | import numpy as np # linear algebra
  7 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  8 | 
  9 | # Input data files are available in the read-only "../input/" directory
 10 | # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
 11 | 
 12 | 
 13 | # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
 14 | # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
 15 | 
 16 | # %% [code]
 17 | 
 18 | import pandas as pd
 19 | import numpy as np
 20 | import matplotlib.pyplot as plt
 21 | import seaborn as sns
 22 | import nltk
 23 | from nltk.corpus import stopwords
 24 | import string
 25 | import math
 26 | from sklearn.feature_extraction.text import CountVectorizer
 27 | from sklearn.model_selection import train_test_split, cross_val_score
 28 | from sklearn.metrics import classification_report
 29 | from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
 30 | #from sklearn.grid_search import GridSearchCV
 31 | %matplotlib inline
 32 | 
 33 | 
 34 | '''import bz2
 35 | def get_labels_and_texts(file):
 36 |     labels = []
 37 |     texts = []
 38 |     for line in bz2.BZ2File(file):
 39 |         x = line.decode("utf-8")
 40 |         labels.append(1 if int(x[9]) == 2 else 0)
 41 |         texts.append(x[10:].strip())
 42 |     return np.array(labels), texts
 43 | train_labels, train_texts = get_labels_and_texts('/kaggle/input/amazonreviews/train.ft.txt.bz2')
 44 | test_labels, test_texts = get_labels_and_texts('/kaggle/input/amazonreviews/test.ft.txt.bz2')
 45 | 
 46 | #data_train['review'][7]
 47 | print(train_labels[4])
 48 | print(train_texts[4])
 49 | 
 50 | # In[6]:
 51 | data={"text":train_texts,'stars':train_labels}
 52 | data_train=pd.DataFrame(data)
 53 | data1={"text":test_texts,'stars':test_labels}
 54 | data_test=pd.DataFrame(data1)
 55 | '''
 56 | import numpy as np
 57 | import pandas as pd
 58 | def multiclass_metrics(cnf_matrix):
 59 | 	cnf_matrix=np.asarray(cnf_matrix)
 60 | 	FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) 
 61 | 	FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
 62 | 	TP = np.diag(cnf_matrix)
 63 | 	TN = cnf_matrix.sum() - (FP + FN + TP)
 64 | 	FP = FP.astype(float)
 65 | 	FN = FN.astype(float)
 66 | 	TP = TP.astype(float)
 67 | 	TN = TN.astype(float)
 68 | 
 69 | 	TP=np.sum(TP)
 70 | 	TN=np.sum(TN)
 71 | 	FP=np.sum(FP)
 72 | 	FN=np.sum(FN)
 73 | 
 74 | 
 75 | 	accuracy=(TP+TN)/(TP+FP+FN+TN)
 76 | 	precision=TP/(TP+FP)
 77 | 	recalll=TP/(FN+TP)
 78 | 	F1=2*precision*recalll/(precision+recalll)
 79 | 	sensiti=TP/(TP+FN)
 80 | 	specifici=TN/(TN+FP)
 81 | 	numerator=TP*TN - FP*FN
 82 |     
 83 | 	denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN))
 84 | 	MCc=numerator/denominator
 85 | 	G_mean1=np.sqrt(sensiti*precision)
 86 | 	G_mean2=np.sqrt(sensiti*specifici)
 87 | 	print('precision:' ,TP/(TP+FP))
 88 | 	print('recall:',TP/(FN+TP))
 89 | 	print("F1:",F1)
 90 | 	print("Specificity:",TN/(TN+FP))
 91 | 	print("Sensitivity ",TP/(TP+FN))
 92 | 	print('G-mean1:',np.sqrt(sensiti*precision))
 93 | 	print("G-mean2",np.sqrt(sensiti*specifici))
 94 | 	print("MCC :",MCc)
 95 | 	acc=[]
 96 | 	pre=[]
 97 | 	recall=[]
 98 | 	f1=[]
 99 | 	specificity=[]
100 | 	sensitivity=[]
101 | 	GMean1=[]
102 | 	Gmean2=[]
103 | 	MCC=[]
104 | 	tp=[]
105 | 	fp=[]
106 | 	fn=[]
107 | 	tn=[]
108 | 	acc.append(accuracy)
109 | 	pre.append(precision)
110 | 	recall.append(recalll)
111 | 	f1.append(F1)
112 | 	specificity.append(specifici)
113 | 	sensitivity.append(sensiti)
114 | 	GMean1.append(G_mean1)
115 | 	Gmean2.append(G_mean2)
116 | 	MCC.append(MCc)
117 | 	tp.append(TP)
118 | 	fp.append(FP)
119 | 	tn.append(TN)
120 | 	fn.append(FN)
121 | 	data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn,}
122 | 	metric=pd.DataFrame(data)
123 | 	return metric
124 | 
125 | #cnf_matrix=[[1025,0,0,20,0,0,0,0,17],[0,0,0,2,0,0,0,0,3],[83,0,63,5,0,0,0,0,0],[18,0,0,330,0,0,0,0,1],[16,0,0,0,165,0,0,0,0],[51,0,0,0,0,0,0,0,0],[2,0,0,1,0,0,0,0,2],[8,0,0,0,0,0,0,0,0],[32,0,0,2,0,0,0,0,154]]
126 | 
127 | 
128 | data_train=pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv')
129 | 
130 | data_test=pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv')
131 | print("hi")
132 | 
133 | data_train=data_train[:10000]
134 | data_test=data_test[10000:]
135 | # In[7]:
136 | 
137 | 
138 | 
139 | data_train.rename(columns={'text':'title','airline_sentiment':'tag'},inplace=True)
140 | data_test.rename(columns={'text':'title','airline_sentiment':'tag'},inplace=True)
141 | # In[94]:
142 | print('jij')
143 | 
144 | # In[8]:
145 | 
146 | 
147 | # In[88]:
148 | 
149 | 
150 | data_train['title']=data_train['title'].astype(str)
151 | data_test['title']=data_test['title'].astype(str)
152 | #data_train
153 | print('fdd')
154 | 
155 | 
156 | '''def make_tags(x):   #converting the ratings column into 0's and 1's.  for binary classifier to take place
157 |     if(x<=3):
158 |         return 0
159 |     else:
160 |         return 1
161 |   
162 | 
163 | 
164 | # In[10]:
165 | 
166 | 
167 | data_train['tag']=data_train['tag'].apply(lambda x: make_tags(x))
168 | data_test['tag']=data_test['tag'].apply(lambda x: make_tags(x))
169 | print('sddsd')
170 | '''
171 | x_train=data_train['title']
172 | y_train=data_train['tag']
173 | 
174 | test_cnn_data=data_test['title']
175 | #y_test=data_test['tag']
176 | 
177 | print('sdfsdfsdf')
178 | '''def text_process(text):
179 |     nopunc = [char for char in text if char not in string.punctuation]
180 |     nopunc = ''.join(nopunc)
181 |     return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
182 | '''
183 | print('ddsd')
184 | vocab = CountVectorizer().fit(x_train)
185 | print("dwerty")
186 | print(len(vocab.vocabulary_))
187 | #print(x_train[2000])
188 | '''r0 = x[2000]
189 | print(r0)
190 | vocab0 = vocab.transform([r0])
191 | print(vocab0)
192 | """
193 |     Now the words in the review number 78 have been converted into a vector.
194 |     The data that we can see is the transformed words.
195 |     If we now get the feature's name - we can get the word back!
196 | """
197 | print("Getting the words back:")
198 | print(vocab.get_feature_names()[19648])
199 | print(vocab.get_feature_names()[10643])
200 | '''
201 | 
202 | x_train = vocab.transform(x_train)
203 | test_cnn_data=vocab.transform(test_cnn_data)
204 | print("Shape of the sparse matrix: ", x_train.shape)
205 | print(y_train)
206 | 
207 | #########MULTIONOMIAL NAIVEBAYES
208 | from sklearn.naive_bayes import MultinomialNB
209 | model = MultinomialNB()
210 | print("hih")
211 | model.fit(x_train,y_train.values)
212 | #predmnb = mnb.predict(x_test)
213 | #print("Confusion Matrix for Multinomial Naive Bayes:")
214 | #print(confusion_matrix(y_test,predmnb))
215 | #print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
216 | #print("Classification Report:",classification_report(y_test,predmnb))
217 | 
218 | 
219 | 
220 | 
221 | pred=model.predict(test_cnn_data)
222 | #print(y_test)
223 | y_test=pred
224 | y_test=y_test.tolist()
225 | output_class_pred=[]
226 | '''for i in range(len(y_test)):
227 |     if(y_test[i][0]<0.5):
228 |         output_class_pred.append(0)
229 |     else:
230 |         output_class_pred.append(1)
231 | '''      
232 | output_class_pred=y_test
233 | original_ans=data_test['tag']
234 | original_ans=original_ans.tolist()
235 | 
236 | # In[ ]:
237 | from sklearn.metrics import confusion_matrix
238 | from sklearn.metrics import classification_report
239 | 
240 | #as its a fake news classifier , so identifying a fake class will be a TP
241 | def check_metric(output_class_pred,original_ans):
242 |     rightly_predicted=0
243 |     TP=0
244 |     for i in range(len(y_test)):
245 |         if(original_ans[i]==output_class_pred[i]):
246 |             rightly_predicted+=1
247 |         
248 |         
249 |     print("Overall_acuracy:",rightly_predicted/len(output_class_pred))
250 |     print('TP',TP)
251 |     accuracy=rightly_predicted/len(y_test)
252 |     print(classification_report(original_ans,output_class_pred))
253 |     print(confusion_matrix(original_ans,output_class_pred))
254 |     TN=confusion_matrix(original_ans,output_class_pred)[0][0]
255 |     TP=confusion_matrix(original_ans,output_class_pred)[1][1]
256 |     FP=confusion_matrix(original_ans,output_class_pred)[0][1]
257 |     FN=confusion_matrix(original_ans,output_class_pred)[1][0]
258 |     
259 |     precision=TP/(TP+FP)
260 |     recalll=TP/(FN+TP)
261 |     F1=2*precision*recalll/(precision+recalll)
262 |     sensiti=TP/(TP+FN)
263 |     specifici=TN/(TN+FP)
264 |     numerator=TP*TN - FP*FN
265 |     
266 |     denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN))
267 |     MCc=numerator/denominator
268 |     G_mean1=np.sqrt(sensiti*precision)
269 |     G_mean2=np.sqrt(sensiti*specifici)
270 |     print('precision:' ,TP/(TP+FP))
271 |     print('recall:',TP/(FN+TP))
272 |     print("F1:",F1)
273 |     print("Specificity:",TN/(TN+FP))
274 |     print("Sensitivity ",TP/(TP+FN))
275 |     print('G-mean1:',np.sqrt(sensiti*precision))
276 |     print("G-mean2",np.sqrt(sensiti*specifici))
277 |     print("MCC :",MCc)
278 |     acc=[]
279 |     pre=[]
280 |     recall=[]
281 |     f1=[]
282 |     specificity=[]
283 |     sensitivity=[]
284 |     GMean1=[]
285 |     Gmean2=[]
286 |     MCC=[]
287 |     tp=[]
288 |     fp=[]
289 |     fn=[]
290 |     tn=[]
291 |     acc.append(accuracy)
292 |     pre.append(precision)
293 |     recall.append(recalll)
294 |     f1.append(F1)
295 |     specificity.append(specifici)
296 |     sensitivity.append(sensiti)
297 |     GMean1.append(G_mean1)
298 |     Gmean2.append(G_mean2)
299 |     MCC.append(MCc)
300 |     tp.append(TP)
301 |     fp.append(FP)
302 |     tn.append(TN)
303 |     fn.append(FN)
304 |     data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn}
305 |     metric=pd.DataFrame(data)
306 |     return metric
307 |     
308 |   
309 | 
310 | 
311 |         
312 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
313 |     
314 | 
315 | resi=multiclass_metrics(cnf_matrix)
316 | resi.to_csv('results1.csv', mode='w', index = False, header=resi.columns,columns=resi.columns)
317 | 
318 | 
319 | # In[ ]:
320 | 
321 | 
322 | 
323 | ########RANDOMFOREST
324 | from sklearn.ensemble import RandomForestClassifier
325 | model = RandomForestClassifier()
326 | model.fit(x_train,y_train.values)
327 | 
328 | pred=model.predict(test_cnn_data)
329 | print(y_test)
330 | y_test=pred
331 | y_test=y_test.tolist()
332 | output_class_pred=[]
333 | output_class_pred=y_test
334 | original_ans=data_test['tag']
335 | original_ans=original_ans.tolist()
336 | 
337 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
338 |     
339 | 
340 | resi=multiclass_metrics(cnf_matrix)
341 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
342 | 
343 | 
344 | 
345 | ####DECISION TREE
346 | from sklearn.tree import DecisionTreeClassifier
347 | model= DecisionTreeClassifier()
348 | model.fit(x_train,y_train.values)
349 | 
350 | pred=model.predict(test_cnn_data)
351 | print(y_test)
352 | y_test=pred
353 | y_test=y_test.tolist()
354 | output_class_pred=[]
355 | output_class_pred=y_test
356 | original_ans=data_test['tag']
357 | original_ans=original_ans.tolist()
358 | 
359 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
360 |     
361 | 
362 | resi=multiclass_metrics(cnf_matrix)
363 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
364 | 
365 | 
366 | 
367 | 
368 | #####SVC
369 | from sklearn.svm import SVC
370 | model = SVC(random_state=101)
371 | model.fit(x_train,y_train.values)
372 | pred=model.predict(test_cnn_data)
373 | print(y_test)
374 | y_test=pred
375 | y_test=y_test.tolist()
376 | output_class_pred=[]
377 | output_class_pred=y_test
378 | original_ans=data_test['tag']
379 | original_ans=original_ans.tolist()
380 | 
381 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
382 |     
383 | 
384 | resi=multiclass_metrics(cnf_matrix)
385 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
386 | 
387 | 
388 | 
389 | ####GRADIENT BOOSTING CLASSIFIER
390 | from sklearn.ensemble import GradientBoostingClassifier
391 | model = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,max_features=0.5,random_state=999999)
392 | model.fit(x_train,y_train.values)
393 | 
394 | pred=model.predict(test_cnn_data)
395 | print(y_test)
396 | y_test=pred
397 | y_test=y_test.tolist()
398 | output_class_pred=[]
399 | output_class_pred=y_test
400 | original_ans=data_test['tag']
401 | original_ans=original_ans.tolist()
402 | 
403 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
404 |     
405 | 
406 | resi=multiclass_metrics(cnf_matrix)
407 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
408 | 
409 | 
410 |     
411 | #####KNN 
412 | from sklearn.neighbors import KNeighborsClassifier
413 | model = KNeighborsClassifier(n_neighbors=10)
414 | model.fit(x_train,y_train.values)
415 | pred=model.predict(test_cnn_data)
416 | print(y_test)
417 | y_test=pred
418 | y_test=y_test.tolist()
419 | output_class_pred=[]
420 | output_class_pred=y_test
421 | original_ans=data_test['tag']
422 | original_ans=original_ans.tolist()
423 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
424 |     
425 | 
426 | resi=multiclass_metrics(cnf_matrix)
427 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
428 | 
429 | 
430 | 
431 | 
432 | ####XGBOOST CLASSIFIER
433 | import xgboost
434 | from xgboost import XGBClassifier
435 | model = XGBClassifier()
436 | model.fit(x_train,y_train)
437 | pred=model.predict(test_cnn_data)
438 | print(y_test)
439 | y_test=pred
440 | y_test=y_test.tolist()
441 | output_class_pred=[]
442 | output_class_pred=y_test
443 | original_ans=data_test['tag']
444 | original_ans=original_ans.tolist()
445 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
446 |     
447 | 
448 | resi=multiclass_metrics(cnf_matrix)
449 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
450 | 
451 | 
452 | 
453 | print(output_class_pred)
454 | print(original_ans)


--------------------------------------------------------------------------------
/Detailed_performance_metrics (1).xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonisanskar/TextConvoNet/a5814e868cc5ef9504774ca9a431d5d3febc3379/Detailed_performance_metrics (1).xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TextConvoNet
2 | The above code is for a novel CNN based Text classification architecture **TextConvoNet** which uses a paragraph matrix and 2D Convolution for text classification tasks.
3 | <br>
4 | Click Here to view the detailed architecture of the [*TextConvoNet*](https://drive.google.com/file/d/1Q7kuPXbtMQtRNGUj-Tmg9hIgSI_2mv5k/view?usp=sharing)
5 | <br>
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/Supplementary_File .pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonisanskar/TextConvoNet/a5814e868cc5ef9504774ca9a431d5d3febc3379/Supplementary_File .pdf


--------------------------------------------------------------------------------
/TextConvoNet.py:
--------------------------------------------------------------------------------
   1 | # This Python 3 environment comes with many helpful analytics libraries installed
   2 | # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
   3 | # For example, here's several helpful packages to load
   4 | 
   5 | import numpy as np # linear algebra
   6 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
   7 | 
   8 | # Input data files are available in the read-only "../input/" directory
   9 | # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
  10 | 
  11 | import bz2
  12 | import pickle
  13 | import os
  14 | '''
  15 | for dirname, _, filenames in os.walk('/kaggle/input'):
  16 |     for filename in filenames:
  17 |         print(os.path.join(dirname, filename))
  18 | '''
  19 | # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
  20 | # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
  21 | 
  22 | #!/usr/bin/env python
  23 | # coding: utf-8
  24 | 
  25 | # In[2]:
  26 | 
  27 | 
  28 | import numpy as np
  29 | import pandas as pd
  30 | import seaborn as sns
  31 | import matplotlib as plt
  32 | import json
  33 | #get_ipython().run_line_magic('matplotlib', 'inline')
  34 | '''
  35 | from tensorflow.keras.optimizers import Adam
  36 | 
  37 | # In[3]:
  38 | trainfile = bz2.BZ2File('../input/amazonreviews/train.ft.txt.bz2','r')
  39 | lines = trainfile.readlines()
  40 | 
  41 | sent_analysis = []
  42 | def sent_list(docs,splitStr='__label__'):
  43 |     for i in range(1,len(docs)):
  44 |         text=str(lines[i])
  45 |         splitText=text.split(splitStr)
  46 |         #print(i)
  47 |         secHalf=splitText[1]
  48 |         text=secHalf[2:len(secHalf)-1]
  49 |         sentiment=secHalf[0]
  50 |         sent_analysis.append([text,sentiment])
  51 |     return sent_analysis
  52 | 
  53 | sentiment_list=sent_list(lines[:1000000],splitStr='__label__')
  54 | 
  55 | train_df = pd.DataFrame(sentiment_list,columns=['Text','Sentiment'])
  56 | 
  57 | data_train=train_df[:4000]
  58 | data_test=train_df[4000:5000]
  59 | '''
  60 | #a=input('path of the taining dataset with fields as title and tag(0,1) ')
  61 | #b=input('path of test dataset')
  62 | #data_train=pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
  63 | 
  64 | 
  65 | # In[4]:
  66 | 
  67 | 
  68 | #data_train
  69 | 
  70 | 
  71 | # In[5]:
  72 | 
  73 | data_train=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
  74 | data_test=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
  75 | 
  76 | data_train=data_train[:40000]
  77 | data_test=data_test[40000:]
  78 | 
  79 | 
  80 | 
  81 | # In[8]:
  82 | data_train.rename(columns={'review':'title','sentiment':'tag'},inplace=True)
  83 | data_test.rename(columns={'review':'title','sentiment':'tag'},inplace=True)
  84 | 
  85 | #data_train['rating'].value_counts()
  86 | #print('training_dataset',data_train)
  87 | #print('training_dataset',data_test)
  88 | 
  89 | # In[9]:
  90 | 
  91 | #print(data_train)
  92 | 
  93 | def make_tags(x):   #converting the ratings column into 0's and 1's.  for binary classifier to take place
  94 |     if(x=="negative"):
  95 |         return 0
  96 |     else:
  97 |         return 1
  98 |   
  99 | 
 100 | 
 101 | # In[10]:
 102 | 
 103 | 
 104 | data_train['tag']=data_train['tag'].apply(lambda x: make_tags(x))
 105 | data_test['tag']=data_test['tag'].apply(lambda x: make_tags(x))
 106 | 
 107 | #print(data_train)
 108 | 
 109 | count0=(data_train['tag']==0).sum()
 110 | count1=(data_train['tag']==1).sum()
 111 | if(count0>count1):
 112 |     imbalance_ratio=(count0)/count1
 113 | else:
 114 |     imbalance_ratio=(count1)/count0
 115 | # In[11]:
 116 | 
 117 | print('imbalance_ratio',imbalance_ratio)
 118 | #print(data_train)
 119 | 
 120 | 
 121 | # In[12]:
 122 | 
 123 | 
 124 | 
 125 | def no_of_words_in_paragraph(x):
 126 |     return len(list(x))
 127 | 
 128 | data_train['no_of_words_in_paragraph']=data_train['title'].apply(lambda x:no_of_words_in_paragraph(x))
 129 | 
 130 | data_test['no_of_words_in_paragraph']=data_test['title'].apply(lambda x:no_of_words_in_paragraph(x))
 131 | 
 132 | 
 133 | 
 134 | print(data_train)
 135 | avg=data_train['no_of_words_in_paragraph'].mean()
 136 | maxim=data_train['no_of_words_in_paragraph'].max()
 137 | print('average paragraph length',data_train['no_of_words_in_paragraph'].mean())
 138 | print('maximum para length',data_train['no_of_words_in_paragraph'].max())
 139 | print('hii')
 140 | excess=(data_train['no_of_words_in_paragraph']>avg).sum()
 141 | excess_ratio=excess/len(data_train)
 142 | print('excess_ratio',excess_ratio)
 143 | 
 144 | 
 145 | #applying sentence tokenizer
 146 | import nltk.data 
 147 | tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle') 
 148 | # Loading PunktSentenceTokenizer using English pickle file 
 149 | def make_sent_token(x):
 150 |     return tokenizer.tokenize(x) 
 151 | #converting each paragraph into separate sentences
 152 | 
 153 | 
 154 | # In[13]:
 155 | 
 156 | 
 157 | data_train['sentence_token']=data_train['title'].apply(lambda x: make_sent_token(x))
 158 | 
 159 | data_test['sentence_token']=data_test['title'].apply(lambda x: make_sent_token(x))
 160 | 
 161 | 
 162 | # In[15]:
 163 | 
 164 | 
 165 | #data_train.drop(columns=['uniqueID','date','usefulCount','condition','drugName'],inplace=True,axis=1)# dropping irrelevant columns
 166 | 
 167 | 
 168 | # In[16]:
 169 | 
 170 | 
 171 | #data_test.drop(columns=['uniqueID','date','usefulCount','condition','drugName'],inplace=True,axis=1)
 172 | 
 173 | 
 174 | # In[17]:
 175 | 
 176 | 
 177 | #data_train
 178 | 
 179 | 
 180 | # In[18]:
 181 | 
 182 | 
 183 | data_train['no_of_sentences']=data_train['sentence_token'].apply(lambda x:len(x))
 184 | 
 185 | 
 186 | # In[19]:
 187 | 
 188 | 
 189 | data_test['no_of_sentences']=data_test['sentence_token'].apply(lambda x:len(x))
 190 | 
 191 | 
 192 | # In[20]:
 193 | avg_sen_length=data_train['no_of_words_in_paragraph'].sum()/data_train['no_of_sentences'].sum()
 194 | print(avg_sen_length)
 195 | 
 196 | #max(data_train['no_of_sentences'])##no of rows in sentence matrix which is to be feed in model(max number of sentence in any paragraph)
 197 | 
 198 | 
 199 | # In[21]:
 200 | 
 201 | 
 202 | #len(data_train[data_train['no_of_sentences']==92]['review'])
 203 | 
 204 | 
 205 | # In[22]:
 206 | 
 207 | 
 208 | #max(data_test['no_of_sentences'])
 209 | 
 210 | 
 211 | # In[23]:
 212 | 
 213 | 
 214 | def max_length_of_sentence(x,y):
 215 |     sen=x
 216 |     nu=y
 217 |     #print(sen)
 218 |     ma=0
 219 |     if(nu>1):
 220 |         l=sen.split('.')
 221 |         #print(l)
 222 |         for i in range(len(l)):
 223 |             k=l[i].replace(',','')
 224 |             maxi=len(k.split())
 225 |             #print(maxi)
 226 |             if(maxi>ma):
 227 |                 ma=maxi
 228 |         return ma
 229 |     else:
 230 |         return len(sen.split())
 231 |         
 232 |     
 233 | 
 234 | 
 235 | # In[24]:
 236 | 
 237 | 
 238 | data_train['max_words_in_sentence']=data_train.apply(lambda x: max_length_of_sentence(x.title,x.no_of_sentences),axis=1)
 239 | 
 240 | 
 241 | # In[25]:
 242 | 
 243 | 
 244 | data_test['max_words_in_sentence']=data_test.apply(lambda x: max_length_of_sentence(x.title,x.no_of_sentences),axis=1)
 245 | 
 246 | 
 247 | # In[26]:
 248 | 
 249 | 
 250 | #max(data_train['max_words_in_sentence'])## number of columns in the data to be feeded
 251 | 
 252 | 
 253 | # In[27]:
 254 | 
 255 | x1=max(data_train['no_of_sentences'])
 256 | y1=max(data_train['max_words_in_sentence'])
 257 | 
 258 | x2=max(data_test['no_of_sentences'])
 259 | y2=max(data_test['max_words_in_sentence'])
 260 | 
 261 | if(x1>=x2):
 262 |     m=x1
 263 |     print(m)
 264 |     m=m
 265 | else:
 266 |     m=x2
 267 |     m=m
 268 |     
 269 | if(y1>=y2):
 270 |     n=y1
 271 | else:
 272 |     n=y2
 273 | 
 274 | #So each para will be converted to a m*n matrix
 275 | if(m<5):
 276 |     m=6
 277 | else:
 278 |     m+=2
 279 | print('x1,x2,y1,y2',x1,x2,y1,y2)
 280 | 
 281 | print("m-->",m,n)
 282 | #So each para will be converted to a m*n matrix
 283 | 
 284 | 
 285 | # In[28]:
 286 | 
 287 | 
 288 | 
 289 | 
 290 | # # Major part starts here ..... Now converting the paragraph into required matrix
 291 | 
 292 | # In[29]:
 293 | 
 294 | 
 295 | import re
 296 | import string 
 297 | from nltk import word_tokenize
 298 | from nltk.corpus import stopwords
 299 | def make_tokens(text):     ##Converting into single tokens in order to create the vocabulary
 300 |     return word_tokenize(text)
 301 | 
 302 | 
 303 | data_train['tokens']=data_train['title'].apply(lambda x: make_tokens(x))
 304 | data_test['tokens']=data_test['title'].apply(lambda x: make_tokens(x))
 305 | 
 306 | 
 307 | # In[30]:
 308 | 
 309 | 
 310 | #data_train['tokens']
 311 | 
 312 | 
 313 | # In[ ]:
 314 | 
 315 | 
 316 | #from gensim import models
 317 | #word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
 318 | #word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
 319 | 
 320 | 
 321 | 
 322 | embeddings_index = {}
 323 | f = open('../input/glove6b300dtxt/glove.6B.300d.txt')
 324 | for line in f:
 325 |     values = line.split(' ')
 326 |     word = values[0] ## The first entry is the word
 327 |     coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
 328 |     embeddings_index[word] = coefs
 329 | f.close()
 330 | 
 331 | print('GloVe data loaded')
 332 | 
 333 | # In[ ]:
 334 | 
 335 | 
 336 | all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
 337 | training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
 338 | TRAINING_VOCAB = sorted(list(set(all_training_words)))
 339 | print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
 340 | print("Max sentence length is %s" % max(training_sentence_lengths))
 341 | para_max=max(training_sentence_lengths)
 342 | 
 343 | vocab=len(TRAINING_VOCAB)
 344 | 
 345 | # In[ ]:
 346 | 
 347 | 
 348 | #len(TRAINING_VOCAB)
 349 | 
 350 | 
 351 | # In[ ]:
 352 | 
 353 | 
 354 | from tensorflow.keras.preprocessing.text import Tokenizer
 355 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 356 | tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), char_level=False)
 357 | tokenizer.fit_on_texts(data_train['title'])       # we assigned values 
 358 | 
 359 | 
 360 | # In[ ]:
 361 | 
 362 | 
 363 | train_word_index = tokenizer.word_index
 364 | 
 365 | 
 366 | # In[ ]:
 367 | 
 368 | 
 369 | #print(train_word_index)
 370 | 
 371 | 
 372 | # In[ ]:
 373 | 
 374 | 
 375 | #data_train.to_csv('medic_train.csv')
 376 | #data_test.to_csv('medic_test.csv')
 377 | 
 378 | 
 379 | # In[ ]:
 380 | 
 381 | 
 382 | def make_train_seq(x):
 383 |     return tokenizer.texts_to_sequences(x)
 384 | data_train['train_seq']=data_train['sentence_token'].apply(lambda x:make_train_seq(x) )
 385 | data_test['train_seq']=data_test['sentence_token'].apply(lambda x:make_train_seq(x) )
 386 | 
 387 | 
 388 | # In[ ]:
 389 | 
 390 | 
 391 | #(data_train['train_seq'])   # here every para has been encoded
 392 | 
 393 | 
 394 | # In[ ]:
 395 | #print(data_train)
 396 | 
 397 | 
 398 | 
 399 | 
 400 | # In[ ]:
 401 | 
 402 | 
 403 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 404 | def padding(x):    #now padding each sentence to a length of n...number of columns
 405 |     MAX_SENTENCE_LENGTH=n  #(no of columns)
 406 |     return pad_sequences(x,maxlen=MAX_SENTENCE_LENGTH,padding='post')
 407 | 
 408 | data_train['padded']=data_train['train_seq'].apply(lambda x:padding(x))
 409 | data_test['padded']=data_test['train_seq'].apply(lambda x:padding(x))
 410 | 
 411 | 
 412 | # In[ ]:
 413 | 
 414 | 
 415 | #(data_train.padded[8])
 416 | 
 417 | 
 418 | # In[ ]:
 419 | 
 420 | 
 421 | 
 422 | ## More code adapted from the keras reference (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
 423 | # prepare embedding matrix 
 424 | from tensorflow.keras.layers import Embedding
 425 | from tensorflow.keras.initializers import Constant
 426 | 
 427 | ## EMBEDDING_DIM =  ## seems to need to match the embeddings_index dimension
 428 | EMBEDDING_DIM = embeddings_index.get('a').shape[0]
 429 | print(EMBEDDING_DIM)
 430 | #num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
 431 |  #= np.zeros(len(train_word_index) + 1, EMBEDDING_DIM)
 432 | train_embedding_weights = np.zeros((len(train_word_index)+1, 
 433 |  EMBEDDING_DIM))
 434 | for word, i in train_word_index.items():
 435 |     #print("sd")
 436 |     embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
 437 |     if embedding_vector is not None:
 438 |         train_embedding_weights[i] = embedding_vector
 439 | print(train_embedding_weights.shape)
 440 |         # words not found in embedding index will be all-zeros.
 441 |         
 442 | 
 443 | # load pre-trained word embeddings into an Embedding layer
 444 | # note that we set trainable = False so as to keep the embeddings fixed
 445 | #embedding_layer = Embedding(num_words,
 446 |                           #  EMBEDDING_DIM,
 447 |                           #  embeddings_initializer=Constant(embedding_matrix),
 448 |                           #  input_length=MAX_SEQUENCE_LENGTH,
 449 |                           #  trainable=False)
 450 | 
 451 | 
 452 | #EMBEDDING_DIM=300
 453 | #train_embedding_weights = np.zeros((len(train_word_index)+1, 
 454 |  #EMBEDDING_DIM))
 455 | #for word,index in train_word_index.items():
 456 |  #train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
 457 | #print(train_embedding_weights.shape)
 458 | 
 459 | 
 460 | # In[43]:
 461 | 
 462 | 
 463 | def make_full_para(x):     #92 cross 192 matrix of a paragraph.   (m*n)
 464 |     l=len(x)
 465 |     h=m-l    #no. of extra rows to be added
 466 |     z=[0]*h*n       #1D vector(#addding extra lines for zeroes as padding)
 467 |     z=np.reshape(z,(h,n))    #reshaping it to match the dimension of paragraph
 468 |     s=x.tolist()+z.tolist()
 469 |     return s 
 470 | 
 471 | 
 472 | # In[ ]:
 473 | 
 474 | 
 475 | 
 476 | 
 477 | 
 478 | # In[ ]:
 479 | 
 480 | 
 481 | data_train['full_para']=data_train['padded'].apply(lambda x : make_full_para(x))
 482 | data_test['full_para']=data_test['padded'].apply(lambda x : make_full_para(x))
 483 | 
 484 | 
 485 | # In[ ]:
 486 | 
 487 | 
 488 | #data_train.full_para
 489 | 
 490 | 
 491 | # In[ ]:
 492 | 
 493 | 
 494 | def create_1d_para(x):
 495 |     l=[]
 496 |     for i in x:
 497 |         l+=i    #concatenating all the sentences in a para into a single 1 d arrray
 498 |     return l
 499 |         
 500 |     
 501 | 
 502 | 
 503 | # In[ ]:
 504 | 
 505 | data_train['single_d_array']=data_train['full_para'].apply(lambda x: create_1d_para(x) )
 506 | data_test['single_d_array']=data_test['full_para'].apply(lambda x: create_1d_para(x) )
 507 | 
 508 | 
 509 | # In[ ]:
 510 | 
 511 | 
 512 | #train_cnn_data=np.array(data_train['single_d_array'].tolist())
 513 | 
 514 | 
 515 | # In[ ]:
 516 | 
 517 | 
 518 | train_cnn_data=np.array(data_train['single_d_array'].tolist())
 519 | test_cnn_data=np.array(data_test['single_d_array'].tolist())
 520 | 
 521 | 
 522 | # In[ ]:
 523 | 
 524 | from sklearn.model_selection import train_test_split
 525 | y_train=data_train['tag'].values
 526 | 
 527 | 
 528 | 
 529 | # In[ ]:
 530 | 
 531 | print('Startting the training')
 532 | #from __future__ import print_function
 533 | from tensorflow.keras.layers import Embedding
 534 | 
 535 | from tensorflow.keras.preprocessing.text import text_to_word_sequence
 536 | import pandas as pd
 537 | from tensorflow.keras.preprocessing.text import Tokenizer
 538 | import numpy as np
 539 | 
 540 | 
 541 | from tensorflow.keras.preprocessing import sequence
 542 | from tensorflow.keras.models import Sequential,Model
 543 | from tensorflow.keras.layers import Dense, Dropout, Activation,Flatten,Bidirectional,GRU,LSTM,SpatialDropout1D,Reshape
 544 | from tensorflow.keras.layers import Embedding,concatenate
 545 | from tensorflow.keras.layers import Conv2D, GlobalMaxPooling2D,MaxPool2D,MaxPool3D,GlobalAveragePooling2D,Conv3D
 546 | from tensorflow.keras.models import Model
 547 | from tensorflow.keras.layers import Input
 548 | 
 549 | 
 550 | # In[ ]:
 551 | 
 552 | filter_sizes = [1,2,3,4]
 553 | num_filters = 32
 554 | embed_size=300
 555 | embedding_matrix=train_embedding_weights
 556 | max_features=len(train_word_index)+1
 557 | maxlen=m*n
 558 | 
 559 | def get_model():    
 560 |     inp = Input(shape=(maxlen, ))
 561 |     x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
 562 |     x = SpatialDropout1D(0.4)(x)
 563 |     x = Reshape((m, n, 300))(x)
 564 |     #print(x)
 565 |     conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 2), 
 566 |                                                                                     activation='relu')(x)
 567 |     conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 3),
 568 |                                                                                     activation='relu')(x)
 569 |     
 570 |     conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 4),
 571 |                                                                                     activation='relu')(x)
 572 |     
 573 |     
 574 |     
 575 |     
 576 |     
 577 |     conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 1), 
 578 |                                                                                     activation='relu')(x)
 579 |     conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 2), activation='relu')(x)
 580 |     
 581 |     conv_6 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 3),
 582 |                                                                                     activation='relu')(x)
 583 |     
 584 |     
 585 |     
 586 |     maxpool_0 = MaxPool2D()(conv_0)
 587 |     maxpool_0=Flatten()(maxpool_0)
 588 |     maxpool_1 = MaxPool2D()(conv_1)
 589 |     maxpool_1=Flatten()(maxpool_1)
 590 |     maxpool_2 = MaxPool2D()(conv_2)
 591 |     maxpool_2 = Flatten()(maxpool_2)
 592 |     
 593 |     maxpool_4 = MaxPool2D()(conv_4)
 594 |     maxpool_4=Flatten()(maxpool_4)
 595 |     maxpool_5 = MaxPool2D()(conv_5)
 596 |     maxpool_5=Flatten()(maxpool_5)
 597 |     maxpool_6 = MaxPool2D()(conv_6)
 598 |     maxpool_6=Flatten()(maxpool_6)
 599 |     #maxpool_7 = MaxPool2D()(conv_7)
 600 |    # maxpool_7=Flatten()(maxpool_7)
 601 |     z = concatenate([maxpool_0, maxpool_1,maxpool_2],axis=1)
 602 |     w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1)    
 603 |     #w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1)    
 604 |     #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1)
 605 |     #z = concatenate([maxpool_0, maxpool_1,maxpool_4, maxpool_5],axis=1)
 606 |     
 607 |     #z = Flatten()(z)
 608 |     z=concatenate([w,z],axis=1)
 609 |     z=Dense(units=64,activation="relu")(z)
 610 |     z = Dropout(0.4)(z)
 611 |         
 612 |     outp = Dense(1, activation="sigmoid")(z)
 613 |     
 614 |     model = Model(inputs=inp, outputs=outp)
 615 |     
 616 |     model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
 617 | 
 618 |     return model
 619 | 
 620 | 
 621 | # In[ ]:
 622 | 
 623 | 
 624 | model=get_model()
 625 | 
 626 | 
 627 | # In[ ]:
 628 | 
 629 | 
 630 | print(model.summary())
 631 | 
 632 | 
 633 | # In[ ]:
 634 | 
 635 | 
 636 | 
 637 | #define callbacks
 638 | from tensorflow.keras.callbacks import EarlyStopping
 639 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
 640 | callbacks_list = [early_stopping]
 641 | 
 642 | import time, datetime
 643 | start = datetime.datetime.now()
 644 | history=model.fit(train_cnn_data, y_train,  epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
 645 | end = datetime.datetime.now()
 646 | diff1= (end - start)
 647 | print('time taken by text_6',diff1)
 648 | 
 649 | 
 650 | 
 651 | 
 652 | 
 653 | 
 654 | pred=model.predict(test_cnn_data)
 655 | y_test=pred
 656 | y_test=y_test.tolist()
 657 | output_class_pred=[]
 658 | for i in range(len(y_test)):
 659 |     if(y_test[i][0]<0.5):
 660 |         output_class_pred.append(0)
 661 |     else:
 662 |         output_class_pred.append(1)
 663 |         
 664 | original_ans=data_test['tag']
 665 | original_ans=original_ans.tolist()
 666 | 
 667 | # In[ ]:
 668 | 
 669 | from sklearn.metrics import confusion_matrix
 670 | from sklearn.metrics import classification_report
 671 | 
 672 | #as its a fake news classifier , so identifying a fake class will be a TP
 673 | def check_metric(output_class_pred,original_ans,diff1):
 674 |     rightly_predicted=0
 675 |     TP=0
 676 |     for i in range(len(y_test)):
 677 |         if(original_ans[i]==output_class_pred[i]):
 678 |             rightly_predicted+=1
 679 |         
 680 |         
 681 |     print("Overall_acuracy:",rightly_predicted/len(output_class_pred))
 682 |     print('TP',TP)
 683 |     accuracy=rightly_predicted/len(y_test)
 684 |     print(classification_report(original_ans,output_class_pred))
 685 |     print(confusion_matrix(original_ans,output_class_pred))
 686 |     TN=confusion_matrix(original_ans,output_class_pred)[0][0]
 687 |     TP=confusion_matrix(original_ans,output_class_pred)[1][1]
 688 |     FP=confusion_matrix(original_ans,output_class_pred)[0][1]
 689 |     FN=confusion_matrix(original_ans,output_class_pred)[1][0]
 690 |     
 691 |     precision=TP/(TP+FP)
 692 |     recalll=TP/(FN+TP)
 693 |     F1=2*precision*recalll/(precision+recalll)
 694 |     sensiti=TP/(TP+FN)
 695 |     specifici=TN/(TN+FP)
 696 |     numerator=TP*TN - FP*FN
 697 |     
 698 |     denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN))
 699 |     MCc=numerator/denominator
 700 |     G_mean1=np.sqrt(sensiti*precision)
 701 |     G_mean2=np.sqrt(sensiti*specifici)
 702 |     print('precision:' ,TP/(TP+FP))
 703 |     print('recall:',TP/(FN+TP))
 704 |     print("F1:",F1)
 705 |     print("Specificity:",TN/(TN+FP))
 706 |     print("Sensitivity ",TP/(TP+FN))
 707 |     print('G-mean1:',np.sqrt(sensiti*precision))
 708 |     print("G-mean2",np.sqrt(sensiti*specifici))
 709 |     print("MCC :",MCc)
 710 |     acc=[]
 711 |     pre=[]
 712 |     recall=[]
 713 |     f1=[]
 714 |     specificity=[]
 715 |     sensitivity=[]
 716 |     GMean1=[]
 717 |     Gmean2=[]
 718 |     MCC=[]
 719 |     tp=[]
 720 |     fp=[]
 721 |     fn=[]
 722 |     tn=[]
 723 |     acc.append(accuracy)
 724 |     pre.append(precision)
 725 |     recall.append(recalll)
 726 |     f1.append(F1)
 727 |     specificity.append(specifici)
 728 |     sensitivity.append(sensiti)
 729 |     GMean1.append(G_mean1)
 730 |     Gmean2.append(G_mean2)
 731 |     MCC.append(MCc)
 732 |     tp.append(TP)
 733 |     fp.append(FP)
 734 |     tn.append(TN)
 735 |     fn.append(FN)
 736 |     data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn,"traintime":diff1,"Exceeding_ratio":excess_ratio,"imbalance_ratio":imbalance_ratio,"Average_length_of_paragraph":avg,"Maximum_length_of_a_paragraph":maxim,"Average_length_of_sentences":avg_sen_length,"Maximum_length_of_a_sentence_in_a_paragraph":n,"Maximum_no_of_sentence_in_any_paragraph":m,"Vocabular_size":vocab,"label0":count0,"label1":count1}
 737 |     metric=pd.DataFrame(data)
 738 |     return metric
 739 | 
 740 | print(history.history.keys())
 741 |     
 742 | resi=check_metric(output_class_pred,original_ans,diff1)
 743 | resi.to_csv('results_text.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
 744 | 
 745 | 
 746 | 
 747 | #####
 748 | 
 749 | filter_sizes = [1,2,3,4]
 750 | num_filters = 32
 751 | embed_size=300
 752 | embedding_matrix=train_embedding_weights
 753 | max_features=len(train_word_index)+1
 754 | maxlen=m*n
 755 | def get_model():    
 756 |     inp = Input(shape=(maxlen, ))
 757 |     x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
 758 |     x = SpatialDropout1D(0.4)(x)
 759 |     x = Reshape((m, n, 300))(x)
 760 |     #print(x)
 761 |     conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 2), 
 762 |                                                                                     activation='relu')(x)
 763 |     conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 3),
 764 |                                                                                     activation='relu')(x)
 765 |     
 766 |     #conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 4),
 767 |                                                                                     #activation='relu')(x)
 768 |     
 769 |     
 770 |     
 771 |     
 772 |     
 773 |     conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 1), 
 774 |                                                                                     activation='relu')(x)
 775 |     conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 2), activation='relu')(x)
 776 |     
 777 |     #conv_6 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 3),
 778 |                                                                                     #activation='relu')(x)
 779 |     
 780 |     
 781 |     
 782 |     maxpool_0 = MaxPool2D()(conv_0)
 783 |     maxpool_0=Flatten()(maxpool_0)
 784 |     maxpool_1 = MaxPool2D()(conv_1)
 785 |     maxpool_1=Flatten()(maxpool_1)
 786 |     #maxpool_2 = MaxPool2D()(conv_2)
 787 |     #maxpool_2 = Flatten()(maxpool_2)
 788 |     
 789 |     maxpool_4 = MaxPool2D()(conv_4)
 790 |     maxpool_4=Flatten()(maxpool_4)
 791 |     maxpool_5 = MaxPool2D()(conv_5)
 792 |     maxpool_5=Flatten()(maxpool_5)
 793 |     #maxpool_6 = MaxPool2D()(conv_6)
 794 |     #maxpool_6=Flatten()(maxpool_6)
 795 |     #maxpool_7 = MaxPool2D()(conv_7)
 796 |    # maxpool_7=Flatten()(maxpool_7)
 797 |         
 798 |     #w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1)    
 799 |     #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1)
 800 |     #z = concatenate([maxpool_0, maxpool_1,maxpool_4, maxpool_5],axis=1)
 801 |     w=concatenate([maxpool_4, maxpool_5],axis=1)    
 802 |     #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1)
 803 |     z = concatenate([maxpool_0, maxpool_1],axis=1)
 804 |     
 805 |     #z = Flatten()(z)
 806 |     z=concatenate([w,z],axis=1)
 807 |     #z = Flatten()(z)
 808 |     #z=concatenate([w,z],axis=1)
 809 |     z=Dense(units=64,activation="relu")(z)
 810 |     z = Dropout(0.4)(z)
 811 |         
 812 |     outp = Dense(1, activation="sigmoid")(z)
 813 |     
 814 |     model = Model(inputs=inp, outputs=outp)
 815 |     
 816 |     model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])
 817 | 
 818 |     return model
 819 | 
 820 | 
 821 | # In[ ]:
 822 | 
 823 | 
 824 | model=get_model()
 825 | 
 826 | 
 827 | # In[ ]:
 828 | 
 829 | 
 830 | print(model.summary())
 831 | 
 832 | 
 833 | # In[ ]:
 834 | 
 835 | 
 836 | 
 837 | #define callbacks
 838 | from tensorflow.keras.callbacks import EarlyStopping
 839 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
 840 | callbacks_list = [early_stopping]
 841 | 
 842 | import time, datetime
 843 | start = datetime.datetime.now()
 844 | 
 845 | history=model.fit(train_cnn_data, y_train,  epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
 846 | 
 847 | end = datetime.datetime.now()
 848 | diff1= (end - start)
 849 | print('time taken by text_4',diff1)
 850 | 
 851 | 
 852 | 
 853 | 
 854 | 
 855 | pred=model.predict(test_cnn_data)
 856 | y_test=pred
 857 | y_test=y_test.tolist()
 858 | output_class_pred=[]
 859 | for i in range(len(y_test)):
 860 |     if(y_test[i][0]<0.5):
 861 |         output_class_pred.append(0)
 862 |     else:
 863 |         output_class_pred.append(1)
 864 |         
 865 | original_ans=data_test['tag']
 866 | original_ans=original_ans.tolist()
 867 | 
 868 | # In[ ]:
 869 | from sklearn.metrics import confusion_matrix
 870 | from sklearn.metrics import classification_report
 871 | 
 872 | #as its a fake news classifier , so identifying a fake class will be a TP
 873 | 
 874 | 
 875 | resi=check_metric(output_class_pred,original_ans,diff1)
 876 | resi.to_csv('results_text.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
 877 | 
 878 | 
 879 | 
 880 | 
 881 | 
 882 | 
 883 | 
 884 | # In[ ]:
 885 | 
 886 | 
 887 | 
 888 | 
 889 | 
 890 | # In[ ]:
 891 | 
 892 | 
 893 | ## now perparing training data for yoon kim model
 894 | 
 895 | 
 896 | # In[ ]:
 897 | 
 898 | 
 899 | def create_single_line_para(x):
 900 |     l=[]
 901 |     for i in x:
 902 |         l+=i    #concatenating all the sentences in a para into a single 1 d arrray
 903 |     return l
 904 |         
 905 | 
 906 | 
 907 | # In[ ]:
 908 | 
 909 | 
 910 | data_train['create_single_line_para']=data_train['train_seq'].apply(lambda x: create_single_line_para(x) )
 911 | data_test['create_single_line_para']=data_test['train_seq'].apply(lambda x: create_single_line_para(x) )
 912 | 
 913 | 
 914 | # In[ ]:
 915 | 
 916 | 
 917 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 918 | yoon_kim_train_data=np.array(data_train['create_single_line_para'].tolist())
 919 | yoon_kim_train_data=pad_sequences(yoon_kim_train_data,maxlen=para_max,padding='post')
 920 | 
 921 | # In[ ]:
 922 | yoon_kim_test_data=np.array(data_test['create_single_line_para'].tolist())
 923 | yoon_kim_test_data=pad_sequences(yoon_kim_test_data,maxlen=para_max,padding='post')
 924 | 
 925 | 
 926 | #from __future__ import print_function
 927 | from tensorflow.keras.layers import Embedding
 928 | 
 929 | from tensorflow.keras.preprocessing.text import text_to_word_sequence
 930 | import pandas as pd
 931 | from tensorflow.keras.preprocessing.text import Tokenizer
 932 | import numpy as np
 933 | 
 934 | 
 935 | from tensorflow.keras.preprocessing import sequence
 936 | from tensorflow.keras.models import Sequential,Model
 937 | from tensorflow.keras.layers import Dense, Dropout, Activation,Flatten,Bidirectional,GRU,LSTM
 938 | from tensorflow.keras.layers import Embedding,concatenate
 939 | from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D,GlobalAveragePooling1D
 940 | from tensorflow.keras.models import Model
 941 | from tensorflow.keras.layers import Input
 942 | 
 943 | 
 944 | # In[ ]:
 945 | 
 946 | 
 947 | train_y=pd.get_dummies(y_train)
 948 | 
 949 | 
 950 | # In[ ]:
 951 | 
 952 | 
 953 | trains_y=train_y[[0,1]].values
 954 | 
 955 | 
 956 | # In[ ]:
 957 | 
 958 | 
 959 | embed_size=300
 960 | embedding_matrix=train_embedding_weights
 961 | max_features=len(train_word_index)+1
 962 | maxlen=para_max 
 963 | max_sequence_length=para_max
 964 | MAX_SEQUENCE_LENGTH=para_max
 965 | EMBEDDING_DIM=300
 966 | 
 967 | 
 968 | #model3 yoon kim
 969 | 
 970 | 
 971 | # In[ ]:
 972 | 
 973 | 
 974 | def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
 975 |     
 976 |     embedding_layer = Embedding(num_words,
 977 |                             embedding_dim,
 978 |                             weights=[embeddings],
 979 |                             input_length=max_sequence_length,
 980 |                             trainable=trainable)
 981 | 
 982 |     sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
 983 |     embedded_sequences = embedding_layer(sequence_input)
 984 | 
 985 |     # Yoon Kim model (https://arxiv.org/abs/1408.5882)
 986 |     convs = []
 987 |     filter_sizes = [3,4,5]
 988 | 
 989 |     for filter_size in filter_sizes:
 990 |         l_conv = Conv1D(filters=100, kernel_size=filter_size, activation='relu')(embedded_sequences)
 991 |         l_pool = MaxPooling1D(pool_size=2)(l_conv)
 992 |         convs.append(l_pool)
 993 | 
 994 |     l_merge = concatenate(convs, axis=1)
 995 | 
 996 |     # add a 1D convnet with global maxpooling, instead of Yoon Kim model
 997 |     #conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
 998 |     #pool = MaxPooling1D(pool_size=2)(conv)
 999 | 
1000 |     #if extra_conv==True:
1001 |         #x = Dropout(0.01)(l_merge)  
1002 |     #else:
1003 |         # Original Yoon Kim model
1004 |         #x = Dropout(0.001)(pool)
1005 |     x = Flatten()(l_merge)
1006 |     
1007 |     x = Dropout(0.5)(x)
1008 |     # Finally, we feed the output into a Sigmoid layer.
1009 |     # The reason why sigmoid is used is because we are trying to achieve a binary classification(1,0) 
1010 |     # for each of the 6 labels, and the sigmoid function will squash the output between the bounds of 0 and 1.
1011 |     preds = Dense(2, activation='softmax')(x)
1012 | 
1013 |     model = Model(sequence_input, preds)
1014 |     model.compile(loss='categorical_crossentropy',
1015 |                   optimizer='Adam',
1016 |                   metrics=['acc'])
1017 |     model.summary()
1018 |     return model
1019 | 
1020 | 
1021 | # In[ ]:
1022 | 
1023 | 
1024 | model1 = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
1025 |                  True)
1026 | 
1027 | 
1028 | # In[ ]:
1029 | 
1030 | 
1031 | training_data=yoon_kim_train_data
1032 | 
1033 | 
1034 | # In[ ]:
1035 | 
1036 | 
1037 | testing_data=yoon_kim_test_data
1038 | 
1039 | 
1040 | # In[ ]:
1041 | 
1042 | 
1043 | 
1044 | #define callbacks
1045 | from tensorflow.keras.callbacks import EarlyStopping
1046 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1047 | callbacks_list = [early_stopping]
1048 | 
1049 | import time, datetime
1050 | start = datetime.datetime.now()
1051 | 
1052 | hist = model1.fit(training_data, trains_y,  epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1053 | end = datetime.datetime.now()
1054 | diff1= (end - start)
1055 | print('time taken by yoon',diff1)
1056 | 
1057 | 
1058 | # In[ ]:
1059 | 
1060 | 
1061 | pred=model1.predict(testing_data)
1062 | y_test=pred
1063 | y_test=y_test.tolist()
1064 | output_class_pred=[]
1065 | #output_class_pred=[]
1066 | for i in range(len(y_test)):
1067 |     m=max(y_test[i])
1068 |     if(y_test[i].index(m)==0):
1069 |         output_class_pred.append(0)
1070 |     else:
1071 |         output_class_pred.append(1)
1072 |         
1073 |         
1074 | original_ans=data_test['tag']
1075 | original_ans=original_ans.tolist()
1076 | 
1077 | 
1078 | # In[ ]:
1079 | 
1080 | 
1081 | #as its a fake news classifier , so identifying a fake class will be a TP
1082 | resi=check_metric(output_class_pred,original_ans,diff1)
1083 | 
1084 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1085 | 
1086 | 
1087 | 
1088 | from tensorflow.keras.models import Model, Sequential
1089 | 
1090 | from tensorflow.keras.layers import Dropout, Embedding, concatenate
1091 | from tensorflow.keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, ZeroPadding1D
1092 | from tensorflow.keras.layers import Dense, Input, Flatten, BatchNormalization
1093 | from tensorflow.keras.layers import Concatenate, Dot, Multiply, RepeatVector
1094 | from tensorflow.keras.layers import Bidirectional, TimeDistributed
1095 | from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute
1096 | 
1097 | #from tensorflow.keras.layers.core import Reshape, Activation
1098 | from tensorflow.keras.optimizers import Adam
1099 | from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
1100 | #from tensorflow.keras.constraints import maxnorm
1101 | #from tensorflow.keras.regularizers import l2
1102 | 
1103 | def ConvNet_vdcnn(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
1104 |     
1105 |     embedding_layer = Embedding(num_words,
1106 |                             embedding_dim,
1107 |                             weights=[embeddings],
1108 |                             input_length=max_sequence_length,
1109 |                             trainable=trainable)
1110 | 
1111 |     sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
1112 |     embedded_sequences = embedding_layer(sequence_input)
1113 | 
1114 |     
1115 |     
1116 | # 4 pairs of convolution blocks followed by pooling
1117 |     conv = Conv1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_sequences)
1118 |     
1119 |     
1120 | # 4 pairs of convolution blocks followed by pooling
1121 |     for filter_size in [64, 128, 256, 512]:
1122 |     
1123 |     # each iteration is a convolution block
1124 |         for cb_i in [0,1]:
1125 |             conv=(Conv1D(filter_size, 3, padding="same",activation='relu'))(conv)
1126 |         #model_1.add(BatchNormalization())
1127 |         #model_1.add(Activation("relu"))
1128 |             conv=(Conv1D(filter_size, 1, padding="same",activation='relu'))(conv)
1129 |         #model_1.add(BatchNormalization())
1130 |         #model_1.add(Activation("relu"))
1131 |     
1132 |         conv=(MaxPooling1D(pool_size=2, strides=3))(conv)
1133 | 
1134 | # model.add(KMaxPooling(k=2))
1135 |     conv=(Flatten())(conv)
1136 |     conv=(Dense(4096, activation="relu"))(conv)
1137 |     conv=(Dense(2048, activation="relu"))(conv)
1138 |     conv=(Dense(2048, activation="relu"))(conv)
1139 | #(Dense(9, activation="softmax"))
1140 | 
1141 |     preds = Dense(2, activation='softmax')(conv)
1142 | 
1143 |     model = Model(sequence_input, preds)
1144 |     model.compile(loss='categorical_crossentropy',
1145 |                   optimizer='Adam',metrics=['acc'])
1146 |     print(model.summary())
1147 |     return model
1148 |     
1149 |     
1150 |     
1151 | model1 = ConvNet_vdcnn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
1152 |                  True)
1153 | 
1154 | 
1155 | 
1156 | 
1157 | # In[ ]:
1158 | 
1159 | 
1160 | training_data=yoon_kim_train_data
1161 | 
1162 | 
1163 | # In[ ]:
1164 | 
1165 | 
1166 | testing_data=yoon_kim_test_data
1167 | 
1168 | 
1169 | # In[ ]:
1170 | 
1171 | 
1172 | 
1173 | #define callbacks
1174 | from tensorflow.keras.callbacks import EarlyStopping
1175 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1176 | callbacks_list = [early_stopping]
1177 | 
1178 | import time, datetime
1179 | start = datetime.datetime.now()
1180 | 
1181 | hist = model1.fit(training_data, trains_y,  epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1182 | end = datetime.datetime.now()
1183 | diff1= (end - start)
1184 | print('time taken by yoon',diff1)
1185 | 
1186 | 
1187 | # In[ ]:
1188 | 
1189 | 
1190 | pred=model1.predict(testing_data)
1191 | y_test=pred
1192 | y_test=y_test.tolist()
1193 | output_class_pred=[]
1194 | #output_class_pred=[]
1195 | for i in range(len(y_test)):
1196 |     m=max(y_test[i])
1197 |     if(y_test[i].index(m)==0):
1198 |         output_class_pred.append(0)
1199 |     else:
1200 |         output_class_pred.append(1)
1201 |         
1202 |         
1203 | original_ans=data_test['tag']
1204 | original_ans=original_ans.tolist()
1205 | 
1206 | 
1207 | # In[ ]:
1208 | 
1209 | 
1210 | #as its a fake news classifier , so identifying a fake class will be a TP
1211 | resi=check_metric(output_class_pred,original_ans,diff1)
1212 | 
1213 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1214 | 
1215 | 
1216 | 
1217 | 
1218 | 
1219 | def ConvNet_clstm(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
1220 |     
1221 |     embedding_layer = Embedding(num_words,
1222 |                             embedding_dim,
1223 |                             weights=[embeddings],
1224 |                             input_length=max_sequence_length,
1225 |                             trainable=trainable)
1226 | 
1227 |     sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
1228 |     embedded_sequences = embedding_layer(sequence_input)
1229 |     
1230 |     convs = []
1231 |     filter_sizes = [10, 20, 30, 40]
1232 | 
1233 |     for filter_size in filter_sizes:
1234 |         l_conv = Conv1D(filters=64, kernel_size=filter_size, padding='valid', activation='relu')(embedded_sequences)
1235 |         convs.append(l_conv)
1236 | 
1237 |     cnn_feature_maps = Concatenate(axis=1)(convs)
1238 |     sentence_encoder = LSTM(64,return_sequences=False)(cnn_feature_maps)
1239 |     fc_layer =Dense(128, activation="relu")(sentence_encoder)
1240 |     #output_layer = Dense(9,activation="softmax")(fc_layer)
1241 | 
1242 |     #model_1 = Model(inputs=[text_input_layer], outputs=[output_layer])
1243 |     preds = Dense(2, activation='softmax')(fc_layer)
1244 | 
1245 |     model = Model(sequence_input, preds)
1246 |     model.compile(loss='categorical_crossentropy',
1247 |                   optimizer='Adam',
1248 |                   metrics=['acc'])
1249 |     model.summary()
1250 |     return model
1251 | 
1252 | 
1253 | model1 = ConvNet_clstm(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
1254 |                  True)
1255 | 
1256 | 
1257 | 
1258 | 
1259 | # In[ ]:
1260 | 
1261 | 
1262 | training_data=yoon_kim_train_data
1263 | 
1264 | 
1265 | # In[ ]:
1266 | 
1267 | 
1268 | testing_data=yoon_kim_test_data
1269 | 
1270 | 
1271 | # In[ ]:
1272 | 
1273 | 
1274 | 
1275 | #define callbacks
1276 | from tensorflow.keras.callbacks import EarlyStopping
1277 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1278 | callbacks_list = [early_stopping]
1279 | 
1280 | import time, datetime
1281 | start = datetime.datetime.now()
1282 | 
1283 | hist = model1.fit(training_data, trains_y,  epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1284 | end = datetime.datetime.now()
1285 | diff1= (end - start)
1286 | print('time taken by yoon',diff1)
1287 | 
1288 | 
1289 | # In[ ]:
1290 | 
1291 | 
1292 | pred=model1.predict(testing_data)
1293 | y_test=pred
1294 | y_test=y_test.tolist()
1295 | output_class_pred=[]
1296 | #output_class_pred=[]
1297 | for i in range(len(y_test)):
1298 |     m=max(y_test[i])
1299 |     if(y_test[i].index(m)==0):
1300 |         output_class_pred.append(0)
1301 |     else:
1302 |         output_class_pred.append(1)
1303 |         
1304 |         
1305 | original_ans=data_test['tag']
1306 | original_ans=original_ans.tolist()
1307 | 
1308 | 
1309 | # In[ ]:
1310 | 
1311 | 
1312 | #as its a fake news classifier , so identifying a fake class will be a TP
1313 | resi=check_metric(output_class_pred,original_ans,diff1)
1314 | 
1315 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1316 | 
1317 | 
1318 | 
1319 | 
1320 | def ConvNet_lstm(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
1321 |     
1322 |     
1323 |     embedding_layer = Embedding(num_words,embedding_dim,weights=[embeddings],input_length=max_sequence_length,trainable=trainable)
1324 | 
1325 |     sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
1326 |     embedded_sequences = embedding_layer(sequence_input)
1327 |     
1328 |     sentence_encoder = LSTM(64,return_sequences=False)(embedded_sequences)
1329 |     fc_layer =Dense(128, activation="relu")(sentence_encoder)
1330 |     #output_layer = Dense(9,activation="softmax")(fc_layer)
1331 |     #model_1 = Model(inputs=[text_input_layer], outputs=[output_layer])
1332 |     preds = Dense(2, activation='softmax')(fc_layer)
1333 | 
1334 |     model = Model(sequence_input, preds)
1335 |     model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['acc'])
1336 |     model.summary()
1337 |     return model
1338 | 
1339 |     
1340 |    
1341 |   
1342 | 
1343 | 
1344 | model1 = ConvNet_lstm(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
1345 |                  True)
1346 | 
1347 | 
1348 | 
1349 | 
1350 | # In[ ]:
1351 | 
1352 | 
1353 | training_data=yoon_kim_train_data
1354 | 
1355 | 
1356 | # In[ ]:
1357 | 
1358 | 
1359 | testing_data=yoon_kim_test_data
1360 | 
1361 | 
1362 | # In[ ]:
1363 | 
1364 | 
1365 | 
1366 | #define callbacks
1367 | from tensorflow.keras.callbacks import EarlyStopping
1368 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1369 | callbacks_list = [early_stopping]
1370 | 
1371 | import time, datetime
1372 | start = datetime.datetime.now()
1373 | 
1374 | hist = model1.fit(training_data, trains_y,  epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1375 | end = datetime.datetime.now()
1376 | diff1= (end - start)
1377 | print('time taken by yoon',diff1)
1378 | 
1379 | 
1380 | # In[ ]:
1381 | 
1382 | 
1383 | pred=model1.predict(testing_data)
1384 | y_test=pred
1385 | y_test=y_test.tolist()
1386 | output_class_pred=[]
1387 | #output_class_pred=[]
1388 | for i in range(len(y_test)):
1389 |     m=max(y_test[i])
1390 |     if(y_test[i].index(m)==0):
1391 |         output_class_pred.append(0)
1392 |     else:
1393 |         output_class_pred.append(1)
1394 |         
1395 |         
1396 | original_ans=data_test['tag']
1397 | original_ans=original_ans.tolist()
1398 | 
1399 | 
1400 | # In[ ]:
1401 | 
1402 | 
1403 | #as its a fake news classifier , so identifying a fake class will be a TP
1404 | resi=check_metric(output_class_pred,original_ans,diff1)
1405 | 
1406 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1407 | 
1408 | 
1409 | 
1410 | #resi.to_csv('results.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
1411 | 
1412 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.9.0
  2 | argon2-cffi==20.1.0
  3 | arrow==0.13.1
  4 | asn1crypto==1.2.0
  5 | astor==0.8.0
  6 | async-generator==1.10
  7 | attrs==19.3.0
  8 | Automat==0.8.0
  9 | backcall==0.1.0
 10 | bcrypt==3.1.7
 11 | beautifulsoup4==4.8.2
 12 | bleach==3.3.0
 13 | blinker==1.4
 14 | cachetools==3.1.1
 15 | certifi==2019.11.28
 16 | cffi==1.13.0
 17 | chardet==3.0.4
 18 | Click==7.0
 19 | cloudpickle==1.3.0
 20 | conda==4.8.2
 21 | conda-package-handling==1.6.0
 22 | constantly==15.1.0
 23 | cryptography==2.8
 24 | cssselect==1.1.0
 25 | cycler==0.10.0
 26 | Cython==0.29.17
 27 | cytoolz==0.10.1
 28 | dask==2.11.0
 29 | debugpy==1.3.0
 30 | decorator==4.4.1
 31 | defusedxml==0.7.1
 32 | eli5==0.10.1
 33 | entrypoints==0.3
 34 | fastcache==1.1.0
 35 | future==0.18.2
 36 | gast==0.2.2
 37 | gensim==4.0.1
 38 | gmpy2==2.0.8
 39 | google-auth==1.11.2
 40 | google-auth-oauthlib==0.4.1
 41 | google-pasta==0.1.8
 42 | graphviz==0.13.2
 43 | grpcio==1.27.2
 44 | h5py==2.8.0
 45 | hyperlink==19.0.0
 46 | idna==2.8
 47 | imageio==2.6.1
 48 | importlib-metadata==1.5.0
 49 | incremental==17.5.0
 50 | ipykernel==6.0.0
 51 | ipython==7.25.0
 52 | ipython-genutils==0.2.0
 53 | jedi==0.16.0
 54 | Jinja2==2.11.1
 55 | joblib==0.14.1
 56 | jsonschema==3.2.0
 57 | jupyter-client==6.1.12
 58 | jupyter-core==4.7.1
 59 | jupyterlab-pygments==0.1.2
 60 | Keras==2.3.1
 61 | Keras-Applications==1.0.8
 62 | Keras-Preprocessing==1.1.0
 63 | kiwisolver==1.1.0
 64 | leveldb==0.201
 65 | lightgbm==2.3.0
 66 | lxml==4.5.0
 67 | Mako==1.1.1
 68 | Markdown==3.1.1
 69 | MarkupSafe==1.1.1
 70 | matplotlib==3.1.3
 71 | matplotlib-inline==0.1.2
 72 | mistune==0.8.4
 73 | mkl-fft==1.0.15
 74 | mkl-random==1.1.0
 75 | mkl-service==2.3.0
 76 | more-itertools==8.2.0
 77 | mpmath==1.1.0
 78 | nbclient==0.5.3
 79 | nbconvert==6.1.0
 80 | nbformat==5.1.3
 81 | nest-asyncio==1.5.1
 82 | networkx==2.4
 83 | nltk==3.4.5
 84 | nose==1.3.7
 85 | notebook==6.4.0
 86 | numpy==1.18.1
 87 | oauthlib==3.1.0
 88 | olefile==0.46
 89 | opt-einsum==3.1.0
 90 | packaging==20.1
 91 | pandas==1.0.1
 92 | pandocfilters==1.4.3
 93 | parsel==1.5.2
 94 | parso==0.6.1
 95 | patsy==0.5.1
 96 | peewee==3.10.0
 97 | pexpect==4.8.0
 98 | pickleshare==0.7.5
 99 | Pillow==7.0.0
100 | pluggy==0.13.1
101 | prometheus-client==0.11.0
102 | prompt-toolkit==3.0.3
103 | protobuf==3.11.4
104 | ptyprocess==0.6.0
105 | py==1.8.1
106 | pyasn1==0.4.8
107 | pyasn1-modules==0.2.7
108 | pycosat==0.6.3
109 | pycparser==2.19
110 | PyDispatcher==2.0.5
111 | pyglet==1.5.0
112 | Pygments==2.5.2
113 | pygpu==0.7.6
114 | PyHamcrest==1.9.0
115 | PyJWT==1.7.1
116 | pyOpenSSL==19.0.0
117 | pyparsing==2.4.6
118 | pyrsistent==0.18.0
119 | PySocks==1.7.1
120 | pytest==5.3.5
121 | pytest-runner==5.2
122 | python-dateutil==2.8.1
123 | python-gflags==3.1.2
124 | pytz==2019.3
125 | PyWavelets==1.1.1
126 | PyYAML==5.3
127 | pyzmq==22.1.0
128 | queuelib==1.5.0
129 | requests==2.22.0
130 | requests-oauthlib==1.3.0
131 | rsa==4.0
132 | ruamel-yaml==0.15.46
133 | scapy==2.4.3
134 | scikit-image==0.16.2
135 | scikit-learn==0.22.1
136 | scipy==1.4.1
137 | Scrapy==1.6.0
138 | seaborn==0.10.0
139 | Send2Trash==1.7.1
140 | service-identity==18.1.0
141 | simplejson==3.17.0
142 | singledispatch==3.4.0.3
143 | six==1.12.0
144 | smart-open==5.1.0
145 | soupsieve==1.9.5
146 | SQLAlchemy==1.3.13
147 | statsmodels==0.11.0
148 | sympy==1.5.1
149 | tabulate==0.8.6
150 | tensorboard==2.1.0
151 | tensorflow==2.0.0
152 | tensorflow-estimator==2.0.0
153 | termcolor==1.1.0
154 | terminado==0.10.1
155 | testpath==0.5.0
156 | Theano==1.0.4
157 | toolz==0.10.0
158 | torch==1.3.1
159 | tornado==6.1
160 | tqdm==4.36.1
161 | traitlets==4.3.3
162 | Twisted==19.10.0
163 | urllib3==1.24.2
164 | w3lib==1.21.0
165 | wcwidth==0.1.8
166 | webencodings==0.5.1
167 | Werkzeug==0.16.1
168 | wrapt==1.11.2
169 | wxPython==4.0.4
170 | zipp==2.2.0
171 | zope.interface==4.7.1
172 | 


--------------------------------------------------------------------------------
/sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1 #1 Node
 3 | #SBATCH --ntasks-per-node=8
 4 | #SBATCH --time=3-00:00:00
 5 | #SBATCH --job-name=run_models_gpu_1_16GB
 6 | #SBATCH --error=%J.err
 7 | #SBATCH --output=%J.out
 8 | 
 9 | #SBATCH --partition=gpu
10 | #SBATCH --gres=gpu:1
11 | 
12 | eval "$(conda shell.bash hook)"
13 | 
14 | module load python/conda-python/3.7
15 | 
16 | python3 /scratch/satyendrac.mnitjaipur/codes/soni/run_models.py


--------------------------------------------------------------------------------