├── All_machine_learning _models.py
├── Detailed_performance_metrics (1).xlsx
├── README.md
├── Supplementary_File .pdf
├── TextConvoNet.py
├── requirements.txt
└── sample.sh
/All_machine_learning _models.py:
--------------------------------------------------------------------------------
1 | # %% [code]
2 | # This Python 3 environment comes with many helpful analytics libraries installed
3 | # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
4 | # For example, here's several helpful packages to load
5 |
6 | import numpy as np # linear algebra
7 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
8 |
9 | # Input data files are available in the read-only "../input/" directory
10 | # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
11 |
12 |
13 | # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
14 | # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
15 |
16 | # %% [code]
17 |
18 | import pandas as pd
19 | import numpy as np
20 | import matplotlib.pyplot as plt
21 | import seaborn as sns
22 | import nltk
23 | from nltk.corpus import stopwords
24 | import string
25 | import math
26 | from sklearn.feature_extraction.text import CountVectorizer
27 | from sklearn.model_selection import train_test_split, cross_val_score
28 | from sklearn.metrics import classification_report
29 | from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
30 | #from sklearn.grid_search import GridSearchCV
31 | %matplotlib inline
32 |
33 |
34 | '''import bz2
35 | def get_labels_and_texts(file):
36 | labels = []
37 | texts = []
38 | for line in bz2.BZ2File(file):
39 | x = line.decode("utf-8")
40 | labels.append(1 if int(x[9]) == 2 else 0)
41 | texts.append(x[10:].strip())
42 | return np.array(labels), texts
43 | train_labels, train_texts = get_labels_and_texts('/kaggle/input/amazonreviews/train.ft.txt.bz2')
44 | test_labels, test_texts = get_labels_and_texts('/kaggle/input/amazonreviews/test.ft.txt.bz2')
45 |
46 | #data_train['review'][7]
47 | print(train_labels[4])
48 | print(train_texts[4])
49 |
50 | # In[6]:
51 | data={"text":train_texts,'stars':train_labels}
52 | data_train=pd.DataFrame(data)
53 | data1={"text":test_texts,'stars':test_labels}
54 | data_test=pd.DataFrame(data1)
55 | '''
56 | import numpy as np
57 | import pandas as pd
58 | def multiclass_metrics(cnf_matrix):
59 | cnf_matrix=np.asarray(cnf_matrix)
60 | FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
61 | FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
62 | TP = np.diag(cnf_matrix)
63 | TN = cnf_matrix.sum() - (FP + FN + TP)
64 | FP = FP.astype(float)
65 | FN = FN.astype(float)
66 | TP = TP.astype(float)
67 | TN = TN.astype(float)
68 |
69 | TP=np.sum(TP)
70 | TN=np.sum(TN)
71 | FP=np.sum(FP)
72 | FN=np.sum(FN)
73 |
74 |
75 | accuracy=(TP+TN)/(TP+FP+FN+TN)
76 | precision=TP/(TP+FP)
77 | recalll=TP/(FN+TP)
78 | F1=2*precision*recalll/(precision+recalll)
79 | sensiti=TP/(TP+FN)
80 | specifici=TN/(TN+FP)
81 | numerator=TP*TN - FP*FN
82 |
83 | denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN))
84 | MCc=numerator/denominator
85 | G_mean1=np.sqrt(sensiti*precision)
86 | G_mean2=np.sqrt(sensiti*specifici)
87 | print('precision:' ,TP/(TP+FP))
88 | print('recall:',TP/(FN+TP))
89 | print("F1:",F1)
90 | print("Specificity:",TN/(TN+FP))
91 | print("Sensitivity ",TP/(TP+FN))
92 | print('G-mean1:',np.sqrt(sensiti*precision))
93 | print("G-mean2",np.sqrt(sensiti*specifici))
94 | print("MCC :",MCc)
95 | acc=[]
96 | pre=[]
97 | recall=[]
98 | f1=[]
99 | specificity=[]
100 | sensitivity=[]
101 | GMean1=[]
102 | Gmean2=[]
103 | MCC=[]
104 | tp=[]
105 | fp=[]
106 | fn=[]
107 | tn=[]
108 | acc.append(accuracy)
109 | pre.append(precision)
110 | recall.append(recalll)
111 | f1.append(F1)
112 | specificity.append(specifici)
113 | sensitivity.append(sensiti)
114 | GMean1.append(G_mean1)
115 | Gmean2.append(G_mean2)
116 | MCC.append(MCc)
117 | tp.append(TP)
118 | fp.append(FP)
119 | tn.append(TN)
120 | fn.append(FN)
121 | data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn,}
122 | metric=pd.DataFrame(data)
123 | return metric
124 |
125 | #cnf_matrix=[[1025,0,0,20,0,0,0,0,17],[0,0,0,2,0,0,0,0,3],[83,0,63,5,0,0,0,0,0],[18,0,0,330,0,0,0,0,1],[16,0,0,0,165,0,0,0,0],[51,0,0,0,0,0,0,0,0],[2,0,0,1,0,0,0,0,2],[8,0,0,0,0,0,0,0,0],[32,0,0,2,0,0,0,0,154]]
126 |
127 |
128 | data_train=pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv')
129 |
130 | data_test=pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv')
131 | print("hi")
132 |
133 | data_train=data_train[:10000]
134 | data_test=data_test[10000:]
135 | # In[7]:
136 |
137 |
138 |
139 | data_train.rename(columns={'text':'title','airline_sentiment':'tag'},inplace=True)
140 | data_test.rename(columns={'text':'title','airline_sentiment':'tag'},inplace=True)
141 | # In[94]:
142 | print('jij')
143 |
144 | # In[8]:
145 |
146 |
147 | # In[88]:
148 |
149 |
150 | data_train['title']=data_train['title'].astype(str)
151 | data_test['title']=data_test['title'].astype(str)
152 | #data_train
153 | print('fdd')
154 |
155 |
156 | '''def make_tags(x): #converting the ratings column into 0's and 1's. for binary classifier to take place
157 | if(x<=3):
158 | return 0
159 | else:
160 | return 1
161 |
162 |
163 |
164 | # In[10]:
165 |
166 |
167 | data_train['tag']=data_train['tag'].apply(lambda x: make_tags(x))
168 | data_test['tag']=data_test['tag'].apply(lambda x: make_tags(x))
169 | print('sddsd')
170 | '''
171 | x_train=data_train['title']
172 | y_train=data_train['tag']
173 |
174 | test_cnn_data=data_test['title']
175 | #y_test=data_test['tag']
176 |
177 | print('sdfsdfsdf')
178 | '''def text_process(text):
179 | nopunc = [char for char in text if char not in string.punctuation]
180 | nopunc = ''.join(nopunc)
181 | return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
182 | '''
183 | print('ddsd')
184 | vocab = CountVectorizer().fit(x_train)
185 | print("dwerty")
186 | print(len(vocab.vocabulary_))
187 | #print(x_train[2000])
188 | '''r0 = x[2000]
189 | print(r0)
190 | vocab0 = vocab.transform([r0])
191 | print(vocab0)
192 | """
193 | Now the words in the review number 78 have been converted into a vector.
194 | The data that we can see is the transformed words.
195 | If we now get the feature's name - we can get the word back!
196 | """
197 | print("Getting the words back:")
198 | print(vocab.get_feature_names()[19648])
199 | print(vocab.get_feature_names()[10643])
200 | '''
201 |
202 | x_train = vocab.transform(x_train)
203 | test_cnn_data=vocab.transform(test_cnn_data)
204 | print("Shape of the sparse matrix: ", x_train.shape)
205 | print(y_train)
206 |
207 | #########MULTIONOMIAL NAIVEBAYES
208 | from sklearn.naive_bayes import MultinomialNB
209 | model = MultinomialNB()
210 | print("hih")
211 | model.fit(x_train,y_train.values)
212 | #predmnb = mnb.predict(x_test)
213 | #print("Confusion Matrix for Multinomial Naive Bayes:")
214 | #print(confusion_matrix(y_test,predmnb))
215 | #print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
216 | #print("Classification Report:",classification_report(y_test,predmnb))
217 |
218 |
219 |
220 |
221 | pred=model.predict(test_cnn_data)
222 | #print(y_test)
223 | y_test=pred
224 | y_test=y_test.tolist()
225 | output_class_pred=[]
226 | '''for i in range(len(y_test)):
227 | if(y_test[i][0]<0.5):
228 | output_class_pred.append(0)
229 | else:
230 | output_class_pred.append(1)
231 | '''
232 | output_class_pred=y_test
233 | original_ans=data_test['tag']
234 | original_ans=original_ans.tolist()
235 |
236 | # In[ ]:
237 | from sklearn.metrics import confusion_matrix
238 | from sklearn.metrics import classification_report
239 |
240 | #as its a fake news classifier , so identifying a fake class will be a TP
241 | def check_metric(output_class_pred,original_ans):
242 | rightly_predicted=0
243 | TP=0
244 | for i in range(len(y_test)):
245 | if(original_ans[i]==output_class_pred[i]):
246 | rightly_predicted+=1
247 |
248 |
249 | print("Overall_acuracy:",rightly_predicted/len(output_class_pred))
250 | print('TP',TP)
251 | accuracy=rightly_predicted/len(y_test)
252 | print(classification_report(original_ans,output_class_pred))
253 | print(confusion_matrix(original_ans,output_class_pred))
254 | TN=confusion_matrix(original_ans,output_class_pred)[0][0]
255 | TP=confusion_matrix(original_ans,output_class_pred)[1][1]
256 | FP=confusion_matrix(original_ans,output_class_pred)[0][1]
257 | FN=confusion_matrix(original_ans,output_class_pred)[1][0]
258 |
259 | precision=TP/(TP+FP)
260 | recalll=TP/(FN+TP)
261 | F1=2*precision*recalll/(precision+recalll)
262 | sensiti=TP/(TP+FN)
263 | specifici=TN/(TN+FP)
264 | numerator=TP*TN - FP*FN
265 |
266 | denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN))
267 | MCc=numerator/denominator
268 | G_mean1=np.sqrt(sensiti*precision)
269 | G_mean2=np.sqrt(sensiti*specifici)
270 | print('precision:' ,TP/(TP+FP))
271 | print('recall:',TP/(FN+TP))
272 | print("F1:",F1)
273 | print("Specificity:",TN/(TN+FP))
274 | print("Sensitivity ",TP/(TP+FN))
275 | print('G-mean1:',np.sqrt(sensiti*precision))
276 | print("G-mean2",np.sqrt(sensiti*specifici))
277 | print("MCC :",MCc)
278 | acc=[]
279 | pre=[]
280 | recall=[]
281 | f1=[]
282 | specificity=[]
283 | sensitivity=[]
284 | GMean1=[]
285 | Gmean2=[]
286 | MCC=[]
287 | tp=[]
288 | fp=[]
289 | fn=[]
290 | tn=[]
291 | acc.append(accuracy)
292 | pre.append(precision)
293 | recall.append(recalll)
294 | f1.append(F1)
295 | specificity.append(specifici)
296 | sensitivity.append(sensiti)
297 | GMean1.append(G_mean1)
298 | Gmean2.append(G_mean2)
299 | MCC.append(MCc)
300 | tp.append(TP)
301 | fp.append(FP)
302 | tn.append(TN)
303 | fn.append(FN)
304 | data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn}
305 | metric=pd.DataFrame(data)
306 | return metric
307 |
308 |
309 |
310 |
311 |
312 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
313 |
314 |
315 | resi=multiclass_metrics(cnf_matrix)
316 | resi.to_csv('results1.csv', mode='w', index = False, header=resi.columns,columns=resi.columns)
317 |
318 |
319 | # In[ ]:
320 |
321 |
322 |
323 | ########RANDOMFOREST
324 | from sklearn.ensemble import RandomForestClassifier
325 | model = RandomForestClassifier()
326 | model.fit(x_train,y_train.values)
327 |
328 | pred=model.predict(test_cnn_data)
329 | print(y_test)
330 | y_test=pred
331 | y_test=y_test.tolist()
332 | output_class_pred=[]
333 | output_class_pred=y_test
334 | original_ans=data_test['tag']
335 | original_ans=original_ans.tolist()
336 |
337 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
338 |
339 |
340 | resi=multiclass_metrics(cnf_matrix)
341 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
342 |
343 |
344 |
345 | ####DECISION TREE
346 | from sklearn.tree import DecisionTreeClassifier
347 | model= DecisionTreeClassifier()
348 | model.fit(x_train,y_train.values)
349 |
350 | pred=model.predict(test_cnn_data)
351 | print(y_test)
352 | y_test=pred
353 | y_test=y_test.tolist()
354 | output_class_pred=[]
355 | output_class_pred=y_test
356 | original_ans=data_test['tag']
357 | original_ans=original_ans.tolist()
358 |
359 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
360 |
361 |
362 | resi=multiclass_metrics(cnf_matrix)
363 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
364 |
365 |
366 |
367 |
368 | #####SVC
369 | from sklearn.svm import SVC
370 | model = SVC(random_state=101)
371 | model.fit(x_train,y_train.values)
372 | pred=model.predict(test_cnn_data)
373 | print(y_test)
374 | y_test=pred
375 | y_test=y_test.tolist()
376 | output_class_pred=[]
377 | output_class_pred=y_test
378 | original_ans=data_test['tag']
379 | original_ans=original_ans.tolist()
380 |
381 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
382 |
383 |
384 | resi=multiclass_metrics(cnf_matrix)
385 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
386 |
387 |
388 |
389 | ####GRADIENT BOOSTING CLASSIFIER
390 | from sklearn.ensemble import GradientBoostingClassifier
391 | model = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,max_features=0.5,random_state=999999)
392 | model.fit(x_train,y_train.values)
393 |
394 | pred=model.predict(test_cnn_data)
395 | print(y_test)
396 | y_test=pred
397 | y_test=y_test.tolist()
398 | output_class_pred=[]
399 | output_class_pred=y_test
400 | original_ans=data_test['tag']
401 | original_ans=original_ans.tolist()
402 |
403 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
404 |
405 |
406 | resi=multiclass_metrics(cnf_matrix)
407 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
408 |
409 |
410 |
411 | #####KNN
412 | from sklearn.neighbors import KNeighborsClassifier
413 | model = KNeighborsClassifier(n_neighbors=10)
414 | model.fit(x_train,y_train.values)
415 | pred=model.predict(test_cnn_data)
416 | print(y_test)
417 | y_test=pred
418 | y_test=y_test.tolist()
419 | output_class_pred=[]
420 | output_class_pred=y_test
421 | original_ans=data_test['tag']
422 | original_ans=original_ans.tolist()
423 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
424 |
425 |
426 | resi=multiclass_metrics(cnf_matrix)
427 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
428 |
429 |
430 |
431 |
432 | ####XGBOOST CLASSIFIER
433 | import xgboost
434 | from xgboost import XGBClassifier
435 | model = XGBClassifier()
436 | model.fit(x_train,y_train)
437 | pred=model.predict(test_cnn_data)
438 | print(y_test)
439 | y_test=pred
440 | y_test=y_test.tolist()
441 | output_class_pred=[]
442 | output_class_pred=y_test
443 | original_ans=data_test['tag']
444 | original_ans=original_ans.tolist()
445 | cnf_matrix=confusion_matrix(original_ans,output_class_pred)
446 |
447 |
448 | resi=multiclass_metrics(cnf_matrix)
449 | resi.to_csv('results1.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
450 |
451 |
452 |
453 | print(output_class_pred)
454 | print(original_ans)
--------------------------------------------------------------------------------
/Detailed_performance_metrics (1).xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonisanskar/TextConvoNet/a5814e868cc5ef9504774ca9a431d5d3febc3379/Detailed_performance_metrics (1).xlsx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TextConvoNet
2 | The above code is for a novel CNN based Text classification architecture **TextConvoNet** which uses a paragraph matrix and 2D Convolution for text classification tasks.
3 |
4 | Click Here to view the detailed architecture of the [*TextConvoNet*](https://drive.google.com/file/d/1Q7kuPXbtMQtRNGUj-Tmg9hIgSI_2mv5k/view?usp=sharing)
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Supplementary_File .pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonisanskar/TextConvoNet/a5814e868cc5ef9504774ca9a431d5d3febc3379/Supplementary_File .pdf
--------------------------------------------------------------------------------
/TextConvoNet.py:
--------------------------------------------------------------------------------
1 | # This Python 3 environment comes with many helpful analytics libraries installed
2 | # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
3 | # For example, here's several helpful packages to load
4 |
5 | import numpy as np # linear algebra
6 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
7 |
8 | # Input data files are available in the read-only "../input/" directory
9 | # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
10 |
11 | import bz2
12 | import pickle
13 | import os
14 | '''
15 | for dirname, _, filenames in os.walk('/kaggle/input'):
16 | for filename in filenames:
17 | print(os.path.join(dirname, filename))
18 | '''
19 | # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
20 | # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
21 |
22 | #!/usr/bin/env python
23 | # coding: utf-8
24 |
25 | # In[2]:
26 |
27 |
28 | import numpy as np
29 | import pandas as pd
30 | import seaborn as sns
31 | import matplotlib as plt
32 | import json
33 | #get_ipython().run_line_magic('matplotlib', 'inline')
34 | '''
35 | from tensorflow.keras.optimizers import Adam
36 |
37 | # In[3]:
38 | trainfile = bz2.BZ2File('../input/amazonreviews/train.ft.txt.bz2','r')
39 | lines = trainfile.readlines()
40 |
41 | sent_analysis = []
42 | def sent_list(docs,splitStr='__label__'):
43 | for i in range(1,len(docs)):
44 | text=str(lines[i])
45 | splitText=text.split(splitStr)
46 | #print(i)
47 | secHalf=splitText[1]
48 | text=secHalf[2:len(secHalf)-1]
49 | sentiment=secHalf[0]
50 | sent_analysis.append([text,sentiment])
51 | return sent_analysis
52 |
53 | sentiment_list=sent_list(lines[:1000000],splitStr='__label__')
54 |
55 | train_df = pd.DataFrame(sentiment_list,columns=['Text','Sentiment'])
56 |
57 | data_train=train_df[:4000]
58 | data_test=train_df[4000:5000]
59 | '''
60 | #a=input('path of the taining dataset with fields as title and tag(0,1) ')
61 | #b=input('path of test dataset')
62 | #data_train=pd.read_csv('../input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
63 |
64 |
65 | # In[4]:
66 |
67 |
68 | #data_train
69 |
70 |
71 | # In[5]:
72 |
73 | data_train=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
74 | data_test=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
75 |
76 | data_train=data_train[:40000]
77 | data_test=data_test[40000:]
78 |
79 |
80 |
81 | # In[8]:
82 | data_train.rename(columns={'review':'title','sentiment':'tag'},inplace=True)
83 | data_test.rename(columns={'review':'title','sentiment':'tag'},inplace=True)
84 |
85 | #data_train['rating'].value_counts()
86 | #print('training_dataset',data_train)
87 | #print('training_dataset',data_test)
88 |
89 | # In[9]:
90 |
91 | #print(data_train)
92 |
93 | def make_tags(x): #converting the ratings column into 0's and 1's. for binary classifier to take place
94 | if(x=="negative"):
95 | return 0
96 | else:
97 | return 1
98 |
99 |
100 |
101 | # In[10]:
102 |
103 |
104 | data_train['tag']=data_train['tag'].apply(lambda x: make_tags(x))
105 | data_test['tag']=data_test['tag'].apply(lambda x: make_tags(x))
106 |
107 | #print(data_train)
108 |
109 | count0=(data_train['tag']==0).sum()
110 | count1=(data_train['tag']==1).sum()
111 | if(count0>count1):
112 | imbalance_ratio=(count0)/count1
113 | else:
114 | imbalance_ratio=(count1)/count0
115 | # In[11]:
116 |
117 | print('imbalance_ratio',imbalance_ratio)
118 | #print(data_train)
119 |
120 |
121 | # In[12]:
122 |
123 |
124 |
125 | def no_of_words_in_paragraph(x):
126 | return len(list(x))
127 |
128 | data_train['no_of_words_in_paragraph']=data_train['title'].apply(lambda x:no_of_words_in_paragraph(x))
129 |
130 | data_test['no_of_words_in_paragraph']=data_test['title'].apply(lambda x:no_of_words_in_paragraph(x))
131 |
132 |
133 |
134 | print(data_train)
135 | avg=data_train['no_of_words_in_paragraph'].mean()
136 | maxim=data_train['no_of_words_in_paragraph'].max()
137 | print('average paragraph length',data_train['no_of_words_in_paragraph'].mean())
138 | print('maximum para length',data_train['no_of_words_in_paragraph'].max())
139 | print('hii')
140 | excess=(data_train['no_of_words_in_paragraph']>avg).sum()
141 | excess_ratio=excess/len(data_train)
142 | print('excess_ratio',excess_ratio)
143 |
144 |
145 | #applying sentence tokenizer
146 | import nltk.data
147 | tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
148 | # Loading PunktSentenceTokenizer using English pickle file
149 | def make_sent_token(x):
150 | return tokenizer.tokenize(x)
151 | #converting each paragraph into separate sentences
152 |
153 |
154 | # In[13]:
155 |
156 |
157 | data_train['sentence_token']=data_train['title'].apply(lambda x: make_sent_token(x))
158 |
159 | data_test['sentence_token']=data_test['title'].apply(lambda x: make_sent_token(x))
160 |
161 |
162 | # In[15]:
163 |
164 |
165 | #data_train.drop(columns=['uniqueID','date','usefulCount','condition','drugName'],inplace=True,axis=1)# dropping irrelevant columns
166 |
167 |
168 | # In[16]:
169 |
170 |
171 | #data_test.drop(columns=['uniqueID','date','usefulCount','condition','drugName'],inplace=True,axis=1)
172 |
173 |
174 | # In[17]:
175 |
176 |
177 | #data_train
178 |
179 |
180 | # In[18]:
181 |
182 |
183 | data_train['no_of_sentences']=data_train['sentence_token'].apply(lambda x:len(x))
184 |
185 |
186 | # In[19]:
187 |
188 |
189 | data_test['no_of_sentences']=data_test['sentence_token'].apply(lambda x:len(x))
190 |
191 |
192 | # In[20]:
193 | avg_sen_length=data_train['no_of_words_in_paragraph'].sum()/data_train['no_of_sentences'].sum()
194 | print(avg_sen_length)
195 |
196 | #max(data_train['no_of_sentences'])##no of rows in sentence matrix which is to be feed in model(max number of sentence in any paragraph)
197 |
198 |
199 | # In[21]:
200 |
201 |
202 | #len(data_train[data_train['no_of_sentences']==92]['review'])
203 |
204 |
205 | # In[22]:
206 |
207 |
208 | #max(data_test['no_of_sentences'])
209 |
210 |
211 | # In[23]:
212 |
213 |
214 | def max_length_of_sentence(x,y):
215 | sen=x
216 | nu=y
217 | #print(sen)
218 | ma=0
219 | if(nu>1):
220 | l=sen.split('.')
221 | #print(l)
222 | for i in range(len(l)):
223 | k=l[i].replace(',','')
224 | maxi=len(k.split())
225 | #print(maxi)
226 | if(maxi>ma):
227 | ma=maxi
228 | return ma
229 | else:
230 | return len(sen.split())
231 |
232 |
233 |
234 |
235 | # In[24]:
236 |
237 |
238 | data_train['max_words_in_sentence']=data_train.apply(lambda x: max_length_of_sentence(x.title,x.no_of_sentences),axis=1)
239 |
240 |
241 | # In[25]:
242 |
243 |
244 | data_test['max_words_in_sentence']=data_test.apply(lambda x: max_length_of_sentence(x.title,x.no_of_sentences),axis=1)
245 |
246 |
247 | # In[26]:
248 |
249 |
250 | #max(data_train['max_words_in_sentence'])## number of columns in the data to be feeded
251 |
252 |
253 | # In[27]:
254 |
255 | x1=max(data_train['no_of_sentences'])
256 | y1=max(data_train['max_words_in_sentence'])
257 |
258 | x2=max(data_test['no_of_sentences'])
259 | y2=max(data_test['max_words_in_sentence'])
260 |
261 | if(x1>=x2):
262 | m=x1
263 | print(m)
264 | m=m
265 | else:
266 | m=x2
267 | m=m
268 |
269 | if(y1>=y2):
270 | n=y1
271 | else:
272 | n=y2
273 |
274 | #So each para will be converted to a m*n matrix
275 | if(m<5):
276 | m=6
277 | else:
278 | m+=2
279 | print('x1,x2,y1,y2',x1,x2,y1,y2)
280 |
281 | print("m-->",m,n)
282 | #So each para will be converted to a m*n matrix
283 |
284 |
285 | # In[28]:
286 |
287 |
288 |
289 |
290 | # # Major part starts here ..... Now converting the paragraph into required matrix
291 |
292 | # In[29]:
293 |
294 |
295 | import re
296 | import string
297 | from nltk import word_tokenize
298 | from nltk.corpus import stopwords
299 | def make_tokens(text): ##Converting into single tokens in order to create the vocabulary
300 | return word_tokenize(text)
301 |
302 |
303 | data_train['tokens']=data_train['title'].apply(lambda x: make_tokens(x))
304 | data_test['tokens']=data_test['title'].apply(lambda x: make_tokens(x))
305 |
306 |
307 | # In[30]:
308 |
309 |
310 | #data_train['tokens']
311 |
312 |
313 | # In[ ]:
314 |
315 |
316 | #from gensim import models
317 | #word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
318 | #word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
319 |
320 |
321 |
322 | embeddings_index = {}
323 | f = open('../input/glove6b300dtxt/glove.6B.300d.txt')
324 | for line in f:
325 | values = line.split(' ')
326 | word = values[0] ## The first entry is the word
327 | coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
328 | embeddings_index[word] = coefs
329 | f.close()
330 |
331 | print('GloVe data loaded')
332 |
333 | # In[ ]:
334 |
335 |
336 | all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
337 | training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
338 | TRAINING_VOCAB = sorted(list(set(all_training_words)))
339 | print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
340 | print("Max sentence length is %s" % max(training_sentence_lengths))
341 | para_max=max(training_sentence_lengths)
342 |
343 | vocab=len(TRAINING_VOCAB)
344 |
345 | # In[ ]:
346 |
347 |
348 | #len(TRAINING_VOCAB)
349 |
350 |
351 | # In[ ]:
352 |
353 |
354 | from tensorflow.keras.preprocessing.text import Tokenizer
355 | from tensorflow.keras.preprocessing.sequence import pad_sequences
356 | tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), char_level=False)
357 | tokenizer.fit_on_texts(data_train['title']) # we assigned values
358 |
359 |
360 | # In[ ]:
361 |
362 |
363 | train_word_index = tokenizer.word_index
364 |
365 |
366 | # In[ ]:
367 |
368 |
369 | #print(train_word_index)
370 |
371 |
372 | # In[ ]:
373 |
374 |
375 | #data_train.to_csv('medic_train.csv')
376 | #data_test.to_csv('medic_test.csv')
377 |
378 |
379 | # In[ ]:
380 |
381 |
382 | def make_train_seq(x):
383 | return tokenizer.texts_to_sequences(x)
384 | data_train['train_seq']=data_train['sentence_token'].apply(lambda x:make_train_seq(x) )
385 | data_test['train_seq']=data_test['sentence_token'].apply(lambda x:make_train_seq(x) )
386 |
387 |
388 | # In[ ]:
389 |
390 |
391 | #(data_train['train_seq']) # here every para has been encoded
392 |
393 |
394 | # In[ ]:
395 | #print(data_train)
396 |
397 |
398 |
399 |
400 | # In[ ]:
401 |
402 |
403 | from tensorflow.keras.preprocessing.sequence import pad_sequences
404 | def padding(x): #now padding each sentence to a length of n...number of columns
405 | MAX_SENTENCE_LENGTH=n #(no of columns)
406 | return pad_sequences(x,maxlen=MAX_SENTENCE_LENGTH,padding='post')
407 |
408 | data_train['padded']=data_train['train_seq'].apply(lambda x:padding(x))
409 | data_test['padded']=data_test['train_seq'].apply(lambda x:padding(x))
410 |
411 |
412 | # In[ ]:
413 |
414 |
415 | #(data_train.padded[8])
416 |
417 |
418 | # In[ ]:
419 |
420 |
421 |
422 | ## More code adapted from the keras reference (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
423 | # prepare embedding matrix
424 | from tensorflow.keras.layers import Embedding
425 | from tensorflow.keras.initializers import Constant
426 |
427 | ## EMBEDDING_DIM = ## seems to need to match the embeddings_index dimension
428 | EMBEDDING_DIM = embeddings_index.get('a').shape[0]
429 | print(EMBEDDING_DIM)
430 | #num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
431 | #= np.zeros(len(train_word_index) + 1, EMBEDDING_DIM)
432 | train_embedding_weights = np.zeros((len(train_word_index)+1,
433 | EMBEDDING_DIM))
434 | for word, i in train_word_index.items():
435 | #print("sd")
436 | embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
437 | if embedding_vector is not None:
438 | train_embedding_weights[i] = embedding_vector
439 | print(train_embedding_weights.shape)
440 | # words not found in embedding index will be all-zeros.
441 |
442 |
443 | # load pre-trained word embeddings into an Embedding layer
444 | # note that we set trainable = False so as to keep the embeddings fixed
445 | #embedding_layer = Embedding(num_words,
446 | # EMBEDDING_DIM,
447 | # embeddings_initializer=Constant(embedding_matrix),
448 | # input_length=MAX_SEQUENCE_LENGTH,
449 | # trainable=False)
450 |
451 |
452 | #EMBEDDING_DIM=300
453 | #train_embedding_weights = np.zeros((len(train_word_index)+1,
454 | #EMBEDDING_DIM))
455 | #for word,index in train_word_index.items():
456 | #train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
457 | #print(train_embedding_weights.shape)
458 |
459 |
460 | # In[43]:
461 |
462 |
463 | def make_full_para(x): #92 cross 192 matrix of a paragraph. (m*n)
464 | l=len(x)
465 | h=m-l #no. of extra rows to be added
466 | z=[0]*h*n #1D vector(#addding extra lines for zeroes as padding)
467 | z=np.reshape(z,(h,n)) #reshaping it to match the dimension of paragraph
468 | s=x.tolist()+z.tolist()
469 | return s
470 |
471 |
472 | # In[ ]:
473 |
474 |
475 |
476 |
477 |
478 | # In[ ]:
479 |
480 |
481 | data_train['full_para']=data_train['padded'].apply(lambda x : make_full_para(x))
482 | data_test['full_para']=data_test['padded'].apply(lambda x : make_full_para(x))
483 |
484 |
485 | # In[ ]:
486 |
487 |
488 | #data_train.full_para
489 |
490 |
491 | # In[ ]:
492 |
493 |
494 | def create_1d_para(x):
495 | l=[]
496 | for i in x:
497 | l+=i #concatenating all the sentences in a para into a single 1 d arrray
498 | return l
499 |
500 |
501 |
502 |
503 | # In[ ]:
504 |
505 | data_train['single_d_array']=data_train['full_para'].apply(lambda x: create_1d_para(x) )
506 | data_test['single_d_array']=data_test['full_para'].apply(lambda x: create_1d_para(x) )
507 |
508 |
509 | # In[ ]:
510 |
511 |
512 | #train_cnn_data=np.array(data_train['single_d_array'].tolist())
513 |
514 |
515 | # In[ ]:
516 |
517 |
518 | train_cnn_data=np.array(data_train['single_d_array'].tolist())
519 | test_cnn_data=np.array(data_test['single_d_array'].tolist())
520 |
521 |
522 | # In[ ]:
523 |
524 | from sklearn.model_selection import train_test_split
525 | y_train=data_train['tag'].values
526 |
527 |
528 |
529 | # In[ ]:
530 |
531 | print('Startting the training')
532 | #from __future__ import print_function
533 | from tensorflow.keras.layers import Embedding
534 |
535 | from tensorflow.keras.preprocessing.text import text_to_word_sequence
536 | import pandas as pd
537 | from tensorflow.keras.preprocessing.text import Tokenizer
538 | import numpy as np
539 |
540 |
541 | from tensorflow.keras.preprocessing import sequence
542 | from tensorflow.keras.models import Sequential,Model
543 | from tensorflow.keras.layers import Dense, Dropout, Activation,Flatten,Bidirectional,GRU,LSTM,SpatialDropout1D,Reshape
544 | from tensorflow.keras.layers import Embedding,concatenate
545 | from tensorflow.keras.layers import Conv2D, GlobalMaxPooling2D,MaxPool2D,MaxPool3D,GlobalAveragePooling2D,Conv3D
546 | from tensorflow.keras.models import Model
547 | from tensorflow.keras.layers import Input
548 |
549 |
550 | # In[ ]:
551 |
552 | filter_sizes = [1,2,3,4]
553 | num_filters = 32
554 | embed_size=300
555 | embedding_matrix=train_embedding_weights
556 | max_features=len(train_word_index)+1
557 | maxlen=m*n
558 |
559 | def get_model():
560 | inp = Input(shape=(maxlen, ))
561 | x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
562 | x = SpatialDropout1D(0.4)(x)
563 | x = Reshape((m, n, 300))(x)
564 | #print(x)
565 | conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 2),
566 | activation='relu')(x)
567 | conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 3),
568 | activation='relu')(x)
569 |
570 | conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 4),
571 | activation='relu')(x)
572 |
573 |
574 |
575 |
576 |
577 | conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 1),
578 | activation='relu')(x)
579 | conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 2), activation='relu')(x)
580 |
581 | conv_6 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 3),
582 | activation='relu')(x)
583 |
584 |
585 |
586 | maxpool_0 = MaxPool2D()(conv_0)
587 | maxpool_0=Flatten()(maxpool_0)
588 | maxpool_1 = MaxPool2D()(conv_1)
589 | maxpool_1=Flatten()(maxpool_1)
590 | maxpool_2 = MaxPool2D()(conv_2)
591 | maxpool_2 = Flatten()(maxpool_2)
592 |
593 | maxpool_4 = MaxPool2D()(conv_4)
594 | maxpool_4=Flatten()(maxpool_4)
595 | maxpool_5 = MaxPool2D()(conv_5)
596 | maxpool_5=Flatten()(maxpool_5)
597 | maxpool_6 = MaxPool2D()(conv_6)
598 | maxpool_6=Flatten()(maxpool_6)
599 | #maxpool_7 = MaxPool2D()(conv_7)
600 | # maxpool_7=Flatten()(maxpool_7)
601 | z = concatenate([maxpool_0, maxpool_1,maxpool_2],axis=1)
602 | w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1)
603 | #w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1)
604 | #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1)
605 | #z = concatenate([maxpool_0, maxpool_1,maxpool_4, maxpool_5],axis=1)
606 |
607 | #z = Flatten()(z)
608 | z=concatenate([w,z],axis=1)
609 | z=Dense(units=64,activation="relu")(z)
610 | z = Dropout(0.4)(z)
611 |
612 | outp = Dense(1, activation="sigmoid")(z)
613 |
614 | model = Model(inputs=inp, outputs=outp)
615 |
616 | model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
617 |
618 | return model
619 |
620 |
621 | # In[ ]:
622 |
623 |
624 | model=get_model()
625 |
626 |
627 | # In[ ]:
628 |
629 |
630 | print(model.summary())
631 |
632 |
633 | # In[ ]:
634 |
635 |
636 |
637 | #define callbacks
638 | from tensorflow.keras.callbacks import EarlyStopping
639 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
640 | callbacks_list = [early_stopping]
641 |
642 | import time, datetime
643 | start = datetime.datetime.now()
644 | history=model.fit(train_cnn_data, y_train, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
645 | end = datetime.datetime.now()
646 | diff1= (end - start)
647 | print('time taken by text_6',diff1)
648 |
649 |
650 |
651 |
652 |
653 |
654 | pred=model.predict(test_cnn_data)
655 | y_test=pred
656 | y_test=y_test.tolist()
657 | output_class_pred=[]
658 | for i in range(len(y_test)):
659 | if(y_test[i][0]<0.5):
660 | output_class_pred.append(0)
661 | else:
662 | output_class_pred.append(1)
663 |
664 | original_ans=data_test['tag']
665 | original_ans=original_ans.tolist()
666 |
667 | # In[ ]:
668 |
669 | from sklearn.metrics import confusion_matrix
670 | from sklearn.metrics import classification_report
671 |
672 | #as its a fake news classifier , so identifying a fake class will be a TP
673 | def check_metric(output_class_pred,original_ans,diff1):
674 | rightly_predicted=0
675 | TP=0
676 | for i in range(len(y_test)):
677 | if(original_ans[i]==output_class_pred[i]):
678 | rightly_predicted+=1
679 |
680 |
681 | print("Overall_acuracy:",rightly_predicted/len(output_class_pred))
682 | print('TP',TP)
683 | accuracy=rightly_predicted/len(y_test)
684 | print(classification_report(original_ans,output_class_pred))
685 | print(confusion_matrix(original_ans,output_class_pred))
686 | TN=confusion_matrix(original_ans,output_class_pred)[0][0]
687 | TP=confusion_matrix(original_ans,output_class_pred)[1][1]
688 | FP=confusion_matrix(original_ans,output_class_pred)[0][1]
689 | FN=confusion_matrix(original_ans,output_class_pred)[1][0]
690 |
691 | precision=TP/(TP+FP)
692 | recalll=TP/(FN+TP)
693 | F1=2*precision*recalll/(precision+recalll)
694 | sensiti=TP/(TP+FN)
695 | specifici=TN/(TN+FP)
696 | numerator=TP*TN - FP*FN
697 |
698 | denominator=np.sqrt((TP+FP)*(FN+TN)*(FP+TN)* (TP+FN))
699 | MCc=numerator/denominator
700 | G_mean1=np.sqrt(sensiti*precision)
701 | G_mean2=np.sqrt(sensiti*specifici)
702 | print('precision:' ,TP/(TP+FP))
703 | print('recall:',TP/(FN+TP))
704 | print("F1:",F1)
705 | print("Specificity:",TN/(TN+FP))
706 | print("Sensitivity ",TP/(TP+FN))
707 | print('G-mean1:',np.sqrt(sensiti*precision))
708 | print("G-mean2",np.sqrt(sensiti*specifici))
709 | print("MCC :",MCc)
710 | acc=[]
711 | pre=[]
712 | recall=[]
713 | f1=[]
714 | specificity=[]
715 | sensitivity=[]
716 | GMean1=[]
717 | Gmean2=[]
718 | MCC=[]
719 | tp=[]
720 | fp=[]
721 | fn=[]
722 | tn=[]
723 | acc.append(accuracy)
724 | pre.append(precision)
725 | recall.append(recalll)
726 | f1.append(F1)
727 | specificity.append(specifici)
728 | sensitivity.append(sensiti)
729 | GMean1.append(G_mean1)
730 | Gmean2.append(G_mean2)
731 | MCC.append(MCc)
732 | tp.append(TP)
733 | fp.append(FP)
734 | tn.append(TN)
735 | fn.append(FN)
736 | data={'accuracy_all':acc,"precision":pre,'recall':recall,'F1_score':f1,'specificity':specificity,'sensitivity':sensitivity,'Gmean1':GMean1,"Gmean2":Gmean2,"MCC":MCC,"TP":tp,"FP":fp,"TN":tn,"FN":fn,"traintime":diff1,"Exceeding_ratio":excess_ratio,"imbalance_ratio":imbalance_ratio,"Average_length_of_paragraph":avg,"Maximum_length_of_a_paragraph":maxim,"Average_length_of_sentences":avg_sen_length,"Maximum_length_of_a_sentence_in_a_paragraph":n,"Maximum_no_of_sentence_in_any_paragraph":m,"Vocabular_size":vocab,"label0":count0,"label1":count1}
737 | metric=pd.DataFrame(data)
738 | return metric
739 |
740 | print(history.history.keys())
741 |
742 | resi=check_metric(output_class_pred,original_ans,diff1)
743 | resi.to_csv('results_text.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
744 |
745 |
746 |
747 | #####
748 |
749 | filter_sizes = [1,2,3,4]
750 | num_filters = 32
751 | embed_size=300
752 | embedding_matrix=train_embedding_weights
753 | max_features=len(train_word_index)+1
754 | maxlen=m*n
755 | def get_model():
756 | inp = Input(shape=(maxlen, ))
757 | x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
758 | x = SpatialDropout1D(0.4)(x)
759 | x = Reshape((m, n, 300))(x)
760 | #print(x)
761 | conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 2),
762 | activation='relu')(x)
763 | conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 3),
764 | activation='relu')(x)
765 |
766 | #conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[0], 4),
767 | #activation='relu')(x)
768 |
769 |
770 |
771 |
772 |
773 | conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 1),
774 | activation='relu')(x)
775 | conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 2), activation='relu')(x)
776 |
777 | #conv_6 = Conv2D(num_filters, kernel_size=(filter_sizes[1], 3),
778 | #activation='relu')(x)
779 |
780 |
781 |
782 | maxpool_0 = MaxPool2D()(conv_0)
783 | maxpool_0=Flatten()(maxpool_0)
784 | maxpool_1 = MaxPool2D()(conv_1)
785 | maxpool_1=Flatten()(maxpool_1)
786 | #maxpool_2 = MaxPool2D()(conv_2)
787 | #maxpool_2 = Flatten()(maxpool_2)
788 |
789 | maxpool_4 = MaxPool2D()(conv_4)
790 | maxpool_4=Flatten()(maxpool_4)
791 | maxpool_5 = MaxPool2D()(conv_5)
792 | maxpool_5=Flatten()(maxpool_5)
793 | #maxpool_6 = MaxPool2D()(conv_6)
794 | #maxpool_6=Flatten()(maxpool_6)
795 | #maxpool_7 = MaxPool2D()(conv_7)
796 | # maxpool_7=Flatten()(maxpool_7)
797 |
798 | #w=concatenate([maxpool_4, maxpool_5,maxpool_6],axis=1)
799 | #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1)
800 | #z = concatenate([maxpool_0, maxpool_1,maxpool_4, maxpool_5],axis=1)
801 | w=concatenate([maxpool_4, maxpool_5],axis=1)
802 | #z = concatenate([maxpool_0, maxpool_1,maxpool_2,maxpool_4, maxpool_5,maxpool_6],axis=1)
803 | z = concatenate([maxpool_0, maxpool_1],axis=1)
804 |
805 | #z = Flatten()(z)
806 | z=concatenate([w,z],axis=1)
807 | #z = Flatten()(z)
808 | #z=concatenate([w,z],axis=1)
809 | z=Dense(units=64,activation="relu")(z)
810 | z = Dropout(0.4)(z)
811 |
812 | outp = Dense(1, activation="sigmoid")(z)
813 |
814 | model = Model(inputs=inp, outputs=outp)
815 |
816 | model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])
817 |
818 | return model
819 |
820 |
821 | # In[ ]:
822 |
823 |
824 | model=get_model()
825 |
826 |
827 | # In[ ]:
828 |
829 |
830 | print(model.summary())
831 |
832 |
833 | # In[ ]:
834 |
835 |
836 |
837 | #define callbacks
838 | from tensorflow.keras.callbacks import EarlyStopping
839 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
840 | callbacks_list = [early_stopping]
841 |
842 | import time, datetime
843 | start = datetime.datetime.now()
844 |
845 | history=model.fit(train_cnn_data, y_train, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
846 |
847 | end = datetime.datetime.now()
848 | diff1= (end - start)
849 | print('time taken by text_4',diff1)
850 |
851 |
852 |
853 |
854 |
855 | pred=model.predict(test_cnn_data)
856 | y_test=pred
857 | y_test=y_test.tolist()
858 | output_class_pred=[]
859 | for i in range(len(y_test)):
860 | if(y_test[i][0]<0.5):
861 | output_class_pred.append(0)
862 | else:
863 | output_class_pred.append(1)
864 |
865 | original_ans=data_test['tag']
866 | original_ans=original_ans.tolist()
867 |
868 | # In[ ]:
869 | from sklearn.metrics import confusion_matrix
870 | from sklearn.metrics import classification_report
871 |
872 | #as its a fake news classifier , so identifying a fake class will be a TP
873 |
874 |
875 | resi=check_metric(output_class_pred,original_ans,diff1)
876 | resi.to_csv('results_text.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 | # In[ ]:
885 |
886 |
887 |
888 |
889 |
890 | # In[ ]:
891 |
892 |
893 | ## now perparing training data for yoon kim model
894 |
895 |
896 | # In[ ]:
897 |
898 |
899 | def create_single_line_para(x):
900 | l=[]
901 | for i in x:
902 | l+=i #concatenating all the sentences in a para into a single 1 d arrray
903 | return l
904 |
905 |
906 |
907 | # In[ ]:
908 |
909 |
910 | data_train['create_single_line_para']=data_train['train_seq'].apply(lambda x: create_single_line_para(x) )
911 | data_test['create_single_line_para']=data_test['train_seq'].apply(lambda x: create_single_line_para(x) )
912 |
913 |
914 | # In[ ]:
915 |
916 |
917 | from tensorflow.keras.preprocessing.sequence import pad_sequences
918 | yoon_kim_train_data=np.array(data_train['create_single_line_para'].tolist())
919 | yoon_kim_train_data=pad_sequences(yoon_kim_train_data,maxlen=para_max,padding='post')
920 |
921 | # In[ ]:
922 | yoon_kim_test_data=np.array(data_test['create_single_line_para'].tolist())
923 | yoon_kim_test_data=pad_sequences(yoon_kim_test_data,maxlen=para_max,padding='post')
924 |
925 |
926 | #from __future__ import print_function
927 | from tensorflow.keras.layers import Embedding
928 |
929 | from tensorflow.keras.preprocessing.text import text_to_word_sequence
930 | import pandas as pd
931 | from tensorflow.keras.preprocessing.text import Tokenizer
932 | import numpy as np
933 |
934 |
935 | from tensorflow.keras.preprocessing import sequence
936 | from tensorflow.keras.models import Sequential,Model
937 | from tensorflow.keras.layers import Dense, Dropout, Activation,Flatten,Bidirectional,GRU,LSTM
938 | from tensorflow.keras.layers import Embedding,concatenate
939 | from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D,GlobalAveragePooling1D
940 | from tensorflow.keras.models import Model
941 | from tensorflow.keras.layers import Input
942 |
943 |
944 | # In[ ]:
945 |
946 |
947 | train_y=pd.get_dummies(y_train)
948 |
949 |
950 | # In[ ]:
951 |
952 |
953 | trains_y=train_y[[0,1]].values
954 |
955 |
956 | # In[ ]:
957 |
958 |
959 | embed_size=300
960 | embedding_matrix=train_embedding_weights
961 | max_features=len(train_word_index)+1
962 | maxlen=para_max
963 | max_sequence_length=para_max
964 | MAX_SEQUENCE_LENGTH=para_max
965 | EMBEDDING_DIM=300
966 |
967 |
968 | #model3 yoon kim
969 |
970 |
971 | # In[ ]:
972 |
973 |
974 | def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
975 |
976 | embedding_layer = Embedding(num_words,
977 | embedding_dim,
978 | weights=[embeddings],
979 | input_length=max_sequence_length,
980 | trainable=trainable)
981 |
982 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
983 | embedded_sequences = embedding_layer(sequence_input)
984 |
985 | # Yoon Kim model (https://arxiv.org/abs/1408.5882)
986 | convs = []
987 | filter_sizes = [3,4,5]
988 |
989 | for filter_size in filter_sizes:
990 | l_conv = Conv1D(filters=100, kernel_size=filter_size, activation='relu')(embedded_sequences)
991 | l_pool = MaxPooling1D(pool_size=2)(l_conv)
992 | convs.append(l_pool)
993 |
994 | l_merge = concatenate(convs, axis=1)
995 |
996 | # add a 1D convnet with global maxpooling, instead of Yoon Kim model
997 | #conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
998 | #pool = MaxPooling1D(pool_size=2)(conv)
999 |
1000 | #if extra_conv==True:
1001 | #x = Dropout(0.01)(l_merge)
1002 | #else:
1003 | # Original Yoon Kim model
1004 | #x = Dropout(0.001)(pool)
1005 | x = Flatten()(l_merge)
1006 |
1007 | x = Dropout(0.5)(x)
1008 | # Finally, we feed the output into a Sigmoid layer.
1009 | # The reason why sigmoid is used is because we are trying to achieve a binary classification(1,0)
1010 | # for each of the 6 labels, and the sigmoid function will squash the output between the bounds of 0 and 1.
1011 | preds = Dense(2, activation='softmax')(x)
1012 |
1013 | model = Model(sequence_input, preds)
1014 | model.compile(loss='categorical_crossentropy',
1015 | optimizer='Adam',
1016 | metrics=['acc'])
1017 | model.summary()
1018 | return model
1019 |
1020 |
1021 | # In[ ]:
1022 |
1023 |
1024 | model1 = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM,
1025 | True)
1026 |
1027 |
1028 | # In[ ]:
1029 |
1030 |
1031 | training_data=yoon_kim_train_data
1032 |
1033 |
1034 | # In[ ]:
1035 |
1036 |
1037 | testing_data=yoon_kim_test_data
1038 |
1039 |
1040 | # In[ ]:
1041 |
1042 |
1043 |
1044 | #define callbacks
1045 | from tensorflow.keras.callbacks import EarlyStopping
1046 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1047 | callbacks_list = [early_stopping]
1048 |
1049 | import time, datetime
1050 | start = datetime.datetime.now()
1051 |
1052 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1053 | end = datetime.datetime.now()
1054 | diff1= (end - start)
1055 | print('time taken by yoon',diff1)
1056 |
1057 |
1058 | # In[ ]:
1059 |
1060 |
1061 | pred=model1.predict(testing_data)
1062 | y_test=pred
1063 | y_test=y_test.tolist()
1064 | output_class_pred=[]
1065 | #output_class_pred=[]
1066 | for i in range(len(y_test)):
1067 | m=max(y_test[i])
1068 | if(y_test[i].index(m)==0):
1069 | output_class_pred.append(0)
1070 | else:
1071 | output_class_pred.append(1)
1072 |
1073 |
1074 | original_ans=data_test['tag']
1075 | original_ans=original_ans.tolist()
1076 |
1077 |
1078 | # In[ ]:
1079 |
1080 |
1081 | #as its a fake news classifier , so identifying a fake class will be a TP
1082 | resi=check_metric(output_class_pred,original_ans,diff1)
1083 |
1084 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1085 |
1086 |
1087 |
1088 | from tensorflow.keras.models import Model, Sequential
1089 |
1090 | from tensorflow.keras.layers import Dropout, Embedding, concatenate
1091 | from tensorflow.keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, ZeroPadding1D
1092 | from tensorflow.keras.layers import Dense, Input, Flatten, BatchNormalization
1093 | from tensorflow.keras.layers import Concatenate, Dot, Multiply, RepeatVector
1094 | from tensorflow.keras.layers import Bidirectional, TimeDistributed
1095 | from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute
1096 |
1097 | #from tensorflow.keras.layers.core import Reshape, Activation
1098 | from tensorflow.keras.optimizers import Adam
1099 | from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
1100 | #from tensorflow.keras.constraints import maxnorm
1101 | #from tensorflow.keras.regularizers import l2
1102 |
1103 | def ConvNet_vdcnn(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
1104 |
1105 | embedding_layer = Embedding(num_words,
1106 | embedding_dim,
1107 | weights=[embeddings],
1108 | input_length=max_sequence_length,
1109 | trainable=trainable)
1110 |
1111 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
1112 | embedded_sequences = embedding_layer(sequence_input)
1113 |
1114 |
1115 |
1116 | # 4 pairs of convolution blocks followed by pooling
1117 | conv = Conv1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_sequences)
1118 |
1119 |
1120 | # 4 pairs of convolution blocks followed by pooling
1121 | for filter_size in [64, 128, 256, 512]:
1122 |
1123 | # each iteration is a convolution block
1124 | for cb_i in [0,1]:
1125 | conv=(Conv1D(filter_size, 3, padding="same",activation='relu'))(conv)
1126 | #model_1.add(BatchNormalization())
1127 | #model_1.add(Activation("relu"))
1128 | conv=(Conv1D(filter_size, 1, padding="same",activation='relu'))(conv)
1129 | #model_1.add(BatchNormalization())
1130 | #model_1.add(Activation("relu"))
1131 |
1132 | conv=(MaxPooling1D(pool_size=2, strides=3))(conv)
1133 |
1134 | # model.add(KMaxPooling(k=2))
1135 | conv=(Flatten())(conv)
1136 | conv=(Dense(4096, activation="relu"))(conv)
1137 | conv=(Dense(2048, activation="relu"))(conv)
1138 | conv=(Dense(2048, activation="relu"))(conv)
1139 | #(Dense(9, activation="softmax"))
1140 |
1141 | preds = Dense(2, activation='softmax')(conv)
1142 |
1143 | model = Model(sequence_input, preds)
1144 | model.compile(loss='categorical_crossentropy',
1145 | optimizer='Adam',metrics=['acc'])
1146 | print(model.summary())
1147 | return model
1148 |
1149 |
1150 |
1151 | model1 = ConvNet_vdcnn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM,
1152 | True)
1153 |
1154 |
1155 |
1156 |
1157 | # In[ ]:
1158 |
1159 |
1160 | training_data=yoon_kim_train_data
1161 |
1162 |
1163 | # In[ ]:
1164 |
1165 |
1166 | testing_data=yoon_kim_test_data
1167 |
1168 |
1169 | # In[ ]:
1170 |
1171 |
1172 |
1173 | #define callbacks
1174 | from tensorflow.keras.callbacks import EarlyStopping
1175 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1176 | callbacks_list = [early_stopping]
1177 |
1178 | import time, datetime
1179 | start = datetime.datetime.now()
1180 |
1181 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1182 | end = datetime.datetime.now()
1183 | diff1= (end - start)
1184 | print('time taken by yoon',diff1)
1185 |
1186 |
1187 | # In[ ]:
1188 |
1189 |
1190 | pred=model1.predict(testing_data)
1191 | y_test=pred
1192 | y_test=y_test.tolist()
1193 | output_class_pred=[]
1194 | #output_class_pred=[]
1195 | for i in range(len(y_test)):
1196 | m=max(y_test[i])
1197 | if(y_test[i].index(m)==0):
1198 | output_class_pred.append(0)
1199 | else:
1200 | output_class_pred.append(1)
1201 |
1202 |
1203 | original_ans=data_test['tag']
1204 | original_ans=original_ans.tolist()
1205 |
1206 |
1207 | # In[ ]:
1208 |
1209 |
1210 | #as its a fake news classifier , so identifying a fake class will be a TP
1211 | resi=check_metric(output_class_pred,original_ans,diff1)
1212 |
1213 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1214 |
1215 |
1216 |
1217 |
1218 |
1219 | def ConvNet_clstm(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
1220 |
1221 | embedding_layer = Embedding(num_words,
1222 | embedding_dim,
1223 | weights=[embeddings],
1224 | input_length=max_sequence_length,
1225 | trainable=trainable)
1226 |
1227 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
1228 | embedded_sequences = embedding_layer(sequence_input)
1229 |
1230 | convs = []
1231 | filter_sizes = [10, 20, 30, 40]
1232 |
1233 | for filter_size in filter_sizes:
1234 | l_conv = Conv1D(filters=64, kernel_size=filter_size, padding='valid', activation='relu')(embedded_sequences)
1235 | convs.append(l_conv)
1236 |
1237 | cnn_feature_maps = Concatenate(axis=1)(convs)
1238 | sentence_encoder = LSTM(64,return_sequences=False)(cnn_feature_maps)
1239 | fc_layer =Dense(128, activation="relu")(sentence_encoder)
1240 | #output_layer = Dense(9,activation="softmax")(fc_layer)
1241 |
1242 | #model_1 = Model(inputs=[text_input_layer], outputs=[output_layer])
1243 | preds = Dense(2, activation='softmax')(fc_layer)
1244 |
1245 | model = Model(sequence_input, preds)
1246 | model.compile(loss='categorical_crossentropy',
1247 | optimizer='Adam',
1248 | metrics=['acc'])
1249 | model.summary()
1250 | return model
1251 |
1252 |
1253 | model1 = ConvNet_clstm(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM,
1254 | True)
1255 |
1256 |
1257 |
1258 |
1259 | # In[ ]:
1260 |
1261 |
1262 | training_data=yoon_kim_train_data
1263 |
1264 |
1265 | # In[ ]:
1266 |
1267 |
1268 | testing_data=yoon_kim_test_data
1269 |
1270 |
1271 | # In[ ]:
1272 |
1273 |
1274 |
1275 | #define callbacks
1276 | from tensorflow.keras.callbacks import EarlyStopping
1277 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1278 | callbacks_list = [early_stopping]
1279 |
1280 | import time, datetime
1281 | start = datetime.datetime.now()
1282 |
1283 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1284 | end = datetime.datetime.now()
1285 | diff1= (end - start)
1286 | print('time taken by yoon',diff1)
1287 |
1288 |
1289 | # In[ ]:
1290 |
1291 |
1292 | pred=model1.predict(testing_data)
1293 | y_test=pred
1294 | y_test=y_test.tolist()
1295 | output_class_pred=[]
1296 | #output_class_pred=[]
1297 | for i in range(len(y_test)):
1298 | m=max(y_test[i])
1299 | if(y_test[i].index(m)==0):
1300 | output_class_pred.append(0)
1301 | else:
1302 | output_class_pred.append(1)
1303 |
1304 |
1305 | original_ans=data_test['tag']
1306 | original_ans=original_ans.tolist()
1307 |
1308 |
1309 | # In[ ]:
1310 |
1311 |
1312 | #as its a fake news classifier , so identifying a fake class will be a TP
1313 | resi=check_metric(output_class_pred,original_ans,diff1)
1314 |
1315 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1316 |
1317 |
1318 |
1319 |
1320 | def ConvNet_lstm(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True, extra_conv=False):
1321 |
1322 |
1323 | embedding_layer = Embedding(num_words,embedding_dim,weights=[embeddings],input_length=max_sequence_length,trainable=trainable)
1324 |
1325 | sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
1326 | embedded_sequences = embedding_layer(sequence_input)
1327 |
1328 | sentence_encoder = LSTM(64,return_sequences=False)(embedded_sequences)
1329 | fc_layer =Dense(128, activation="relu")(sentence_encoder)
1330 | #output_layer = Dense(9,activation="softmax")(fc_layer)
1331 | #model_1 = Model(inputs=[text_input_layer], outputs=[output_layer])
1332 | preds = Dense(2, activation='softmax')(fc_layer)
1333 |
1334 | model = Model(sequence_input, preds)
1335 | model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['acc'])
1336 | model.summary()
1337 | return model
1338 |
1339 |
1340 |
1341 |
1342 |
1343 |
1344 | model1 = ConvNet_lstm(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM,
1345 | True)
1346 |
1347 |
1348 |
1349 |
1350 | # In[ ]:
1351 |
1352 |
1353 | training_data=yoon_kim_train_data
1354 |
1355 |
1356 | # In[ ]:
1357 |
1358 |
1359 | testing_data=yoon_kim_test_data
1360 |
1361 |
1362 | # In[ ]:
1363 |
1364 |
1365 |
1366 | #define callbacks
1367 | from tensorflow.keras.callbacks import EarlyStopping
1368 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
1369 | callbacks_list = [early_stopping]
1370 |
1371 | import time, datetime
1372 | start = datetime.datetime.now()
1373 |
1374 | hist = model1.fit(training_data, trains_y, epochs=10,callbacks=callbacks_list,batch_size=32,validation_split=0.1 )
1375 | end = datetime.datetime.now()
1376 | diff1= (end - start)
1377 | print('time taken by yoon',diff1)
1378 |
1379 |
1380 | # In[ ]:
1381 |
1382 |
1383 | pred=model1.predict(testing_data)
1384 | y_test=pred
1385 | y_test=y_test.tolist()
1386 | output_class_pred=[]
1387 | #output_class_pred=[]
1388 | for i in range(len(y_test)):
1389 | m=max(y_test[i])
1390 | if(y_test[i].index(m)==0):
1391 | output_class_pred.append(0)
1392 | else:
1393 | output_class_pred.append(1)
1394 |
1395 |
1396 | original_ans=data_test['tag']
1397 | original_ans=original_ans.tolist()
1398 |
1399 |
1400 | # In[ ]:
1401 |
1402 |
1403 | #as its a fake news classifier , so identifying a fake class will be a TP
1404 | resi=check_metric(output_class_pred,original_ans,diff1)
1405 |
1406 | resi.to_csv('results_text', mode='a', index = False, header=resi.columns,columns=resi.columns)
1407 |
1408 |
1409 |
1410 | #resi.to_csv('results.csv', mode='a', index = False, header=resi.columns,columns=resi.columns)
1411 |
1412 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.9.0
2 | argon2-cffi==20.1.0
3 | arrow==0.13.1
4 | asn1crypto==1.2.0
5 | astor==0.8.0
6 | async-generator==1.10
7 | attrs==19.3.0
8 | Automat==0.8.0
9 | backcall==0.1.0
10 | bcrypt==3.1.7
11 | beautifulsoup4==4.8.2
12 | bleach==3.3.0
13 | blinker==1.4
14 | cachetools==3.1.1
15 | certifi==2019.11.28
16 | cffi==1.13.0
17 | chardet==3.0.4
18 | Click==7.0
19 | cloudpickle==1.3.0
20 | conda==4.8.2
21 | conda-package-handling==1.6.0
22 | constantly==15.1.0
23 | cryptography==2.8
24 | cssselect==1.1.0
25 | cycler==0.10.0
26 | Cython==0.29.17
27 | cytoolz==0.10.1
28 | dask==2.11.0
29 | debugpy==1.3.0
30 | decorator==4.4.1
31 | defusedxml==0.7.1
32 | eli5==0.10.1
33 | entrypoints==0.3
34 | fastcache==1.1.0
35 | future==0.18.2
36 | gast==0.2.2
37 | gensim==4.0.1
38 | gmpy2==2.0.8
39 | google-auth==1.11.2
40 | google-auth-oauthlib==0.4.1
41 | google-pasta==0.1.8
42 | graphviz==0.13.2
43 | grpcio==1.27.2
44 | h5py==2.8.0
45 | hyperlink==19.0.0
46 | idna==2.8
47 | imageio==2.6.1
48 | importlib-metadata==1.5.0
49 | incremental==17.5.0
50 | ipykernel==6.0.0
51 | ipython==7.25.0
52 | ipython-genutils==0.2.0
53 | jedi==0.16.0
54 | Jinja2==2.11.1
55 | joblib==0.14.1
56 | jsonschema==3.2.0
57 | jupyter-client==6.1.12
58 | jupyter-core==4.7.1
59 | jupyterlab-pygments==0.1.2
60 | Keras==2.3.1
61 | Keras-Applications==1.0.8
62 | Keras-Preprocessing==1.1.0
63 | kiwisolver==1.1.0
64 | leveldb==0.201
65 | lightgbm==2.3.0
66 | lxml==4.5.0
67 | Mako==1.1.1
68 | Markdown==3.1.1
69 | MarkupSafe==1.1.1
70 | matplotlib==3.1.3
71 | matplotlib-inline==0.1.2
72 | mistune==0.8.4
73 | mkl-fft==1.0.15
74 | mkl-random==1.1.0
75 | mkl-service==2.3.0
76 | more-itertools==8.2.0
77 | mpmath==1.1.0
78 | nbclient==0.5.3
79 | nbconvert==6.1.0
80 | nbformat==5.1.3
81 | nest-asyncio==1.5.1
82 | networkx==2.4
83 | nltk==3.4.5
84 | nose==1.3.7
85 | notebook==6.4.0
86 | numpy==1.18.1
87 | oauthlib==3.1.0
88 | olefile==0.46
89 | opt-einsum==3.1.0
90 | packaging==20.1
91 | pandas==1.0.1
92 | pandocfilters==1.4.3
93 | parsel==1.5.2
94 | parso==0.6.1
95 | patsy==0.5.1
96 | peewee==3.10.0
97 | pexpect==4.8.0
98 | pickleshare==0.7.5
99 | Pillow==7.0.0
100 | pluggy==0.13.1
101 | prometheus-client==0.11.0
102 | prompt-toolkit==3.0.3
103 | protobuf==3.11.4
104 | ptyprocess==0.6.0
105 | py==1.8.1
106 | pyasn1==0.4.8
107 | pyasn1-modules==0.2.7
108 | pycosat==0.6.3
109 | pycparser==2.19
110 | PyDispatcher==2.0.5
111 | pyglet==1.5.0
112 | Pygments==2.5.2
113 | pygpu==0.7.6
114 | PyHamcrest==1.9.0
115 | PyJWT==1.7.1
116 | pyOpenSSL==19.0.0
117 | pyparsing==2.4.6
118 | pyrsistent==0.18.0
119 | PySocks==1.7.1
120 | pytest==5.3.5
121 | pytest-runner==5.2
122 | python-dateutil==2.8.1
123 | python-gflags==3.1.2
124 | pytz==2019.3
125 | PyWavelets==1.1.1
126 | PyYAML==5.3
127 | pyzmq==22.1.0
128 | queuelib==1.5.0
129 | requests==2.22.0
130 | requests-oauthlib==1.3.0
131 | rsa==4.0
132 | ruamel-yaml==0.15.46
133 | scapy==2.4.3
134 | scikit-image==0.16.2
135 | scikit-learn==0.22.1
136 | scipy==1.4.1
137 | Scrapy==1.6.0
138 | seaborn==0.10.0
139 | Send2Trash==1.7.1
140 | service-identity==18.1.0
141 | simplejson==3.17.0
142 | singledispatch==3.4.0.3
143 | six==1.12.0
144 | smart-open==5.1.0
145 | soupsieve==1.9.5
146 | SQLAlchemy==1.3.13
147 | statsmodels==0.11.0
148 | sympy==1.5.1
149 | tabulate==0.8.6
150 | tensorboard==2.1.0
151 | tensorflow==2.0.0
152 | tensorflow-estimator==2.0.0
153 | termcolor==1.1.0
154 | terminado==0.10.1
155 | testpath==0.5.0
156 | Theano==1.0.4
157 | toolz==0.10.0
158 | torch==1.3.1
159 | tornado==6.1
160 | tqdm==4.36.1
161 | traitlets==4.3.3
162 | Twisted==19.10.0
163 | urllib3==1.24.2
164 | w3lib==1.21.0
165 | wcwidth==0.1.8
166 | webencodings==0.5.1
167 | Werkzeug==0.16.1
168 | wrapt==1.11.2
169 | wxPython==4.0.4
170 | zipp==2.2.0
171 | zope.interface==4.7.1
172 |
--------------------------------------------------------------------------------
/sample.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH -N 1 #1 Node
3 | #SBATCH --ntasks-per-node=8
4 | #SBATCH --time=3-00:00:00
5 | #SBATCH --job-name=run_models_gpu_1_16GB
6 | #SBATCH --error=%J.err
7 | #SBATCH --output=%J.out
8 |
9 | #SBATCH --partition=gpu
10 | #SBATCH --gres=gpu:1
11 |
12 | eval "$(conda shell.bash hook)"
13 |
14 | module load python/conda-python/3.7
15 |
16 | python3 /scratch/satyendrac.mnitjaipur/codes/soni/run_models.py
--------------------------------------------------------------------------------