├── source
    ├── ModelRun
    │   ├── __init__.py
    │   └── Driver.py
    ├── Models
    │   ├── __init__.py
    │   ├── LogisticRegressionImplementation.py
    │   └── NaiveBayes.py
    ├── PerformanceEvaluation
    │   ├── __init__.py
    │   ├── EvaluationReports
    │   │   ├── Naive Bayes Model_Evaluation_Report.txt
    │   │   ├── Random Forest Model_Evaluation_Report.txt
    │   │   ├── Logistic Regression Model_Evaluation_Report.txt
    │   │   └── Support Vector Machine Model_Evaluation_Report.txt
    │   └── Evaluate.py
    ├── DataCollectionAndCleaning
    │   ├── __init__.py
    │   ├── crawler.py
    │   └── DataExtractionAndFilter.py
    └── DataVisualization
    │   ├── __init__.py
    │   ├── Plot_Images
    │       ├── LossVsIterationPlot.png
    │       ├── Naive Bayes Model-scatterPlot.png
    │       ├── Random Forest Model-scatterPlot.png
    │       ├── Logistic Regression Model-scatterPlot.png
    │       └── Support Vector Machine Model-scatterPlot.png
    │   └── Visualize.py
├── FakeNewsDetectionProjectReport.pdf
└── README.txt


/source/ModelRun/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/Models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/PerformanceEvaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/DataCollectionAndCleaning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/DataVisualization/__init__.py:
--------------------------------------------------------------------------------
1 | import Visualize


--------------------------------------------------------------------------------
/FakeNewsDetectionProjectReport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/FakeNewsDetectionProjectReport.pdf


--------------------------------------------------------------------------------
/source/DataVisualization/Plot_Images/LossVsIterationPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/LossVsIterationPlot.png


--------------------------------------------------------------------------------
/source/DataVisualization/Plot_Images/Naive Bayes Model-scatterPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Naive Bayes Model-scatterPlot.png


--------------------------------------------------------------------------------
/source/DataVisualization/Plot_Images/Random Forest Model-scatterPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Random Forest Model-scatterPlot.png


--------------------------------------------------------------------------------
/source/DataVisualization/Plot_Images/Logistic Regression Model-scatterPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Logistic Regression Model-scatterPlot.png


--------------------------------------------------------------------------------
/source/DataVisualization/Plot_Images/Support Vector Machine Model-scatterPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Support Vector Machine Model-scatterPlot.png


--------------------------------------------------------------------------------
/source/PerformanceEvaluation/EvaluationReports/Naive Bayes Model_Evaluation_Report.txt:
--------------------------------------------------------------------------------
 1 | ##########################################################################################
 2 | ######################### Evaluation Report of Naive Bayes Model #########################
 3 | ##########################################################################################
 4 | 
 5 | True Negative: 756
 6 | False Positive: 1048
 7 | Recall: 0.70
 8 | Precision: 0.59
 9 | False Negative: 658
10 | True Positive: 1538
11 | Accuracy: 0.5735
12 | 


--------------------------------------------------------------------------------
/source/PerformanceEvaluation/EvaluationReports/Random Forest Model_Evaluation_Report.txt:
--------------------------------------------------------------------------------
 1 | ##########################################################################################
 2 | ######################## Evaluation Report of Random Forest Model ########################
 3 | ##########################################################################################
 4 | 
 5 | True Negative: 1756
 6 | False Positive: 48
 7 | Recall: 0.96
 8 | Precision: 0.98
 9 | False Negative: 82
10 | True Positive: 2114
11 | Accuracy: 0.9675
12 | 


--------------------------------------------------------------------------------
/source/PerformanceEvaluation/EvaluationReports/Logistic Regression Model_Evaluation_Report.txt:
--------------------------------------------------------------------------------
 1 | ##########################################################################################
 2 | ##################### Evaluation Report of Logistic Regression Model #####################
 3 | ##########################################################################################
 4 | 
 5 | True Negative: 1750
 6 | False Positive: 54
 7 | Recall: 0.98
 8 | Precision: 0.98
 9 | False Negative: 41
10 | True Positive: 2155
11 | Accuracy: 0.97625
12 | 


--------------------------------------------------------------------------------
/source/PerformanceEvaluation/EvaluationReports/Support Vector Machine Model_Evaluation_Report.txt:
--------------------------------------------------------------------------------
 1 | ##########################################################################################
 2 | ################### Evaluation Report of Support Vector Machine Model ####################
 3 | ##########################################################################################
 4 | 
 5 | True Negative: 1754
 6 | False Positive: 45
 7 | Recall: 0.97
 8 | Precision: 0.98
 9 | False Negative: 76
10 | True Positive: 2125
11 | Accuracy: 0.97
12 | 


--------------------------------------------------------------------------------
/source/DataVisualization/Visualize.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | 
 4 | TITLE ={"ModelLogisticRegression":"Logistic Regression Model",
 5 |         "NaiveBayes":"Naive Bayes Model",
 6 |         "GridSearchCV":"Support Vector Machine Model",
 7 |         "RandomForestClassifier":"Random Forest Model"}
 8 |     
 9 | 
10 | def plotScatterGraphForPrediction(prediction ,y_test,className):
11 |     
12 |     ''' scatter plot '''
13 |     
14 |     #fig=plt.figure()
15 |     plt.ylim(-1, 2)
16 |     ax = plt.subplots(nrows=2, ncols=1, figsize=(7,7))[1]
17 |     plt.title('Fake news vs Real News -> '+TITLE[className])
18 |     ax[0].scatter(xrange(len(prediction)), prediction, color='red')
19 |     ax[0].set_title='Prediction'
20 |     ax[0].plot()
21 |     ax[1].scatter(xrange(len(y_test)), y_test, color='green')
22 |     ax[1].set_title='Actual'
23 |     ax[1].plot()
24 |     #fig.add(ax)
25 |     plt.savefig('../DataVisualization/Plot_Images/'+TITLE[className]+'-scatterPlot.png')
26 |     plt.show()
27 |     
28 | def loss_vs_iteration_plot(loss_array):
29 |     
30 |     '''  plot loss v/s iteration plot '''
31 |             
32 |     plt.title('loss vs iteration')
33 |     plt.plot(xrange(len(loss_array)), loss_array)
34 |     plt.savefig('../DataVisualization/Plot_Images/LossVsIterationPlot.png')
35 |     plt.show()
36 |     


--------------------------------------------------------------------------------
/source/Models/LogisticRegressionImplementation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | ''' Logistic Regression Model class '''
 4 | class ModelLogisticRegression:
 5 | 
 6 |     ''' init method '''
 7 |     def __init__(self,params): #learning_rate,threshold_tolerance=0.005,maximum_iterations=1000):
 8 |         self.maximum_iterations = params[2]#maximum_iterations
 9 |         self.threshold_tolerance = params[1]#threshold_tolerance
10 |         self.learning_rate = params[0]#learning_rate
11 | 
12 |     ''' train the model '''
13 |     def fit(self,X,y):
14 | 
15 |         iterations = 1
16 |         self.loss_array = []
17 |         self.weight = np.array([0] * len(X[0]))
18 |         loss_difference = float('inf')
19 |         loss = self.calculatelogisticLoss(X, y)
20 |         while iterations < self.maximum_iterations and loss_difference > self.threshold_tolerance:
21 |             iterations = iterations + 1
22 |             scores = []
23 |             for x in X:
24 |                 scores.append(np.dot(self.weight.T,x))
25 |             error = sigmoid(scores) - y
26 |             gradient = np.dot(X.T,error)
27 |             self.weight = self.weight - (self.learning_rate * gradient)
28 |             loss_difference = abs(loss - self.calculatelogisticLoss(X,y))
29 |             loss = self.calculatelogisticLoss(X,y)
30 |             self.loss_array.append(loss)
31 |     
32 |     ''' calculation of logistic Loss '''
33 |     def calculatelogisticLoss(self,X,y):
34 |         res = []
35 |         for x in X:
36 |             res.append(np.dot(self.weight.T,x))
37 |         return -1 * (np.sum((y * np.log(sigmoid(res))) + ((1 - y) * np.log(1 - sigmoid(res)))))
38 | 
39 |     ''' test the model '''
40 |     def predict(self, X):
41 |         res = []
42 |         for x in X:
43 |             res.append(np.dot(self.weight.T,x))
44 |         return np.round(sigmoid(res))
45 | 
46 | 
47 | ''' compute the sigmoid '''
48 | def sigmoid(scores):
49 |     res = []
50 |     for score in scores:
51 |         res.append(1 / (1 + np.exp(-score)))
52 |     return np.array(res)


--------------------------------------------------------------------------------
/source/PerformanceEvaluation/Evaluate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 |   
 3 | def accuracy(pred,y_test):
 4 |     
 5 |     '''  compute accuracy  '''
 6 |     
 7 |     prediction=np.array(pred).astype(int)
 8 |     y_test=np.array(y_test).astype(int)
 9 |     count=0
10 |     for i in range (0,len(y_test)):
11 |         if prediction[i]==y_test[i]:
12 |             count+=1
13 |     accuracy=(count/float(len(y_test)))
14 |     print "Accuracy on prediction :",accuracy
15 |     
16 |     return accuracy
17 |     
18 |     
19 | def precision_recall_evaluation(prediction, y_test):
20 |     
21 |     ''' calculate precision, recall '''
22 |     
23 |     true_positive = 0;
24 |     false_positive = 0;
25 |     false_negative = 0;
26 |     true_negative = 0;
27 |     
28 |     for index in range(len(prediction)):
29 |         if y_test[index] == 1 and prediction[index] == 1:
30 |             true_positive += 1
31 |         elif y_test[index] == 0 and prediction[index] == 1:
32 |             false_positive += 1
33 |         elif y_test[index] == 1 and prediction[index] == 0:
34 |             false_negative += 1
35 |         else:
36 |             true_negative+=1
37 |             
38 |     ''' confusion matrix data  '''
39 |             
40 |     print('True Positive', true_positive)
41 |     print('False Positive', false_positive)
42 |     print('False Negative', false_negative)
43 |     print('True Negative', true_negative)
44 |  
45 |     precision = true_positive / (float) (true_positive + false_positive)
46 |     recall = true_positive / (float) (true_positive + false_negative)
47 |     
48 |     print 'Precision: {0:0.2f}'.format(precision)
49 |     print 'Recall: {0:0.2f}'.format(recall)
50 |     
51 |     reportResults={'True Positive':true_positive,
52 |                    'True Negative':true_negative,
53 |                    'False Positive':false_positive,
54 |                    'False Negative':false_negative,
55 |                    'Precision':'{0:0.2f}'.format(precision),
56 |                    'Recall': '{0:0.2f}'.format(recall)}
57 |     return reportResults


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | #############################################################################################
 2 | ####################################        FINAL PROJECT        ############################
 3 | #############################################################################################
 4 | 
 5 |  GOAL: Detection of Fake News using various Machine Learning Classifier algorithms and evaluation of their 
 6 | 	   performance
 7 | 
 8 |  SUBMISSION SUMMARY:
 9 | 
10 |  Task 1: Generation of DataSet
11 | 
12 | 		> A. The News Aggregator Dataset from the UCI Machine Learning Repository was used to extract real 
13 | 		     news. This dataset consists of links to the originally published news articles in their websites.
14 | 			 We extracted these URLS and crawled them to download the news content using Beauti-fulSoup.
15 | 		> B. For fake news we used Kaggle’s ‘Getting Real about Fake News’ dataset.
16 | 			 The CSV file with data was available off the shelf for use, and we had to perform minimal text
17 | 			 pro-cessing on this data.
18 | 		  
19 |  Task 2: Implementation of Classifier Algorithms:
20 | 
21 | 		> A. Implemented Logistic Regression Algorithm from scratch and test it against the dataset.
22 | 		> B. Implemented Naive Bayes Classifier Algorithm from scratch and test it against the dataset.
23 | 		> C. Implemented Random Forest Classifier Algorithm using 'scikit-learn' library and integrate it with 
24 | 		     our data set.
25 | 		> D. Implemented Support Vector Machine Classifier Algorithm using 'scikit-learn' library and integrate 
26 | 		     it with our data set.
27 | 
28 |  Task 3: Implementation of Evaluation Measure:
29 |  	  
30 |  	  	> A. Implemented methods to calculate accuracy of the prediction algorithms.
31 | 		> B. Implemented methods to calculate precision of the prediction algorithms.
32 | 		> C. Implemented methods to calculate recall of the prediction algorithms.
33 | 
34 | Task 4: Implementation of Data Visualization code via graph plots:
35 | 	
36 | 		> A. Implemented methods to plot iteration vs loss graph for Logistic Regression.
37 | 		> B. Implemented methods to generate scatter plot of predictions and actual results against data set.
38 |  	  
39 | 
40 | 		
41 | ############################################################################################	
42 | 
43 |  INSTALLATION GUIDE:
44 | 
45 | 		> Download Python 2.7 from : "https://www.python.org/download/releases/2.7/"
46 | 		> Set Environment variables for Python [for detailed steps refer : 
47 | 		  "https://docs.python.org/2/using/windows.html" ]
48 | 		> Install BeautifulSoup  by the following the below steps:
49 | 		       1. Open command prompt (cmd) in Windows.
50 | 		       2. Run Command : 'pip install BeautifulSoup4'
51 | 			   3. Run Command : 'pip install sklearn'
52 | 			   4. Run Command : 'pip install pandas'
53 | 			   5. Run Command : 'pip install numpy'
54 | 			   6. Run Command : 'pip install matplotlib'
55 | 			   7. Run Command : 'pip install nltk'
56 | 		       
57 | ###########################################################################################
58 | 
59 | 
60 |  STEPS TO RUN PROGRAM:
61 | 
62 | 		> Open Command Prompt in Windows
63 | 		> Go to the directory {localpath}/FakeNews/ModelRun
64 | 		> Run the command:
65 | 			python Driver.py
66 | 		> Follow the instructions as shown in the command prompt
67 | 
68 | ##########################################################################################
69 | Project Members:
70 | ----------------
71 | Ritika Nair
72 | Shubham Rastogi
73 | Tridiv Nandi
74 | 


--------------------------------------------------------------------------------
/source/Models/NaiveBayes.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | 
  4 | class NaiveBayes():
  5 |     
  6 |     def __init__(self,pp):
  7 |         print ""
  8 |     
  9 |     def fit(self,X,y):
 10 |         ''' initialize the model '''
 11 |   
 12 |         self.X=X[:,1:]
 13 |         self.y=y
 14 |         self.docCount,self.vocabularyCount = np.shape(self.X)
 15 |         #print self.docCount,self.vocabularyCount
 16 |         self.categories,self.categoryCount=self.createFeatureDictionary()
 17 |         self.featureCount={}
 18 |         for classes in self.categories:
 19 |             self.featureCount[classes]=len(self.categories[classes])
 20 |         self.train()
 21 |         
 22 |         
 23 |             
 24 |     def createFeatureDictionary(self):
 25 |         
 26 |         ''' create dictionary for each class 
 27 |             key : class , value = {feature : frequency } ''' 
 28 |         
 29 |         categories={}
 30 |         categoryCount={}
 31 |         for yi in np.unique(self.y):
 32 |                 categories[yi]={}
 33 |                 categoryCount[yi]=len(self.y[self.y==yi])
 34 |         
 35 |         #print categoryCount
 36 |                     
 37 |         for j in range(self.docCount) :    
 38 |             for i in range(self.vocabularyCount):
 39 |                 if self.X[j][i]!=0:
 40 |                     if i not in categories[self.y[j]]:
 41 |                         categories[self.y[j]][i]=1
 42 |                     else:
 43 |                         categories[self.y[j]][i]+=1
 44 |                 else:
 45 |                     categories[self.y[j]][i]=0
 46 |         
 47 |         return categories,categoryCount
 48 |         
 49 |         
 50 |     def train(self):
 51 |         
 52 |         ''' train the model '''
 53 |         
 54 |         ''' calculate the prior probabilities
 55 |                 and conditional probabilities '''
 56 |         
 57 |         self.priorProbab={}
 58 |         self.conditionalProbab={}
 59 |         
 60 |         for classes in self.categories:
 61 |             self.priorProbab[classes]=math.log(self.featureCount[classes]/float(self.docCount))
 62 |             self.conditionalProbab[classes]={}
 63 |             for features in self.categories[classes]:
 64 |                 self.conditionalProbab[classes][features]=\
 65 |                 math.log((self.categories[classes][features] + 1) / ( float( self.featureCount[classes] + self.vocabularyCount ) ) )
 66 |         
 67 |     
 68 |     def predict(self,X_test):
 69 |         
 70 |         pred=[]
 71 |         for xi in X_test:
 72 |             pred.append(self.check(xi))
 73 |             
 74 |         return pred
 75 |     
 76 |     def check(self,x):
 77 |         ''' test the data '''
 78 |         docfeatures=[]
 79 |         for i in range(len(x)):
 80 |             if x[i]!=0:
 81 |                 docfeatures.append(i)
 82 |         
 83 |         val={}
 84 |         for classes in self.categories:
 85 |             val[classes]=self.priorProbab[classes]
 86 |         
 87 |             
 88 |         unseenProbab={}
 89 |         for classes in self.categories:
 90 |             unseenProbab[classes]=math.log(1/float(self.featureCount[classes]+self.vocabularyCount))
 91 |             
 92 |         
 93 |         for classes in self.categories:
 94 |             for feature in docfeatures:
 95 |                 if feature in self.categories[classes]:
 96 |                     val[classes]+=self.categories[classes][feature]
 97 |                 else:
 98 |                     val[classes]+=unseenProbab[classes]  
 99 |                     
100 |         sortedMap=sorted(val.iteritems(),key=lambda(k,v):(v,k),reverse=True)
101 |         
102 |         return sortedMap[0][0]
103 |             
104 | if __name__=='__main__':
105 |     
106 |     ''' testing Naive Bayes accuracy '''
107 |     n=NaiveBayes(1)
108 |     X=np.array([[0,0,0,0,0.33,0.121,0.1121,0,0,0,0,0,0,0,0.33,0.121,0.1121,0,0,0],
109 |                 [0.134,0,0.111,0.11,0,0.1231,0,0,0,0,0.134,0,0.111,0.11,0,0.1231,0,0,0,0],
110 |                 [0,0.11,0,0,0.123,1.111,1.566,0,0.221,0,0,0.11,0,0,0.123,1.111,1.566,0,0.221,0],
111 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0.11,0,0,0.123,1.111,1.566,0,0.221,0],
112 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11],
113 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11],
114 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11],
115 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11],
116 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11],
117 |                 [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11]])
118 |     y=np.array([0,1,0,1,1,1,0,0,1,0])
119 |     n.fit(X[:6], y[:6])
120 |     pred=n.predict(X[7:])
121 |     print pred
122 |     print y[7:]
123 |             


--------------------------------------------------------------------------------
/source/DataCollectionAndCleaning/crawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pandas as pd
  4 | import Queue
  5 | import threading
  6 | import sys
  7 | import re
  8 | 
  9 | TEXT_PASSAGE={}
 10 | queue=Queue.Queue()
 11 | 
 12 | ''' threaded crawler for crawling data '''
 13 | class ThreadedCrawler(threading.Thread):
 14 |     def __init__(self,queue):
 15 |         
 16 |         threading.Thread.__init__(self)
 17 |         self.queue=queue
 18 |         
 19 |        
 20 |     def run(self):
 21 |         while True:
 22 |             try:
 23 |                 index,url=self.queue.get()
 24 |                 print "thread no: "+str(threading.current_thread()) +", working on :"+str(index)
 25 |                 sys.stdout.flush()
 26 |                 page = requests.get(url)
 27 |                 text = page.text
 28 |                 soupObj = BeautifulSoup(text, 'html.parser')
 29 |                 res = ""
 30 |                 # remove any remaining image tags
 31 |                 for img in soupObj.findAll('img'):
 32 |                     img.decompose()   
 33 |             
 34 |                 # remove all formulas 
 35 |                 for mathItems in soupObj.findAll('math'):
 36 |                     mathItems.decompose()    
 37 |     
 38 |                 # remove all tables and their content
 39 |                 for table in soupObj.findAll('table'):
 40 |                     table.decompose()
 41 |         
 42 |                 # remove content navigation
 43 |                 for conNav in soupObj.findAll('div', {'id':'toc'}):
 44 |                     conNav.decompose()
 45 |     
 46 |                 # remove script
 47 |                 for scripts in soupObj.findAll('script'):
 48 |                     scripts.decompose()
 49 | 
 50 | 
 51 |                 for div in soupObj.find_all("div", {"class": re.compile("content")}):                    
 52 |                     res += div.get_text().encode('UTF-8')
 53 | 
 54 |                 # Remove newlines and extra spaces
 55 |                 res=res.strip()
 56 |                 res=res.replace("\n", "")
 57 |                 res=res.replace("\t","")
 58 |                 res = " ".join(res.split())
 59 | 
 60 |                 TEXT_PASSAGE[index]=res
 61 |             except Exception:
 62 |                 TEXT_PASSAGE[index]=""
 63 |             self.queue.task_done()
 64 |        
 65 | def main(dataFrame=pd.DataFrame() ,URL=None,listOfUrls=None):  
 66 |     threadCount=1
 67 |     if not dataFrame.empty:
 68 |         #print "Enter the column name from which urls need to be extracted:"
 69 |         urlCol='URL'#raw_input()
 70 |         #print "Enter the target column name where the crawled text need to be inserted:"
 71 |         targetCol='Text'#raw_input()
 72 |         if dataFrame.size>10:
 73 |             threadCount=abs(dataFrame.size/100)
 74 |         startThreads(threadCount)
 75 |         for index, row in dataFrame.iterrows():
 76 |             queue.put((index,row[urlCol]))
 77 |         queue.join()
 78 |         count =1
 79 |         print 'start copying'
 80 |         for index in TEXT_PASSAGE:
 81 |             text = TEXT_PASSAGE[index]
 82 |             if text == '':                
 83 |                 dataFrame.drop(index, inplace=True)
 84 |                 continue
 85 |             dataFrame.at[index, targetCol] = text
 86 |             print "Completed : " + str(count)
 87 |             count+=1
 88 |         return dataFrame
 89 |     elif listOfUrls!=None:
 90 |         if len(listOfUrls)>10:
 91 |             threadCount=abs(len(listOfUrls)/10)
 92 |         startThreads(threadCount)
 93 |         for urls in listOfUrls:
 94 |             queue.put((urls,urls))
 95 |         queue.join()
 96 |         text=[]
 97 |         return TEXT_PASSAGE
 98 |     elif URL!=None:
 99 |         startThreads(threadCount)
100 |         queue.put((URL,URL))
101 |         queue.join()
102 |         if URL in TEXT_PASSAGE:
103 |             return TEXT_PASSAGE[URL]
104 |         else: return ""
105 |           
106 | def startThreads(threadCount):
107 |     
108 |     try:
109 |         for i in range(threadCount):
110 |             print "Thread count :",i
111 |             t=ThreadedCrawler(queue)
112 |             t.setDaemon(True)
113 |             t.start()
114 |     except Exception as e:
115 |         print "error"
116 |         sys.stdout.flush()
117 |         print e.__doc__
118 |         print e.message
119 |         sys.stdout.flush()
120 |     
121 | 
122 | def crawlSingleUrl():
123 |     
124 |     ''' returns text crawled from url '''
125 |     
126 |     print "Enter url to crawl"
127 |     url=raw_input()
128 |     #text=main(dataFrame=None,URL=url,listOfUrls=None)
129 |     text=main(URL=url)
130 |     return text
131 |     
132 | def crawlListOfUrl():
133 |     
134 |     ''' returns a map {url: text} '''
135 |     
136 |     print "Enter multiple URLs comma separated (eg. http://www.wikipwdia.com, www.yahoo.com) "
137 |     line=raw_input()
138 |     urlList=line.split(',')
139 |     #text=main(dataFrame=None, URL=None,listOfUrls=urlList)
140 |     text=main(listOfUrls=urlList)
141 |     return text
142 |     
143 | def crawlUrlsFromCSV():
144 |     print "Enter the full path of the csv file: "
145 |     f=raw_input()
146 |     df = pd.read_csv(f)
147 |     df=df.astype(str)
148 |     df=main(dataFrame=df)
149 |     df.to_csv(f, index=False) 
150 | 
151 | def crawlUrlsFromDF(df):
152 |     df=df.astype(str)
153 |     df=main(dataFrame=df)
154 |     return df
155 |        
156 | def selectTask():
157 |     print "\nSelect the Task to perform using multithreaded crawler :"
158 |     print "Enter 1 : To crawl a single URL and return context text or null if the url does not exists."
159 |     print "Enter 2 : To crawl a list of URLs. and return list of textual content"
160 |     print "Enter 3 : To crawl URLs from a csv file and update the same csv on the target column"
161 |     print "Enter 4 : To exit!!!!"
162 |     options={1:crawlSingleUrl,
163 |              2:crawlListOfUrl,
164 |              3:crawlUrlsFromCSV,
165 |              4:crawlUrlsFromDF}
166 |     print "Enter Your Choice >>> "
167 |     x=input()
168 |     if x!=4:
169 |         return options[x]()
170 |      
171 | if __name__ == '__main__':
172 |     print 'Welcome to multithreaded crawler'
173 |     t=selectTask()
174 |     #print t
175 |     #main('DataSets\RealNewsDataSet1.csv')
176 |     


--------------------------------------------------------------------------------
/source/ModelRun/Driver.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import pickle
  3 | import sys
  4 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  5 | from sklearn import svm
  6 | from sklearn.ensemble import RandomForestClassifier
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.metrics.ranking import roc_auc_score
  9 | from sklearn.metrics.scorer import make_scorer
 10 | from sklearn.model_selection import GridSearchCV
 11 | from DataVisualization import Visualize
 12 | from Models.LogisticRegressionImplementation import ModelLogisticRegression
 13 | from Models.NaiveBayes import NaiveBayes
 14 | from PerformanceEvaluation import Evaluate
 15 | import numpy as np
 16 | import pandas as pd
 17 | import nltk
 18 | nltk.download('stopwords')
 19 | from nltk.corpus import stopwords
 20 | 
 21 | 
 22 | MODEL={1:"Logistic Regression Model",2:"Naive Bayes Model"}
 23 | PARAMS={1:(0.0004,0.0003,1000),2:2}
 24 | TITLE ={"ModelLogisticRegression":"Logistic Regression Model",
 25 |         "NaiveBayes":"Naive Bayes Model",
 26 |         "GridSearchCV":"Support Vector Machine Model",
 27 |         "RandomForestClassifier":"Random Forest Model"}
 28 | 
 29 | def fetchData(fileName,modelObj):
 30 |     data=pd.read_csv(fileName)
 31 |     print "Enter the size of data to train and test (max data - 20000): "
 32 |     dataSize=input()
 33 |     print "Enter the ratio of data of Train to Test data (for example: 0.8 for 80% Train to 20% test: "
 34 |     data=data[:dataSize]
 35 |     trainDataSize=int(abs(dataSize*0.8))
 36 |     testStartIndex=trainDataSize
 37 |     testEndIndex=dataSize
 38 |     
 39 |     ''' fetching data text feature from data set for training '''
 40 |     X_train=data.iloc[:trainDataSize,2].values
 41 |     
 42 |     ''' fetching real or fake  feature from data set for training '''
 43 |     y_train=data.iloc[:trainDataSize,-1].values
 44 |     
 45 |     ''' fetching data text feature from data set for testing  '''
 46 |     X_test=data.iloc[testStartIndex:testEndIndex,2].values
 47 |     
 48 |     ''' fetching data text feature from data set for testing '''
 49 |     y_test=data.iloc[testStartIndex:testEndIndex,-1].values
 50 |     
 51 |     print "The data split is as follows:"
 52 |     print "X-train :",len(X_train)
 53 |     print "Y-train :",len(y_train)
 54 |     print "X-test :",len(X_test)
 55 |     print "Y-test :",len(y_test)
 56 |     
 57 |     ''' fetch stop words list from nltk '''
 58 |     stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
 59 |     #print stopwords_
 60 |    
 61 |     ''' Optimization of feature generation based on Model '''
 62 |     
 63 |     if modelObj.__class__.__name__!='GridSearchCV':
 64 |         maxFeatures=50000
 65 |     else:
 66 |         maxFeatures=10000
 67 |     
 68 |     ''' intiallize tfidf object  '''
 69 |     ''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
 70 |                                      stop_words are removed by initializing the param stop_words using a 
 71 |                                      stop words list fetched using NLTK lib }'''    
 72 |     tfidf = TfidfVectorizer(min_df=1,max_features=maxFeatures,stop_words=stopwords_)
 73 |     
 74 |     ''' Generate TF-IDF Feature for train and test data'''
 75 |     tfidfTrain = tfidf.fit_transform(X_train).toarray()
 76 |     tfidfTest= tfidf.transform(X_test).toarray()
 77 |     
 78 |     ''' dimensions of new features generated '''
 79 |     print 'Shape of the tfidf vector :', np.shape(tfidfTrain)
 80 |     
 81 |     ''' padding constants to the generated tfidfTrain and tfidfTest '''
 82 |     constant = np.ones((len(tfidfTrain), 1))
 83 |     tfidfTrain = np.hstack((constant,tfidfTrain))
 84 |     constant2 = np.ones((len(tfidfTest), 1))
 85 |     tfidfTest = np.hstack((constant2,tfidfTest))
 86 |     
 87 |     ''' return the data split  '''
 88 |     return (tfidfTrain,y_train,tfidfTest,y_test)    
 89 |     
 90 | def runModel(modelObj):
 91 |     print "Enter the file path of the data set to be used: (currently hard coded)"
 92 |     #fileName=input()
 93 |     
 94 |     ''' fetch the data split '''
 95 |     X_train,y_train,X_test,y_test=fetchData('../DataSets/FinalDataSet.csv',modelObj)
 96 |     
 97 |     #Visualize.plotInitalData(X_train,y_train)
 98 |     
 99 |     ''' fit the Train data '''
100 |     modelObj.fit(X_train,y_train)
101 |     
102 |     ''' predict using test data '''
103 |     pred=modelObj.predict(X_test)
104 |     writeValsToPickleFile(pred,'Prediction-'+modelObj.__class__.__name__)
105 |     writeValsToPickleFile(y_test, 'Actual_data-'+modelObj.__class__.__name__)
106 |     
107 |     print "\nEvaluation on test data:\n"
108 |     ''' Evaluation of testing data and prediction : based on accuracy, precision , recall of the data  '''
109 |     mapResults=Evaluate.precision_recall_evaluation(pred, y_test)
110 |     mapResults['Accuracy']=Evaluate.accuracy(pred,y_test)
111 |     
112 |     print '\n Writing the result to a text file for reference'
113 |     writeResultsToTextFile(mapResults,TITLE[modelObj.__class__.__name__])
114 |     
115 |     print "\nVisualization\n"
116 |     ''' Visualize the output '''
117 |     Visualize.plotScatterGraphForPrediction(pred, y_test,modelObj.__class__.__name__)
118 |     if modelObj.__class__.__name__=='ModelLogisticRegression':
119 |         loss_array = modelObj.loss_array
120 |         writeValsToPickleFile(loss_array, 'loss_data-'+modelObj.__class__.__name__)
121 |         Visualize.loss_vs_iteration_plot(loss_array)    
122 |     
123 | def writeResultsToTextFile(mapResults,model):
124 |     fname="../PerformanceEvaluation/EvaluationReports/"+model+'_Evaluation_Report'+'.txt'
125 |     if os.path.exists(fname):
126 |         os.remove(fname)    
127 |     fileModel=open(fname,'w')
128 |     topic=" Evaluation Report of "+model+" "
129 |     hashLen=90-len(topic)
130 |     hashLen=hashLen/2
131 |     filler="#"*hashLen+topic+"#"*hashLen
132 |     if len(filler)<90:
133 |         filler+="#"
134 |     fileModel.write("#"*90+"\n"+filler+"\n"+"#"*90+"\n\n")
135 |     for results in mapResults:
136 |         fileModel.write(results+": "+str(mapResults[results])+"\n")
137 |     fileModel.close()
138 |     
139 | 
140 |     
141 | def selectTasks():
142 |     while True:
143 |         print "\nSelect the Model for classification:"
144 |         print "Enter 1 : Logistic Regression"
145 |         print "Enter 2 : Naive Bayes"
146 |         print "Enter 3 : Support Vector Machne Model using SKlearn library"
147 |         print "Enter 4 : Random Forest Model using SKlearn library"
148 |         print "Enter 5 : To exit!!!!"
149 |         options={1:ModelLogisticRegression,
150 |                  2:NaiveBayes,
151 |                  3:svm.SVC,
152 |                  4:RandomForestClassifier}
153 |         print "Enter Your Choice >>> "
154 |         x=input()
155 |         if x==5:
156 |             break
157 |         elif x==4:
158 |             print "Classification on Random Forest Model using SKLearn Library"
159 |             runModel(options[x](n_jobs=2,random_state=0))
160 |         elif x==3:
161 |             print "Classification on Support Vector Machine Model using SKLearn Library"
162 |             parameters={'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]}
163 |             svc=svm.SVC(kernel = 'linear', probability = True, random_state = 0)
164 |             roc_auc_scorer = make_scorer(roc_auc_score)
165 |             modelObj = GridSearchCV(svc, parameters, scoring=roc_auc_scorer)
166 |             runModel(modelObj)
167 |         else:
168 |             print "Classification on "+MODEL[x]
169 |             runModel(options[x](PARAMS[x]))
170 |             
171 | def writeValsToPickleFile(data,name):
172 |     
173 |     fName='../DataVisualization/PickleFilesForActualAndPredicted/'+name+'.pickle'
174 |     if os.path.exists(fName):
175 |         os.remove(fName)
176 |     fileIndex=open(fName,'wb')
177 |     pickle.dump(data,fileIndex)
178 |     fileIndex.close()
179 |     
180 | if __name__=='__main__':
181 |     print "Welcome to fake news classifier"
182 |     selectTasks()    


--------------------------------------------------------------------------------
/source/DataCollectionAndCleaning/DataExtractionAndFilter.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd 
  2 | import os
  3 | from sklearn.utils import shuffle
  4 | import crawler
  5 | LEN=2
  6 | 
  7 | ''' fetch the raw real news data from csv '''
  8 | 
  9 | def fetchDataFrameRealNewsCSV():
 10 |     print('Fetching real news dataset....')
 11 |     fake_news_data = pd.read_csv('../DataSets/uci-news-aggregator.csv')
 12 |     dfReal = pd.DataFrame(data=fake_news_data)
 13 |     return dfReal
 14 | 
 15 | ''' fetch the raw fake news data from csv '''
 16 | 
 17 | def fetchDataFrameFakeNewsCSV():
 18 |     print('Fetching fake news dataset....')
 19 |     fake_news_data = pd.read_csv('../DataSets/fake.csv')
 20 |     dfFake = pd.DataFrame(data=fake_news_data)
 21 |     return dfFake
 22 | 
 23 | ''' process fake news data '''
 24 | 
 25 | def processFakeNewsDataFrame(dfFake):
 26 |     pd.options.mode.chained_assignment = None
 27 |     dfFake=dfFake.astype('str')
 28 |     dfFake['country']=pd.Series('united states',index=dfFake.index)
 29 |     dfFake['Site Country']=dfFake['site_url'].apply(lambda x: str(x).split('.')[0])
 30 |     dfFake=dfFake[['site_url','title','text','author','language','country','Site Country','published']]
 31 |     dfFake['Fake ( fake =1 and Real =0)']=pd.Series('1',index=dfFake.index)
 32 |     dfFake.columns=[ 'URL', 'Title', 'Text', 'Author', 'Language', 'Site Country', 'Site Name', 'ThreadPublication Date', 'Fake ( fake =1 and Real =0)']
 33 |     print (len(dfFake['URL'][1].split('.')[1])==3)
 34 |     dfFake=dfFake.loc[(dfFake['Language']=='english') & (dfFake['Site Country']=='united states')]
 35 |     print dfFake['URL'].size
 36 |     dfFake.reindex(fill_value='')
 37 |     print "No. of fake records : ",dfFake['URL'].size
 38 |     return dfFake
 39 | 
 40 | ''' process real news data '''
 41 | 
 42 | def processRealNewsDataFrame(dfReal):
 43 |     pd.options.mode.chained_assignment = None
 44 |     dfReal=dfReal.loc[(dfReal['CATEGORY']=='b') & (dfReal['URL']!='') & (dfReal['URL']!=None)] #& dfReal.URL.str.contains('^http') ]
 45 |     dfReal['Text']=pd.Series('',index=dfReal.index)
 46 |     dfReal['Fake ( fake =1 and Real =0)']=pd.Series('0',index=dfReal.index)
 47 |     dfReal['Language']=pd.Series('english',index=dfReal.index)
 48 |     dfReal['Site Country']=pd.Series('united states',index=dfReal.index)
 49 |     dfReal=dfReal.drop_duplicates('URL')
 50 |     dfReal=dfReal[['URL','TITLE','Text','PUBLISHER','Language','Site Country','HOSTNAME','TIMESTAMP','Fake ( fake =1 and Real =0)']]
 51 |     dfReal.columns=[ 'URL', 'Title', 'Text', 'Author', 'Language', 'Site Country', 'Site Name', 'ThreadPublication Date', 'Fake ( fake =1 and Real =0)']
 52 |     dfReal=dfReal.astype('str')
 53 |     dfReal.reindex()
 54 |     print 'No. of Real news records: ',dfReal['URL'].size
 55 |     return dfReal
 56 | 
 57 | ''' extract top real news urls for crawling '''
 58 | 
 59 | def extractTopRealResultsForCrawling(dfReal):
 60 |     print "Retrieve top 20000 Real news data"
 61 |     num=dfReal.size
 62 |     loop=num/10000
 63 |     listOfIndex=[]
 64 |     df=[]
 65 |     for i in range(0,loop):
 66 |         listOfIndex.append(dfReal[i*10000:(i+1)*10000])
 67 |         df+=[dfReal[i*10000:(i+1)*10000]]
 68 |     
 69 |     #print "length of dataframe array retrieved:",len(df[0])
 70 |     return df[:LEN]
 71 | 
 72 | ''' filter text records that is not relevant or null '''
 73 | 
 74 | def filterNullTextContentRecords(df):
 75 |     keyList =['Page Not Found','The item that you have requested was not found','The address was entered incorrectly',\
 76 |               'The item no longer exists','There has been an error on the site','We apologize for any inconvenience','font-size,font-family',\
 77 |               'text-align','404 - File or directory not found','The resource you are looking for might have been removed', \
 78 |               'had its name changed', 'or is temporarily unavailable.','Return to the previous page',\
 79 |               'If you feel the address you entered is correct you can contact us',\
 80 |                'mentioning the error message received and the item you were trying to reach','It looks like nothing was found at this location.',\
 81 |                'Well, this is unfortunate','Your story was not found','The story you requested could not be found',\
 82 |                '404 - File or directory not found','PAGE NOT FOUND','We\'re sorry that the page you\'re looking for cannot be found','Page Not Found - 404',\
 83 |                'Sorry, but the page you were looking for is not here','This is usually the result of a bad or outdated link','ERROR404',\
 84 |                'The case of this missing page is still unsolved','Return to the previous page','The item that you have requested was not found',\
 85 |                'The case of this missing page is still unsolved','The page may no longer exist or may have moved to another web address',\
 86 |                'The page you were looking for cannot be found','The page you requested cannot be found',\
 87 |                'Either it doesn\'t exist or it was removed from the site','It looks like nothing was found at this location',\
 88 |                'This Page Could Not Be Found','404 Sorry, the page you have searched for doesnt exist','Nothing was found at this location',\
 89 |                'ERROR404The case of this missing page is still unsolved','Error 404 Nothing found','404 - File or directory not found',\
 90 |                'Oh no!No content to show for this page','404 The resource or page you are looking for could have been removed, had its name changed, or is temporarily unavailable',\
 91 |                'Sorry, the page you are looking for cannot be found','Oops! Page Not Found','Oops! Page Not Found',\
 92 |                '404We\'re sorry, but the page you were looking for doesn\'t exist','Page not found','Pardon Our Interruption',\
 93 |                '500 - Internal server error','Not found, error 404','The page you are looking for no longer exists','Oops, This Page Could Not Be Found',\
 94 |                'The page you\'ve requested can not be displayed','It appears you\'ve missed your intended destination, either through a bad or outdated link',\
 95 |                'This might be because:You have typed the web address incorrectly, or the page you were looking for may have been moved, updated or deleted',\
 96 |                'we couldn\'t find the page you were looking for','500 - Internal server error.There is a problem with the resource you are looking for, and it cannot be displayed',\
 97 |                'We haven\'t been able to serve the page you asked for','We\'re sorry, but we seem to have lost this page','PAGE NOT FOUND',\
 98 |                'We\'re sorry,the page you requested could not be found']   # print keyList
 99 |     print 'No. of records: ',df['URL'].size
100 |     pd.options.mode.chained_assignment = None
101 |     for index,row in df.iterrows():
102 |         if checkText(str(row['Text']),keyList) or len(str(row['Text']))<300:
103 |             df.drop(index, inplace=True)
104 |     df.reindex()
105 |      
106 | ''' test whether the key words exist in the given text '''
107 | 
108 | def checkText(text,keyWords):
109 |     
110 |     for key in keyWords:
111 |         if key in text:
112 |             return True
113 |     return False
114 | 
115 | ''' combine and shuffle the data '''  
116 |    
117 | def combineAndShuffle(dfCombine):
118 |     combinedDf = pd.concat(dfCombine)
119 |     combinedDf = shuffle(combinedDf)
120 |     return  combinedDf   
121 | 
122 | ''' extract real and fake news data '''
123 |     
124 | def dataSetExtraction():
125 |     dfReal=fetchDataFrameRealNewsCSV()
126 |     dfFake=fetchDataFrameFakeNewsCSV()
127 |     dfFake=processFakeNewsDataFrame(dfFake) 
128 |     dfReal=processRealNewsDataFrame(dfReal)
129 |     dfCombine=[]
130 |     for d in extractTopRealResultsForCrawling(dfReal):
131 |         print 'len of datadrame :',d['URL'].size
132 |         #d=d[:100]
133 |         d=crawler.crawlUrlsFromDF(d)
134 |         d=filterNullTextContentRecords(d)
135 |         dfCombine+=[d]
136 |     dfCombine+=[dfFake]
137 |     df=combineAndShuffle(dfCombine)
138 |     if os.path.exists('../DataSets/FinalDataSet.csv'):
139 |         os.remove('../DataSets/FinalDataSet.csv')    
140 |     df.to_csv('../DataSets/FinalDataSet.csv', index=False)
141 |     print 'No. of records in final data set: ',df['URL'].size
142 |     print "Saving New CSV file"
143 | 
144 | if __name__=='__main__':dataSetExtraction()


--------------------------------------------------------------------------------