├── source ├── ModelRun │ ├── __init__.py │ └── Driver.py ├── Models │ ├── __init__.py │ ├── LogisticRegressionImplementation.py │ └── NaiveBayes.py ├── PerformanceEvaluation │ ├── __init__.py │ ├── EvaluationReports │ │ ├── Naive Bayes Model_Evaluation_Report.txt │ │ ├── Random Forest Model_Evaluation_Report.txt │ │ ├── Logistic Regression Model_Evaluation_Report.txt │ │ └── Support Vector Machine Model_Evaluation_Report.txt │ └── Evaluate.py ├── DataCollectionAndCleaning │ ├── __init__.py │ ├── crawler.py │ └── DataExtractionAndFilter.py └── DataVisualization │ ├── __init__.py │ ├── Plot_Images │ ├── LossVsIterationPlot.png │ ├── Naive Bayes Model-scatterPlot.png │ ├── Random Forest Model-scatterPlot.png │ ├── Logistic Regression Model-scatterPlot.png │ └── Support Vector Machine Model-scatterPlot.png │ └── Visualize.py ├── FakeNewsDetectionProjectReport.pdf └── README.txt /source/ModelRun/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/Models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/PerformanceEvaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/DataCollectionAndCleaning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/DataVisualization/__init__.py: -------------------------------------------------------------------------------- 1 | import Visualize -------------------------------------------------------------------------------- /FakeNewsDetectionProjectReport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/FakeNewsDetectionProjectReport.pdf -------------------------------------------------------------------------------- /source/DataVisualization/Plot_Images/LossVsIterationPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/LossVsIterationPlot.png -------------------------------------------------------------------------------- /source/DataVisualization/Plot_Images/Naive Bayes Model-scatterPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Naive Bayes Model-scatterPlot.png -------------------------------------------------------------------------------- /source/DataVisualization/Plot_Images/Random Forest Model-scatterPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Random Forest Model-scatterPlot.png -------------------------------------------------------------------------------- /source/DataVisualization/Plot_Images/Logistic Regression Model-scatterPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Logistic Regression Model-scatterPlot.png -------------------------------------------------------------------------------- /source/DataVisualization/Plot_Images/Support Vector Machine Model-scatterPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritikavnair/Fake-News-Detection/HEAD/source/DataVisualization/Plot_Images/Support Vector Machine Model-scatterPlot.png -------------------------------------------------------------------------------- /source/PerformanceEvaluation/EvaluationReports/Naive Bayes Model_Evaluation_Report.txt: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | ######################### Evaluation Report of Naive Bayes Model ######################### 3 | ########################################################################################## 4 | 5 | True Negative: 756 6 | False Positive: 1048 7 | Recall: 0.70 8 | Precision: 0.59 9 | False Negative: 658 10 | True Positive: 1538 11 | Accuracy: 0.5735 12 | -------------------------------------------------------------------------------- /source/PerformanceEvaluation/EvaluationReports/Random Forest Model_Evaluation_Report.txt: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | ######################## Evaluation Report of Random Forest Model ######################## 3 | ########################################################################################## 4 | 5 | True Negative: 1756 6 | False Positive: 48 7 | Recall: 0.96 8 | Precision: 0.98 9 | False Negative: 82 10 | True Positive: 2114 11 | Accuracy: 0.9675 12 | -------------------------------------------------------------------------------- /source/PerformanceEvaluation/EvaluationReports/Logistic Regression Model_Evaluation_Report.txt: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | ##################### Evaluation Report of Logistic Regression Model ##################### 3 | ########################################################################################## 4 | 5 | True Negative: 1750 6 | False Positive: 54 7 | Recall: 0.98 8 | Precision: 0.98 9 | False Negative: 41 10 | True Positive: 2155 11 | Accuracy: 0.97625 12 | -------------------------------------------------------------------------------- /source/PerformanceEvaluation/EvaluationReports/Support Vector Machine Model_Evaluation_Report.txt: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | ################### Evaluation Report of Support Vector Machine Model #################### 3 | ########################################################################################## 4 | 5 | True Negative: 1754 6 | False Positive: 45 7 | Recall: 0.97 8 | Precision: 0.98 9 | False Negative: 76 10 | True Positive: 2125 11 | Accuracy: 0.97 12 | -------------------------------------------------------------------------------- /source/DataVisualization/Visualize.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | TITLE ={"ModelLogisticRegression":"Logistic Regression Model", 5 | "NaiveBayes":"Naive Bayes Model", 6 | "GridSearchCV":"Support Vector Machine Model", 7 | "RandomForestClassifier":"Random Forest Model"} 8 | 9 | 10 | def plotScatterGraphForPrediction(prediction ,y_test,className): 11 | 12 | ''' scatter plot ''' 13 | 14 | #fig=plt.figure() 15 | plt.ylim(-1, 2) 16 | ax = plt.subplots(nrows=2, ncols=1, figsize=(7,7))[1] 17 | plt.title('Fake news vs Real News -> '+TITLE[className]) 18 | ax[0].scatter(xrange(len(prediction)), prediction, color='red') 19 | ax[0].set_title='Prediction' 20 | ax[0].plot() 21 | ax[1].scatter(xrange(len(y_test)), y_test, color='green') 22 | ax[1].set_title='Actual' 23 | ax[1].plot() 24 | #fig.add(ax) 25 | plt.savefig('../DataVisualization/Plot_Images/'+TITLE[className]+'-scatterPlot.png') 26 | plt.show() 27 | 28 | def loss_vs_iteration_plot(loss_array): 29 | 30 | ''' plot loss v/s iteration plot ''' 31 | 32 | plt.title('loss vs iteration') 33 | plt.plot(xrange(len(loss_array)), loss_array) 34 | plt.savefig('../DataVisualization/Plot_Images/LossVsIterationPlot.png') 35 | plt.show() 36 | -------------------------------------------------------------------------------- /source/Models/LogisticRegressionImplementation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ''' Logistic Regression Model class ''' 4 | class ModelLogisticRegression: 5 | 6 | ''' init method ''' 7 | def __init__(self,params): #learning_rate,threshold_tolerance=0.005,maximum_iterations=1000): 8 | self.maximum_iterations = params[2]#maximum_iterations 9 | self.threshold_tolerance = params[1]#threshold_tolerance 10 | self.learning_rate = params[0]#learning_rate 11 | 12 | ''' train the model ''' 13 | def fit(self,X,y): 14 | 15 | iterations = 1 16 | self.loss_array = [] 17 | self.weight = np.array([0] * len(X[0])) 18 | loss_difference = float('inf') 19 | loss = self.calculatelogisticLoss(X, y) 20 | while iterations < self.maximum_iterations and loss_difference > self.threshold_tolerance: 21 | iterations = iterations + 1 22 | scores = [] 23 | for x in X: 24 | scores.append(np.dot(self.weight.T,x)) 25 | error = sigmoid(scores) - y 26 | gradient = np.dot(X.T,error) 27 | self.weight = self.weight - (self.learning_rate * gradient) 28 | loss_difference = abs(loss - self.calculatelogisticLoss(X,y)) 29 | loss = self.calculatelogisticLoss(X,y) 30 | self.loss_array.append(loss) 31 | 32 | ''' calculation of logistic Loss ''' 33 | def calculatelogisticLoss(self,X,y): 34 | res = [] 35 | for x in X: 36 | res.append(np.dot(self.weight.T,x)) 37 | return -1 * (np.sum((y * np.log(sigmoid(res))) + ((1 - y) * np.log(1 - sigmoid(res))))) 38 | 39 | ''' test the model ''' 40 | def predict(self, X): 41 | res = [] 42 | for x in X: 43 | res.append(np.dot(self.weight.T,x)) 44 | return np.round(sigmoid(res)) 45 | 46 | 47 | ''' compute the sigmoid ''' 48 | def sigmoid(scores): 49 | res = [] 50 | for score in scores: 51 | res.append(1 / (1 + np.exp(-score))) 52 | return np.array(res) -------------------------------------------------------------------------------- /source/PerformanceEvaluation/Evaluate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def accuracy(pred,y_test): 4 | 5 | ''' compute accuracy ''' 6 | 7 | prediction=np.array(pred).astype(int) 8 | y_test=np.array(y_test).astype(int) 9 | count=0 10 | for i in range (0,len(y_test)): 11 | if prediction[i]==y_test[i]: 12 | count+=1 13 | accuracy=(count/float(len(y_test))) 14 | print "Accuracy on prediction :",accuracy 15 | 16 | return accuracy 17 | 18 | 19 | def precision_recall_evaluation(prediction, y_test): 20 | 21 | ''' calculate precision, recall ''' 22 | 23 | true_positive = 0; 24 | false_positive = 0; 25 | false_negative = 0; 26 | true_negative = 0; 27 | 28 | for index in range(len(prediction)): 29 | if y_test[index] == 1 and prediction[index] == 1: 30 | true_positive += 1 31 | elif y_test[index] == 0 and prediction[index] == 1: 32 | false_positive += 1 33 | elif y_test[index] == 1 and prediction[index] == 0: 34 | false_negative += 1 35 | else: 36 | true_negative+=1 37 | 38 | ''' confusion matrix data ''' 39 | 40 | print('True Positive', true_positive) 41 | print('False Positive', false_positive) 42 | print('False Negative', false_negative) 43 | print('True Negative', true_negative) 44 | 45 | precision = true_positive / (float) (true_positive + false_positive) 46 | recall = true_positive / (float) (true_positive + false_negative) 47 | 48 | print 'Precision: {0:0.2f}'.format(precision) 49 | print 'Recall: {0:0.2f}'.format(recall) 50 | 51 | reportResults={'True Positive':true_positive, 52 | 'True Negative':true_negative, 53 | 'False Positive':false_positive, 54 | 'False Negative':false_negative, 55 | 'Precision':'{0:0.2f}'.format(precision), 56 | 'Recall': '{0:0.2f}'.format(recall)} 57 | return reportResults -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | ############################################################################################# 2 | #################################### FINAL PROJECT ############################ 3 | ############################################################################################# 4 | 5 | GOAL: Detection of Fake News using various Machine Learning Classifier algorithms and evaluation of their 6 | performance 7 | 8 | SUBMISSION SUMMARY: 9 | 10 | Task 1: Generation of DataSet 11 | 12 | > A. The News Aggregator Dataset from the UCI Machine Learning Repository was used to extract real 13 | news. This dataset consists of links to the originally published news articles in their websites. 14 | We extracted these URLS and crawled them to download the news content using Beauti-fulSoup. 15 | > B. For fake news we used Kaggle’s ‘Getting Real about Fake News’ dataset. 16 | The CSV file with data was available off the shelf for use, and we had to perform minimal text 17 | pro-cessing on this data. 18 | 19 | Task 2: Implementation of Classifier Algorithms: 20 | 21 | > A. Implemented Logistic Regression Algorithm from scratch and test it against the dataset. 22 | > B. Implemented Naive Bayes Classifier Algorithm from scratch and test it against the dataset. 23 | > C. Implemented Random Forest Classifier Algorithm using 'scikit-learn' library and integrate it with 24 | our data set. 25 | > D. Implemented Support Vector Machine Classifier Algorithm using 'scikit-learn' library and integrate 26 | it with our data set. 27 | 28 | Task 3: Implementation of Evaluation Measure: 29 | 30 | > A. Implemented methods to calculate accuracy of the prediction algorithms. 31 | > B. Implemented methods to calculate precision of the prediction algorithms. 32 | > C. Implemented methods to calculate recall of the prediction algorithms. 33 | 34 | Task 4: Implementation of Data Visualization code via graph plots: 35 | 36 | > A. Implemented methods to plot iteration vs loss graph for Logistic Regression. 37 | > B. Implemented methods to generate scatter plot of predictions and actual results against data set. 38 | 39 | 40 | 41 | ############################################################################################ 42 | 43 | INSTALLATION GUIDE: 44 | 45 | > Download Python 2.7 from : "https://www.python.org/download/releases/2.7/" 46 | > Set Environment variables for Python [for detailed steps refer : 47 | "https://docs.python.org/2/using/windows.html" ] 48 | > Install BeautifulSoup by the following the below steps: 49 | 1. Open command prompt (cmd) in Windows. 50 | 2. Run Command : 'pip install BeautifulSoup4' 51 | 3. Run Command : 'pip install sklearn' 52 | 4. Run Command : 'pip install pandas' 53 | 5. Run Command : 'pip install numpy' 54 | 6. Run Command : 'pip install matplotlib' 55 | 7. Run Command : 'pip install nltk' 56 | 57 | ########################################################################################### 58 | 59 | 60 | STEPS TO RUN PROGRAM: 61 | 62 | > Open Command Prompt in Windows 63 | > Go to the directory {localpath}/FakeNews/ModelRun 64 | > Run the command: 65 | python Driver.py 66 | > Follow the instructions as shown in the command prompt 67 | 68 | ########################################################################################## 69 | Project Members: 70 | ---------------- 71 | Ritika Nair 72 | Shubham Rastogi 73 | Tridiv Nandi 74 | -------------------------------------------------------------------------------- /source/Models/NaiveBayes.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | class NaiveBayes(): 5 | 6 | def __init__(self,pp): 7 | print "" 8 | 9 | def fit(self,X,y): 10 | ''' initialize the model ''' 11 | 12 | self.X=X[:,1:] 13 | self.y=y 14 | self.docCount,self.vocabularyCount = np.shape(self.X) 15 | #print self.docCount,self.vocabularyCount 16 | self.categories,self.categoryCount=self.createFeatureDictionary() 17 | self.featureCount={} 18 | for classes in self.categories: 19 | self.featureCount[classes]=len(self.categories[classes]) 20 | self.train() 21 | 22 | 23 | 24 | def createFeatureDictionary(self): 25 | 26 | ''' create dictionary for each class 27 | key : class , value = {feature : frequency } ''' 28 | 29 | categories={} 30 | categoryCount={} 31 | for yi in np.unique(self.y): 32 | categories[yi]={} 33 | categoryCount[yi]=len(self.y[self.y==yi]) 34 | 35 | #print categoryCount 36 | 37 | for j in range(self.docCount) : 38 | for i in range(self.vocabularyCount): 39 | if self.X[j][i]!=0: 40 | if i not in categories[self.y[j]]: 41 | categories[self.y[j]][i]=1 42 | else: 43 | categories[self.y[j]][i]+=1 44 | else: 45 | categories[self.y[j]][i]=0 46 | 47 | return categories,categoryCount 48 | 49 | 50 | def train(self): 51 | 52 | ''' train the model ''' 53 | 54 | ''' calculate the prior probabilities 55 | and conditional probabilities ''' 56 | 57 | self.priorProbab={} 58 | self.conditionalProbab={} 59 | 60 | for classes in self.categories: 61 | self.priorProbab[classes]=math.log(self.featureCount[classes]/float(self.docCount)) 62 | self.conditionalProbab[classes]={} 63 | for features in self.categories[classes]: 64 | self.conditionalProbab[classes][features]=\ 65 | math.log((self.categories[classes][features] + 1) / ( float( self.featureCount[classes] + self.vocabularyCount ) ) ) 66 | 67 | 68 | def predict(self,X_test): 69 | 70 | pred=[] 71 | for xi in X_test: 72 | pred.append(self.check(xi)) 73 | 74 | return pred 75 | 76 | def check(self,x): 77 | ''' test the data ''' 78 | docfeatures=[] 79 | for i in range(len(x)): 80 | if x[i]!=0: 81 | docfeatures.append(i) 82 | 83 | val={} 84 | for classes in self.categories: 85 | val[classes]=self.priorProbab[classes] 86 | 87 | 88 | unseenProbab={} 89 | for classes in self.categories: 90 | unseenProbab[classes]=math.log(1/float(self.featureCount[classes]+self.vocabularyCount)) 91 | 92 | 93 | for classes in self.categories: 94 | for feature in docfeatures: 95 | if feature in self.categories[classes]: 96 | val[classes]+=self.categories[classes][feature] 97 | else: 98 | val[classes]+=unseenProbab[classes] 99 | 100 | sortedMap=sorted(val.iteritems(),key=lambda(k,v):(v,k),reverse=True) 101 | 102 | return sortedMap[0][0] 103 | 104 | if __name__=='__main__': 105 | 106 | ''' testing Naive Bayes accuracy ''' 107 | n=NaiveBayes(1) 108 | X=np.array([[0,0,0,0,0.33,0.121,0.1121,0,0,0,0,0,0,0,0.33,0.121,0.1121,0,0,0], 109 | [0.134,0,0.111,0.11,0,0.1231,0,0,0,0,0.134,0,0.111,0.11,0,0.1231,0,0,0,0], 110 | [0,0.11,0,0,0.123,1.111,1.566,0,0.221,0,0,0.11,0,0,0.123,1.111,1.566,0,0.221,0], 111 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0.11,0,0,0.123,1.111,1.566,0,0.221,0], 112 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11], 113 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11], 114 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11], 115 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11], 116 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11], 117 | [0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11,0,0,0.12,0,0.01,1.111,1.566,0,0.221,0.11]]) 118 | y=np.array([0,1,0,1,1,1,0,0,1,0]) 119 | n.fit(X[:6], y[:6]) 120 | pred=n.predict(X[7:]) 121 | print pred 122 | print y[7:] 123 | -------------------------------------------------------------------------------- /source/DataCollectionAndCleaning/crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | import Queue 5 | import threading 6 | import sys 7 | import re 8 | 9 | TEXT_PASSAGE={} 10 | queue=Queue.Queue() 11 | 12 | ''' threaded crawler for crawling data ''' 13 | class ThreadedCrawler(threading.Thread): 14 | def __init__(self,queue): 15 | 16 | threading.Thread.__init__(self) 17 | self.queue=queue 18 | 19 | 20 | def run(self): 21 | while True: 22 | try: 23 | index,url=self.queue.get() 24 | print "thread no: "+str(threading.current_thread()) +", working on :"+str(index) 25 | sys.stdout.flush() 26 | page = requests.get(url) 27 | text = page.text 28 | soupObj = BeautifulSoup(text, 'html.parser') 29 | res = "" 30 | # remove any remaining image tags 31 | for img in soupObj.findAll('img'): 32 | img.decompose() 33 | 34 | # remove all formulas 35 | for mathItems in soupObj.findAll('math'): 36 | mathItems.decompose() 37 | 38 | # remove all tables and their content 39 | for table in soupObj.findAll('table'): 40 | table.decompose() 41 | 42 | # remove content navigation 43 | for conNav in soupObj.findAll('div', {'id':'toc'}): 44 | conNav.decompose() 45 | 46 | # remove script 47 | for scripts in soupObj.findAll('script'): 48 | scripts.decompose() 49 | 50 | 51 | for div in soupObj.find_all("div", {"class": re.compile("content")}): 52 | res += div.get_text().encode('UTF-8') 53 | 54 | # Remove newlines and extra spaces 55 | res=res.strip() 56 | res=res.replace("\n", "") 57 | res=res.replace("\t","") 58 | res = " ".join(res.split()) 59 | 60 | TEXT_PASSAGE[index]=res 61 | except Exception: 62 | TEXT_PASSAGE[index]="" 63 | self.queue.task_done() 64 | 65 | def main(dataFrame=pd.DataFrame() ,URL=None,listOfUrls=None): 66 | threadCount=1 67 | if not dataFrame.empty: 68 | #print "Enter the column name from which urls need to be extracted:" 69 | urlCol='URL'#raw_input() 70 | #print "Enter the target column name where the crawled text need to be inserted:" 71 | targetCol='Text'#raw_input() 72 | if dataFrame.size>10: 73 | threadCount=abs(dataFrame.size/100) 74 | startThreads(threadCount) 75 | for index, row in dataFrame.iterrows(): 76 | queue.put((index,row[urlCol])) 77 | queue.join() 78 | count =1 79 | print 'start copying' 80 | for index in TEXT_PASSAGE: 81 | text = TEXT_PASSAGE[index] 82 | if text == '': 83 | dataFrame.drop(index, inplace=True) 84 | continue 85 | dataFrame.at[index, targetCol] = text 86 | print "Completed : " + str(count) 87 | count+=1 88 | return dataFrame 89 | elif listOfUrls!=None: 90 | if len(listOfUrls)>10: 91 | threadCount=abs(len(listOfUrls)/10) 92 | startThreads(threadCount) 93 | for urls in listOfUrls: 94 | queue.put((urls,urls)) 95 | queue.join() 96 | text=[] 97 | return TEXT_PASSAGE 98 | elif URL!=None: 99 | startThreads(threadCount) 100 | queue.put((URL,URL)) 101 | queue.join() 102 | if URL in TEXT_PASSAGE: 103 | return TEXT_PASSAGE[URL] 104 | else: return "" 105 | 106 | def startThreads(threadCount): 107 | 108 | try: 109 | for i in range(threadCount): 110 | print "Thread count :",i 111 | t=ThreadedCrawler(queue) 112 | t.setDaemon(True) 113 | t.start() 114 | except Exception as e: 115 | print "error" 116 | sys.stdout.flush() 117 | print e.__doc__ 118 | print e.message 119 | sys.stdout.flush() 120 | 121 | 122 | def crawlSingleUrl(): 123 | 124 | ''' returns text crawled from url ''' 125 | 126 | print "Enter url to crawl" 127 | url=raw_input() 128 | #text=main(dataFrame=None,URL=url,listOfUrls=None) 129 | text=main(URL=url) 130 | return text 131 | 132 | def crawlListOfUrl(): 133 | 134 | ''' returns a map {url: text} ''' 135 | 136 | print "Enter multiple URLs comma separated (eg. http://www.wikipwdia.com, www.yahoo.com) " 137 | line=raw_input() 138 | urlList=line.split(',') 139 | #text=main(dataFrame=None, URL=None,listOfUrls=urlList) 140 | text=main(listOfUrls=urlList) 141 | return text 142 | 143 | def crawlUrlsFromCSV(): 144 | print "Enter the full path of the csv file: " 145 | f=raw_input() 146 | df = pd.read_csv(f) 147 | df=df.astype(str) 148 | df=main(dataFrame=df) 149 | df.to_csv(f, index=False) 150 | 151 | def crawlUrlsFromDF(df): 152 | df=df.astype(str) 153 | df=main(dataFrame=df) 154 | return df 155 | 156 | def selectTask(): 157 | print "\nSelect the Task to perform using multithreaded crawler :" 158 | print "Enter 1 : To crawl a single URL and return context text or null if the url does not exists." 159 | print "Enter 2 : To crawl a list of URLs. and return list of textual content" 160 | print "Enter 3 : To crawl URLs from a csv file and update the same csv on the target column" 161 | print "Enter 4 : To exit!!!!" 162 | options={1:crawlSingleUrl, 163 | 2:crawlListOfUrl, 164 | 3:crawlUrlsFromCSV, 165 | 4:crawlUrlsFromDF} 166 | print "Enter Your Choice >>> " 167 | x=input() 168 | if x!=4: 169 | return options[x]() 170 | 171 | if __name__ == '__main__': 172 | print 'Welcome to multithreaded crawler' 173 | t=selectTask() 174 | #print t 175 | #main('DataSets\RealNewsDataSet1.csv') 176 | -------------------------------------------------------------------------------- /source/ModelRun/Driver.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import pickle 3 | import sys 4 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 5 | from sklearn import svm 6 | from sklearn.ensemble import RandomForestClassifier 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.metrics.ranking import roc_auc_score 9 | from sklearn.metrics.scorer import make_scorer 10 | from sklearn.model_selection import GridSearchCV 11 | from DataVisualization import Visualize 12 | from Models.LogisticRegressionImplementation import ModelLogisticRegression 13 | from Models.NaiveBayes import NaiveBayes 14 | from PerformanceEvaluation import Evaluate 15 | import numpy as np 16 | import pandas as pd 17 | import nltk 18 | nltk.download('stopwords') 19 | from nltk.corpus import stopwords 20 | 21 | 22 | MODEL={1:"Logistic Regression Model",2:"Naive Bayes Model"} 23 | PARAMS={1:(0.0004,0.0003,1000),2:2} 24 | TITLE ={"ModelLogisticRegression":"Logistic Regression Model", 25 | "NaiveBayes":"Naive Bayes Model", 26 | "GridSearchCV":"Support Vector Machine Model", 27 | "RandomForestClassifier":"Random Forest Model"} 28 | 29 | def fetchData(fileName,modelObj): 30 | data=pd.read_csv(fileName) 31 | print "Enter the size of data to train and test (max data - 20000): " 32 | dataSize=input() 33 | print "Enter the ratio of data of Train to Test data (for example: 0.8 for 80% Train to 20% test: " 34 | data=data[:dataSize] 35 | trainDataSize=int(abs(dataSize*0.8)) 36 | testStartIndex=trainDataSize 37 | testEndIndex=dataSize 38 | 39 | ''' fetching data text feature from data set for training ''' 40 | X_train=data.iloc[:trainDataSize,2].values 41 | 42 | ''' fetching real or fake feature from data set for training ''' 43 | y_train=data.iloc[:trainDataSize,-1].values 44 | 45 | ''' fetching data text feature from data set for testing ''' 46 | X_test=data.iloc[testStartIndex:testEndIndex,2].values 47 | 48 | ''' fetching data text feature from data set for testing ''' 49 | y_test=data.iloc[testStartIndex:testEndIndex,-1].values 50 | 51 | print "The data split is as follows:" 52 | print "X-train :",len(X_train) 53 | print "Y-train :",len(y_train) 54 | print "X-test :",len(X_test) 55 | print "Y-test :",len(y_test) 56 | 57 | ''' fetch stop words list from nltk ''' 58 | stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))] 59 | #print stopwords_ 60 | 61 | ''' Optimization of feature generation based on Model ''' 62 | 63 | if modelObj.__class__.__name__!='GridSearchCV': 64 | maxFeatures=50000 65 | else: 66 | maxFeatures=10000 67 | 68 | ''' intiallize tfidf object ''' 69 | ''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast, 70 | stop_words are removed by initializing the param stop_words using a 71 | stop words list fetched using NLTK lib }''' 72 | tfidf = TfidfVectorizer(min_df=1,max_features=maxFeatures,stop_words=stopwords_) 73 | 74 | ''' Generate TF-IDF Feature for train and test data''' 75 | tfidfTrain = tfidf.fit_transform(X_train).toarray() 76 | tfidfTest= tfidf.transform(X_test).toarray() 77 | 78 | ''' dimensions of new features generated ''' 79 | print 'Shape of the tfidf vector :', np.shape(tfidfTrain) 80 | 81 | ''' padding constants to the generated tfidfTrain and tfidfTest ''' 82 | constant = np.ones((len(tfidfTrain), 1)) 83 | tfidfTrain = np.hstack((constant,tfidfTrain)) 84 | constant2 = np.ones((len(tfidfTest), 1)) 85 | tfidfTest = np.hstack((constant2,tfidfTest)) 86 | 87 | ''' return the data split ''' 88 | return (tfidfTrain,y_train,tfidfTest,y_test) 89 | 90 | def runModel(modelObj): 91 | print "Enter the file path of the data set to be used: (currently hard coded)" 92 | #fileName=input() 93 | 94 | ''' fetch the data split ''' 95 | X_train,y_train,X_test,y_test=fetchData('../DataSets/FinalDataSet.csv',modelObj) 96 | 97 | #Visualize.plotInitalData(X_train,y_train) 98 | 99 | ''' fit the Train data ''' 100 | modelObj.fit(X_train,y_train) 101 | 102 | ''' predict using test data ''' 103 | pred=modelObj.predict(X_test) 104 | writeValsToPickleFile(pred,'Prediction-'+modelObj.__class__.__name__) 105 | writeValsToPickleFile(y_test, 'Actual_data-'+modelObj.__class__.__name__) 106 | 107 | print "\nEvaluation on test data:\n" 108 | ''' Evaluation of testing data and prediction : based on accuracy, precision , recall of the data ''' 109 | mapResults=Evaluate.precision_recall_evaluation(pred, y_test) 110 | mapResults['Accuracy']=Evaluate.accuracy(pred,y_test) 111 | 112 | print '\n Writing the result to a text file for reference' 113 | writeResultsToTextFile(mapResults,TITLE[modelObj.__class__.__name__]) 114 | 115 | print "\nVisualization\n" 116 | ''' Visualize the output ''' 117 | Visualize.plotScatterGraphForPrediction(pred, y_test,modelObj.__class__.__name__) 118 | if modelObj.__class__.__name__=='ModelLogisticRegression': 119 | loss_array = modelObj.loss_array 120 | writeValsToPickleFile(loss_array, 'loss_data-'+modelObj.__class__.__name__) 121 | Visualize.loss_vs_iteration_plot(loss_array) 122 | 123 | def writeResultsToTextFile(mapResults,model): 124 | fname="../PerformanceEvaluation/EvaluationReports/"+model+'_Evaluation_Report'+'.txt' 125 | if os.path.exists(fname): 126 | os.remove(fname) 127 | fileModel=open(fname,'w') 128 | topic=" Evaluation Report of "+model+" " 129 | hashLen=90-len(topic) 130 | hashLen=hashLen/2 131 | filler="#"*hashLen+topic+"#"*hashLen 132 | if len(filler)<90: 133 | filler+="#" 134 | fileModel.write("#"*90+"\n"+filler+"\n"+"#"*90+"\n\n") 135 | for results in mapResults: 136 | fileModel.write(results+": "+str(mapResults[results])+"\n") 137 | fileModel.close() 138 | 139 | 140 | 141 | def selectTasks(): 142 | while True: 143 | print "\nSelect the Model for classification:" 144 | print "Enter 1 : Logistic Regression" 145 | print "Enter 2 : Naive Bayes" 146 | print "Enter 3 : Support Vector Machne Model using SKlearn library" 147 | print "Enter 4 : Random Forest Model using SKlearn library" 148 | print "Enter 5 : To exit!!!!" 149 | options={1:ModelLogisticRegression, 150 | 2:NaiveBayes, 151 | 3:svm.SVC, 152 | 4:RandomForestClassifier} 153 | print "Enter Your Choice >>> " 154 | x=input() 155 | if x==5: 156 | break 157 | elif x==4: 158 | print "Classification on Random Forest Model using SKLearn Library" 159 | runModel(options[x](n_jobs=2,random_state=0)) 160 | elif x==3: 161 | print "Classification on Support Vector Machine Model using SKLearn Library" 162 | parameters={'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]} 163 | svc=svm.SVC(kernel = 'linear', probability = True, random_state = 0) 164 | roc_auc_scorer = make_scorer(roc_auc_score) 165 | modelObj = GridSearchCV(svc, parameters, scoring=roc_auc_scorer) 166 | runModel(modelObj) 167 | else: 168 | print "Classification on "+MODEL[x] 169 | runModel(options[x](PARAMS[x])) 170 | 171 | def writeValsToPickleFile(data,name): 172 | 173 | fName='../DataVisualization/PickleFilesForActualAndPredicted/'+name+'.pickle' 174 | if os.path.exists(fName): 175 | os.remove(fName) 176 | fileIndex=open(fName,'wb') 177 | pickle.dump(data,fileIndex) 178 | fileIndex.close() 179 | 180 | if __name__=='__main__': 181 | print "Welcome to fake news classifier" 182 | selectTasks() -------------------------------------------------------------------------------- /source/DataCollectionAndCleaning/DataExtractionAndFilter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | from sklearn.utils import shuffle 4 | import crawler 5 | LEN=2 6 | 7 | ''' fetch the raw real news data from csv ''' 8 | 9 | def fetchDataFrameRealNewsCSV(): 10 | print('Fetching real news dataset....') 11 | fake_news_data = pd.read_csv('../DataSets/uci-news-aggregator.csv') 12 | dfReal = pd.DataFrame(data=fake_news_data) 13 | return dfReal 14 | 15 | ''' fetch the raw fake news data from csv ''' 16 | 17 | def fetchDataFrameFakeNewsCSV(): 18 | print('Fetching fake news dataset....') 19 | fake_news_data = pd.read_csv('../DataSets/fake.csv') 20 | dfFake = pd.DataFrame(data=fake_news_data) 21 | return dfFake 22 | 23 | ''' process fake news data ''' 24 | 25 | def processFakeNewsDataFrame(dfFake): 26 | pd.options.mode.chained_assignment = None 27 | dfFake=dfFake.astype('str') 28 | dfFake['country']=pd.Series('united states',index=dfFake.index) 29 | dfFake['Site Country']=dfFake['site_url'].apply(lambda x: str(x).split('.')[0]) 30 | dfFake=dfFake[['site_url','title','text','author','language','country','Site Country','published']] 31 | dfFake['Fake ( fake =1 and Real =0)']=pd.Series('1',index=dfFake.index) 32 | dfFake.columns=[ 'URL', 'Title', 'Text', 'Author', 'Language', 'Site Country', 'Site Name', 'ThreadPublication Date', 'Fake ( fake =1 and Real =0)'] 33 | print (len(dfFake['URL'][1].split('.')[1])==3) 34 | dfFake=dfFake.loc[(dfFake['Language']=='english') & (dfFake['Site Country']=='united states')] 35 | print dfFake['URL'].size 36 | dfFake.reindex(fill_value='') 37 | print "No. of fake records : ",dfFake['URL'].size 38 | return dfFake 39 | 40 | ''' process real news data ''' 41 | 42 | def processRealNewsDataFrame(dfReal): 43 | pd.options.mode.chained_assignment = None 44 | dfReal=dfReal.loc[(dfReal['CATEGORY']=='b') & (dfReal['URL']!='') & (dfReal['URL']!=None)] #& dfReal.URL.str.contains('^http') ] 45 | dfReal['Text']=pd.Series('',index=dfReal.index) 46 | dfReal['Fake ( fake =1 and Real =0)']=pd.Series('0',index=dfReal.index) 47 | dfReal['Language']=pd.Series('english',index=dfReal.index) 48 | dfReal['Site Country']=pd.Series('united states',index=dfReal.index) 49 | dfReal=dfReal.drop_duplicates('URL') 50 | dfReal=dfReal[['URL','TITLE','Text','PUBLISHER','Language','Site Country','HOSTNAME','TIMESTAMP','Fake ( fake =1 and Real =0)']] 51 | dfReal.columns=[ 'URL', 'Title', 'Text', 'Author', 'Language', 'Site Country', 'Site Name', 'ThreadPublication Date', 'Fake ( fake =1 and Real =0)'] 52 | dfReal=dfReal.astype('str') 53 | dfReal.reindex() 54 | print 'No. of Real news records: ',dfReal['URL'].size 55 | return dfReal 56 | 57 | ''' extract top real news urls for crawling ''' 58 | 59 | def extractTopRealResultsForCrawling(dfReal): 60 | print "Retrieve top 20000 Real news data" 61 | num=dfReal.size 62 | loop=num/10000 63 | listOfIndex=[] 64 | df=[] 65 | for i in range(0,loop): 66 | listOfIndex.append(dfReal[i*10000:(i+1)*10000]) 67 | df+=[dfReal[i*10000:(i+1)*10000]] 68 | 69 | #print "length of dataframe array retrieved:",len(df[0]) 70 | return df[:LEN] 71 | 72 | ''' filter text records that is not relevant or null ''' 73 | 74 | def filterNullTextContentRecords(df): 75 | keyList =['Page Not Found','The item that you have requested was not found','The address was entered incorrectly',\ 76 | 'The item no longer exists','There has been an error on the site','We apologize for any inconvenience','font-size,font-family',\ 77 | 'text-align','404 - File or directory not found','The resource you are looking for might have been removed', \ 78 | 'had its name changed', 'or is temporarily unavailable.','Return to the previous page',\ 79 | 'If you feel the address you entered is correct you can contact us',\ 80 | 'mentioning the error message received and the item you were trying to reach','It looks like nothing was found at this location.',\ 81 | 'Well, this is unfortunate','Your story was not found','The story you requested could not be found',\ 82 | '404 - File or directory not found','PAGE NOT FOUND','We\'re sorry that the page you\'re looking for cannot be found','Page Not Found - 404',\ 83 | 'Sorry, but the page you were looking for is not here','This is usually the result of a bad or outdated link','ERROR404',\ 84 | 'The case of this missing page is still unsolved','Return to the previous page','The item that you have requested was not found',\ 85 | 'The case of this missing page is still unsolved','The page may no longer exist or may have moved to another web address',\ 86 | 'The page you were looking for cannot be found','The page you requested cannot be found',\ 87 | 'Either it doesn\'t exist or it was removed from the site','It looks like nothing was found at this location',\ 88 | 'This Page Could Not Be Found','404 Sorry, the page you have searched for doesnt exist','Nothing was found at this location',\ 89 | 'ERROR404The case of this missing page is still unsolved','Error 404 Nothing found','404 - File or directory not found',\ 90 | 'Oh no!No content to show for this page','404 The resource or page you are looking for could have been removed, had its name changed, or is temporarily unavailable',\ 91 | 'Sorry, the page you are looking for cannot be found','Oops! Page Not Found','Oops! Page Not Found',\ 92 | '404We\'re sorry, but the page you were looking for doesn\'t exist','Page not found','Pardon Our Interruption',\ 93 | '500 - Internal server error','Not found, error 404','The page you are looking for no longer exists','Oops, This Page Could Not Be Found',\ 94 | 'The page you\'ve requested can not be displayed','It appears you\'ve missed your intended destination, either through a bad or outdated link',\ 95 | 'This might be because:You have typed the web address incorrectly, or the page you were looking for may have been moved, updated or deleted',\ 96 | 'we couldn\'t find the page you were looking for','500 - Internal server error.There is a problem with the resource you are looking for, and it cannot be displayed',\ 97 | 'We haven\'t been able to serve the page you asked for','We\'re sorry, but we seem to have lost this page','PAGE NOT FOUND',\ 98 | 'We\'re sorry,the page you requested could not be found'] # print keyList 99 | print 'No. of records: ',df['URL'].size 100 | pd.options.mode.chained_assignment = None 101 | for index,row in df.iterrows(): 102 | if checkText(str(row['Text']),keyList) or len(str(row['Text']))<300: 103 | df.drop(index, inplace=True) 104 | df.reindex() 105 | 106 | ''' test whether the key words exist in the given text ''' 107 | 108 | def checkText(text,keyWords): 109 | 110 | for key in keyWords: 111 | if key in text: 112 | return True 113 | return False 114 | 115 | ''' combine and shuffle the data ''' 116 | 117 | def combineAndShuffle(dfCombine): 118 | combinedDf = pd.concat(dfCombine) 119 | combinedDf = shuffle(combinedDf) 120 | return combinedDf 121 | 122 | ''' extract real and fake news data ''' 123 | 124 | def dataSetExtraction(): 125 | dfReal=fetchDataFrameRealNewsCSV() 126 | dfFake=fetchDataFrameFakeNewsCSV() 127 | dfFake=processFakeNewsDataFrame(dfFake) 128 | dfReal=processRealNewsDataFrame(dfReal) 129 | dfCombine=[] 130 | for d in extractTopRealResultsForCrawling(dfReal): 131 | print 'len of datadrame :',d['URL'].size 132 | #d=d[:100] 133 | d=crawler.crawlUrlsFromDF(d) 134 | d=filterNullTextContentRecords(d) 135 | dfCombine+=[d] 136 | dfCombine+=[dfFake] 137 | df=combineAndShuffle(dfCombine) 138 | if os.path.exists('../DataSets/FinalDataSet.csv'): 139 | os.remove('../DataSets/FinalDataSet.csv') 140 | df.to_csv('../DataSets/FinalDataSet.csv', index=False) 141 | print 'No. of records in final data set: ',df['URL'].size 142 | print "Saving New CSV file" 143 | 144 | if __name__=='__main__':dataSetExtraction() --------------------------------------------------------------------------------