├── Accuracy_checker.py ├── Download_twitter_Api.py ├── InsertTweetDemo.py ├── README.md ├── data ├── dictionary.tsv └── tweetdata.txt ├── depression_sentiment_analysis.py ├── preprocessor.py └── processed_data └── output.xlsx /Accuracy_checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on the day we all start to love our self. 5 | 6 | @author: Nikie Jo Deocampo 7 | """ 8 | 9 | import json 10 | import pandas as pd 11 | import time 12 | import numpy as np 13 | import itertools 14 | import matplotlib.pyplot as plt 15 | from sklearn.metrics import confusion_matrix 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | from sklearn import metrics 18 | #from sklearn.metrics import roc_auc_score 19 | 20 | tweets_data = [] 21 | x = [] 22 | y = [] 23 | vectorizer = CountVectorizer(stop_words='english') 24 | 25 | def retrieveTweet(data_url): 26 | 27 | tweets_data_path = data_url 28 | tweets_file = open(tweets_data_path, "r") 29 | for line in tweets_file: 30 | try: 31 | tweet = json.loads(line) 32 | tweets_data.append(tweet) 33 | except: 34 | continue 35 | 36 | 37 | def retrieveProcessedData(Pdata_url): 38 | sent = pd.read_excel(Pdata_url) 39 | for i in range(len(tweets_data)): 40 | if tweets_data[i]['id']==sent['id'][i]: 41 | x.append(tweets_data[i]['text']) 42 | y.append(sent['sentiment'][i]) 43 | 44 | def plot_confusion_matrix(cm, classes, 45 | normalize=False, 46 | title='Confusion matrix', 47 | cmap=plt.cm.Blues): 48 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 49 | plt.title(title) 50 | plt.colorbar() 51 | tick_marks = np.arange(len(classes)) 52 | plt.xticks(tick_marks, classes, rotation=45) 53 | plt.yticks(tick_marks, classes) 54 | 55 | fmt = '.2f' if normalize else 'd' 56 | thresh = cm.max() / 2. 57 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 58 | plt.text(j, i, format(cm[i, j], fmt), 59 | horizontalalignment="center", 60 | color="white" if cm[i, j] > thresh else "black") 61 | 62 | plt.tight_layout() 63 | plt.ylabel('True label') 64 | plt.xlabel('Predicted label') 65 | 66 | 67 | def nbTrain(): 68 | from sklearn.naive_bayes import MultinomialNB 69 | start_timenb = time.time() 70 | train_features = vectorizer.fit_transform(x) 71 | 72 | actual = y 73 | 74 | nb = MultinomialNB() 75 | nb.fit(train_features, [int(r) for r in y]) 76 | 77 | test_features = vectorizer.transform(x) 78 | predictions = nb.predict(test_features) 79 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1) 80 | nbscore = format(metrics.auc(fpr, tpr)) 81 | nbscore = float(nbscore)*100 82 | 83 | # nb_matrix = confusion_matrix(actual, predictions) 84 | # plt.figure() 85 | # plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier') 86 | 87 | print("\n") 88 | 89 | print("Naive Bayes Accuracy : \n", nbscore,"%") 90 | print(" Completion Speed", round((time.time() - start_timenb),5)) 91 | print() 92 | 93 | def datree(): 94 | from sklearn import tree 95 | start_timedt = time.time() 96 | train_featurestree = vectorizer.fit_transform(x) 97 | actual1 = y 98 | test_features1 = vectorizer.transform(x) 99 | dtree = tree.DecisionTreeClassifier() 100 | 101 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 102 | 103 | prediction1 = dtree.predict(test_features1) 104 | ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1) 105 | dtreescore = format(metrics.auc(ddd, ttt)) 106 | dtreescore = float(dtreescore)*100 107 | print("Decision tree Accuracy : \n", dtreescore, "%") 108 | print(" Completion Speed", round((time.time() - start_timedt),5)) 109 | print() 110 | 111 | def Tsvm(): 112 | from sklearn.svm import SVC 113 | start_timesvm = time.time() 114 | train_featuressvm = vectorizer.fit_transform(x) 115 | actual2 = y 116 | test_features2 = vectorizer.transform(x) 117 | svc = SVC() 118 | 119 | svc = svc.fit(train_featuressvm, [int(r) for r in y]) 120 | prediction2 = svc.predict(test_features2) 121 | sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1) 122 | svc = format(metrics.auc(sss, vvv)) 123 | svc = float(svc)*100 124 | print("Support vector machine Accuracy : \n", svc, "%") 125 | print(" Completion Speed", round((time.time() - start_timesvm),5)) 126 | print() 127 | 128 | def knN(): 129 | from sklearn.neighbors import KNeighborsClassifier 130 | start_timekn = time.time() 131 | train_featureskn = vectorizer.fit_transform(x) 132 | actual3 = y 133 | test_features3 = vectorizer.transform(x) 134 | kn = KNeighborsClassifier(n_neighbors=2) 135 | 136 | 137 | kn = kn.fit(train_featureskn, [int(i) for i in y]) 138 | prediction3 = kn.predict(test_features3) 139 | kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1) 140 | kn = format(metrics.auc(kkk, nnn)) 141 | kn = float(kn)*100 142 | 143 | print("Kneighborsclassifier Accuracy : \n", kn, "%") 144 | print(" Completion Speed", round((time.time() - start_timekn),5)) 145 | print() 146 | 147 | def RanFo(): 148 | from sklearn.ensemble import RandomForestClassifier 149 | start_timerf = time.time() 150 | train_featuresrf = vectorizer.fit_transform(x) 151 | actual4 = y 152 | test_features4 = vectorizer.transform(x) 153 | rf = RandomForestClassifier(max_depth=2, random_state=0) 154 | 155 | 156 | rf = rf.fit(train_featuresrf, [int(i) for i in y]) 157 | prediction4 = rf.predict(test_features4) 158 | rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1) 159 | kn = format(metrics.auc(rrr, fff)) 160 | kn = float(kn)*100 161 | print("Random Forest Accuracy : \n", kn, "%") 162 | print(" Completion Speed", round((time.time() - start_timerf),5)) 163 | print() 164 | print() 165 | 166 | 167 | def runall(): 168 | retrieveTweet('data/tweetdata.txt') 169 | retrieveProcessedData('processed_data/output.xlsx') 170 | nbTrain() 171 | datree() 172 | Tsvm() 173 | knN() 174 | RanFo() 175 | 176 | def datreeINPUT(inputtweet): 177 | from sklearn import tree 178 | train_featurestree = vectorizer.fit_transform(x) 179 | dtree = tree.DecisionTreeClassifier() 180 | 181 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 182 | 183 | 184 | inputdtree= vectorizer.transform([inputtweet]) 185 | predictt = dtree.predict(inputdtree) 186 | 187 | if predictt == 1: 188 | predictt = "Positive" 189 | elif predictt == 0: 190 | predictt = "Neutral" 191 | elif predictt == -1: 192 | predictt = "Negative" 193 | else: 194 | print("Nothing") 195 | 196 | print("\n*****************") 197 | print(predictt) 198 | print("*****************") 199 | 200 | runall() 201 | 202 | #print("Input your tweet : ") 203 | #inputtweet = input() 204 | # 205 | #datreeINPUT(inputtweet) 206 | -------------------------------------------------------------------------------- /Download_twitter_Api.py: -------------------------------------------------------------------------------- 1 | from tweepy.streaming import StreamListener 2 | from tweepy import OAuthHandler 3 | from tweepy import Stream 4 | 5 | 6 | consumer_key = 'd7RJDeV6M1TdKnXXdY29Zud5O' 7 | consumer_secret = '8LV35luiAco2mBnQ1W6erOnA8cbMwVgxblfHjP5zk5dmAXGwd6' 8 | access_token = '2206645458-9qlftwQ5eiovob7GCp21VrAoFRXi7AJLGt5ts3O' 9 | access_secret = 'Oc9ZKbHSL0reJhZYcU0Vk9UERbVvsTwerIfDUTwiRNGYf' 10 | 11 | 12 | 13 | class StdOutListener(StreamListener): 14 | 15 | def on_data(self, data): 16 | with open('data/tweetdata.txt','a') as tf: 17 | tf.write(data) 18 | print(data) 19 | return True 20 | 21 | def on_error(self, status): 22 | print(status) 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | 28 | l = StdOutListener() 29 | auth = OAuthHandler(consumer_key, consumer_secret) 30 | auth.set_access_token(access_token, access_secret) 31 | stream = Stream(auth, l) 32 | 33 | stream.filter(track=['depression', 'anxiety', 'mental health', 'suicide', 'stress', 'sad']) -------------------------------------------------------------------------------- /InsertTweetDemo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on the day we all start to love our self. 5 | 6 | @author: Nikie Jo Deocampo 7 | """ 8 | 9 | import json 10 | import pandas as pd 11 | import time 12 | import numpy as np 13 | import itertools 14 | import matplotlib.pyplot as plt 15 | from sklearn.metrics import confusion_matrix 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | from sklearn import metrics 18 | #from sklearn.metrics import roc_auc_score 19 | 20 | tweets_data = [] 21 | x = [] 22 | y = [] 23 | vectorizer = CountVectorizer(stop_words='english') 24 | 25 | def retrieveTweet(data_url): 26 | 27 | tweets_data_path = data_url 28 | tweets_file = open(tweets_data_path, "r") 29 | for line in tweets_file: 30 | try: 31 | tweet = json.loads(line) 32 | tweets_data.append(tweet) 33 | except: 34 | continue 35 | 36 | 37 | def retrieveProcessedData(Pdata_url): 38 | sent = pd.read_excel(Pdata_url) 39 | for i in range(len(tweets_data)): 40 | if tweets_data[i]['id']==sent['id'][i]: 41 | x.append(tweets_data[i]['text']) 42 | y.append(sent['sentiment'][i]) 43 | 44 | def plot_confusion_matrix(cm, classes, 45 | normalize=False, 46 | title='Confusion matrix', 47 | cmap=plt.cm.Blues): 48 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 49 | plt.title(title) 50 | plt.colorbar() 51 | tick_marks = np.arange(len(classes)) 52 | plt.xticks(tick_marks, classes, rotation=45) 53 | plt.yticks(tick_marks, classes) 54 | 55 | fmt = '.2f' if normalize else 'd' 56 | thresh = cm.max() / 2. 57 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 58 | plt.text(j, i, format(cm[i, j], fmt), 59 | horizontalalignment="center", 60 | color="white" if cm[i, j] > thresh else "black") 61 | 62 | plt.tight_layout() 63 | plt.ylabel('True label') 64 | plt.xlabel('Predicted label') 65 | 66 | 67 | def nbTrain(): 68 | from sklearn.naive_bayes import MultinomialNB 69 | start_timenb = time.time() 70 | train_features = vectorizer.fit_transform(x) 71 | 72 | actual = y 73 | 74 | nb = MultinomialNB() 75 | nb.fit(train_features, [int(r) for r in y]) 76 | 77 | test_features = vectorizer.transform(x) 78 | predictions = nb.predict(test_features) 79 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1) 80 | nbscore = format(metrics.auc(fpr, tpr)) 81 | nbscore = float(nbscore)*100 82 | 83 | # nb_matrix = confusion_matrix(actual, predictions) 84 | # plt.figure() 85 | # plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier') 86 | 87 | print("\n") 88 | 89 | print("Naive Bayes Accuracy : \n", nbscore,"%") 90 | print(" Completion Speed", round((time.time() - start_timenb),5)) 91 | print() 92 | 93 | def datree(): 94 | from sklearn import tree 95 | start_timedt = time.time() 96 | train_featurestree = vectorizer.fit_transform(x) 97 | actual1 = y 98 | test_features1 = vectorizer.transform(x) 99 | dtree = tree.DecisionTreeClassifier() 100 | 101 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 102 | 103 | prediction1 = dtree.predict(test_features1) 104 | ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1) 105 | dtreescore = format(metrics.auc(ddd, ttt)) 106 | dtreescore = float(dtreescore)*100 107 | print("Decision tree Accuracy : \n", dtreescore, "%") 108 | print(" Completion Speed", round((time.time() - start_timedt),5)) 109 | print() 110 | 111 | def Tsvm(): 112 | from sklearn.svm import SVC 113 | start_timesvm = time.time() 114 | train_featuressvm = vectorizer.fit_transform(x) 115 | actual2 = y 116 | test_features2 = vectorizer.transform(x) 117 | svc = SVC() 118 | 119 | svc = svc.fit(train_featuressvm, [int(r) for r in y]) 120 | prediction2 = svc.predict(test_features2) 121 | sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1) 122 | svc = format(metrics.auc(sss, vvv)) 123 | svc = float(svc)*100 124 | print("Support vector machine Accuracy : \n", svc, "%") 125 | print(" Completion Speed", round((time.time() - start_timesvm),5)) 126 | print() 127 | 128 | def knN(): 129 | from sklearn.neighbors import KNeighborsClassifier 130 | start_timekn = time.time() 131 | train_featureskn = vectorizer.fit_transform(x) 132 | actual3 = y 133 | test_features3 = vectorizer.transform(x) 134 | kn = KNeighborsClassifier(n_neighbors=2) 135 | 136 | 137 | kn = kn.fit(train_featureskn, [int(i) for i in y]) 138 | prediction3 = kn.predict(test_features3) 139 | kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1) 140 | kn = format(metrics.auc(kkk, nnn)) 141 | kn = float(kn)*100 142 | 143 | print("Kneighborsclassifier Accuracy : \n", kn, "%") 144 | print(" Completion Speed", round((time.time() - start_timekn),5)) 145 | print() 146 | 147 | def RanFo(): 148 | from sklearn.ensemble import RandomForestClassifier 149 | start_timerf = time.time() 150 | train_featuresrf = vectorizer.fit_transform(x) 151 | actual4 = y 152 | test_features4 = vectorizer.transform(x) 153 | rf = RandomForestClassifier(max_depth=2, random_state=0) 154 | 155 | 156 | rf = rf.fit(train_featuresrf, [int(i) for i in y]) 157 | prediction4 = rf.predict(test_features4) 158 | rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1) 159 | kn = format(metrics.auc(rrr, fff)) 160 | kn = float(kn)*100 161 | print("Random Forest Accuracy : \n", kn, "%") 162 | print(" Completion Speed", round((time.time() - start_timerf),5)) 163 | print() 164 | print() 165 | 166 | 167 | def runall(): 168 | retrieveTweet('data/tweetdata.txt') 169 | retrieveProcessedData('processed_data/output.xlsx') 170 | # nbTrain() 171 | # datree() 172 | # Tsvm() 173 | # knN() 174 | # RanFo() 175 | 176 | def datreeINPUT(inputtweet): 177 | from sklearn import tree 178 | train_featurestree = vectorizer.fit_transform(x) 179 | dtree = tree.DecisionTreeClassifier() 180 | 181 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 182 | 183 | 184 | inputdtree= vectorizer.transform([inputtweet]) 185 | predictt = dtree.predict(inputdtree) 186 | 187 | if predictt == 1: 188 | predictt = "Positive" 189 | elif predictt == 0: 190 | predictt = "Neutral" 191 | elif predictt == -1: 192 | predictt = "Negative" 193 | else: 194 | print("Nothing") 195 | 196 | print("\n*****************") 197 | print(predictt) 198 | print("*****************") 199 | 200 | runall() 201 | 202 | print("\nInput your tweet : ") 203 | inputtweet = input() 204 | 205 | datreeINPUT(inputtweet) 206 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Classification of Depression on Social Media Using Text Mining

2 | 3 |

Author | Introduction | The Project | Video Demo | References | Acknowledgement

4 | 5 | 6 | 7 |

Author - 저자

8 | 9 | 10 | 11 |

12 |

Name: Nikie Jo Elauria Deocampo

13 |

Country: Philippines

14 |

Educational Background: 15 |

16 |

Undergraduate: Bachelor of Science in Information System

17 |

Graduate: Masters in Information Technology

18 |

School: West Visayas State University

19 |

20 |

21 |

Mentor: Dr. Bobby Gerardo

22 |

Motto: I work hard so my dog can have a better life.

23 |

24 | 25 | 26 |

Introduction - 소개

27 | 28 |

Mental illness has been prevalent in the world, depression is one of the most common psychological problem i know and i would like to help as much as i can. Being a fan of Anthony Bourdain and Robin Williams, It has propel me to explore in this study. With the use of the Large amount of data tweets and Facebook post online i can use machine learning to data mine it and be able to produce a meaningful and useful outcome.

Social media generates countless data every day because of millions of active users share and communicate in entire 29 | community, it changes human interaction. For this project, I will be using Python and various modules and libraries.

30 | 31 | 32 |

The Project - 프로젝트

33 | 34 | Requirements: 35 |

36 |

42 |

43 | 44 |

The aim of the project is to predict early signs of depression through Social Media text mining. Below are the steps to run the python codes using the data sets uploaded in the repositories or you can download your own.

45 | 46 | 47 | 48 | Follow steps below: 49 |
    50 |
  1. Create a twitter developers account ( Register Here), From that account your would need 4 things. 51 |
  2. 52 | consumer_key = '', consumer_secret = '', access_token = '', access_secret = '' 53 |
  3. Using the file "Download_twitter_Api.py" insert the credentials and you can download current tweets using keywords such us depression, anxiety or sadness. When data sets are ready you may proceed on the preprocessing stage. 54 |
    55 | 56 |
  4. 57 |
  5. Run "preprocessor.py", This stage will go through your data sets and the given dictionary. The dictionary contain words with their corresponding polarity, which is essential to calcualting the sentiment of each tweet, each word will be seperated, tokenized and given its polarity. Every tweet will consist of the summation of all polarity of each word and devided by number of words in that tweet.
  6. 58 |
  7. Once preprocess is done. You can find the file in the directory "processed_data/output.xlsx". Opening it you will find that the ID (tweet) and Sentiment of each tweet is seperated into 2 columns. With this output you now have a twitter data set and its corresponding sentiment filtered by depress keywords. (Positive, Neutral and Negative).
  8. 59 | 60 |
  9. Now for training and Predicting. Make sure all files are located in proper folders, Run "depression_sentiment_analysis.py". The code will run through the output.xlsx file and at the same time recover the tweet corresponding to the id of each sentiment. using this we use the original data and feed them to our classifiers. When everything is done you should have all the AUC of each classifier listed in the console.
  10. 61 |
  11. But wait, There's more. You will also have the ability to type in a sample tweet, The tweet will go through the highest AUC in the list of classifier to predict the sentiment of the tweet you wrote.
  12. 62 | 63 |
64 | 65 |

66 | What the result could mean? Postive, This mean that person is unlikely to have depression or anxiety. Neutral, This is the middle level wherein the user may or may not have depression but may also be more prone to being depress. At that stage the user may display some depression like symptoms. lasty, Negative is the lowest level where depression and anxiety symptoms are being detected through the users tweets. The more negative words the user uses mean the more negative emotion the tweet has. 67 |

68 | 69 | 70 | 71 |

Video Tutorials

72 | 77 | 78 |

Results - 결과들

79 | 80 | Below are the Matrix for the 5 classifier with Decision tree having the highest score. 81 | 82 | 83 | 84 | 85 | Using the same data set to test my accuracy, I trained and tested about 10,000 Tweets: 86 |

AUC is an abbrevation for area under the curve. It is used in classification analysis in order to determine which of the used models predicts the classes best.

87 | 88 |

Accuracy:

89 | 96 | 97 |

Completion Time:

98 | 105 | 106 | 107 |

Future Plans - 향후 계획

108 |

This study is not yet perfect and im still aiming to improve it.

109 | 110 | 116 | 117 |

References - 참고

118 | 119 | 125 | 126 | 127 |

Acknowledgement - 승인

128 | 129 |

130 | This work is not possible without the overwhelming support from Jeju National University, Jeju Development Center and other selfless sponsors. I would like to specifically give a big thanks to Prof. Yungcheol Byun for being the best host ever and my mentor Dr. Bobby Gerardo for the help and guidance. 131 |

132 | 133 | -------------------------------------------------------------------------------- /depression_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on the day we all start to love our self. 5 | 6 | @author: Nikie Jo Deocampo 7 | """ 8 | 9 | import json 10 | import pandas as pd 11 | import time 12 | import numpy as np 13 | import itertools 14 | import matplotlib.pyplot as plt 15 | from sklearn.metrics import confusion_matrix 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | from sklearn import metrics 18 | #from sklearn.metrics import roc_auc_score 19 | 20 | tweets_data = [] 21 | x = [] 22 | y = [] 23 | vectorizer = CountVectorizer(stop_words='english') 24 | 25 | def retrieveTweet(data_url): 26 | 27 | tweets_data_path = data_url 28 | tweets_file = open(tweets_data_path, "r") 29 | for line in tweets_file: 30 | try: 31 | tweet = json.loads(line) 32 | tweets_data.append(tweet) 33 | except: 34 | continue 35 | 36 | 37 | def retrieveProcessedData(Pdata_url): 38 | sent = pd.read_excel(Pdata_url) 39 | for i in range(len(tweets_data)): 40 | if tweets_data[i]['id']==sent['id'][i]: 41 | x.append(tweets_data[i]['text']) 42 | y.append(sent['sentiment'][i]) 43 | 44 | def plot_confusion_matrix(cm, classes, 45 | normalize=False, 46 | title='Confusion matrix', 47 | cmap=plt.cm.Blues): 48 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 49 | plt.title(title) 50 | plt.colorbar() 51 | tick_marks = np.arange(len(classes)) 52 | plt.xticks(tick_marks, classes, rotation=45) 53 | plt.yticks(tick_marks, classes) 54 | 55 | fmt = '.2f' if normalize else 'd' 56 | thresh = cm.max() / 2. 57 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 58 | plt.text(j, i, format(cm[i, j], fmt), 59 | horizontalalignment="center", 60 | color="white" if cm[i, j] > thresh else "black") 61 | 62 | plt.tight_layout() 63 | plt.ylabel('True label') 64 | plt.xlabel('Predicted label') 65 | 66 | 67 | def nbTrain(): 68 | from sklearn.naive_bayes import MultinomialNB 69 | start_timenb = time.time() 70 | train_features = vectorizer.fit_transform(x) 71 | 72 | actual = y 73 | 74 | nb = MultinomialNB() 75 | nb.fit(train_features, [int(r) for r in y]) 76 | 77 | test_features = vectorizer.transform(x) 78 | predictions = nb.predict(test_features) 79 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1) 80 | nbscore = format(metrics.auc(fpr, tpr)) 81 | nbscore = float(nbscore)*100 82 | 83 | nb_matrix = confusion_matrix(actual, predictions) 84 | plt.figure() 85 | plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier') 86 | 87 | print("\n") 88 | 89 | # test_try= vectorizer.transform(["Lets help those in need, fight anxiety and bring happiness"]) 90 | # test_try2= vectorizer.transform(["Dont look down at people with anxiety rather give love and respect to all. shout! Equality."]) 91 | # predictr = nb.predict(test_try) 92 | # predictt = nb.predict(test_try2) 93 | 94 | 95 | # print(predictr) 96 | # print(predictt) 97 | 98 | print("Naive Bayes Accuracy : \n", nbscore,"%") 99 | print(" Completion Speed", round((time.time() - start_timenb),5)) 100 | print() 101 | 102 | def datree(): 103 | from sklearn import tree 104 | start_timedt = time.time() 105 | train_featurestree = vectorizer.fit_transform(x) 106 | actual1 = y 107 | test_features1 = vectorizer.transform(x) 108 | dtree = tree.DecisionTreeClassifier() 109 | 110 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 111 | 112 | prediction1 = dtree.predict(test_features1) 113 | ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1) 114 | dtreescore = format(metrics.auc(ddd, ttt)) 115 | dtreescore = float(dtreescore)*100 116 | print("Decision tree Accuracy : \n", dtreescore, "%") 117 | print(" Completion Speed", round((time.time() - start_timedt),5)) 118 | print() 119 | 120 | def Tsvm(): 121 | from sklearn.svm import SVC 122 | start_timesvm = time.time() 123 | train_featuressvm = vectorizer.fit_transform(x) 124 | actual2 = y 125 | test_features2 = vectorizer.transform(x) 126 | svc = SVC() 127 | 128 | svc = svc.fit(train_featuressvm, [int(r) for r in y]) 129 | prediction2 = svc.predict(test_features2) 130 | sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1) 131 | svc = format(metrics.auc(sss, vvv)) 132 | svc = float(svc)*100 133 | print("Support vector machine Accuracy : \n", svc, "%") 134 | print(" Completion Speed", round((time.time() - start_timesvm),5)) 135 | print() 136 | 137 | def knN(): 138 | from sklearn.neighbors import KNeighborsClassifier 139 | start_timekn = time.time() 140 | train_featureskn = vectorizer.fit_transform(x) 141 | actual3 = y 142 | test_features3 = vectorizer.transform(x) 143 | kn = KNeighborsClassifier(n_neighbors=2) 144 | 145 | 146 | kn = kn.fit(train_featureskn, [int(i) for i in y]) 147 | prediction3 = kn.predict(test_features3) 148 | kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1) 149 | kn = format(metrics.auc(kkk, nnn)) 150 | kn = float(kn)*100 151 | 152 | print("Kneighborsclassifier Accuracy : \n", kn, "%") 153 | print(" Completion Speed", round((time.time() - start_timekn),5)) 154 | print() 155 | 156 | def RanFo(): 157 | from sklearn.ensemble import RandomForestClassifier 158 | start_timerf = time.time() 159 | train_featuresrf = vectorizer.fit_transform(x) 160 | actual4 = y 161 | test_features4 = vectorizer.transform(x) 162 | rf = RandomForestClassifier(max_depth=2, random_state=0) 163 | 164 | 165 | rf = rf.fit(train_featuresrf, [int(i) for i in y]) 166 | prediction4 = rf.predict(test_features4) 167 | rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1) 168 | kn = format(metrics.auc(rrr, fff)) 169 | kn = float(kn)*100 170 | print("Random Forest Accuracy : \n", kn, "%") 171 | print(" Completion Speed", round((time.time() - start_timerf),5)) 172 | print() 173 | print() 174 | 175 | 176 | def runall(): 177 | retrieveTweet('data/tweetdata.txt') 178 | retrieveProcessedData('processed_data/output.xlsx') 179 | nbTrain() 180 | datree() 181 | Tsvm() 182 | knN() 183 | RanFo() 184 | 185 | def datreeINPUT(inputtweet): 186 | from sklearn import tree 187 | train_featurestree = vectorizer.fit_transform(x) 188 | dtree = tree.DecisionTreeClassifier() 189 | 190 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 191 | 192 | 193 | inputdtree= vectorizer.transform([inputtweet]) 194 | predictt = dtree.predict(inputdtree) 195 | 196 | if predictt == 1: 197 | predictt = "Positive" 198 | elif predictt == 0: 199 | predictt = "Neutral" 200 | elif predictt == -1: 201 | predictt = "Negative" 202 | else: 203 | print("Nothing") 204 | 205 | print("\n*****************") 206 | print(predictt) 207 | print("*****************") 208 | 209 | runall() 210 | 211 | print("Input your tweet : ") 212 | inputtweet = input() 213 | 214 | datreeINPUT(inputtweet) 215 | -------------------------------------------------------------------------------- /preprocessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Jul 26 16:34:13 2018 5 | 6 | @author: Nikie Jo Deocampo 7 | """ 8 | import json 9 | import csv 10 | from nltk.tokenize import word_tokenize 11 | import string 12 | import re 13 | import time 14 | import pandas as pd 15 | 16 | 17 | tweets_data = [] 18 | x = [] 19 | y = [] 20 | k = [] 21 | some_milby = [] 22 | print("===========================") 23 | print("Starting Preprocess Function") 24 | print("=========================== \n\n") 25 | 26 | def getdata(dataurl): 27 | print("===========================") 28 | print("Retrieving TXT File") 29 | tweets_data_path = dataurl 30 | tweets_file = open(tweets_data_path, "r") 31 | for line in tweets_file: 32 | try: 33 | tweet = json.loads(line) 34 | tweets_data.append(tweet) 35 | except: 36 | continue 37 | print("===========================") 38 | print("Retrieving Successfull") 39 | print("=========================== \n \n") 40 | time.sleep(3) 41 | processdata() 42 | 43 | 44 | def processdata(): 45 | print("===========================") 46 | print("Recovering Data Teets") 47 | print("===========================") 48 | time.sleep(1) 49 | RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE) 50 | for i in range(len(tweets_data)): 51 | q = tweets_data[i]['text'] 52 | o = tweets_data[i]['id_str'] 53 | q = RE_EMOJI.sub(r'', q) 54 | i = q.translate(str.maketrans('','',string.punctuation)) 55 | x.append(i) 56 | k.append(o) 57 | print("===========================") 58 | print("Data Tweets Recovered") 59 | print("===========================\n\n") 60 | 61 | 62 | 63 | def readdict(dataurl): 64 | print("===========================") 65 | print("Reading Dictionary") 66 | print("===========================") 67 | with open(dataurl) as tsvfile: 68 | reader = csv.reader(tsvfile, delimiter='\t') 69 | for row in reader: 70 | i = [] 71 | i.append(row[2]) 72 | i.append(row[5]) 73 | y.append(i) 74 | print("===========================") 75 | print("Dictionary Preparation Done") 76 | print("===========================\n\n") 77 | addpolarity() 78 | 79 | def addpolarity(): 80 | start_time = time.time() 81 | counter = 0 82 | print("===========================") 83 | print("Processing please wait...") 84 | print("===========================\n\n") 85 | 86 | 87 | 88 | for j in x: 89 | 90 | tweet_token = j 91 | token = word_tokenize(tweet_token) 92 | sumnum = 0 93 | sum_word = 0 94 | for t in token: 95 | 96 | for d in y: 97 | if t == d[0]: 98 | sentiment = d[1] 99 | if sentiment == "positive": 100 | sumnum += 1 101 | sum_word += 1 102 | 103 | elif sentiment == "negative": 104 | sumnum += -1 105 | sum_word += 1 106 | 107 | else: 108 | sumnum += 0 109 | sum_word += 1 110 | 111 | 112 | break 113 | 114 | 115 | if sum_word != 0.0: 116 | sum_more = sumnum / sum_word 117 | if sum_more >= 0.2: 118 | sum_more = 1 119 | 120 | elif (sum_more < 0.2) and (sum_more > -0.5): 121 | sum_more = 0 122 | 123 | elif sum_more <= -0.5: 124 | sum_more = -1 125 | 126 | else: 127 | print("****") 128 | 129 | 130 | sum_var = [] 131 | varid = k[counter] 132 | sum_var.append(varid) 133 | sum_var.append(sum_more) 134 | some_milby.append(sum_var) 135 | counter += 1 136 | 137 | print("Processing time: ", round((time.time() - start_time),8), "Seconds \n\n") 138 | 139 | time.sleep(3) 140 | 141 | print("===========================") 142 | print("Processing Finish") 143 | print("===========================") 144 | 145 | 146 | savetoxlsx() 147 | 148 | def savetoxlsx(): 149 | df = pd.DataFrame(some_milby) 150 | df.to_excel('processed_data/output.xlsx', header=("id","sentiment"), index=False) 151 | 152 | 153 | #file = open("testfile_data.txt","w") 154 | #file.write(some_milby) 155 | #file.close() 156 | 157 | print("===========================") 158 | print("Data Saved!") 159 | print("===========================") 160 | 161 | 162 | def runall(): 163 | getdata('data/tweetdata.txt') 164 | readdict('data/dictionary.tsv') 165 | 166 | 167 | 168 | runall() -------------------------------------------------------------------------------- /processed_data/output.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niquejoe/Classification-of-Depression-on-Social-Media-Using-Text-Mining/fc732087c9ff8d06cbdb498cdeb09c31f14b3c67/processed_data/output.xlsx --------------------------------------------------------------------------------