├── Accuracy_checker.py ├── Download_twitter_Api.py ├── InsertTweetDemo.py ├── README.md ├── data ├── dictionary.tsv └── tweetdata.txt ├── depression_sentiment_analysis.py ├── preprocessor.py └── processed_data └── output.xlsx /Accuracy_checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on the day we all start to love our self. 5 | 6 | @author: Nikie Jo Deocampo 7 | """ 8 | 9 | import json 10 | import pandas as pd 11 | import time 12 | import numpy as np 13 | import itertools 14 | import matplotlib.pyplot as plt 15 | from sklearn.metrics import confusion_matrix 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | from sklearn import metrics 18 | #from sklearn.metrics import roc_auc_score 19 | 20 | tweets_data = [] 21 | x = [] 22 | y = [] 23 | vectorizer = CountVectorizer(stop_words='english') 24 | 25 | def retrieveTweet(data_url): 26 | 27 | tweets_data_path = data_url 28 | tweets_file = open(tweets_data_path, "r") 29 | for line in tweets_file: 30 | try: 31 | tweet = json.loads(line) 32 | tweets_data.append(tweet) 33 | except: 34 | continue 35 | 36 | 37 | def retrieveProcessedData(Pdata_url): 38 | sent = pd.read_excel(Pdata_url) 39 | for i in range(len(tweets_data)): 40 | if tweets_data[i]['id']==sent['id'][i]: 41 | x.append(tweets_data[i]['text']) 42 | y.append(sent['sentiment'][i]) 43 | 44 | def plot_confusion_matrix(cm, classes, 45 | normalize=False, 46 | title='Confusion matrix', 47 | cmap=plt.cm.Blues): 48 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 49 | plt.title(title) 50 | plt.colorbar() 51 | tick_marks = np.arange(len(classes)) 52 | plt.xticks(tick_marks, classes, rotation=45) 53 | plt.yticks(tick_marks, classes) 54 | 55 | fmt = '.2f' if normalize else 'd' 56 | thresh = cm.max() / 2. 57 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 58 | plt.text(j, i, format(cm[i, j], fmt), 59 | horizontalalignment="center", 60 | color="white" if cm[i, j] > thresh else "black") 61 | 62 | plt.tight_layout() 63 | plt.ylabel('True label') 64 | plt.xlabel('Predicted label') 65 | 66 | 67 | def nbTrain(): 68 | from sklearn.naive_bayes import MultinomialNB 69 | start_timenb = time.time() 70 | train_features = vectorizer.fit_transform(x) 71 | 72 | actual = y 73 | 74 | nb = MultinomialNB() 75 | nb.fit(train_features, [int(r) for r in y]) 76 | 77 | test_features = vectorizer.transform(x) 78 | predictions = nb.predict(test_features) 79 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1) 80 | nbscore = format(metrics.auc(fpr, tpr)) 81 | nbscore = float(nbscore)*100 82 | 83 | # nb_matrix = confusion_matrix(actual, predictions) 84 | # plt.figure() 85 | # plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier') 86 | 87 | print("\n") 88 | 89 | print("Naive Bayes Accuracy : \n", nbscore,"%") 90 | print(" Completion Speed", round((time.time() - start_timenb),5)) 91 | print() 92 | 93 | def datree(): 94 | from sklearn import tree 95 | start_timedt = time.time() 96 | train_featurestree = vectorizer.fit_transform(x) 97 | actual1 = y 98 | test_features1 = vectorizer.transform(x) 99 | dtree = tree.DecisionTreeClassifier() 100 | 101 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 102 | 103 | prediction1 = dtree.predict(test_features1) 104 | ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1) 105 | dtreescore = format(metrics.auc(ddd, ttt)) 106 | dtreescore = float(dtreescore)*100 107 | print("Decision tree Accuracy : \n", dtreescore, "%") 108 | print(" Completion Speed", round((time.time() - start_timedt),5)) 109 | print() 110 | 111 | def Tsvm(): 112 | from sklearn.svm import SVC 113 | start_timesvm = time.time() 114 | train_featuressvm = vectorizer.fit_transform(x) 115 | actual2 = y 116 | test_features2 = vectorizer.transform(x) 117 | svc = SVC() 118 | 119 | svc = svc.fit(train_featuressvm, [int(r) for r in y]) 120 | prediction2 = svc.predict(test_features2) 121 | sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1) 122 | svc = format(metrics.auc(sss, vvv)) 123 | svc = float(svc)*100 124 | print("Support vector machine Accuracy : \n", svc, "%") 125 | print(" Completion Speed", round((time.time() - start_timesvm),5)) 126 | print() 127 | 128 | def knN(): 129 | from sklearn.neighbors import KNeighborsClassifier 130 | start_timekn = time.time() 131 | train_featureskn = vectorizer.fit_transform(x) 132 | actual3 = y 133 | test_features3 = vectorizer.transform(x) 134 | kn = KNeighborsClassifier(n_neighbors=2) 135 | 136 | 137 | kn = kn.fit(train_featureskn, [int(i) for i in y]) 138 | prediction3 = kn.predict(test_features3) 139 | kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1) 140 | kn = format(metrics.auc(kkk, nnn)) 141 | kn = float(kn)*100 142 | 143 | print("Kneighborsclassifier Accuracy : \n", kn, "%") 144 | print(" Completion Speed", round((time.time() - start_timekn),5)) 145 | print() 146 | 147 | def RanFo(): 148 | from sklearn.ensemble import RandomForestClassifier 149 | start_timerf = time.time() 150 | train_featuresrf = vectorizer.fit_transform(x) 151 | actual4 = y 152 | test_features4 = vectorizer.transform(x) 153 | rf = RandomForestClassifier(max_depth=2, random_state=0) 154 | 155 | 156 | rf = rf.fit(train_featuresrf, [int(i) for i in y]) 157 | prediction4 = rf.predict(test_features4) 158 | rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1) 159 | kn = format(metrics.auc(rrr, fff)) 160 | kn = float(kn)*100 161 | print("Random Forest Accuracy : \n", kn, "%") 162 | print(" Completion Speed", round((time.time() - start_timerf),5)) 163 | print() 164 | print() 165 | 166 | 167 | def runall(): 168 | retrieveTweet('data/tweetdata.txt') 169 | retrieveProcessedData('processed_data/output.xlsx') 170 | nbTrain() 171 | datree() 172 | Tsvm() 173 | knN() 174 | RanFo() 175 | 176 | def datreeINPUT(inputtweet): 177 | from sklearn import tree 178 | train_featurestree = vectorizer.fit_transform(x) 179 | dtree = tree.DecisionTreeClassifier() 180 | 181 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 182 | 183 | 184 | inputdtree= vectorizer.transform([inputtweet]) 185 | predictt = dtree.predict(inputdtree) 186 | 187 | if predictt == 1: 188 | predictt = "Positive" 189 | elif predictt == 0: 190 | predictt = "Neutral" 191 | elif predictt == -1: 192 | predictt = "Negative" 193 | else: 194 | print("Nothing") 195 | 196 | print("\n*****************") 197 | print(predictt) 198 | print("*****************") 199 | 200 | runall() 201 | 202 | #print("Input your tweet : ") 203 | #inputtweet = input() 204 | # 205 | #datreeINPUT(inputtweet) 206 | -------------------------------------------------------------------------------- /Download_twitter_Api.py: -------------------------------------------------------------------------------- 1 | from tweepy.streaming import StreamListener 2 | from tweepy import OAuthHandler 3 | from tweepy import Stream 4 | 5 | 6 | consumer_key = 'd7RJDeV6M1TdKnXXdY29Zud5O' 7 | consumer_secret = '8LV35luiAco2mBnQ1W6erOnA8cbMwVgxblfHjP5zk5dmAXGwd6' 8 | access_token = '2206645458-9qlftwQ5eiovob7GCp21VrAoFRXi7AJLGt5ts3O' 9 | access_secret = 'Oc9ZKbHSL0reJhZYcU0Vk9UERbVvsTwerIfDUTwiRNGYf' 10 | 11 | 12 | 13 | class StdOutListener(StreamListener): 14 | 15 | def on_data(self, data): 16 | with open('data/tweetdata.txt','a') as tf: 17 | tf.write(data) 18 | print(data) 19 | return True 20 | 21 | def on_error(self, status): 22 | print(status) 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | 28 | l = StdOutListener() 29 | auth = OAuthHandler(consumer_key, consumer_secret) 30 | auth.set_access_token(access_token, access_secret) 31 | stream = Stream(auth, l) 32 | 33 | stream.filter(track=['depression', 'anxiety', 'mental health', 'suicide', 'stress', 'sad']) -------------------------------------------------------------------------------- /InsertTweetDemo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on the day we all start to love our self. 5 | 6 | @author: Nikie Jo Deocampo 7 | """ 8 | 9 | import json 10 | import pandas as pd 11 | import time 12 | import numpy as np 13 | import itertools 14 | import matplotlib.pyplot as plt 15 | from sklearn.metrics import confusion_matrix 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | from sklearn import metrics 18 | #from sklearn.metrics import roc_auc_score 19 | 20 | tweets_data = [] 21 | x = [] 22 | y = [] 23 | vectorizer = CountVectorizer(stop_words='english') 24 | 25 | def retrieveTweet(data_url): 26 | 27 | tweets_data_path = data_url 28 | tweets_file = open(tweets_data_path, "r") 29 | for line in tweets_file: 30 | try: 31 | tweet = json.loads(line) 32 | tweets_data.append(tweet) 33 | except: 34 | continue 35 | 36 | 37 | def retrieveProcessedData(Pdata_url): 38 | sent = pd.read_excel(Pdata_url) 39 | for i in range(len(tweets_data)): 40 | if tweets_data[i]['id']==sent['id'][i]: 41 | x.append(tweets_data[i]['text']) 42 | y.append(sent['sentiment'][i]) 43 | 44 | def plot_confusion_matrix(cm, classes, 45 | normalize=False, 46 | title='Confusion matrix', 47 | cmap=plt.cm.Blues): 48 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 49 | plt.title(title) 50 | plt.colorbar() 51 | tick_marks = np.arange(len(classes)) 52 | plt.xticks(tick_marks, classes, rotation=45) 53 | plt.yticks(tick_marks, classes) 54 | 55 | fmt = '.2f' if normalize else 'd' 56 | thresh = cm.max() / 2. 57 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 58 | plt.text(j, i, format(cm[i, j], fmt), 59 | horizontalalignment="center", 60 | color="white" if cm[i, j] > thresh else "black") 61 | 62 | plt.tight_layout() 63 | plt.ylabel('True label') 64 | plt.xlabel('Predicted label') 65 | 66 | 67 | def nbTrain(): 68 | from sklearn.naive_bayes import MultinomialNB 69 | start_timenb = time.time() 70 | train_features = vectorizer.fit_transform(x) 71 | 72 | actual = y 73 | 74 | nb = MultinomialNB() 75 | nb.fit(train_features, [int(r) for r in y]) 76 | 77 | test_features = vectorizer.transform(x) 78 | predictions = nb.predict(test_features) 79 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1) 80 | nbscore = format(metrics.auc(fpr, tpr)) 81 | nbscore = float(nbscore)*100 82 | 83 | # nb_matrix = confusion_matrix(actual, predictions) 84 | # plt.figure() 85 | # plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier') 86 | 87 | print("\n") 88 | 89 | print("Naive Bayes Accuracy : \n", nbscore,"%") 90 | print(" Completion Speed", round((time.time() - start_timenb),5)) 91 | print() 92 | 93 | def datree(): 94 | from sklearn import tree 95 | start_timedt = time.time() 96 | train_featurestree = vectorizer.fit_transform(x) 97 | actual1 = y 98 | test_features1 = vectorizer.transform(x) 99 | dtree = tree.DecisionTreeClassifier() 100 | 101 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 102 | 103 | prediction1 = dtree.predict(test_features1) 104 | ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1) 105 | dtreescore = format(metrics.auc(ddd, ttt)) 106 | dtreescore = float(dtreescore)*100 107 | print("Decision tree Accuracy : \n", dtreescore, "%") 108 | print(" Completion Speed", round((time.time() - start_timedt),5)) 109 | print() 110 | 111 | def Tsvm(): 112 | from sklearn.svm import SVC 113 | start_timesvm = time.time() 114 | train_featuressvm = vectorizer.fit_transform(x) 115 | actual2 = y 116 | test_features2 = vectorizer.transform(x) 117 | svc = SVC() 118 | 119 | svc = svc.fit(train_featuressvm, [int(r) for r in y]) 120 | prediction2 = svc.predict(test_features2) 121 | sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1) 122 | svc = format(metrics.auc(sss, vvv)) 123 | svc = float(svc)*100 124 | print("Support vector machine Accuracy : \n", svc, "%") 125 | print(" Completion Speed", round((time.time() - start_timesvm),5)) 126 | print() 127 | 128 | def knN(): 129 | from sklearn.neighbors import KNeighborsClassifier 130 | start_timekn = time.time() 131 | train_featureskn = vectorizer.fit_transform(x) 132 | actual3 = y 133 | test_features3 = vectorizer.transform(x) 134 | kn = KNeighborsClassifier(n_neighbors=2) 135 | 136 | 137 | kn = kn.fit(train_featureskn, [int(i) for i in y]) 138 | prediction3 = kn.predict(test_features3) 139 | kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1) 140 | kn = format(metrics.auc(kkk, nnn)) 141 | kn = float(kn)*100 142 | 143 | print("Kneighborsclassifier Accuracy : \n", kn, "%") 144 | print(" Completion Speed", round((time.time() - start_timekn),5)) 145 | print() 146 | 147 | def RanFo(): 148 | from sklearn.ensemble import RandomForestClassifier 149 | start_timerf = time.time() 150 | train_featuresrf = vectorizer.fit_transform(x) 151 | actual4 = y 152 | test_features4 = vectorizer.transform(x) 153 | rf = RandomForestClassifier(max_depth=2, random_state=0) 154 | 155 | 156 | rf = rf.fit(train_featuresrf, [int(i) for i in y]) 157 | prediction4 = rf.predict(test_features4) 158 | rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1) 159 | kn = format(metrics.auc(rrr, fff)) 160 | kn = float(kn)*100 161 | print("Random Forest Accuracy : \n", kn, "%") 162 | print(" Completion Speed", round((time.time() - start_timerf),5)) 163 | print() 164 | print() 165 | 166 | 167 | def runall(): 168 | retrieveTweet('data/tweetdata.txt') 169 | retrieveProcessedData('processed_data/output.xlsx') 170 | # nbTrain() 171 | # datree() 172 | # Tsvm() 173 | # knN() 174 | # RanFo() 175 | 176 | def datreeINPUT(inputtweet): 177 | from sklearn import tree 178 | train_featurestree = vectorizer.fit_transform(x) 179 | dtree = tree.DecisionTreeClassifier() 180 | 181 | dtree = dtree.fit(train_featurestree, [int(r) for r in y]) 182 | 183 | 184 | inputdtree= vectorizer.transform([inputtweet]) 185 | predictt = dtree.predict(inputdtree) 186 | 187 | if predictt == 1: 188 | predictt = "Positive" 189 | elif predictt == 0: 190 | predictt = "Neutral" 191 | elif predictt == -1: 192 | predictt = "Negative" 193 | else: 194 | print("Nothing") 195 | 196 | print("\n*****************") 197 | print(predictt) 198 | print("*****************") 199 | 200 | runall() 201 | 202 | print("\nInput your tweet : ") 203 | inputtweet = input() 204 | 205 | datreeINPUT(inputtweet) 206 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
12 |
Name: Nikie Jo Elauria Deocampo
13 |Country: Philippines
14 |Educational Background: 15 |
16 |
Undergraduate: Bachelor of Science in Information System
17 |Graduate: Masters in Information Technology
18 |School: West Visayas State University
19 | 20 | 21 |Mentor: Dr. Bobby Gerardo
22 |Motto: I work hard so my dog can have a better life.
23 | 24 | 25 | 26 |Mental illness has been prevalent in the world, depression is one of the most common psychological problem i know and i would like to help as much as i can. Being a fan of Anthony Bourdain and Robin Williams, It has propel me to explore in this study. With the use of the Large amount of data tweets and Facebook post online i can use machine learning to data mine it and be able to produce a meaningful and useful outcome.
Social media generates countless data every day because of millions of active users share and communicate in entire 29 | community, it changes human interaction. For this project, I will be using Python and various modules and libraries.
30 | 31 | 32 |36 |
The aim of the project is to predict early signs of depression through Social Media text mining. Below are the steps to run the python codes using the data sets uploaded in the repositories or you can download your own.
45 | 46 |consumer_key = '', consumer_secret = '', access_token = '', access_secret = ''
53 | 66 | What the result could mean? Postive, This mean that person is unlikely to have depression or anxiety. Neutral, This is the middle level wherein the user may or may not have depression but may also be more prone to being depress. At that stage the user may display some depression like symptoms. lasty, Negative is the lowest level where depression and anxiety symptoms are being detected through the users tweets. The more negative words the user uses mean the more negative emotion the tweet has. 67 |
68 | 69 | 70 | 71 |AUC is an abbrevation for area under the curve. It is used in classification analysis in order to determine which of the used models predicts the classes best.
87 | 88 |This study is not yet perfect and im still aiming to improve it.
109 | 110 |130 | This work is not possible without the overwhelming support from Jeju National University, Jeju Development Center and other selfless sponsors. I would like to specifically give a big thanks to Prof. Yungcheol Byun for being the best host ever and my mentor Dr. Bobby Gerardo for the help and guidance. 131 |
132 |