├── Accuracy_checker.py
├── Download_twitter_Api.py
├── InsertTweetDemo.py
├── README.md
├── data
    ├── dictionary.tsv
    └── tweetdata.txt
├── depression_sentiment_analysis.py
├── preprocessor.py
└── processed_data
    └── output.xlsx


/Accuracy_checker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on the day we all start to love our self.
  5 | 
  6 | @author: Nikie Jo Deocampo
  7 | """
  8 | 
  9 | import json
 10 | import pandas as pd
 11 | import time
 12 | import numpy as np
 13 | import itertools
 14 | import matplotlib.pyplot as plt 
 15 | from sklearn.metrics import confusion_matrix
 16 | from sklearn.feature_extraction.text import CountVectorizer
 17 | from sklearn import metrics
 18 | #from sklearn.metrics import roc_auc_score
 19 | 
 20 | tweets_data = []
 21 | x = []
 22 | y = []
 23 | vectorizer = CountVectorizer(stop_words='english')
 24 | 
 25 | def retrieveTweet(data_url):
 26 | 
 27 |     tweets_data_path = data_url
 28 |     tweets_file = open(tweets_data_path, "r")
 29 |     for line in tweets_file:
 30 |         try:
 31 |             tweet = json.loads(line)
 32 |             tweets_data.append(tweet)
 33 |         except:
 34 |             continue
 35 | 
 36 |              
 37 | def retrieveProcessedData(Pdata_url):
 38 |     sent = pd.read_excel(Pdata_url)
 39 |     for i in range(len(tweets_data)):
 40 |         if tweets_data[i]['id']==sent['id'][i]:
 41 |             x.append(tweets_data[i]['text'])
 42 |             y.append(sent['sentiment'][i])
 43 | 
 44 | def plot_confusion_matrix(cm, classes,
 45 |                           normalize=False,
 46 |                           title='Confusion matrix',
 47 |                           cmap=plt.cm.Blues):
 48 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
 49 |     plt.title(title)
 50 |     plt.colorbar()
 51 |     tick_marks = np.arange(len(classes))
 52 |     plt.xticks(tick_marks, classes, rotation=45)
 53 |     plt.yticks(tick_marks, classes)
 54 | 
 55 |     fmt = '.2f' if normalize else 'd'
 56 |     thresh = cm.max() / 2.
 57 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 58 |         plt.text(j, i, format(cm[i, j], fmt),
 59 |                  horizontalalignment="center",
 60 |                  color="white" if cm[i, j] > thresh else "black")
 61 | 
 62 |     plt.tight_layout()
 63 |     plt.ylabel('True label')
 64 |     plt.xlabel('Predicted label')          
 65 |             
 66 |             
 67 | def nbTrain():
 68 |     from sklearn.naive_bayes import MultinomialNB
 69 |     start_timenb = time.time()
 70 |     train_features = vectorizer.fit_transform(x)
 71 |     
 72 |     actual = y
 73 |     
 74 |     nb = MultinomialNB()
 75 |     nb.fit(train_features, [int(r) for r in y])
 76 |     
 77 |     test_features = vectorizer.transform(x)
 78 |     predictions = nb.predict(test_features)
 79 |     fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
 80 |     nbscore = format(metrics.auc(fpr, tpr))
 81 |     nbscore = float(nbscore)*100
 82 |     
 83 | #    nb_matrix = confusion_matrix(actual, predictions)
 84 | #    plt.figure()
 85 | #    plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier')
 86 |     
 87 |     print("\n")
 88 | 
 89 |     print("Naive Bayes  Accuracy : \n", nbscore,"%")
 90 |     print(" Completion Speed", round((time.time() - start_timenb),5))
 91 |     print()
 92 | 
 93 | def datree():
 94 |     from sklearn import tree
 95 |     start_timedt = time.time()
 96 |     train_featurestree = vectorizer.fit_transform(x)
 97 |     actual1 = y
 98 |     test_features1 = vectorizer.transform(x)
 99 |     dtree = tree.DecisionTreeClassifier()
100 |     
101 |     dtree = dtree.fit(train_featurestree, [int(r) for r in y])
102 |     
103 |     prediction1 = dtree.predict(test_features1)
104 |     ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1)
105 |     dtreescore = format(metrics.auc(ddd, ttt))
106 |     dtreescore = float(dtreescore)*100
107 |     print("Decision tree Accuracy : \n", dtreescore, "%")
108 |     print(" Completion Speed", round((time.time() - start_timedt),5))
109 |     print()
110 | 
111 | def Tsvm():
112 |     from sklearn.svm import SVC
113 |     start_timesvm = time.time()
114 |     train_featuressvm = vectorizer.fit_transform(x)
115 |     actual2 = y
116 |     test_features2 = vectorizer.transform(x)
117 |     svc = SVC()
118 |     
119 |     svc = svc.fit(train_featuressvm, [int(r) for r in y])
120 |     prediction2 = svc.predict(test_features2)
121 |     sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1)
122 |     svc = format(metrics.auc(sss, vvv))
123 |     svc = float(svc)*100
124 |     print("Support vector machine Accuracy : \n", svc, "%")
125 |     print(" Completion Speed", round((time.time() - start_timesvm),5))
126 |     print()
127 | 
128 | def knN():
129 |     from sklearn.neighbors import KNeighborsClassifier
130 |     start_timekn = time.time()
131 |     train_featureskn = vectorizer.fit_transform(x)
132 |     actual3 = y
133 |     test_features3 = vectorizer.transform(x)
134 |     kn = KNeighborsClassifier(n_neighbors=2)
135 |     
136 |     
137 |     kn = kn.fit(train_featureskn, [int(i) for i in y])
138 |     prediction3 = kn.predict(test_features3)
139 |     kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1)
140 |     kn = format(metrics.auc(kkk, nnn))
141 |     kn = float(kn)*100
142 |     
143 |     print("Kneighborsclassifier Accuracy : \n", kn, "%")
144 |     print(" Completion Speed", round((time.time() - start_timekn),5))
145 |     print()
146 | 
147 | def RanFo():
148 |     from sklearn.ensemble import RandomForestClassifier
149 |     start_timerf = time.time()
150 |     train_featuresrf = vectorizer.fit_transform(x)
151 |     actual4 = y
152 |     test_features4 = vectorizer.transform(x)
153 |     rf = RandomForestClassifier(max_depth=2, random_state=0)
154 |     
155 |     
156 |     rf = rf.fit(train_featuresrf, [int(i) for i in y])
157 |     prediction4 = rf.predict(test_features4)
158 |     rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1)
159 |     kn = format(metrics.auc(rrr, fff))
160 |     kn = float(kn)*100
161 |     print("Random Forest Accuracy : \n", kn, "%")
162 |     print(" Completion Speed", round((time.time() - start_timerf),5))
163 |     print()
164 |     print()
165 | 
166 | 
167 | def runall():     
168 |     retrieveTweet('data/tweetdata.txt')  
169 |     retrieveProcessedData('processed_data/output.xlsx')
170 |     nbTrain()
171 |     datree()
172 |     Tsvm()
173 |     knN()
174 |     RanFo()
175 |     
176 | def datreeINPUT(inputtweet):
177 |     from sklearn import tree
178 |     train_featurestree = vectorizer.fit_transform(x)
179 |     dtree = tree.DecisionTreeClassifier()
180 |     
181 |     dtree = dtree.fit(train_featurestree, [int(r) for r in y])
182 |     
183 |     
184 |     inputdtree= vectorizer.transform([inputtweet])
185 |     predictt = dtree.predict(inputdtree)
186 |     
187 |     if predictt == 1:
188 |         predictt = "Positive"
189 |     elif predictt == 0:
190 |         predictt = "Neutral"
191 |     elif predictt == -1:
192 |         predictt = "Negative"
193 |     else:
194 |         print("Nothing")
195 |     
196 |     print("\n*****************")
197 |     print(predictt)
198 |     print("*****************")
199 | 
200 | runall()
201 | 
202 | #print("Input your tweet : ")
203 | #inputtweet = input()
204 | #
205 | #datreeINPUT(inputtweet)
206 | 


--------------------------------------------------------------------------------
/Download_twitter_Api.py:
--------------------------------------------------------------------------------
 1 | from tweepy.streaming import StreamListener
 2 | from tweepy import OAuthHandler
 3 | from tweepy import Stream
 4 | 
 5 | 
 6 | consumer_key = 'd7RJDeV6M1TdKnXXdY29Zud5O'
 7 | consumer_secret = '8LV35luiAco2mBnQ1W6erOnA8cbMwVgxblfHjP5zk5dmAXGwd6'
 8 | access_token = '2206645458-9qlftwQ5eiovob7GCp21VrAoFRXi7AJLGt5ts3O'
 9 | access_secret = 'Oc9ZKbHSL0reJhZYcU0Vk9UERbVvsTwerIfDUTwiRNGYf'
10 | 
11 | 
12 | 
13 | class StdOutListener(StreamListener):
14 | 
15 |     def on_data(self, data):
16 |         with open('data/tweetdata.txt','a') as tf:
17 |             tf.write(data)
18 |         print(data)
19 |         return True
20 | 
21 |     def on_error(self, status):
22 |         print(status)
23 | 
24 | 
25 | if __name__ == '__main__':
26 | 
27 | 
28 |     l = StdOutListener()
29 |     auth = OAuthHandler(consumer_key, consumer_secret)
30 |     auth.set_access_token(access_token, access_secret)
31 |     stream = Stream(auth, l)
32 | 
33 |     stream.filter(track=['depression', 'anxiety', 'mental health', 'suicide', 'stress', 'sad'])


--------------------------------------------------------------------------------
/InsertTweetDemo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on the day we all start to love our self.
  5 | 
  6 | @author: Nikie Jo Deocampo
  7 | """
  8 | 
  9 | import json
 10 | import pandas as pd
 11 | import time
 12 | import numpy as np
 13 | import itertools
 14 | import matplotlib.pyplot as plt 
 15 | from sklearn.metrics import confusion_matrix
 16 | from sklearn.feature_extraction.text import CountVectorizer
 17 | from sklearn import metrics
 18 | #from sklearn.metrics import roc_auc_score
 19 | 
 20 | tweets_data = []
 21 | x = []
 22 | y = []
 23 | vectorizer = CountVectorizer(stop_words='english')
 24 | 
 25 | def retrieveTweet(data_url):
 26 | 
 27 |     tweets_data_path = data_url
 28 |     tweets_file = open(tweets_data_path, "r")
 29 |     for line in tweets_file:
 30 |         try:
 31 |             tweet = json.loads(line)
 32 |             tweets_data.append(tweet)
 33 |         except:
 34 |             continue
 35 | 
 36 |              
 37 | def retrieveProcessedData(Pdata_url):
 38 |     sent = pd.read_excel(Pdata_url)
 39 |     for i in range(len(tweets_data)):
 40 |         if tweets_data[i]['id']==sent['id'][i]:
 41 |             x.append(tweets_data[i]['text'])
 42 |             y.append(sent['sentiment'][i])
 43 | 
 44 | def plot_confusion_matrix(cm, classes,
 45 |                           normalize=False,
 46 |                           title='Confusion matrix',
 47 |                           cmap=plt.cm.Blues):
 48 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
 49 |     plt.title(title)
 50 |     plt.colorbar()
 51 |     tick_marks = np.arange(len(classes))
 52 |     plt.xticks(tick_marks, classes, rotation=45)
 53 |     plt.yticks(tick_marks, classes)
 54 | 
 55 |     fmt = '.2f' if normalize else 'd'
 56 |     thresh = cm.max() / 2.
 57 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 58 |         plt.text(j, i, format(cm[i, j], fmt),
 59 |                  horizontalalignment="center",
 60 |                  color="white" if cm[i, j] > thresh else "black")
 61 | 
 62 |     plt.tight_layout()
 63 |     plt.ylabel('True label')
 64 |     plt.xlabel('Predicted label')          
 65 |             
 66 |             
 67 | def nbTrain():
 68 |     from sklearn.naive_bayes import MultinomialNB
 69 |     start_timenb = time.time()
 70 |     train_features = vectorizer.fit_transform(x)
 71 |     
 72 |     actual = y
 73 |     
 74 |     nb = MultinomialNB()
 75 |     nb.fit(train_features, [int(r) for r in y])
 76 |     
 77 |     test_features = vectorizer.transform(x)
 78 |     predictions = nb.predict(test_features)
 79 |     fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
 80 |     nbscore = format(metrics.auc(fpr, tpr))
 81 |     nbscore = float(nbscore)*100
 82 |     
 83 | #    nb_matrix = confusion_matrix(actual, predictions)
 84 | #    plt.figure()
 85 | #    plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier')
 86 |     
 87 |     print("\n")
 88 | 
 89 |     print("Naive Bayes  Accuracy : \n", nbscore,"%")
 90 |     print(" Completion Speed", round((time.time() - start_timenb),5))
 91 |     print()
 92 | 
 93 | def datree():
 94 |     from sklearn import tree
 95 |     start_timedt = time.time()
 96 |     train_featurestree = vectorizer.fit_transform(x)
 97 |     actual1 = y
 98 |     test_features1 = vectorizer.transform(x)
 99 |     dtree = tree.DecisionTreeClassifier()
100 |     
101 |     dtree = dtree.fit(train_featurestree, [int(r) for r in y])
102 |     
103 |     prediction1 = dtree.predict(test_features1)
104 |     ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1)
105 |     dtreescore = format(metrics.auc(ddd, ttt))
106 |     dtreescore = float(dtreescore)*100
107 |     print("Decision tree Accuracy : \n", dtreescore, "%")
108 |     print(" Completion Speed", round((time.time() - start_timedt),5))
109 |     print()
110 | 
111 | def Tsvm():
112 |     from sklearn.svm import SVC
113 |     start_timesvm = time.time()
114 |     train_featuressvm = vectorizer.fit_transform(x)
115 |     actual2 = y
116 |     test_features2 = vectorizer.transform(x)
117 |     svc = SVC()
118 |     
119 |     svc = svc.fit(train_featuressvm, [int(r) for r in y])
120 |     prediction2 = svc.predict(test_features2)
121 |     sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1)
122 |     svc = format(metrics.auc(sss, vvv))
123 |     svc = float(svc)*100
124 |     print("Support vector machine Accuracy : \n", svc, "%")
125 |     print(" Completion Speed", round((time.time() - start_timesvm),5))
126 |     print()
127 | 
128 | def knN():
129 |     from sklearn.neighbors import KNeighborsClassifier
130 |     start_timekn = time.time()
131 |     train_featureskn = vectorizer.fit_transform(x)
132 |     actual3 = y
133 |     test_features3 = vectorizer.transform(x)
134 |     kn = KNeighborsClassifier(n_neighbors=2)
135 |     
136 |     
137 |     kn = kn.fit(train_featureskn, [int(i) for i in y])
138 |     prediction3 = kn.predict(test_features3)
139 |     kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1)
140 |     kn = format(metrics.auc(kkk, nnn))
141 |     kn = float(kn)*100
142 |     
143 |     print("Kneighborsclassifier Accuracy : \n", kn, "%")
144 |     print(" Completion Speed", round((time.time() - start_timekn),5))
145 |     print()
146 | 
147 | def RanFo():
148 |     from sklearn.ensemble import RandomForestClassifier
149 |     start_timerf = time.time()
150 |     train_featuresrf = vectorizer.fit_transform(x)
151 |     actual4 = y
152 |     test_features4 = vectorizer.transform(x)
153 |     rf = RandomForestClassifier(max_depth=2, random_state=0)
154 |     
155 |     
156 |     rf = rf.fit(train_featuresrf, [int(i) for i in y])
157 |     prediction4 = rf.predict(test_features4)
158 |     rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1)
159 |     kn = format(metrics.auc(rrr, fff))
160 |     kn = float(kn)*100
161 |     print("Random Forest Accuracy : \n", kn, "%")
162 |     print(" Completion Speed", round((time.time() - start_timerf),5))
163 |     print()
164 |     print()
165 | 
166 | 
167 | def runall():     
168 |     retrieveTweet('data/tweetdata.txt')  
169 |     retrieveProcessedData('processed_data/output.xlsx')
170 | #    nbTrain()
171 | #    datree()
172 | #    Tsvm()
173 | #    knN()
174 | #    RanFo()
175 |     
176 | def datreeINPUT(inputtweet):
177 |     from sklearn import tree
178 |     train_featurestree = vectorizer.fit_transform(x)
179 |     dtree = tree.DecisionTreeClassifier()
180 |     
181 |     dtree = dtree.fit(train_featurestree, [int(r) for r in y])
182 |     
183 |     
184 |     inputdtree= vectorizer.transform([inputtweet])
185 |     predictt = dtree.predict(inputdtree)
186 |     
187 |     if predictt == 1:
188 |         predictt = "Positive"
189 |     elif predictt == 0:
190 |         predictt = "Neutral"
191 |     elif predictt == -1:
192 |         predictt = "Negative"
193 |     else:
194 |         print("Nothing")
195 |     
196 |     print("\n*****************")
197 |     print(predictt)
198 |     print("*****************")
199 | 
200 | runall()
201 | 
202 | print("\nInput your tweet : ")
203 | inputtweet = input()
204 | 
205 | datreeINPUT(inputtweet)
206 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <center><h2>Classification of Depression on Social Media Using Text Mining</h2></center>
  2 | 
  3 | <h4><a href="#about">Author</a> | <a href="#introduction">Introduction</a> | <a href="#project">The Project</a> | <a href="#vd">Video Demo</a> | <a href="#reference">References</a> | <a href="#acknowledgement">Acknowledgement</a></h4>
  4 | 
  5 | <img src="https://i.ibb.co/QH8Y1Wq/aa.jpg" />
  6 | 
  7 | <h2 id="about">Author  -  저자 </h2>
  8 | 
  9 | <img src="https://lh3.googleusercontent.com/_WfiyUWgmJQ3Gn1KPb7IeBENpT6hapDD6eViS0XX3K5Xitx6koiTvPI4wzaKPHSJSoyt4XyA0dWcQhi-cXtx0EcjvQpiwAPjBkJ4rj9Sbx9vdGuriAdzdcxVIoCQiWbPouzZ6d0CBDnhDB0F_ypvVd-uF3tr162BTpCVd-016Nqnz7SkZZ308SJd6EeOTaLZyT4ZmH1EKELmck-3k5AIvS8H-UwavkRVMtKYrScpYaVL4mC42CmtRE3Lua_jFxPAAt5vh6SP1bfxJn43o4Kt5PDcpiGdbzCgX_AbX42Ps0_KFHTzCSbMn6zM0fQB9V4aP_qvL5ZDCeOyWO4YO4P__42lZKBZpm17Yo1qByC-ZtEalgRAtSlIdGXDbdtbdQ4VsX6qIc2r7o0G1hu4w9f518mvNrT_QxVzW2sPUvmDlYP-RRTIsdyateJ7_580cKFGBbbpclj4u6n7XxyNfA1tqDxlZTWODWDEA4agK7bNf4-iomdXjjKIJBQ4XBdbEBls4PWfdCHgZ6q2-M_euppM3-2D0cmZ4MTI4oTR5c31XalMMlxWdQ4GRsa24VeZzIEb1TuCnDgLrzwSOd-KPz5BgCYMKjX4Soemh4nSyRq_Zz4pSY577KPFQoCa4rq9FhKCC8KXAL6zhSg2bai-v90qoqgVMsjYIMs7=w876-h1312-no" width="200" align="left" style="border: 2px solid #000"/>
 10 | 
 11 | <p>
 12 | 	<p><strong>Name:</strong> Nikie Jo Elauria Deocampo</p>
 13 | 	<p><strong>Country:</strong></h4> Philippines</p>
 14 | 	<p><strong>Educational Background:</strong>
 15 | 		<p>
 16 | 			<p><u>Undergraduate:</u></strong> Bachelor of Science in Information System</p>
 17 | 			<p><u>Graduate:</u></strong> Masters in Information Technology</p>
 18 | 			<p><u>School:</u></strong> West Visayas State University</p>
 19 | 		</p>
 20 | 	</p>
 21 | 	<p><strong>Mentor:</strong> Dr. Bobby Gerardo</p>
 22 | 	<p><strong>Motto:</strong> I work hard so my dog can have a better life.</p>
 23 | </p>
 24 | 
 25 | 
 26 | <h2 id="introduction">Introduction  -  소개</h2>
 27 | 
 28 | <p>Mental illness has been prevalent in the world, depression is one of the most common psychological problem i know and i would like to help as much as i can. Being a fan of Anthony Bourdain and Robin Williams, It has propel me to explore in this study. With the use of the Large amount of data tweets and Facebook post online i can use machine learning to data mine it and be able to produce a meaningful and useful outcome.</p><p>Social media generates countless data every day because of millions of active users share and communicate in entire
 29 | community, it changes human interaction. For this project, I will be using Python and various modules and libraries.</p>
 30 | 
 31 | 
 32 | <h2 id="project">The Project  -  프로젝트</h2>
 33 | 
 34 | <strong>Requirements:</strong>
 35 | <p>
 36 | 	<ul>
 37 | 		<li>Python 3.6.1 or Higher</li>
 38 | 		<li>Twitter developer account</li>
 39 | 		<li>A bunch of modules (Keras, TF, Numpy, sklearns, pandas and itertools)</li>
 40 | 		<li>A lot of patience and a love for machine learning.</li>
 41 | 	</ul>
 42 | </p>
 43 | 
 44 | <p>The aim of the project is to predict early signs of depression through <strong>Social Media</strong> text mining. Below are the steps to run the python codes using the data sets uploaded in the repositories or you can download your own.</p>
 45 | 
 46 | <img src="https://i.ibb.co/5vHttnS/bb.png"/>
 47 | 
 48 | <strong>Follow steps below:</strong>
 49 | <ol>
 50 | 	<li>Create a twitter developers account (<a href="https://developer.twitter.com/" target="_new"> Register Here</a>), From that account your would need 4 things.
 51 | 	</li>
 52 | 	<code>consumer_key = '', consumer_secret = '', access_token = '', access_secret = ''</code>
 53 | 	<li>Using the file "Download_twitter_Api.py" insert the credentials and you can download current tweets using keywords such us depression, anxiety or sadness. When data sets are ready you may proceed on the preprocessing stage. 
 54 | 	<br>
 55 |     <img src="https://i.ibb.co/M19tdrd/cc.png" />
 56 | 	</li>
 57 | 	<li>Run "preprocessor.py", This stage will go through your data sets and the given dictionary. The dictionary contain words with their corresponding polarity, which is essential to calcualting the sentiment of each tweet, each word will be seperated, tokenized and given its polarity. Every tweet will consist of the summation of all polarity of each word and devided by number of words in that tweet.</li>
 58 | 	<li>Once preprocess is done. You can find the file in the directory "processed_data/output.xlsx". Opening it you will find that the ID (tweet) and Sentiment of each tweet is seperated into 2 columns. With this output you now have a twitter data set and its corresponding sentiment filtered by depress keywords. (Positive, Neutral and Negative).</li>
 59 | 	<img src="https://i.ibb.co/NsvFqxm/dd.png" />
 60 | 	<li>Now for training and Predicting. Make sure all files are located in proper folders, Run "depression_sentiment_analysis.py". The code will run through the output.xlsx file and at the same time recover the tweet corresponding to the id of each sentiment. using this we use the original data and feed them to our classifiers. When everything is done you should have all the AUC of each classifier listed in the console.</li>
 61 | 	<li>But wait, There's more. You will also have the ability to type in a sample tweet, The tweet will go through the highest AUC in the list of classifier to predict the sentiment of the tweet you wrote.</li>
 62 | 	<img src="https://i.ibb.co/72F8Vjq/ee.png" />
 63 | </ol>
 64 | 
 65 | <p>
 66 | 	What the result could mean? <strong>Postive</strong>, This mean that person is unlikely to have depression or anxiety. <strong>Neutral</strong>, This is the middle level wherein the user may or may not have depression but may also be more prone to being depress. At that stage the user may display some depression like symptoms. lasty, <strong>Negative</strong> is the lowest level where depression and anxiety symptoms are being detected through the users tweets. The more negative words the user uses mean the more negative emotion the tweet has.
 67 | </p>
 68 | 
 69 | 
 70 | 
 71 | <h2 id="vd">Video Tutorials</h2>
 72 | <ul>
 73 | 	<li><strong>How to download from Twitter: </strong><a href="http://recordit.co/lptv75TqDV">Video Link</a></li>
 74 | 	<li><strong>How to preprocess data: </strong><a href="http://recordit.co/Y4YSvAokAN">Video Link</a></li>
 75 | 	<li><strong>How to train and test: </strong><a href="http://recordit.co/eh42r6uAI0">Video Link</a></li>
 76 | </ul>
 77 | 
 78 | <h2>Results  -  결과들</h2>
 79 | 
 80 | Below are the Matrix for the 5 classifier with Decision tree having the highest score.
 81 | 
 82 | <img src="https://i.ibb.co/4WTHVGC/ff.png">
 83 | 
 84 | 
 85 | Using the same data set to test my accuracy, I trained and tested about 10,000 Tweets:
 86 | <p>AUC is an abbrevation for area under the curve. It is used in classification analysis in order to determine which of the used models predicts the classes best.</p>
 87 | 
 88 | <h4>Accuracy: </h4>
 89 | <ul>
 90 | 	<li><strong>Naive Bayes  Accuracy: </strong>93.79406648429645 %</li>
 91 | 	<li><strong>Decision Tree: </strong>98.55668748040587 %</li>
 92 | 	<li><strong>Support Vector Machine: </strong>50.0 %</li>
 93 | 	<li><strong>Kneighbors: </strong>81.464022923447 %</li>
 94 | 	<li><strong>Random Forest: </strong>49.1038137743686 %</li>
 95 | </ul>
 96 | 
 97 | <h4>Completion Time: </h4>
 98 | <ul>
 99 | 	<li><strong>Naive Bayes  Accuracy: </strong>0.59779 Seconds</li>
100 | 	<li><strong>Decision Tree: </strong>3.40457 Seconds</li>
101 | 	<li><strong>Support Vector Machine: </strong>29.83311 Seconds</li>
102 | 	<li><strong>Kneighbors: </strong>7.99048 Seconds</li>
103 | 	<li><strong>Random Forest: </strong>0.60994 Seconds</li>
104 | </ul>
105 | 
106 | 
107 | <h2>Future Plans  -  향후 계획</h2>
108 | <p>This study is not yet perfect and im still aiming to improve it.</p>
109 | 
110 | <ul>
111 | 	<li>Use Contextual Semantic segmentation</li>
112 | 	<li>Use Stopwords to increase accuracy of model</li>
113 | 	<li>Eliminating features with extremely low frequency</li>
114 | 	<li>Use Complex Features: n-grams and part of speech tags</li>
115 | </ul>
116 | 
117 | <h2 id="reference">References  -  참고</h2>
118 | 
119 | <ul>
120 | 	<li>https://www.researchgate.net/publication/318136574_Extracting_Depression_Symptoms_from_Social_Networks_and_Web_Blogs_via_Text_Mining</li>
121 | 	<li>https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/</li>
122 | 	<li>https://github.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data</li>
123 | 	<li>https://arxiv.org/pdf/1607.07384.pdf</li>
124 | </ul>
125 | 
126 | 
127 | <h2 id="acknowledgement">Acknowledgement  -  승인</h2>
128 | 
129 | <p>
130 | 	This work is not possible without the overwhelming support from <strong>Jeju National University</strong>, <strong>Jeju Development Center</strong> and other selfless sponsors. I would like to specifically give a big thanks to <strong>Prof. Yungcheol Byun</strong> for being the best host ever and my mentor <strong>Dr. Bobby Gerardo</strong> for the help and guidance.
131 | </p>
132 | <img src="https://i.ibb.co/c2Gk625/tt.jpg" />
133 | 


--------------------------------------------------------------------------------
/depression_sentiment_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on the day we all start to love our self.
  5 | 
  6 | @author: Nikie Jo Deocampo
  7 | """
  8 | 
  9 | import json
 10 | import pandas as pd
 11 | import time
 12 | import numpy as np
 13 | import itertools
 14 | import matplotlib.pyplot as plt 
 15 | from sklearn.metrics import confusion_matrix
 16 | from sklearn.feature_extraction.text import CountVectorizer
 17 | from sklearn import metrics
 18 | #from sklearn.metrics import roc_auc_score
 19 | 
 20 | tweets_data = []
 21 | x = []
 22 | y = []
 23 | vectorizer = CountVectorizer(stop_words='english')
 24 | 
 25 | def retrieveTweet(data_url):
 26 | 
 27 |     tweets_data_path = data_url
 28 |     tweets_file = open(tweets_data_path, "r")
 29 |     for line in tweets_file:
 30 |         try:
 31 |             tweet = json.loads(line)
 32 |             tweets_data.append(tweet)
 33 |         except:
 34 |             continue
 35 | 
 36 |              
 37 | def retrieveProcessedData(Pdata_url):
 38 |     sent = pd.read_excel(Pdata_url)
 39 |     for i in range(len(tweets_data)):
 40 |         if tweets_data[i]['id']==sent['id'][i]:
 41 |             x.append(tweets_data[i]['text'])
 42 |             y.append(sent['sentiment'][i])
 43 | 
 44 | def plot_confusion_matrix(cm, classes,
 45 |                           normalize=False,
 46 |                           title='Confusion matrix',
 47 |                           cmap=plt.cm.Blues):
 48 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
 49 |     plt.title(title)
 50 |     plt.colorbar()
 51 |     tick_marks = np.arange(len(classes))
 52 |     plt.xticks(tick_marks, classes, rotation=45)
 53 |     plt.yticks(tick_marks, classes)
 54 | 
 55 |     fmt = '.2f' if normalize else 'd'
 56 |     thresh = cm.max() / 2.
 57 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 58 |         plt.text(j, i, format(cm[i, j], fmt),
 59 |                  horizontalalignment="center",
 60 |                  color="white" if cm[i, j] > thresh else "black")
 61 | 
 62 |     plt.tight_layout()
 63 |     plt.ylabel('True label')
 64 |     plt.xlabel('Predicted label')          
 65 |             
 66 |             
 67 | def nbTrain():
 68 |     from sklearn.naive_bayes import MultinomialNB
 69 |     start_timenb = time.time()
 70 |     train_features = vectorizer.fit_transform(x)
 71 |     
 72 |     actual = y
 73 |     
 74 |     nb = MultinomialNB()
 75 |     nb.fit(train_features, [int(r) for r in y])
 76 |     
 77 |     test_features = vectorizer.transform(x)
 78 |     predictions = nb.predict(test_features)
 79 |     fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
 80 |     nbscore = format(metrics.auc(fpr, tpr))
 81 |     nbscore = float(nbscore)*100
 82 |     
 83 |     nb_matrix = confusion_matrix(actual, predictions)
 84 |     plt.figure()
 85 |     plot_confusion_matrix(nb_matrix, classes=[-1,0,1], title='Confusion matrix For NB classifier')
 86 |     
 87 |     print("\n")
 88 |     
 89 | #    test_try= vectorizer.transform(["Lets help those in need, fight anxiety and bring happiness"])
 90 | #    test_try2= vectorizer.transform(["Dont look down at people with anxiety rather give love and respect to all. shout! Equality."])
 91 | #    predictr = nb.predict(test_try)
 92 | #    predictt = nb.predict(test_try2)
 93 |     
 94 |     
 95 | #    print(predictr)
 96 | #    print(predictt)
 97 | 
 98 |     print("Naive Bayes  Accuracy : \n", nbscore,"%")
 99 |     print(" Completion Speed", round((time.time() - start_timenb),5))
100 |     print()
101 | 
102 | def datree():
103 |     from sklearn import tree
104 |     start_timedt = time.time()
105 |     train_featurestree = vectorizer.fit_transform(x)
106 |     actual1 = y
107 |     test_features1 = vectorizer.transform(x)
108 |     dtree = tree.DecisionTreeClassifier()
109 |     
110 |     dtree = dtree.fit(train_featurestree, [int(r) for r in y])
111 |     
112 |     prediction1 = dtree.predict(test_features1)
113 |     ddd, ttt, thresholds = metrics.roc_curve(actual1, prediction1, pos_label=1)
114 |     dtreescore = format(metrics.auc(ddd, ttt))
115 |     dtreescore = float(dtreescore)*100
116 |     print("Decision tree Accuracy : \n", dtreescore, "%")
117 |     print(" Completion Speed", round((time.time() - start_timedt),5))
118 |     print()
119 | 
120 | def Tsvm():
121 |     from sklearn.svm import SVC
122 |     start_timesvm = time.time()
123 |     train_featuressvm = vectorizer.fit_transform(x)
124 |     actual2 = y
125 |     test_features2 = vectorizer.transform(x)
126 |     svc = SVC()
127 |     
128 |     svc = svc.fit(train_featuressvm, [int(r) for r in y])
129 |     prediction2 = svc.predict(test_features2)
130 |     sss, vvv, thresholds = metrics.roc_curve(actual2, prediction2, pos_label=1)
131 |     svc = format(metrics.auc(sss, vvv))
132 |     svc = float(svc)*100
133 |     print("Support vector machine Accuracy : \n", svc, "%")
134 |     print(" Completion Speed", round((time.time() - start_timesvm),5))
135 |     print()
136 | 
137 | def knN():
138 |     from sklearn.neighbors import KNeighborsClassifier
139 |     start_timekn = time.time()
140 |     train_featureskn = vectorizer.fit_transform(x)
141 |     actual3 = y
142 |     test_features3 = vectorizer.transform(x)
143 |     kn = KNeighborsClassifier(n_neighbors=2)
144 |     
145 |     
146 |     kn = kn.fit(train_featureskn, [int(i) for i in y])
147 |     prediction3 = kn.predict(test_features3)
148 |     kkk, nnn, thresholds = metrics.roc_curve(actual3, prediction3, pos_label=1)
149 |     kn = format(metrics.auc(kkk, nnn))
150 |     kn = float(kn)*100
151 |     
152 |     print("Kneighborsclassifier Accuracy : \n", kn, "%")
153 |     print(" Completion Speed", round((time.time() - start_timekn),5))
154 |     print()
155 | 
156 | def RanFo():
157 |     from sklearn.ensemble import RandomForestClassifier
158 |     start_timerf = time.time()
159 |     train_featuresrf = vectorizer.fit_transform(x)
160 |     actual4 = y
161 |     test_features4 = vectorizer.transform(x)
162 |     rf = RandomForestClassifier(max_depth=2, random_state=0)
163 |     
164 |     
165 |     rf = rf.fit(train_featuresrf, [int(i) for i in y])
166 |     prediction4 = rf.predict(test_features4)
167 |     rrr, fff, thresholds = metrics.roc_curve(actual4, prediction4, pos_label=1)
168 |     kn = format(metrics.auc(rrr, fff))
169 |     kn = float(kn)*100
170 |     print("Random Forest Accuracy : \n", kn, "%")
171 |     print(" Completion Speed", round((time.time() - start_timerf),5))
172 |     print()
173 |     print()
174 | 
175 | 
176 | def runall():     
177 |     retrieveTweet('data/tweetdata.txt')  
178 |     retrieveProcessedData('processed_data/output.xlsx')
179 |     nbTrain()
180 |     datree()
181 |     Tsvm()
182 |     knN()
183 |     RanFo()
184 |     
185 | def datreeINPUT(inputtweet):
186 |     from sklearn import tree
187 |     train_featurestree = vectorizer.fit_transform(x)
188 |     dtree = tree.DecisionTreeClassifier()
189 |     
190 |     dtree = dtree.fit(train_featurestree, [int(r) for r in y])
191 |     
192 |     
193 |     inputdtree= vectorizer.transform([inputtweet])
194 |     predictt = dtree.predict(inputdtree)
195 |     
196 |     if predictt == 1:
197 |         predictt = "Positive"
198 |     elif predictt == 0:
199 |         predictt = "Neutral"
200 |     elif predictt == -1:
201 |         predictt = "Negative"
202 |     else:
203 |         print("Nothing")
204 |     
205 |     print("\n*****************")
206 |     print(predictt)
207 |     print("*****************")
208 | 
209 | runall()
210 | 
211 | print("Input your tweet : ")
212 | inputtweet = input()
213 | 
214 | datreeINPUT(inputtweet)
215 | 


--------------------------------------------------------------------------------
/preprocessor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Jul 26 16:34:13 2018
  5 | 
  6 | @author: Nikie Jo Deocampo
  7 | """
  8 | import json
  9 | import csv
 10 | from nltk.tokenize import word_tokenize
 11 | import string
 12 | import re
 13 | import time
 14 | import pandas as pd
 15 | 
 16 | 
 17 | tweets_data = []
 18 | x = []
 19 | y = []
 20 | k = []
 21 | some_milby = []
 22 | print("===========================")
 23 | print("Starting Preprocess Function")
 24 | print("=========================== \n\n")
 25 | 
 26 | def getdata(dataurl):
 27 |     print("===========================")
 28 |     print("Retrieving TXT File")
 29 |     tweets_data_path = dataurl
 30 |     tweets_file = open(tweets_data_path, "r")
 31 |     for line in tweets_file:
 32 |         try:
 33 |             tweet = json.loads(line)
 34 |             tweets_data.append(tweet)
 35 |         except:
 36 |             continue
 37 |     print("===========================")
 38 |     print("Retrieving Successfull")
 39 |     print("=========================== \n \n")
 40 |     time.sleep(3)
 41 |     processdata()
 42 | 
 43 | 
 44 | def processdata():
 45 |     print("===========================")
 46 |     print("Recovering Data Teets")
 47 |     print("===========================")
 48 |     time.sleep(1)
 49 |     RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
 50 |     for i in range(len(tweets_data)):
 51 |         q = tweets_data[i]['text']
 52 |         o = tweets_data[i]['id_str']
 53 |         q = RE_EMOJI.sub(r'', q)
 54 |         i = q.translate(str.maketrans('','',string.punctuation))
 55 |         x.append(i)
 56 |         k.append(o)
 57 |     print("===========================")
 58 |     print("Data Tweets Recovered")
 59 |     print("===========================\n\n")
 60 |     
 61 |     
 62 |     
 63 | def readdict(dataurl):
 64 |     print("===========================")
 65 |     print("Reading Dictionary")
 66 |     print("===========================")  
 67 |     with open(dataurl) as tsvfile:
 68 |       reader = csv.reader(tsvfile, delimiter='\t')
 69 |       for row in reader:
 70 |           i = []
 71 |           i.append(row[2])
 72 |           i.append(row[5])
 73 |           y.append(i)
 74 |     print("===========================")
 75 |     print("Dictionary Preparation Done")
 76 |     print("===========================\n\n")  
 77 |     addpolarity()
 78 | 
 79 | def addpolarity():  
 80 |     start_time = time.time()
 81 |     counter = 0
 82 |     print("===========================")
 83 |     print("Processing please wait...")
 84 |     print("===========================\n\n")
 85 |     
 86 |     
 87 |     
 88 |     for j in x:
 89 |  
 90 |             tweet_token = j
 91 |             token = word_tokenize(tweet_token)
 92 |             sumnum = 0
 93 |             sum_word = 0
 94 |             for t in token:
 95 |      
 96 |                 for d in y:
 97 |                     if t == d[0]:
 98 |                         sentiment = d[1]
 99 |                         if sentiment == "positive":
100 |                             sumnum += 1
101 |                             sum_word += 1
102 | 
103 |                         elif sentiment == "negative":
104 |                             sumnum += -1
105 |                             sum_word += 1
106 | 
107 |                         else:
108 |                             sumnum += 0
109 |                             sum_word += 1
110 | 
111 | 
112 |                         break
113 |                  
114 |             
115 |             if sum_word != 0.0:
116 |                 sum_more = sumnum / sum_word
117 |                 if sum_more >= 0.2:
118 |                     sum_more = 1
119 |    
120 |                 elif (sum_more < 0.2) and (sum_more > -0.5):
121 |                     sum_more = 0
122 |                    
123 |                 elif sum_more <= -0.5:
124 |                     sum_more = -1
125 |                    
126 |                 else:
127 |                     print("****")
128 |                     
129 |                 
130 |             sum_var = []    
131 |             varid = k[counter]
132 |             sum_var.append(varid)
133 |             sum_var.append(sum_more)
134 |             some_milby.append(sum_var)
135 |             counter += 1
136 |             
137 |     print("Processing time: ", round((time.time() - start_time),8), "Seconds \n\n")
138 |     
139 |     time.sleep(3)
140 |         
141 |     print("===========================")
142 |     print("Processing Finish")
143 |     print("===========================")
144 |     
145 |     
146 |     savetoxlsx()
147 |     
148 | def savetoxlsx():
149 |     df = pd.DataFrame(some_milby)
150 |     df.to_excel('processed_data/output.xlsx', header=("id","sentiment"), index=False)
151 |     
152 |     
153 |     #file = open("testfile_data.txt","w") 
154 |     #file.write(some_milby) 
155 |     #file.close() 
156 |     
157 |     print("===========================")
158 |     print("Data Saved!")
159 |     print("===========================") 
160 |     
161 | 
162 | def runall():
163 |     getdata('data/tweetdata.txt')
164 |     readdict('data/dictionary.tsv')
165 |     
166 | 
167 | 
168 | runall()


--------------------------------------------------------------------------------
/processed_data/output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niquejoe/Classification-of-Depression-on-Social-Media-Using-Text-Mining/fc732087c9ff8d06cbdb498cdeb09c31f14b3c67/processed_data/output.xlsx


--------------------------------------------------------------------------------