├── CNAME ├── CONTRIBUTING.md ├── link.txt ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ └── custom.md └── workflows │ └── python-package-conda.yml ├── Model ├── model.pkl └── vectorizer.pkl ├── environment.yml ├── README.md ├── LICENSE ├── twitterapiaccount.py └── twitterapiaccount.ipynb /CNAME: -------------------------------------------------------------------------------- 1 | dtweet.me -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /link.txt: -------------------------------------------------------------------------------- 1 | https://colab.research.google.com/drive/1q7pmP3GtNueW7iA4mVutlMdT7BcFZKJR?usp=sharing 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: SUBHADIPMAITI-DEV 4 | 5 | 6 | -------------------------------------------------------------------------------- /Model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUBHADIPMAITI-DEV/Depression-Detection-System-Using-Machine-Learning/HEAD/Model/model.pkl -------------------------------------------------------------------------------- /Model/vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUBHADIPMAITI-DEV/Depression-Detection-System-Using-Machine-Learning/HEAD/Model/vectorizer.pkl -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: my-environment 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.10 6 | - pandas 7 | - numpy 8 | - scikit-learn 9 | - matplotlib 10 | - pip 11 | - pip: 12 | - some-pip-package 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | # Depression-Detection-System-Using-Machine-Learning 7 | 8 | 9 | 10 | SYSTEM REQUIREMENT 11 | 12 | 🔎Hardware Requirements: 13 | System Processing: I3 10TH Gen ; 14 | RAM: 8GB ; 15 | Storage: SSD 512 GB ; 16 | Also, we can use cloud execution for GPU – Google Colab architecture. 17 | 18 | 🔎Software Requirements: 19 | OS: Windows 10 ; 20 | Google Colab ; 21 | 22 | 23 | RESULT ANALYSIS 24 | ![image](https://user-images.githubusercontent.com/78700974/204124779-9d353689-2adb-499d-97e3-171c69f4ee8f.png) 25 | 26 | RESULT VIDEO 27 | 28 | 29 | https://github.com/SUBHADIPMAITI-DEV/Depression-Detection-System-Using-Machine-Learning/assets/78700974/f70aed97-32f2-4086-a80c-51770771f1c4 30 | 31 | 32 | 33 | 34 | 35 | 36 | DOWNLOAD THE PROJECT REPORT ➡️ 37 | 38 | 39 | 40 | 41 | 42 | 📁[Depression.Detection.System.Using.Machine.Learning.Report.SM.G1.pdf](https://github.com/user-attachments/files/16267929/Depression.Detection.System.Using.Machine.Learning.Report.SM.G1_removed.pdf) 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Subhadip Maiti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/python-package-conda.yml: -------------------------------------------------------------------------------- 1 | name: Python Package using Conda 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.10 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: '3.10' 17 | - name: Add conda to system path 18 | run: | 19 | # $CONDA is an environment variable pointing to the root of the miniconda directory 20 | echo $CONDA/bin >> $GITHUB_PATH 21 | - name: Install dependencies 22 | run: | 23 | conda env update --file environment.yml --name base 24 | - name: Lint with flake8 25 | run: | 26 | conda install flake8 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | conda install pytest 34 | pytest 35 | -------------------------------------------------------------------------------- /twitterapiaccount.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """twitterapiaccount.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1q7pmP3GtNueW7iA4mVutlMdT7BcFZKJR 8 | """ 9 | 10 | from google.colab import drive 11 | drive.mount('/content/drive') 12 | 13 | !pip install better_profanity 14 | import nltk 15 | 16 | nltk.download('punkt') 17 | nltk.download('stopwords') 18 | nltk.download('wordnet') 19 | nltk.download('omw-1.4') 20 | 21 | 22 | 23 | RAW_INPUT_TRAINING_DATA = "/content/drive/MyDrive/twitterapiaccount/dataset_combined_2510_new.csv" 24 | 25 | 26 | 27 | USER_TWEET_DATA_FILE = "user_tweets.csv" 28 | PREPROCESSED_INPUT_TRAINING_DATA = "preprocessed_input_data.csv" 29 | MODEL_FILE = "model.pkl" 30 | VECTORIZER_FILE = "vectorizer.pkl" 31 | 32 | # Commented out IPython magic to ensure Python compatibility. 33 | 34 | #DATA CLEANING: Vectorizer AND NLP 35 | 36 | import pandas as pd 37 | import numpy as np 38 | import matplotlib.pyplot as plt 39 | # %matplotlib inline 40 | 41 | import re 42 | from sklearn.feature_extraction.text import TfidfVectorizer 43 | 44 | #importing nlp packages 45 | from nltk import stem 46 | from nltk.corpus import stopwords 47 | stemmer = stem.SnowballStemmer('english') 48 | stopwords = set(stopwords.words('english')) 49 | 50 | 51 | #removing the special characters and numbers and url 52 | def keep_alpha(s): 53 | # s = row['content'] 54 | non_url = re.sub(r"http\S+", "", s) 55 | res = re.sub('[^a-zA-Z\s]', '', non_url) 56 | res1 = re.sub('\n', '', res) 57 | return res1 58 | 59 | def nlp_preprocessing(msg): 60 | try: 61 | # converting messages to lowercase 62 | msg = msg.lower() 63 | # removing stopwords 64 | msg = [word for word in msg.split() if word not in stopwords] 65 | # using a stemmer (getting root form of each word of each row) 66 | msg = " ".join([stemmer.stem(word) for word in msg]) 67 | 68 | except Exception as e: 69 | print(e) 70 | 71 | return msg 72 | 73 | 74 | df=pd.read_csv(RAW_INPUT_TRAINING_DATA) 75 | df.rename(columns = {'Text':'tweet'}, inplace = True) 76 | df = df.dropna() 77 | df = df.sample(frac=1).reset_index() 78 | #df = df.sample(frac=0.1).reset_index() 79 | 80 | 81 | # data preprocessing using NLP : nltk 82 | df['tweet'] = df['tweet'].astype(str) 83 | #remove leading and ending whitespaces 84 | df['tweet'] = df['tweet'].str.strip() 85 | 86 | # keep only alphabets 87 | df['tweet'] = df['tweet'].apply(keep_alpha) 88 | 89 | # nlp preprocessing to remove stopwords and get base/stem form of each word 90 | df['tweet'] = df['tweet'].apply(nlp_preprocessing) 91 | print(df.head(2)) 92 | print(df.tail(2)) 93 | 94 | 95 | print(df['depressed'].value_counts()) 96 | 97 | # df.to_csv("data//preprocessed_input_data.csv", index=False) 98 | df.to_csv(PREPROCESSED_INPUT_TRAINING_DATA, index=False) 99 | 100 | import pandas as pd 101 | from sklearn import metrics 102 | from sklearn.metrics import confusion_matrix 103 | from sklearn.metrics import accuracy_score 104 | from sklearn.metrics import plot_confusion_matrix 105 | 106 | 107 | # to save or to load model 108 | import joblib 109 | 110 | svmout=0 111 | lrout=0 112 | dtout=0 113 | 114 | ## SVM 115 | def train_svm(X_train, X_test, y_train, y_test): 116 | 117 | from sklearn import svm 118 | svm = svm.SVC(C=1000) 119 | 120 | # training svm model 121 | svm.fit(X_train, y_train) 122 | 123 | print("\n\n----SVM------") 124 | y_pred = svm.predict(X_test) 125 | print("Confusion matrix SVM:\n", confusion_matrix(y_test, y_pred)) 126 | 127 | plot_confusion_matrix(svm, X_test, y_test) 128 | plt.show() 129 | 130 | svmout=round((accuracy_score(y_test, y_pred) * 100),2) 131 | # calculate the accuracy 132 | print("Accuracy score for SVM: ", round((accuracy_score(y_test, y_pred) * 100),2)) 133 | 134 | return svm, svmout 135 | 136 | 137 | 138 | ## Logistic regression 139 | def train_logistic_regression(X_train, X_test, y_train, y_test): 140 | from sklearn.linear_model import LogisticRegression 141 | # Create an instance of the model. 142 | logreg = LogisticRegression() 143 | # Training the model. 144 | logreg.fit(X_train,y_train) 145 | 146 | #Do prediction. 147 | y_pred=logreg.predict(X_test) 148 | 149 | print("\n\n-----------Logistic Regression-----") 150 | print("Confusion matrix Logistic Regression:\n",confusion_matrix(y_test, y_pred)) 151 | 152 | plot_confusion_matrix(logreg, X_test, y_test) 153 | plt.show() 154 | 155 | lrout=round((accuracy_score(y_test, y_pred) * 100),2) 156 | # calculate the accuracy 157 | print("Accuracy score for Logistic regression: ", round((accuracy_score(y_test, y_pred) * 100),2)) 158 | return logreg, lrout 159 | 160 | 161 | 162 | ## Decision Tree 163 | def train_decision_tree(X_train, X_test, y_train, y_test): 164 | from sklearn.tree import DecisionTreeClassifier 165 | model = DecisionTreeClassifier() 166 | model.fit(X_train, y_train) 167 | 168 | 169 | y_pred = model.predict(X_test) 170 | print("\n\n--------Decision Tree------------") 171 | print("Confusion matrix Decision Tree:\n",confusion_matrix(y_test, y_pred)) 172 | 173 | plot_confusion_matrix(model, X_test, y_test) 174 | plt.show() 175 | 176 | dtout= round((accuracy_score(y_test, y_pred) * 100),2) 177 | print("Accuracy score for Decision Tree: ", round((accuracy_score(y_test, y_pred) * 100),2)) 178 | return model, dtout 179 | 180 | 181 | 182 | 183 | 184 | # training ML Model 185 | # df = pd.read_csv("data//preprocessed_input_data.csv") 186 | df = pd.read_csv(PREPROCESSED_INPUT_TRAINING_DATA) 187 | df = df.dropna() 188 | print(df.head()) 189 | 190 | # training the vectorizer (conveet text data to number data) 191 | from sklearn.feature_extraction.text import TfidfVectorizer 192 | vectorizer = TfidfVectorizer() 193 | X = vectorizer.fit_transform(df['tweet'].values ) 194 | y = df['depressed'].values 195 | 196 | #save vectorizer object to vectorize user tweets later 197 | # joblib.dump(vectorizer, 'vectorizer.pkl') 198 | joblib.dump(vectorizer, VECTORIZER_FILE) 199 | 200 | 201 | # train test split 202 | from sklearn.model_selection import train_test_split 203 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) 204 | 205 | 206 | 207 | # checking accruacy of SVM 208 | svm_model, svmout = train_svm(X_train, X_test, y_train, y_test) 209 | 210 | # checking accuracy of Logistic Regression 211 | lr_model, lrout = train_logistic_regression(X_train, X_test, y_train, y_test) 212 | 213 | # checking accuracy of Decision Tree Algorithm 214 | dt_model, dtout = train_decision_tree(X_train, X_test, y_train, y_test) 215 | 216 | 217 | 218 | #PLOTING 219 | 220 | # data = {'SVM':svmout, 'Logistic Regression':lrout, 'Decision Tree':dtout} 221 | # courses = list(data.keys()) 222 | # values = list(data.values()) 223 | 224 | # fig = plt.figure(figsize = (10, 5)) 225 | # # creating the bar plot 226 | # plt.bar(courses, values, color ='maroon', 227 | # width = 0.4) 228 | # plt.xlabel("Tweet") 229 | # plt.ylabel("No. of tweets") 230 | # plt.title("Depression Analysis") 231 | # plt.show() 232 | 233 | x = ['SVM', 'Logistic Regression', 'Decision Tree'] 234 | y = [svmout, lrout, dtout] 235 | color = ['red', 'blue', 'green'] 236 | bars = plt.bar(x, height=y, color=color, width=.5) 237 | xlocs, xlabs = plt.xticks() 238 | # reference x so you don't need to change the range each time x changes 239 | xlocs=[i for i in x] 240 | xlabs=[i for i in x] 241 | plt.xlabel('Model') 242 | plt.ylabel('Accuracy %') 243 | plt.xticks(xlocs, xlabs) 244 | plt.title("Depression Analysis") 245 | 246 | print("\n\n") 247 | for bar in bars: 248 | yval = bar.get_height() 249 | plt.text(bar.get_x(), yval + .5, yval) 250 | 251 | plt.figure(figsize=(15, 15)) 252 | plt.show() 253 | 254 | print("\n\n") 255 | 256 | 257 | # choose SVM Regression based on high accuracy score 258 | model, accuracy_final = train_svm(X_train, X_test, y_train, y_test) 259 | 260 | 261 | # Save the model as a pickle in a file at given location "model.pkl" 262 | #joblib.dump(model, 'model.pkl') 263 | joblib.dump(model, MODEL_FILE) 264 | 265 | 266 | # Load/Read the model from the file at given location "model.pkl" 267 | # classification_model = joblib.load('model.pkl') 268 | classification_model = joblib.load(MODEL_FILE) 269 | 270 | # predicting the model on test data 271 | y_pred=classification_model.predict(X_test) 272 | 273 | # calculate the accuracy 274 | print("\n\n Model accuracy: ", round((accuracy_score(y_test, y_pred) * 100), 2)) 275 | 276 | print("\n\n", confusion_matrix(y_test, y_pred)) 277 | 278 | #PREDICT TWEETS 279 | # twitter dataset scraping based on keyword 280 | 281 | import re 282 | import numpy as np 283 | import tweepy 284 | from tweepy import OAuthHandler 285 | from textblob import TextBlob 286 | 287 | import pandas as pd 288 | from wordcloud import WordCloud 289 | from better_profanity import profanity 290 | import configparser 291 | 292 | import joblib 293 | 294 | def download_user_tweets(): 295 | # set twitter credentials 296 | #insert your API key details 297 | api_key = 'api----key----here' 298 | api_key_secret = 'api------key-------secret--here' 299 | access_token = 'access----token-------here' 300 | access_token_secret = 'access----------token-----secret----here' 301 | 302 | # Access Twitter Data (login to twitter via api) 303 | auth = tweepy.OAuthHandler(api_key, api_key_secret) 304 | auth.set_access_token(access_token, access_token_secret) 305 | api = tweepy.API(auth) 306 | 307 | # read configs 308 | # config = configparser.ConfigParser() 309 | # config.read('config.ini') 310 | # consumer_key = config['twitter']['api_key'] 311 | # consumer_secret = config['twitter']['api_key_secret'] 312 | # access_token = config['twitter']['access_token'] 313 | # access_token_secret = config['twitter']['access_token_secret'] 314 | # authentication 315 | # auth = tweepy.OAuthHandler(api_key, api_key_secret) 316 | # auth.set_access_token(access_token, access_token_secret) 317 | # api = tweepy.API(auth) 318 | 319 | # user tweets 320 | user = input("Enter Twitter username:").strip() 321 | if len(user)<=1: 322 | user = 'elonmusk' 323 | limit=50 324 | 325 | tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit) 326 | 327 | # tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended') 328 | 329 | # create DataFrame 330 | columns = ['User', 'tweet'] 331 | data = [] 332 | 333 | for tweet in tweets: 334 | data.append([tweet.user.screen_name, tweet.full_text]) 335 | 336 | df = pd.DataFrame(data, columns=columns) 337 | 338 | # print(df.head()) 339 | # print("\n\n") 340 | # save user tweets to csv 341 | # print("LOGGER: saving user tweets to : ", USER_TWEET_DATA_FILE) 342 | df.to_csv(USER_TWEET_DATA_FILE, index=False) 343 | 344 | return df 345 | 346 | 347 | def predict_user_tweets(df): 348 | 349 | # user tweet preprocessing using NLP : nltk 350 | 351 | df['tweet'] = df['tweet'].astype(str) 352 | #remove leading and ending whitespaces 353 | df['tweet'] = df['tweet'].str.strip() 354 | 355 | # keep only alphabets 356 | df['tweet'] = df['tweet'].apply(keep_alpha) 357 | 358 | # nlp preprocessing to remove stopwords and get base/stem form of each word 359 | df['tweet'] = df['tweet'].apply(nlp_preprocessing) 360 | df['tweet'] = df['tweet'].str.strip() 361 | 362 | # replace empty rows with NAN and then drop them 363 | df['tweet'].replace('', np.nan, inplace=True) 364 | df = df.dropna() 365 | df = df.reset_index(drop=True) 366 | 367 | vectorizer = joblib.load(VECTORIZER_FILE) 368 | X_test = vectorizer.transform(df['tweet'].values ) 369 | 370 | # Load/Read the model from the file at given location "model.pkl" 371 | # classification_model = joblib.load('model.pkl') 372 | classification_model = joblib.load(MODEL_FILE) 373 | 374 | # predicting the model on user test data 375 | y_pred=classification_model.predict(X_test) 376 | 377 | # print(y_pred) 378 | df['prediction'] = y_pred 379 | print(df[['tweet', 'prediction']]) 380 | return list(y_pred) 381 | 382 | 383 | def final_output(predictions): 384 | total = len(predictions) 385 | depressed_count = predictions.count("YES") 386 | 387 | print("\n\n") 388 | 389 | if depressed_count > (total*.6): 390 | print("Result: DEPRESSED 😒") 391 | else: 392 | print("Result: NOT DEPRESSED 😊") 393 | 394 | user_tweets = download_user_tweets() 395 | predictions = predict_user_tweets(user_tweets) 396 | final_output(predictions) 397 | -------------------------------------------------------------------------------- /twitterapiaccount.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "id": "X8bRsVp767xc" 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from google.colab import drive\n", 12 | "drive.mount('/content/drive')" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "colab": { 20 | "base_uri": "https://localhost:8080/" 21 | }, 22 | "id": "DSFmsgVt7qPE", 23 | "outputId": "999b499e-d3c8-4ada-cb81-ed7c33e46f8d" 24 | }, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 31 | "Collecting better_profanity\n", 32 | " Downloading better_profanity-0.7.0-py3-none-any.whl (46 kB)\n", 33 | "\u001b[K |████████████████████████████████| 46 kB 2.1 MB/s \n", 34 | "\u001b[?25hInstalling collected packages: better-profanity\n", 35 | "Successfully installed better-profanity-0.7.0\n" 36 | ] 37 | }, 38 | { 39 | "name": "stderr", 40 | "output_type": "stream", 41 | "text": [ 42 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 43 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n", 44 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 45 | "[nltk_data] Unzipping corpora/stopwords.zip.\n", 46 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", 47 | "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "!pip install better_profanity \n", 53 | "import nltk\n", 54 | "\n", 55 | "nltk.download('punkt')\n", 56 | "nltk.download('stopwords')\n", 57 | "nltk.download('wordnet')\n", 58 | "nltk.download('omw-1.4')\n", 59 | "\n", 60 | "\n", 61 | "\n", 62 | "RAW_INPUT_TRAINING_DATA = \"/content/drive/MyDrive/twitterapiaccount/dataset_combined_2510_new.csv\"\n", 63 | "\n", 64 | "\n", 65 | "\n", 66 | "USER_TWEET_DATA_FILE = \"user_tweets.csv\" \n", 67 | "PREPROCESSED_INPUT_TRAINING_DATA = \"preprocessed_input_data.csv\"\n", 68 | "MODEL_FILE = \"model.pkl\"\n", 69 | "VECTORIZER_FILE = \"vectorizer.pkl\"\n", 70 | "\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "colab": { 78 | "base_uri": "https://localhost:8080/" 79 | }, 80 | "id": "U70IHhO37qR2", 81 | "outputId": "9624348d-4eee-44bd-8a80-ee84997ababd" 82 | }, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | " index tweet depressed\n", 89 | "0 10197 soooo sunburnt mum earlier wait get fell aslee... NO\n", 90 | "1 24592 even get suicid didnt answer show told viewer ... YES\n", 91 | " index tweet depressed\n", 92 | "37865 26162 got rob laptop award first year achiev year th... YES\n", 93 | "37866 23082 retweet your either gay depress horni hungri l... YES\n", 94 | "YES 23012\n", 95 | "NO 14855\n", 96 | "Name: depressed, dtype: int64\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "\n", 102 | "#DATA CLEANING: Vectorizer AND NLP\n", 103 | "\n", 104 | "import pandas as pd\n", 105 | "import numpy as np\n", 106 | "import matplotlib.pyplot as plt\n", 107 | "%matplotlib inline\n", 108 | "\n", 109 | "import re\n", 110 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 111 | "\n", 112 | "#importing nlp packages\n", 113 | "from nltk import stem\n", 114 | "from nltk.corpus import stopwords\n", 115 | "stemmer = stem.SnowballStemmer('english')\n", 116 | "stopwords = set(stopwords.words('english'))\n", 117 | "\n", 118 | "\n", 119 | "#removing the special characters and numbers and url\n", 120 | "def keep_alpha(s): \n", 121 | "# s = row['content']\n", 122 | " non_url = re.sub(r\"http\\S+\", \"\", s)\n", 123 | " res = re.sub('[^a-zA-Z\\s]', '', non_url)\n", 124 | " res1 = re.sub('\\n', '', res)\n", 125 | " return res1\n", 126 | "\n", 127 | "def nlp_preprocessing(msg):\n", 128 | " try:\n", 129 | " # converting messages to lowercase\n", 130 | " msg = msg.lower()\n", 131 | " # removing stopwords\n", 132 | " msg = [word for word in msg.split() if word not in stopwords]\n", 133 | " # using a stemmer (getting root form of each word of each row)\n", 134 | " msg = \" \".join([stemmer.stem(word) for word in msg])\n", 135 | " \n", 136 | " except Exception as e:\n", 137 | " print(e)\n", 138 | "\n", 139 | " return msg\n", 140 | "\n", 141 | "\n", 142 | "df=pd.read_csv(RAW_INPUT_TRAINING_DATA)\n", 143 | "df.rename(columns = {'Text':'tweet'}, inplace = True)\n", 144 | "df = df.dropna()\n", 145 | "df = df.sample(frac=1).reset_index()\n", 146 | "#df = df.sample(frac=0.1).reset_index()\n", 147 | "\n", 148 | "\n", 149 | "# data preprocessing using NLP : nltk\n", 150 | "df['tweet'] = df['tweet'].astype(str)\n", 151 | "#remove leading and ending whitespaces\n", 152 | "df['tweet'] = df['tweet'].str.strip()\n", 153 | "\n", 154 | "# keep only alphabets\n", 155 | "df['tweet'] = df['tweet'].apply(keep_alpha)\n", 156 | "\n", 157 | "# nlp preprocessing to remove stopwords and get base/stem form of each word\n", 158 | "df['tweet'] = df['tweet'].apply(nlp_preprocessing)\n", 159 | "print(df.head(2))\n", 160 | "print(df.tail(2))\n", 161 | "\n", 162 | "\n", 163 | "print(df['depressed'].value_counts())\n", 164 | "\n", 165 | "# df.to_csv(\"data//preprocessed_input_data.csv\", index=False)\n", 166 | "df.to_csv(PREPROCESSED_INPUT_TRAINING_DATA, index=False)\n", 167 | "\n", 168 | "\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "colab": { 176 | "background_save": true, 177 | "base_uri": "https://localhost:8080/" 178 | }, 179 | "id": "zW_UfmPz7qUx", 180 | "outputId": "ab74b338-8c46-446d-e307-25390b76e16b" 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | " index tweet depressed\n", 188 | "0 10197 soooo sunburnt mum earlier wait get fell aslee... NO\n", 189 | "1 24592 even get suicid didnt answer show told viewer ... YES\n", 190 | "2 993 depress YES\n", 191 | "3 35428 hate see peopl depress YES\n", 192 | "4 17374 bright side speak tour come end octob im gonna... YES\n", 193 | "\n", 194 | "\n", 195 | "----SVM------\n", 196 | "Confusion matrix SVM:\n", 197 | " [[2727 324]\n", 198 | " [ 317 3995]]\n" 199 | ] 200 | }, 201 | { 202 | "name": "stderr", 203 | "output_type": "stream", 204 | "text": [ 205 | "/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.\n", 206 | " warnings.warn(msg, category=FutureWarning)\n" 207 | ] 208 | }, 209 | { 210 | "data": { 211 | "image/png": "\n", 212 | "text/plain": [ 213 | "
" 214 | ] 215 | }, 216 | "metadata": {}, 217 | "output_type": "display_data" 218 | }, 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "Accuracy score for SVM: 91.29\n", 224 | "\n", 225 | "\n", 226 | "-----------Logistic Regression-----\n", 227 | "Confusion matrix Logistic Regression:\n", 228 | " [[2682 369]\n", 229 | " [ 336 3976]]\n" 230 | ] 231 | }, 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.\n", 237 | " warnings.warn(msg, category=FutureWarning)\n" 238 | ] 239 | }, 240 | { 241 | "data": { 242 | "image/png": "\n", 243 | "text/plain": [ 244 | "
" 245 | ] 246 | }, 247 | "metadata": {}, 248 | "output_type": "display_data" 249 | }, 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "Accuracy score for Logistic regression: 90.43\n", 255 | "\n", 256 | "\n", 257 | "--------Decision Tree------------\n", 258 | "Confusion matrix Decision Tree:\n", 259 | " [[2662 389]\n", 260 | " [ 463 3849]]\n" 261 | ] 262 | }, 263 | { 264 | "name": "stderr", 265 | "output_type": "stream", 266 | "text": [ 267 | "/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.\n", 268 | " warnings.warn(msg, category=FutureWarning)\n" 269 | ] 270 | }, 271 | { 272 | "data": { 273 | "image/png": "\n", 274 | "text/plain": [ 275 | "
" 276 | ] 277 | }, 278 | "metadata": {}, 279 | "output_type": "display_data" 280 | }, 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "Accuracy score for Decision Tree: 88.43\n", 286 | "\n", 287 | "\n", 288 | "\n" 289 | ] 290 | }, 291 | { 292 | "data": { 293 | "image/png": "\n", 294 | "text/plain": [ 295 | "
" 296 | ] 297 | }, 298 | "metadata": {}, 299 | "output_type": "display_data" 300 | }, 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "
" 305 | ] 306 | }, 307 | "metadata": {}, 308 | "output_type": "display_data" 309 | }, 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "\n", 315 | "\n", 316 | "\n", 317 | "\n", 318 | "\n", 319 | "----SVM------\n", 320 | "Confusion matrix SVM:\n", 321 | " [[2727 324]\n", 322 | " [ 317 3995]]\n" 323 | ] 324 | }, 325 | { 326 | "name": "stderr", 327 | "output_type": "stream", 328 | "text": [ 329 | "/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.\n", 330 | " warnings.warn(msg, category=FutureWarning)\n" 331 | ] 332 | }, 333 | { 334 | "data": { 335 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUsAAAEGCAYAAADscbcsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dfZxXZZ3/8dd7hvt7EW8Q8WYRbdESldCyTC0V3Xa1NtNsV9ZsLdMyb0rdfWze9LPS9S43pSzdsK0UuzE00vCmVbZUQNEEI8YMBfEGkHsY5ubz++NcA18HZr5nYM7cfHk/H4/zmHOuc51zrsPAh+vmnOsoIjAzs9ZVdXYBzMy6AwdLM7McHCzNzHJwsDQzy8HB0swshx6dXYDt0WNwv+i16+DOLoa1QY+a2s4ugrXBBtayMWq1Pec44Zj+sWx5Q668s5+vfSgiJmzP9YrSrYNlr10Hc8DNn+nsYlgbDPv7P3d2EawNnopHtvscy5Y38PRDe+XKWz18wbDtvmBBunWwNLOuL4BGGju7GNvNwdLMChUEdZGvGd6VOViaWeFcszQzKyMIGirgtWoHSzMrXCMOlmZmrQqgwcHSzKw81yzNzMoIoM59lmZmrQvCzXAzs7ICGrp/rHSwNLNiZW/wdH8OlmZWMNHAds3F0SU4WJpZobIBHgdLM7NWZc9ZOliamZXV6JqlmVnrXLM0M8shEA0V8AUbB0szK5yb4WZmZQRiY1R3djG2m4OlmRUqeyjdzXAzs7I8wGNmVkaEaAjXLM3Mymp0zdLMrHXZAE/3DzXdv25sZl1a0wBPnqU1kvpIelrSc5LmSroqpf9Q0suS5qRlbEqXpFsk1Uh6XtKhJeeaKGlBWibmuY/uH+7NrMtraJ/nLGuBYyNijaSewAxJv0n7vhIRP2uW/0RgdFoOByYBh0saClwBjCOL5bMlTY2It1u7uGuWZlaopjd48iytniezJm32TEtr0wqfDNyVjnsSGCJpOHACMD0ilqcAOR2YUO4+HCzNrHCNUZVrKUdStaQ5wJtkAe+ptOua1NS+SVLvlDYCeLXk8EUpraX0VjlYmlmhsok0ctcsh0maVbKc845zRTRExFhgT2C8pIOAy4F3Ae8FhgKXFnEf7rM0s0IFoi7/645LI2Jc2XNGrJD0GDAhIq5PybWS/hu4JG0vBkaWHLZnSlsMHN0s/XflrumapZkVKgIaoirX0hpJu0gaktb7AscBf0r9kEgScArwQjpkKnBmGhU/AlgZEUuAh4DjJe0kaSfg+JTWKtcszaxgaq+H0ocDkyVVk1X0pkTEA5IelbQLIGAO8PmUfxpwElADrAPOAoiI5ZK+DsxM+a6OiOXlLu5gaWaFCmiX1x0j4nngkK2kH9tC/gDOa2HfncCdbbm+g6WZFc6T/5qZlRHIk/+amZWTfQq3+4ea7n8HZtbFyfNZmpmVE5Dr7ZyuzsHSzArnmqWZWRkRcs3SzKycbIDHX3c0MyvD3+AxMysrG+Bxn6WZWVl+g8fMrAy/wWNmllO5j5F1Bw6WZlaoCKhrdLA0M2tV1gx3sDQzK8tv8FguVW/VMeCm16la0QDAhgmD2fAPOzHw2teoXlwHgNY2EP2rWXHL3vR8di39Jy+F+oAeYu1Zu1B3cD+0rpHBl23+KF3V0jpqjxnE2n/dtVPua0fRs3cjN/yihp69guoewRO/HsKPrt+dS7+zkNEHr6ehTsyf05dvf3UkDfWbg8L+B6/j5vsX8I1z92bGr4d04h10Lj861AaSArgxIi5O25cAAyLiyrR9DnBRyr4KuCgiZnRE2TpCVIu1n9mFhv36oHWNDLlwIXVj+7H60j025el/x1s09suaKo2Dqln1HyNo3LkH1QtrGfS1Rbw9eRTRr4oVt+y96ZghX15I7fsGdPj97GjqasVXTx3FhnXVVPcIbryvhpmPDuTRX+zEtefvBcBlt73CiWcs44G7hgFQVRWc/e9LmP2/Azuz6F1EZTTDO+oOaoGPSxrWfIekjwKfAz4QEe8i+37GTyTt3kFlK1wM7UHDfn2y9X5V1I/sRdWy+pIMQa8Zq6n9UPYPq2FUHxp3zv4fa9irF9oYUNf4jnNWLd6IVjZQf2DfjrmJHZrYsC57Xa9Hz6C6ZxABMx8dRPbZFzH/2X4MG1636YiTP7OUGdMGs2KpG28Ajek7POWWrqyjgmU9cDtw4Vb2XQp8JSKWAkTEM8BkWvh2RndX9UYdPV6qpf6APpvSesxdT+OQahr36LVF/l6/X0P9qD7Q852/qt6Pr6b2AwNBXfsvWKWoqgpumz6fe56fy7OPD2D+s/037avuEXz4E28z67HsP7udd6/j/Seu5IHJO3dWcbuUbDS8OtfSlXVk3fhW4NOSBjdLPxCY3SxtVkrfgqRzmj7AXr9yXQHFLND6RgZ98zXW/usuRL/NfzF6P76ajUdt2VyrXlhL/x8uZc15W/ZJ9n5ic03UitfYKL5w3AF8+rAxHDB2HXsfsH7Tvi9+cxEvPNmfF57OukQ+f9Vi7rhmOFEB/XTtoemh9DxLV9ZhwTIiVgF3AV/azvPcHhHjImJcj8H92qdwHaE+GPTN19hw9CA2vr8kyDUEvf+whtoPvjPwVS2tY9A3XmP1hbvTOPydNc7ql2tRQ2xq2lvHWbuqmud+P4D3HrMagE9f9DqDd67ne1du7n/e/+D1XD5pIZOfmscHP7qSL35zMe+bsLKzitwltEczXFIfSU9Lek7SXElXpfR9JT0lqUbSPZJ6pfTeabsm7d+n5FyXp/T5kk7Icw8d3aFyM/AM8N8lafOAw4BHS9IOA+Z2YLmKFcGAW16nYWQvNpyy0zt29ZyzjoYRvWgc1nNTmtY0MOiqxaydOIz6MVv2Sfb+31XUbqUmasUYPLSe+nqxdlU1vfo0cuhRa5hy665MOGMZ445ezaWfHPWOWuTEI/520/rFN73CUw8P4g8PNm9Q7TjacTS8Fjg2ItZI6gnMkPQbssHhmyLibknfBc4GJqWfb0fEfpJOB64FTpM0BjidrPW6B/CwpP0joqG1i3dosEwfN59CdhNN3+y9DrhW0oSIWCZpLPAvwOEdWbYi9Zi3gT6PraZ+n14M+dJCANaeuTN14wZkfY/NmtN9fr2C6iV19Lt7Of3uzr79vvLqEcSQ7NfVe8YaVl0xomNvYgc2dLc6Lvn2K1RVQVUVPH7/YJ56eBDTXnmONxb14ub7FwDwf9MG8+ObKmZcsl21x2h4+g74mrTZMy0BHAuckdInA1eSBcuT0zrAz4DvSFJKvzsiaoGXJdUA44E/tHb9zhiquwE4v2kjIqZKGgH8Pj1itBr4p4hY0gllK0T9gX1Zev/+W9235sIt/3GtP21n1p/W8uDA2z/Yt93KZuW9/GJfzjv+gC3ST9rr4LLH3nDhXkUUqVuJEPX5g+UwSbNKtm+PiNubNiRVk41x7Ec2DvISsCIimh4vWQQ01SRGAK9mZYh6SSuBnVP6kyXXKD2mRR0SLCNiQMn6G0C/Zvsnkf1PYGYVqA3N8KURMa6lnampPFbSEOCXwLvaoXi5dP8nRc2sS2vqs2zP0fCIWAE8BrwPGCKpqeK3J7A4rS8GRgKk/YOBZaXpWzmmRQ6WZla49giWknZJNUok9QWOA14kC5qfSNkmAr9K61PTNmn/o6nfcypwehot3xcYDTxd7h78eoGZFaodJ/8dDkxO/ZZVwJSIeEDSPOBuSf8PeBa4I+W/A/hRGsBZTjYCTkTMTQPN88hemDmv3Eg4OFiaWQdoj1cZI+J54JCtpP+FbDS7efoG4NQWznUNcE1bru9gaWaFioB6T/5rZlZeV3+VMQ8HSzMrlD9YZmaWUyVMKuJgaWaF6+pzVebhYGlmhYpwn6WZWQ6iwaPhZmbluc/SzKwMf93RzCyPyPotuzsHSzMrnEfDzczKCA/wmJnl42a4mVkOHg03MysjwsHSzCwXPzpkZpaD+yzNzMoIRKNHw83MyquAiqWDpZkVzAM8ZmY5VUDVsvt3JJhZlxehXEs5kkZKekzSPElzJV2Q0q+UtFjSnLScVHLM5ZJqJM2XdEJJ+oSUViPpsnLXbrFmKem/aOX/g4j4Utk7M7MdXgCNje3WDK8HLo6IZyQNBGZLmp723RQR15dmljSG7HvhBwJ7AA9L2j/tvhU4DlgEzJQ0NSLmtXTh1prhs7btXszMSgTQTn2WEbEEWJLWV0t6ERjRyiEnA3dHRC3wsqQaNn9jvCZ9cxxJd6e8bQ+WETG5dFtSv4hYl+N+zMzeoQ3PWQ6TVFpRuz0ibt9aRkn7AIcATwFHAudLOpOsondxRLxNFkifLDlsEZuD66vN0g9vrWBl+ywlvU/SPOBPaftgSbeVO87MbJPIucDSiBhXsrQUKAcAPwe+HBGrgEnAKGAsWc3zhva+hTwDPDcDJwDLACLiOeCo9i6ImVWqfIM7eR8vktSTLFD+OCJ+ARARb0REQ0Q0At9nc1N7MTCy5PA9U1pL6S3KNRoeEa82S2rIc5yZGdCWmmWrJAm4A3gxIm4sSR9eku1jwAtpfSpwuqTekvYFRgNPAzOB0ZL2ldSLbBBoamvXzvOc5auS3g9EiugXAC/mOM7MLHsovf1Gw48E/hn4o6Q5Ke3fgE9JGptdjb8CnwOIiLmSppAN3NQD50VEA4Ck84GHgGrgzoiY29qF8wTLzwPfJusUfS2d/Ly23J2Z7ejabTR8Rgsnm9bKMdcA12wlfVprxzVXNlhGxFLg03lPaGa2hR3hDR5JfyPpfklvSXpT0q8k/U1HFM7MKkQ79Vl2pjwDPD8BpgDDyZ6Avxf4aZGFMrMK0vRQep6lC8sTLPtFxI8ioj4t/wP0KbpgZlY5IvItXVlr74YPTau/SS+Z3032f8RptKFT1MyM9hsN7zStDfDMJguOTXf5uZJ9AVxeVKHMrLKoi9ca82jt3fB9O7IgZlahusHgTR65Jv+VdBAwhpK+yoi4q6hCmVkl6fqDN3mUDZaSrgCOJguW04ATgRmAg6WZ5VMBNcs8o+GfAD4MvB4RZwEHA4MLLZWZVZbGnEsXlqcZvj4iGiXVSxoEvMk7Z+swM2tZO07+25nyBMtZkoaQTXs0G1gD/KHQUplZRano0fAmEfGFtPpdSQ8CgyLi+WKLZWYVpZKDpaRDW9sXEc8UUyQzs66ntZpla9OyB3BsO5elzXrU1DLs7//c2cWwNnjotTnlM1mXMf6E9vnsVkU3wyPimI4siJlVqKDiX3c0M2sflVyzNDNrLxXdDDczazcVECzzzJQuSf8k6Wtpey9J48sdZ2a2yQ4yU/ptwPuAT6Xt1cCthZXIzCqKIv9S9lzSSEmPSZonaa6kC1L6UEnTJS1IP3dK6ZJ0i6QaSc+XPhIpaWLKv0DSxHLXzhMsD4+I84ANABHxNtArx3FmZplG5VvKqwcujogxwBHAeZLGAJcBj0TEaOCRtA3ZxD+j03IOMAk2TW5+BXA4MB64oinAtiRPsKyTVE2qJEvahS7/yruZdSXtVbOMiCVNL8RExGrgRbLPdJ8MTE7ZJgOnpPWTgbsi8yQwRNJw4ARgekQsTxXA6cCE1q6dJ1jeAvwS2FXSNWTTs30jx3FmZpn8fZbDJM0qWc5p6ZSS9gEOAZ4CdouIJWnX68BuaX0E8GrJYYtSWkvpLcrzbviPJc0mm6ZNwCkR8WK548zMAMhZa0yWRsS4cpkkDQB+Dnw5IlZJm5vwERFS+z+slGc0fC9gHXA/MBVYm9LMzPJpx9FwST3JAuWPI+IXKfmN1Lwm/XwzpS/mnVNK7pnSWkpvUZ5m+K+BB9LPR4C/AL/JcZyZGQBqzLeUPU9WhbwDeDEibizZNRVoGtGeCPyqJP3MNCp+BLAyNdcfAo6XtFMa2Dk+pbUoTzP83c0KeyjwhRaym5kV6Ujgn4E/SmqaleXfgG8BUySdDSwEPpn2TQNOAmrIWshnAUTEcklfB2amfFdHxPLWLtzmN3gi4hlJh7f1ODPbgbVTD2JEzGDz57mb+/BW8gdwXgvnuhO4M++183yw7KKSzSrgUOC1vBcwsx1c2wZ4uqw8NcuBJev1ZH2XPy+mOGZWkSo9WKaH0QdGxCUdVB4zq0SVHCwl9YiIeklHdmSBzKyyiHwj3V1dazXLp8n6J+dImgrcC6xt2lnyfJOZWct2oD7LPsAysm/uBNl/FAE4WJpZPhUeLHdNI+EvsDlINqmAWzezDlMBEaO1YFkNDGDrzzRVwK2bWUep9Gb4koi4usNKYmaVq8KDZff/dqWZdb6o/NHwLV4dMjPbJpVcsyz3UrmZWV6V3mdpZtY+HCzNzMroBp+5zcPB0swKJdwMNzPLxcHSzCwPB0szsxwcLM3MytiBZh0yM9s+DpZmZuVVwuuOeb4bbma2XRT5lrLnke6U9KakF0rSrpS0WNKctJxUsu9ySTWS5ks6oSR9QkqrkXRZnntwsDSzYkUblvJ+CEzYSvpNETE2LdMAJI0BTgcOTMfcJqk6fVvsVuBEYAzwqZS3VW6Gm1nx2u+74Y9L2idn9pOBuyOiFnhZUg0wPu2riYi/AEi6O+Wd19rJXLM0s0I1vcGTsxk+TNKskuWcnJc5X9LzqZm+U0obAbxakmdRSmspvVWuWZpZ4dSYu2q5NCLGtfH0k4Cvk9Vfvw7cAHymjecoy8HSzIpV8EQaEfFG07qk7wMPpM3FwMiSrHumNFpJb5Gb4WZWuPYaDd/quaXhJZsfI/vIIsBU4HRJvSXtC4wm+8T3TGC0pH0l9SIbBJpa7jquWZpZ8dqpZinpp8DRZH2bi4ArgKMljU1X+SvwOYCImCtpCtnATT1wXkQ0pPOcDzxE9mHGOyNibrlrO1iaWeHa63XHiPjUVpLvaCX/NcA1W0mfBkxry7UdLM2seH7d0cysjB3g645mZtvNM6WbmeUV3T9aOliaWeFcs7Q269m7kRt+UUPPXkF1j+CJXw/hR9fvzj+ctZSPffYt9th3I6cedCCrlme/mk+c+ybHfvxtAKqrYeToDZz27gNZvcK/uiJt3CAu/vh+1G2soqEePvh3KznzK68zZ8YAvn/1HtTVidHvWc9FN7xCdQ9YvaKaGy8ayZKFvenZu5GLb3yVfd61AYAzx4+h74AGqqqgukfwnQf/3Ml318H8dceWSRLwBHBNRPwmpZ0KnA0cB/yxJPvdEfEtSR8le1WpCugJfDsivldE+TpTXa346qmj2LCumuoewY331TDz0YHMndmPp6aP4rqf17wj/88m7crPJu0KwOHHreTj/7rUgbID9OwdXHfvS/Tt30h9HVx0ymgOO3oV/3nBXlw75SX2HFXL5Ot2Z/qUoUw4Yzl337Ibow5czxV3/pVXFvTm1n/fk2unvLTpfNfdW8PgnRs68Y46lwd4WhARIenzwL2SHkvX+QbZNEnPRcTY0vySegK3A+MjYpGk3sA+RZSt84kN66oB6NEzqO4ZRMBLL/Qre+Qxp6zgd/cNKbqABkjQt3/2L7y+TjTUiepq6Nkr2HNULQCHfmg19/zXbkw4YzmvLOjNJ89/E4C9Rtfyxqu9ePutHuy0S32n3UNXUgnBsrDXHSPiBeB+4FLga8BdEfFSC9kHkgXUZenY2oiYX1TZOltVVXDb9Pnc8/xcnn18APOf7V/2mN59Gxl39GpmTBvcASU0gIYGOPcjB3Daew7ikKNWc8Ah62ioF39+ri8AMx4Ywluv9QRg3zEb+L/0u/nTs/14Y1Evli7J9qHg3z41ivNO2J9p/7Nzp9xLpwqyAZ48SxdWdHvuKuAZYCPQNJNIX0lzSvJ8MyLukTQVWCjpEbIX4X8aEVv8f5SmbDoHoA/la2NdUWOj+MJxB9B/UANX3PEyex+wnoXz+7Z6zBHHrWTurP5ugneg6mqY9PB81qys5qqz92Hh/D5cPumvfPeKEdRtFId9aDVVqbpx2vlvMOk/RnDuRw5g379dz34Hrd+078b7ahg2vI4VS3tw2emjGLnfBt59xNrOu7FO4AGeMiJiraR7gDVpAk6A9c2b4SnvZyW9G/gIcAlZ3+a/bCXf7WRNdgZpaLf+FaxdVc1zvx/Ae49ZXTZYfuhkN8E7y4DBDRz8/jXMfGwgp577Fjfel/Urz/7dQBb9pTcA/Qc2csnN2RSJETDx8DHsvnf2V37Y8DoAhgyr58gJK/nTs/12uGBZCQM8HTHrUGNayoqIP0bETWSB8h8LLVUnGTy0nv6Dso7+Xn0aOfSoNbxa06fVY/oNbOA9R6zl9w8O6ogiGrBiWTVrVmZ9y7XrxTOPD2TkfrWsWJrVLzbWiim37cpH/3kZAGtWVlO3UQD85idDOeiINfQf2MiGdVWsW5P9M9uwrorZ/ztw0yj5jqKNk/92WV2iTSdpADAuIn6XksYCCzuvRMUZulsdl3z7FaqqoKoKHr9/ME89PIiTz36LU899i6G71vHdh+fz9KODuPmSbMq9I09cyezHB1K7vrqTS7/jWP5GT66/YC8aG0VjIxz19ys44rhVfP/qPXjq4UFEI/zdxGWM/cAaAF5Z0Jvrv7wXAvY+YAMX3pDVMt9+qwdXnb0vAA31cMzHVvDeY1Z31m11joi2TP7bZSkK7lSVdCVZM/z6tN3AOx8depBsVpB7gFHAemAtcEFEzGrt3IM0NA7Xh4sothXkodfmlM9kXcb4E15l1nMbtD3nGDhkzzjkqAty5X3i/q/O3oaZ0jtE4TXLiLiy2XZL1aOTWkg3s26uqzex8+gSzXAzq2ABVEAz3MHSzIrX/WOlg6WZFc/NcDOzHCphNNzB0syKVSGzDvlTuGZWqOyh9Mi1lD2XdKekNyW9UJI2VNJ0SQvSz51SuiTdIqlG0vOSDi05ZmLKv0DSxDz34WBpZsVrzLmU90Oy2ctKXQY8EhGjgUfSNsCJZN8KH002n8QkyIIr2Sd0DwfGA1c0BdjWOFiaWeHaq2YZEY8Dy5slnwxMTuuTgVNK0u+KzJPAEEnDgROA6RGxPCLeBqazZQDegvsszaxYbeuzHCap9M2929PkOa3ZLSKWpPXXgd3S+gjg1ZJ8i1JaS+mtcrA0s4K16d3wpdvzumOaeLyQ4SQ3w82seMVO/vtGal6Tfr6Z0hcDI0vy7ZnSWkpvlYOlmRUrss9K5Fm20VSgaUR7IvCrkvQz06j4EcDK1Fx/CDhe0k5pYOf4lNYqN8PNrHjtNLuZpJ8CR5P1bS4iG9X+FjBF0tlkUzt+MmWfRjZBTw2wDjgrK0osl/R1YGbKd3VENB802oKDpZkVr516ESPiUy3s2mKuxsjmnzyvhfPcCdzZlms7WJpZ4dTY/T/v6GBpZsUK8j5w3qU5WJpZoUS+B867OgdLMyueg6WZWQ4OlmZmZbjP0swsH4+Gm5mVtV2vMnYZDpZmVqzAwdLMLJfu3wp3sDSz4vk5SzOzPBwszczKiICG7t8Od7A0s+K5ZmlmloODpZlZGQHk/wZPl+VgaWYFCwj3WZqZtS7wAI+ZWS7uszQzy8HB0sysnMqYSMPfDTezYgXQ2JhvyUHSXyX9UdIcSbNS2lBJ0yUtSD93SumSdIukGknPSzp0W2/DwdLMiheRb8nvmIgYGxHj0vZlwCMRMRp4JG0DnAiMTss5wKRtvQUHSzMrWHrdMc+y7U4GJqf1ycApJel3ReZJYIik4dtyAQdLMytWQERjrgUYJmlWyXLO1s/IbyXNLtm/W0QsSeuvA7ul9RHAqyXHLkppbeYBHjMrXv43eJaWNK1b8oGIWCxpV2C6pD+V7oyIkNTuI0quWZpZ8dqxzzIiFqefbwK/BMYDbzQ1r9PPN1P2xcDIksP3TGlt5mBpZsWKaLfRcEn9JQ1sWgeOB14ApgITU7aJwK/S+lTgzDQqfgSwsqS53iZuhptZ8drvOcvdgF9Kgix+/SQiHpQ0E5gi6WxgIfDJlH8acBJQA6wDztrWCztYmlnBgmhoaJ8zRfwFOHgr6cuAD28lPYDz2uPaDpZmVixP0WZmlpOnaDMza10A4ZqlmVkZ4cl/zcxyaa8Bns6k6MZTJ0l6i+wxgUozDFja2YWwNqnU39neEbHL9pxA0oNkfz55LI2ICdtzvaJ062BZqSTNyvHKl3Uh/p1VPr/BY2aWg4OlmVkODpZd0+2dXQBrM//OKpz7LM3McnDN0swsBwdLM7McHCw7iaSQdEPJ9iWSrizZPkfSn9LytKQPdEpBd3BpHsQZkk4sSTtV0oOSGtIXBpuWy9L+j0p6VtJzkuZJ+lzn3YG1F/dZdhJJG4AlwHsjYqmkS4ABEXGlpI8CVwEnpH2HAvcB4yPi9U4s9g5J0kHAvcAhZG+9PQtMAJ6LiAHN8vYke1FifEQsktQb2Cci5ndwsa2duWbZeerJRlAv3Mq+S4GvRMRSgIh4huyLde0yL5+1TUS8ANxP9nv5GtnXAl9qIftAsoC6LB1b60BZGfxueOe6FXhe0nXN0g8EZjdLm8XmafOt410FPANsBJre1OkraU5Jnm9GxD2SpgILJT0CPAD8NKICZpLYwTlYdqKIWCXpLuBLwPrOLo+1LCLWSroHWBMRtSl5fUSM3Urez0p6N/AR4BLgOOBfOqywVgg3wzvfzcDZQP+StHnAYc3yHQbM7ahC2VY1pqWsiPhjRNxEFij/sdBSWYdwsOxkEbEcmEIWMJtcB1wraWcASWPJaia3dXgBrU0kDZB0dEnSWCpzZqwdjpvhXcMNwPlNGxExVdII4PfpY/GrgX/a1k94WmGa91k+CFwDfFXS98i6VtbiJnhF8KNDZmY5uBluZpaDg6WZWQ4OlmZmOThYmpnl4GBpZpaDg2UFK5kV5wVJ90rqtx3n+qGkT6T1H0ga00reoyW9fxuu8VdJW3wFsKX0ZnnWtPFaV6bJS8xycbCsbOsjYmxEHET2TvPnS3dK2qbnbCPisxExr5UsRwNtDpZmXZmD5Y7jCWC/VOt7Ik32ME9StaT/lDRT0vNNcy+meRy/I2m+pIeBXZtOJOl3ksal9QmSnklzNz4iaR+yoHxhqtV+UNIukn6erjFT0pHp2J0l/VbSXEk/AFTuJiTdJ2l2OuacZvtuSumPSNolpY1Kc0/OTvf9rvb4w7Qdj9/g2QGkGuSJZG+YABwKHBQRL6eAszIi3pvmXvw/Sb8lm0cREugAAAIISURBVLvxAGAMsBvZ++p3NjvvLsD3gaPSuYZGxHJJ3yWbcOL6lO8nwE0RMUPSXsBDwN8CVwAzIuJqSX/HO1/5bMln0jX6AjMl/TwilpG9Wz8rIi6U9LV07vPJpsH7fEQskHQ42Sujx27DH6Pt4BwsK1vp63hPAHeQNY+fjoiXU/rxwHua+iOBwcBo4CiyqcUagNckPbqV8x8BPN50rvSe+9Z8BBgjbao4DpI0IF3j4+nYX0t6O8c9fUnSx9L6yFTWZWQTXNyT0v8H+EW6xvuBe0uu3TvHNcy24GBZ2baYQiwFjbWlScAXI+KhZvlOasdyVAFHRMSGrZQltzRBxUeA90XEOkm/A/q0kD3SdVdsbRo1s7Zyn6U9BJybPoeApP0l9QceB05LfZrDgWO2cuyTwFGS9k3HDk3pq8lmDG/yW+CLTRtpFiXSNc5IaScCO5Up62Dg7RQo30VWs21SBTTVjs8ga96vAl6WdGq6hiQdXOYaZlvlYGk/IOuPfEbSC8D3yFocvwQWpH13AX9ofmBEvAWcQ9bkfY7NzeD7gY81DfCQTW48Lg0gzWPzqPxVZMF2Lllz/JUyZX0Q6CHpReBbZMG6yVpgfLqHY4GrU/qngbNT+eYCJ+f4MzHbgmcdMjPLwTVLM7McHCzNzHJwsDQzy8HB0swsBwdLM7McHCzNzHJwsDQzy+H/A7PDtFoMFDA4AAAAAElFTkSuQmCC\n", 336 | "text/plain": [ 337 | "
" 338 | ] 339 | }, 340 | "metadata": {}, 341 | "output_type": "display_data" 342 | }, 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "Accuracy score for SVM: 91.29\n", 348 | "\n", 349 | "\n", 350 | " Model accuracy: 91.29\n", 351 | "\n", 352 | "\n", 353 | " [[2727 324]\n", 354 | " [ 317 3995]]\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "\n", 360 | "\n", 361 | "import pandas as pd\n", 362 | "from sklearn import metrics\n", 363 | "from sklearn.metrics import confusion_matrix \n", 364 | "from sklearn.metrics import accuracy_score \n", 365 | "from sklearn.metrics import plot_confusion_matrix\n", 366 | "\n", 367 | "\n", 368 | "# to save or to load model\n", 369 | "import joblib\n", 370 | "\n", 371 | "svmout=0\n", 372 | "lrout=0\n", 373 | "dtout=0\n", 374 | "\n", 375 | "## SVM\n", 376 | "def train_svm(X_train, X_test, y_train, y_test):\n", 377 | " \n", 378 | " from sklearn import svm \n", 379 | " svm = svm.SVC(C=1000)\n", 380 | " \n", 381 | " # training svm model\n", 382 | " svm.fit(X_train, y_train)\n", 383 | " \n", 384 | " print(\"\\n\\n----SVM------\")\n", 385 | " y_pred = svm.predict(X_test)\n", 386 | " print(\"Confusion matrix SVM:\\n\", confusion_matrix(y_test, y_pred))\n", 387 | "\n", 388 | " plot_confusion_matrix(svm, X_test, y_test) \n", 389 | " plt.show()\n", 390 | "\n", 391 | " svmout=round((accuracy_score(y_test, y_pred) * 100),2)\n", 392 | " # calculate the accuracy\n", 393 | " print(\"Accuracy score for SVM: \", round((accuracy_score(y_test, y_pred) * 100),2))\n", 394 | "\n", 395 | " return svm, svmout\n", 396 | "\n", 397 | " \n", 398 | " \n", 399 | "## Logistic regression\n", 400 | "def train_logistic_regression(X_train, X_test, y_train, y_test):\n", 401 | " from sklearn.linear_model import LogisticRegression \n", 402 | " # Create an instance of the model. \n", 403 | " logreg = LogisticRegression() \n", 404 | " # Training the model. \n", 405 | " logreg.fit(X_train,y_train)\n", 406 | " \n", 407 | " #Do prediction. \n", 408 | " y_pred=logreg.predict(X_test)\n", 409 | " \n", 410 | " print(\"\\n\\n-----------Logistic Regression-----\")\n", 411 | " print(\"Confusion matrix Logistic Regression:\\n\",confusion_matrix(y_test, y_pred))\n", 412 | " \n", 413 | " plot_confusion_matrix(logreg, X_test, y_test) \n", 414 | " plt.show()\n", 415 | "\n", 416 | " lrout=round((accuracy_score(y_test, y_pred) * 100),2)\n", 417 | " # calculate the accuracy \n", 418 | " print(\"Accuracy score for Logistic regression: \", round((accuracy_score(y_test, y_pred) * 100),2))\n", 419 | " return logreg, lrout\n", 420 | "\n", 421 | "\n", 422 | "\n", 423 | "## Decision Tree\n", 424 | "def train_decision_tree(X_train, X_test, y_train, y_test):\n", 425 | " from sklearn.tree import DecisionTreeClassifier\n", 426 | " model = DecisionTreeClassifier()\n", 427 | " model.fit(X_train, y_train)\n", 428 | " \n", 429 | "\n", 430 | " y_pred = model.predict(X_test)\n", 431 | " print(\"\\n\\n--------Decision Tree------------\")\n", 432 | " print(\"Confusion matrix Decision Tree:\\n\",confusion_matrix(y_test, y_pred))\n", 433 | "\n", 434 | " plot_confusion_matrix(model, X_test, y_test) \n", 435 | " plt.show()\n", 436 | "\n", 437 | " dtout= round((accuracy_score(y_test, y_pred) * 100),2)\n", 438 | " print(\"Accuracy score for Decision Tree: \", round((accuracy_score(y_test, y_pred) * 100),2))\n", 439 | " return model, dtout\n", 440 | "\n", 441 | "\n", 442 | "\n", 443 | "\n", 444 | "\n", 445 | "# training ML Model \n", 446 | "# df = pd.read_csv(\"data//preprocessed_input_data.csv\")\n", 447 | "df = pd.read_csv(PREPROCESSED_INPUT_TRAINING_DATA)\n", 448 | "df = df.dropna()\n", 449 | "print(df.head())\n", 450 | "\n", 451 | "# training the vectorizer (conveet text data to number data)\n", 452 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 453 | "vectorizer = TfidfVectorizer()\n", 454 | "X = vectorizer.fit_transform(df['tweet'].values )\n", 455 | "y = df['depressed'].values\n", 456 | "\n", 457 | "#save vectorizer object to vectorize user tweets later\n", 458 | "# joblib.dump(vectorizer, 'vectorizer.pkl')\n", 459 | "joblib.dump(vectorizer, VECTORIZER_FILE)\n", 460 | "\n", 461 | "\n", 462 | "# train test split\n", 463 | "from sklearn.model_selection import train_test_split\n", 464 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)\n", 465 | "\n", 466 | "\n", 467 | "\n", 468 | "# checking accruacy of SVM\n", 469 | "svm_model, svmout = train_svm(X_train, X_test, y_train, y_test)\n", 470 | "\n", 471 | "# checking accuracy of Logistic Regression\n", 472 | "lr_model, lrout = train_logistic_regression(X_train, X_test, y_train, y_test)\n", 473 | "\n", 474 | "# checking accuracy of Decision Tree Algorithm\n", 475 | "dt_model, dtout = train_decision_tree(X_train, X_test, y_train, y_test)\n", 476 | "\n", 477 | "\n", 478 | "\n", 479 | "#PLOTING\n", 480 | "\n", 481 | "# data = {'SVM':svmout, 'Logistic Regression':lrout, 'Decision Tree':dtout}\n", 482 | "# courses = list(data.keys())\n", 483 | "# values = list(data.values())\n", 484 | "\n", 485 | "# fig = plt.figure(figsize = (10, 5))\n", 486 | "# # creating the bar plot\n", 487 | "# plt.bar(courses, values, color ='maroon',\n", 488 | "# \t\twidth = 0.4)\n", 489 | "# plt.xlabel(\"Tweet\")\n", 490 | "# plt.ylabel(\"No. of tweets\")\n", 491 | "# plt.title(\"Depression Analysis\")\n", 492 | "# plt.show()\n", 493 | "\n", 494 | "x = ['SVM', 'Logistic Regression', 'Decision Tree']\n", 495 | "y = [svmout, lrout, dtout]\n", 496 | "color = ['red', 'blue', 'green']\n", 497 | "bars = plt.bar(x, height=y, color=color, width=.5)\n", 498 | "xlocs, xlabs = plt.xticks()\n", 499 | "# reference x so you don't need to change the range each time x changes\n", 500 | "xlocs=[i for i in x]\n", 501 | "xlabs=[i for i in x]\n", 502 | "plt.xlabel('Model')\n", 503 | "plt.ylabel('Accuracy %')\n", 504 | "plt.xticks(xlocs, xlabs)\n", 505 | "plt.title(\"Depression Analysis\")\n", 506 | "\n", 507 | "print(\"\\n\\n\")\n", 508 | "for bar in bars:\n", 509 | " yval = bar.get_height()\n", 510 | " plt.text(bar.get_x(), yval + .5, yval)\n", 511 | "\n", 512 | "plt.figure(figsize=(15, 15))\n", 513 | "plt.show()\n", 514 | "\n", 515 | "print(\"\\n\\n\")\n", 516 | "\n", 517 | "\n", 518 | "# choose SVM Regression based on high accuracy score\n", 519 | "model, accuracy_final = train_svm(X_train, X_test, y_train, y_test)\n", 520 | "\n", 521 | "\n", 522 | "# Save the model as a pickle in a file at given location \"model.pkl\"\n", 523 | "#joblib.dump(model, 'model.pkl')\n", 524 | "joblib.dump(model, MODEL_FILE)\n", 525 | "\n", 526 | "\n", 527 | "# Load/Read the model from the file at given location \"model.pkl\"\n", 528 | "# classification_model = joblib.load('model.pkl')\n", 529 | "classification_model = joblib.load(MODEL_FILE)\n", 530 | "\n", 531 | "# predicting the model on test data\n", 532 | "y_pred=classification_model.predict(X_test)\n", 533 | "\n", 534 | "# calculate the accuracy \n", 535 | "print(\"\\n\\n Model accuracy: \", round((accuracy_score(y_test, y_pred) * 100), 2))\n", 536 | "\n", 537 | "print(\"\\n\\n\", confusion_matrix(y_test, y_pred))\n", 538 | "\n", 539 | "\n", 540 | "\n" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "id": "RSuxIzzD7qZE" 548 | }, 549 | "outputs": [], 550 | "source": [ 551 | "#PREDICT TWEETS\n", 552 | "# twitter dataset scraping based on keyword\n", 553 | "\n", 554 | "import re \n", 555 | "import numpy as np\n", 556 | "import tweepy \n", 557 | "from tweepy import OAuthHandler \n", 558 | "from textblob import TextBlob \n", 559 | "\n", 560 | "import pandas as pd\n", 561 | "from wordcloud import WordCloud\n", 562 | "from better_profanity import profanity\n", 563 | "import configparser\n", 564 | "\n", 565 | "import joblib\n", 566 | "\n", 567 | "def download_user_tweets():\n", 568 | " # set twitter credentials \n", 569 | " api_key = 'Replace you Twitter API key'\n", 570 | " api_key_secret = 'Replace you Twitter API key secret'\n", 571 | " access_token = 'Replace you Twitter access token'\n", 572 | " access_token_secret = 'Replace you Twitter access token secret'\n", 573 | " \n", 574 | " # Access Twitter Data (login to twitter via api) \n", 575 | " auth = tweepy.OAuthHandler(api_key, api_key_secret)\n", 576 | " auth.set_access_token(access_token, access_token_secret)\n", 577 | " api = tweepy.API(auth)\n", 578 | " \n", 579 | " # read configs\n", 580 | " # config = configparser.ConfigParser()\n", 581 | " # config.read('config.ini') \n", 582 | " # consumer_key = config['twitter']['api_key']\n", 583 | " # consumer_secret = config['twitter']['api_key_secret'] \n", 584 | " # access_token = config['twitter']['access_token']\n", 585 | " # access_token_secret = config['twitter']['access_token_secret'] \n", 586 | " # authentication\n", 587 | " # auth = tweepy.OAuthHandler(api_key, api_key_secret)\n", 588 | " # auth.set_access_token(access_token, access_token_secret) \n", 589 | " # api = tweepy.API(auth)\n", 590 | " \n", 591 | " # user tweets\n", 592 | " user = input(\"Enter Twitter username:\").strip()\n", 593 | " if len(user)<=1:\n", 594 | " user = 'elonmusk'\n", 595 | " limit=50\n", 596 | " \n", 597 | " tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)\n", 598 | " \n", 599 | " # tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended')\n", 600 | " \n", 601 | " # create DataFrame\n", 602 | " columns = ['User', 'tweet']\n", 603 | " data = []\n", 604 | " \n", 605 | " for tweet in tweets:\n", 606 | " data.append([tweet.user.screen_name, tweet.full_text])\n", 607 | " \n", 608 | " df = pd.DataFrame(data, columns=columns)\n", 609 | " \n", 610 | "# print(df.head())\n", 611 | "# print(\"\\n\\n\")\n", 612 | " # save user tweets to csv\n", 613 | "# print(\"LOGGER: saving user tweets to : \", USER_TWEET_DATA_FILE)\n", 614 | " df.to_csv(USER_TWEET_DATA_FILE, index=False)\n", 615 | "\n", 616 | " return df\n", 617 | "\n", 618 | "\n", 619 | "def predict_user_tweets(df):\n", 620 | " \n", 621 | " # user tweet preprocessing using NLP : nltk\n", 622 | "\n", 623 | " df['tweet'] = df['tweet'].astype(str)\n", 624 | " #remove leading and ending whitespaces\n", 625 | " df['tweet'] = df['tweet'].str.strip()\n", 626 | "\n", 627 | " # keep only alphabets\n", 628 | " df['tweet'] = df['tweet'].apply(keep_alpha)\n", 629 | "\n", 630 | " # nlp preprocessing to remove stopwords and get base/stem form of each word\n", 631 | " df['tweet'] = df['tweet'].apply(nlp_preprocessing)\n", 632 | " df['tweet'] = df['tweet'].str.strip()\n", 633 | "\n", 634 | " # replace empty rows with NAN and then drop them\n", 635 | " df['tweet'].replace('', np.nan, inplace=True)\n", 636 | " df = df.dropna()\n", 637 | " df = df.reset_index(drop=True)\n", 638 | " \n", 639 | " vectorizer = joblib.load(VECTORIZER_FILE)\n", 640 | " X_test = vectorizer.transform(df['tweet'].values )\n", 641 | " \n", 642 | " # Load/Read the model from the file at given location \"model.pkl\"\n", 643 | " # classification_model = joblib.load('model.pkl')\n", 644 | " classification_model = joblib.load(MODEL_FILE)\n", 645 | " \n", 646 | " # predicting the model on user test data\n", 647 | " y_pred=classification_model.predict(X_test)\n", 648 | " \n", 649 | "# print(y_pred)\n", 650 | " df['prediction'] = y_pred\n", 651 | " print(df[['tweet', 'prediction']])\n", 652 | " return list(y_pred)\n", 653 | "\n", 654 | "\n", 655 | "def final_output(predictions):\n", 656 | " total = len(predictions)\n", 657 | " depressed_count = predictions.count(\"YES\")\n", 658 | " \n", 659 | " print(\"\\n\\n\")\n", 660 | " \n", 661 | " if depressed_count > (total*.6):\n", 662 | " print(\"Result: DEPRESSED 😒\")\n", 663 | " else:\n", 664 | " print(\"Result: NOT DEPRESSED 😊\")\n", 665 | "\n", 666 | "user_tweets = download_user_tweets()\n", 667 | "predictions = predict_user_tweets(user_tweets)\n", 668 | "final_output(predictions)" 669 | ] 670 | } 671 | ], 672 | "metadata": { 673 | "colab": { 674 | "provenance": [] 675 | }, 676 | "kernelspec": { 677 | "display_name": "Python 3", 678 | "name": "python3" 679 | }, 680 | "language_info": { 681 | "name": "python" 682 | } 683 | }, 684 | "nbformat": 4, 685 | "nbformat_minor": 0 686 | } 687 | --------------------------------------------------------------------------------