├── SentimentAnalysis.ipynb ├── .ipynb_checkpoints ├── SentimentAnalysis-checkpoint.ipynb └── emotions-checkpoint.csv └── emotions.csv /SentimentAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests\n", 10 | "from bs4 import BeautifulSoup" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "url = 'https://www.goodreads.com/quotes/tag/{}?page={}'" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "emotions = ['love', 'religion']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "complete = url.format(emotions[0], 1)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "'https://www.goodreads.com/quotes/tag/love?page=1'" 49 | ] 50 | }, 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "complete" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "def get_quotes(complete):\n", 67 | " data = requests.get(complete)\n", 68 | " soup = BeautifulSoup(data.text)\n", 69 | " divs = soup.find_all('div', attrs={'class' : 'quoteText'})\n", 70 | " quotes = [div.text.strip().split('\\n')[0][1:-1] for div in divs]\n", 71 | " return quotes" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 7, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "quotes = get_quotes(complete)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 8, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "Processed page 1 for love\n", 93 | "Processed page 2 for love\n", 94 | "Processed page 3 for love\n", 95 | "Processed page 4 for love\n", 96 | "Processed page 5 for love\n", 97 | "Processed page 1 for religion\n", 98 | "Processed page 2 for religion\n", 99 | "Processed page 3 for religion\n", 100 | "Processed page 4 for religion\n", 101 | "Processed page 5 for religion\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "X, y = [], []\n", 107 | "\n", 108 | "for emotion in emotions:\n", 109 | " for i in range(1, 6):\n", 110 | " complete = url.format(emotion, i)\n", 111 | " quotes = get_quotes(complete)\n", 112 | " X.extend(quotes)\n", 113 | " y.extend([emotion] * len(quotes))\n", 114 | " print(f'Processed page {i} for {emotion}')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 9, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "'The very essence of romance is uncertainty.'" 126 | ] 127 | }, 128 | "execution_count": 9, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "X[100]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 10, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "'religion'" 146 | ] 147 | }, 148 | "execution_count": 10, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "y[200]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 11, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "import pandas as pd" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 12, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "df = pd.DataFrame(list(zip(y, X)), columns=['emotion', 'quotes'])" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 13, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "df.to_csv('emotions.csv', index=False)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 14, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from sklearn.feature_extraction.text import CountVectorizer" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 15, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "vect = CountVectorizer(max_features=500)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 16, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "from nltk.tokenize import RegexpTokenizer\n", 209 | "from nltk.stem import PorterStemmer\n", 210 | "from nltk.corpus import stopwords" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 17, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "tokenizer = RegexpTokenizer('\\w+')\n", 220 | "sw = set(stopwords.words('english'))\n", 221 | "ps = PorterStemmer()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 18, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "def getStemmedQuote(quote):\n", 231 | " quote = quote.lower()\n", 232 | " \n", 233 | " # tokenize\n", 234 | " tokens = tokenizer.tokenize(quote)\n", 235 | " \n", 236 | " # remove stopwords\n", 237 | " new_tokens = [token for token in tokens if token not in sw]\n", 238 | " \n", 239 | " stemmed_token = [ps.stem(token) for token in new_tokens]\n", 240 | " \n", 241 | " clean_quote = ' '.join(stemmed_token)\n", 242 | " \n", 243 | " return clean_quote\n", 244 | "\n", 245 | "def getStemmedQuotes(quotes):\n", 246 | " d = []\n", 247 | " for quote in quotes:\n", 248 | " d.append(getStemmedQuote(quote))\n", 249 | " return d" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 19, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "X = getStemmedQuotes(X)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 20, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 270 | " dtype=, encoding='utf-8', input='content',\n", 271 | " lowercase=True, max_df=1.0, max_features=500, min_df=1,\n", 272 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 273 | " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 274 | " tokenizer=None, vocabulary=None)" 275 | ] 276 | }, 277 | "execution_count": 20, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "vect.fit(X)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 21, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "500" 295 | ] 296 | }, 297 | "execution_count": 21, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "len(vect.vocabulary_)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 22, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "X_mod = vect.transform(X).todense()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 23, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "from sklearn.model_selection import train_test_split" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 24, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "X_train, X_test, y_train, y_test = train_test_split(\n", 331 | "... X_mod, y, test_size=0.33, random_state=42)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 25, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from sklearn.naive_bayes import BernoulliNB" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 26, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "model = BernoulliNB()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 27, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)" 361 | ] 362 | }, 363 | "execution_count": 27, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "model.fit(X_train, y_train)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 28, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/plain": [ 380 | "0.8080808080808081" 381 | ] 382 | }, 383 | "execution_count": 28, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "model.score(X_test, y_test)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 29, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "line = \"You're just too good to be true can't take my eyes off you you'd be like heaven to touch I wanna hold you so much I love you baby\"" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 30, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "X_vec = vect.transform([line]).todense()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 31, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "array(['love'], dtype=', encoding='utf-8', input='content',\n", 271 | " lowercase=True, max_df=1.0, max_features=500, min_df=1,\n", 272 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 273 | " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 274 | " tokenizer=None, vocabulary=None)" 275 | ] 276 | }, 277 | "execution_count": 20, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "vect.fit(X)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 21, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "500" 295 | ] 296 | }, 297 | "execution_count": 21, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "len(vect.vocabulary_)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 22, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "X_mod = vect.transform(X).todense()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 23, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "from sklearn.model_selection import train_test_split" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 24, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "X_train, X_test, y_train, y_test = train_test_split(\n", 331 | "... X_mod, y, test_size=0.33, random_state=42)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 25, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from sklearn.naive_bayes import BernoulliNB" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 26, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "model = BernoulliNB()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 27, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)" 361 | ] 362 | }, 363 | "execution_count": 27, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "model.fit(X_train, y_train)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 28, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/plain": [ 380 | "0.8080808080808081" 381 | ] 382 | }, 383 | "execution_count": 28, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "model.score(X_test, y_test)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 29, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "line = \"You're just too good to be true can't take my eyes off you you'd be like heaven to touch I wanna hold you so much I love you baby\"" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 30, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "X_vec = vect.transform([line]).todense()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 31, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "array(['love'], dtype='