├── Equity-Evaluation-Corpus.csv ├── README.md ├── emotion_classification.ipynb ├── text_emotion.csv └── tweets_clean.txt /README.md: -------------------------------------------------------------------------------- 1 | ## Text Emotion Classification 2 | 3 | In this project, we try to the classify the text according to the emotion that it represents. This is multi class sentiment analysis problem. We combine three different datasets, namely [equity evaluation corpus](https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/Equity-Evaluation-Corpus.csv), [text emotion](https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/text_emotion.csv) and [cleaned tweets](https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/tweets_clean.txt) to create the large dataset used in this problem. 4 | 5 | The Jupyter Notebook file is [here](https://nbviewer.jupyter.org/github/abishekarun/Text-Emotion-Classification/blob/master/emotion_classification.ipynb) for this project. 6 | 7 | The resources that helped me are: 8 | 9 | + [Emotion Classification in Microblog Text](https://pdfs.semanticscholar.org/c804/78e361ed8f5fd5400fdbd4f6a6f37a2e4b57.pdf) 10 | + [Emotxt: A Toolkit for Emotion Recognition](https://arxiv.org/ftp/arxiv/papers/1708/1708.03892.pdf) 11 | + [Emotion Detection](https://www.microsoft.com/developerblog/2015/11/29/emotion-detection-and-recognition-from-text-using-deep-learning/) 12 | + [ANN for Emotion Recognition](https://medium.com/data-science-group-iitr/artificial-neural-network-for-text-classification-b7aa5994d985) 13 | -------------------------------------------------------------------------------- /emotion_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import re\n", 12 | "\n", 13 | "import nltk\n", 14 | "from nltk.corpus import stopwords\n", 15 | "\n", 16 | "from scipy.stats import itemfreq\n", 17 | "from sklearn.model_selection import train_test_split\n", 18 | "from sklearn.naive_bayes import MultinomialNB\n", 19 | "from sklearn.pipeline import Pipeline\n", 20 | "from sklearn.preprocessing import LabelEncoder\n", 21 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer\n", 22 | "from sklearn.metrics import confusion_matrix\n", 23 | "\n", 24 | "pd.options.mode.chained_assignment = None" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "data1 = pd.read_csv('text_emotion.csv',encoding = \"ISO-8859-1\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | "
tweet_idsentimentauthorcontent
01956967341emptyxoshayzers@tiffanylue i know i was listenin to bad habi...
11956967666sadnesswannamamaLayin n bed with a headache ughhhh...waitin o...
21956967696sadnesscoolfunkyFuneral ceremony...gloomy friday...
31956967789enthusiasmczareaquinowants to hang out with friends SOON!
41956968416neutralxkilljoyx@dannycastillo We want to trade with someone w...
51956968477worryxxxPEACHESxxxRe-pinging @ghostridah14: why didn't you go to...
61956968487sadnessShansBeeI should be sleep, but im not! thinking about ...
71956968636worrymcsleazyHmmm. http://www.djhero.com/ is down
81956969035sadnessnic0lepaula@charviray Charlene my love. I miss you
91956969172sadnessIngenue_Em@kelcouch I'm sorry at least it's Friday?
\n", 141 | "
" 142 | ], 143 | "text/plain": [ 144 | " tweet_id sentiment author \\\n", 145 | "0 1956967341 empty xoshayzers \n", 146 | "1 1956967666 sadness wannamama \n", 147 | "2 1956967696 sadness coolfunky \n", 148 | "3 1956967789 enthusiasm czareaquino \n", 149 | "4 1956968416 neutral xkilljoyx \n", 150 | "5 1956968477 worry xxxPEACHESxxx \n", 151 | "6 1956968487 sadness ShansBee \n", 152 | "7 1956968636 worry mcsleazy \n", 153 | "8 1956969035 sadness nic0lepaula \n", 154 | "9 1956969172 sadness Ingenue_Em \n", 155 | "\n", 156 | " content \n", 157 | "0 @tiffanylue i know i was listenin to bad habi... \n", 158 | "1 Layin n bed with a headache ughhhh...waitin o... \n", 159 | "2 Funeral ceremony...gloomy friday... \n", 160 | "3 wants to hang out with friends SOON! \n", 161 | "4 @dannycastillo We want to trade with someone w... \n", 162 | "5 Re-pinging @ghostridah14: why didn't you go to... \n", 163 | "6 I should be sleep, but im not! thinking about ... \n", 164 | "7 Hmmm. http://www.djhero.com/ is down \n", 165 | "8 @charviray Charlene my love. I miss you \n", 166 | "9 @kelcouch I'm sorry at least it's Friday? " 167 | ] 168 | }, 169 | "execution_count": 3, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "data1.head(10)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "(40000, 4)" 187 | ] 188 | }, 189 | "execution_count": 4, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "data1.shape" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 5, 201 | "metadata": { 202 | "scrolled": true 203 | }, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "\n", 210 | "RangeIndex: 40000 entries, 0 to 39999\n", 211 | "Data columns (total 4 columns):\n", 212 | "tweet_id 40000 non-null int64\n", 213 | "sentiment 40000 non-null object\n", 214 | "author 40000 non-null object\n", 215 | "content 40000 non-null object\n", 216 | "dtypes: int64(1), object(3)\n", 217 | "memory usage: 1.2+ MB\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "data1.info()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 6, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "data1=data1[['tweet_id','sentiment','content']].copy()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 7, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "neutral 8638\n", 243 | "worry 8459\n", 244 | "happiness 5209\n", 245 | "sadness 5165\n", 246 | "love 3842\n", 247 | "surprise 2187\n", 248 | "fun 1776\n", 249 | "relief 1526\n", 250 | "hate 1323\n", 251 | "empty 827\n", 252 | "enthusiasm 759\n", 253 | "boredom 179\n", 254 | "anger 110\n", 255 | "Name: sentiment, dtype: int64" 256 | ] 257 | }, 258 | "execution_count": 7, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "data1.sentiment.value_counts()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 8, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "data1.sentiment = np.where((data1.sentiment == 'neutral') |(data1.sentiment == 'empty')|(data1.sentiment == 'boredom'),'neutral',data1.sentiment)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 9, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "data1.sentiment = np.where((data1.sentiment == 'fun') |(data1.sentiment == 'enthusiasm'),'fun',data1.sentiment)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 10, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "data1=data1[data1.sentiment !='neutral']" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 11, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/plain": [ 302 | "worry 8459\n", 303 | "happiness 5209\n", 304 | "sadness 5165\n", 305 | "love 3842\n", 306 | "fun 2535\n", 307 | "surprise 2187\n", 308 | "relief 1526\n", 309 | "hate 1323\n", 310 | "anger 110\n", 311 | "Name: sentiment, dtype: int64" 312 | ] 313 | }, 314 | "execution_count": 11, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "data1.sentiment.value_counts()" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 12, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "data2=pd.read_csv('tweets_clean.txt',sep='\t',header=None)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 13, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/html": [ 340 | "
\n", 341 | "\n", 354 | "\n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | "
012
0145353048817012736:Thinks that @melbahughes had a great 50th birt...:: surprise
1144279638024257536:Como una expresión tan simple, una sola oració...:: sadness
2140499585285111809:the moment when you get another follower and y...:: joy
3145207578270507009:Be the greatest dancer of your life! practice ...:: joy
4139502146390470656:eww.. my moms starting to make her annual rum ...:: disgust
5146042696899887106:If ur heart hurts all the time for tht person ...:: joy
6145492569609084928:I feel awful, and it's way too freaking early....:: joy
7145903955229151232:So chuffed for safc fans! Bet me dar comes in ...:: joy
8142717613234069504:Making art and viewing art are different at th...:: fear
9144183822873927680:Soooo dooowwwn!! Move on, get some sleep... Me...:: anger
\n", 426 | "
" 427 | ], 428 | "text/plain": [ 429 | " 0 1 \\\n", 430 | "0 145353048817012736: Thinks that @melbahughes had a great 50th birt... \n", 431 | "1 144279638024257536: Como una expresión tan simple, una sola oració... \n", 432 | "2 140499585285111809: the moment when you get another follower and y... \n", 433 | "3 145207578270507009: Be the greatest dancer of your life! practice ... \n", 434 | "4 139502146390470656: eww.. my moms starting to make her annual rum ... \n", 435 | "5 146042696899887106: If ur heart hurts all the time for tht person ... \n", 436 | "6 145492569609084928: I feel awful, and it's way too freaking early.... \n", 437 | "7 145903955229151232: So chuffed for safc fans! Bet me dar comes in ... \n", 438 | "8 142717613234069504: Making art and viewing art are different at th... \n", 439 | "9 144183822873927680: Soooo dooowwwn!! Move on, get some sleep... Me... \n", 440 | "\n", 441 | " 2 \n", 442 | "0 :: surprise \n", 443 | "1 :: sadness \n", 444 | "2 :: joy \n", 445 | "3 :: joy \n", 446 | "4 :: disgust \n", 447 | "5 :: joy \n", 448 | "6 :: joy \n", 449 | "7 :: joy \n", 450 | "8 :: fear \n", 451 | "9 :: anger " 452 | ] 453 | }, 454 | "execution_count": 13, 455 | "metadata": {}, 456 | "output_type": "execute_result" 457 | } 458 | ], 459 | "source": [ 460 | "data2.head(10)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 14, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "data2.columns=['tweet_id','content','sentiment']" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 15, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "data2.sentiment = data2.sentiment.str.replace(':: ','')" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 16, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "data": { 488 | "text/plain": [ 489 | "joy 8240\n", 490 | "surprise 3849\n", 491 | "sadness 3830\n", 492 | "fear 2816\n", 493 | "anger 1555\n", 494 | "disgust 761\n", 495 | "Name: sentiment, dtype: int64" 496 | ] 497 | }, 498 | "execution_count": 16, 499 | "metadata": {}, 500 | "output_type": "execute_result" 501 | } 502 | ], 503 | "source": [ 504 | "data2.sentiment.value_counts()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 17, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "# Emotions to keep\n", 514 | "\n", 515 | "# worry,happpy(happiness,joy),surprise,sadness,love,fear,anger,hate(disgust+hate),relief,fun(fun+enthusiasm)" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 18, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "data = data1.append(data2)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 19, 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "data": { 534 | "text/html": [ 535 | "
\n", 536 | "\n", 549 | "\n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | "
contentsentimenttweet_id
1Layin n bed with a headache ughhhh...waitin o...sadness1956967666
2Funeral ceremony...gloomy friday...sadness1956967696
3wants to hang out with friends SOON!fun1956967789
5Re-pinging @ghostridah14: why didn't you go to...worry1956968477
6I should be sleep, but im not! thinking about ...sadness1956968487
7Hmmm. http://www.djhero.com/ is downworry1956968636
8@charviray Charlene my love. I miss yousadness1956969035
9@kelcouch I'm sorry at least it's Friday?sadness1956969172
11Choked on her retainersworry1956969531
12Ugh! I have to beat this stupid song to get to...sadness1956970047
\n", 621 | "
" 622 | ], 623 | "text/plain": [ 624 | " content sentiment tweet_id\n", 625 | "1 Layin n bed with a headache ughhhh...waitin o... sadness 1956967666\n", 626 | "2 Funeral ceremony...gloomy friday... sadness 1956967696\n", 627 | "3 wants to hang out with friends SOON! fun 1956967789\n", 628 | "5 Re-pinging @ghostridah14: why didn't you go to... worry 1956968477\n", 629 | "6 I should be sleep, but im not! thinking about ... sadness 1956968487\n", 630 | "7 Hmmm. http://www.djhero.com/ is down worry 1956968636\n", 631 | "8 @charviray Charlene my love. I miss you sadness 1956969035\n", 632 | "9 @kelcouch I'm sorry at least it's Friday? sadness 1956969172\n", 633 | "11 Choked on her retainers worry 1956969531\n", 634 | "12 Ugh! I have to beat this stupid song to get to... sadness 1956970047" 635 | ] 636 | }, 637 | "execution_count": 19, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | "data.head(10)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 20, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "data.sentiment = np.where((data.sentiment == 'disgust') |(data.sentiment == 'hate'),'hate',data.sentiment)" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 21, 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "data": { 662 | "text/plain": [ 663 | "sadness 8995\n", 664 | "worry 8459\n", 665 | "joy 8240\n", 666 | "surprise 6036\n", 667 | "happiness 5209\n", 668 | "love 3842\n", 669 | "fear 2816\n", 670 | "fun 2535\n", 671 | "hate 2084\n", 672 | "anger 1665\n", 673 | "relief 1526\n", 674 | "Name: sentiment, dtype: int64" 675 | ] 676 | }, 677 | "execution_count": 21, 678 | "metadata": {}, 679 | "output_type": "execute_result" 680 | } 681 | ], 682 | "source": [ 683 | "data.sentiment.value_counts()" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 22, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "data=data[data.sentiment.isin(['sadness','worry','joy'])]" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 23, 698 | "metadata": {}, 699 | "outputs": [ 700 | { 701 | "data": { 702 | "text/plain": [ 703 | "sadness 8995\n", 704 | "worry 8459\n", 705 | "joy 8240\n", 706 | "Name: sentiment, dtype: int64" 707 | ] 708 | }, 709 | "execution_count": 23, 710 | "metadata": {}, 711 | "output_type": "execute_result" 712 | } 713 | ], 714 | "source": [ 715 | "data.sentiment.value_counts()" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 24, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "#data3 = pd.read_csv('Equity-Evaluation-Corpus.csv',sep=',')" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 25, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "#data3.Emotion.value_counts()" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "## Clean Text" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "#### Remove irrelevant characters other than alphanumeric and space" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 26, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "data['content']=data['content'].str.replace('[^A-Za-z0-9\\s]+', '')" 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "#### Remove links from the text" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 27, 769 | "metadata": {}, 770 | "outputs": [], 771 | "source": [ 772 | "data['content']=data['content'].str.replace('http\\S+|www.\\S+', '', case=False)" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "#### Convert everything to lowercase" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 28, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "data['content']=data['content'].str.lower()" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "#### Assign Target Variable" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 29, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "target=data.sentiment\n", 805 | "data = data.drop(['sentiment'],axis=1)" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 30, 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [ 814 | "le=LabelEncoder()\n", 815 | "target=le.fit_transform(target)" 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "### Split Data into train & test" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": 31, 828 | "metadata": {}, 829 | "outputs": [], 830 | "source": [ 831 | "X_train, X_test, y_train, y_test = train_test_split(data,target,stratify=target,test_size=0.4, random_state=42)" 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": {}, 837 | "source": [ 838 | "##### Check if the split divides the classes uniformly" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 32, 844 | "metadata": {}, 845 | "outputs": [ 846 | { 847 | "data": { 848 | "text/plain": [ 849 | "array([[ 0, 4944],\n", 850 | " [ 1, 5397],\n", 851 | " [ 2, 5075]], dtype=int64)" 852 | ] 853 | }, 854 | "execution_count": 32, 855 | "metadata": {}, 856 | "output_type": "execute_result" 857 | } 858 | ], 859 | "source": [ 860 | "itemfreq(y_train)" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": 33, 866 | "metadata": {}, 867 | "outputs": [ 868 | { 869 | "data": { 870 | "text/plain": [ 871 | "array([[ 0, 3296],\n", 872 | " [ 1, 3598],\n", 873 | " [ 2, 3384]], dtype=int64)" 874 | ] 875 | }, 876 | "execution_count": 33, 877 | "metadata": {}, 878 | "output_type": "execute_result" 879 | } 880 | ], 881 | "source": [ 882 | "itemfreq(y_test)" 883 | ] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "metadata": {}, 888 | "source": [ 889 | "### Tokenization \n", 890 | "\n", 891 | "Tokenization can be done in a variety of ways, namely **Bag of words, tf-idf, Glove, word2vec ,fasttext **etc. Lets see how they can be applied and how they affect the accuracy" 892 | ] 893 | }, 894 | { 895 | "cell_type": "markdown", 896 | "metadata": {}, 897 | "source": [ 898 | "#### Bag of Words " 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": 34, 904 | "metadata": {}, 905 | "outputs": [ 906 | { 907 | "name": "stdout", 908 | "output_type": "stream", 909 | "text": [ 910 | "Shape of Term Frequency Matrix: (15416, 25747)\n" 911 | ] 912 | } 913 | ], 914 | "source": [ 915 | "# Extracting features from text files\n", 916 | "count_vect = CountVectorizer()\n", 917 | "X_train_counts = count_vect.fit_transform(X_train.content)\n", 918 | "X_test_counts =count_vect.transform(X_test.content)\n", 919 | "print('Shape of Term Frequency Matrix: ',X_train_counts.shape)" 920 | ] 921 | }, 922 | { 923 | "cell_type": "markdown", 924 | "metadata": {}, 925 | "source": [ 926 | "#### Naive Bayes Model" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": 35, 932 | "metadata": {}, 933 | "outputs": [ 934 | { 935 | "name": "stdout", 936 | "output_type": "stream", 937 | "text": [ 938 | "59.2625024324\n" 939 | ] 940 | } 941 | ], 942 | "source": [ 943 | "# Machine Learning\n", 944 | "# Training Naive Bayes (NB) classifier on training data.\n", 945 | "clf = MultinomialNB().fit(X_train_counts,y_train)\n", 946 | "predicted = clf.predict(X_test_counts)\n", 947 | "nb_clf_accuracy = np.mean(predicted == y_test) * 100\n", 948 | "print(nb_clf_accuracy)" 949 | ] 950 | }, 951 | { 952 | "cell_type": "markdown", 953 | "metadata": {}, 954 | "source": [ 955 | "#### Same thing can be done using a Pipeline\n", 956 | "\n", 957 | "Lets take a look at how it can be done.
\n", 958 | "First lets define a function for printing accuracy" 959 | ] 960 | }, 961 | { 962 | "cell_type": "code", 963 | "execution_count": 36, 964 | "metadata": {}, 965 | "outputs": [], 966 | "source": [ 967 | "def print_acc(model):\n", 968 | " predicted = model.predict(X_test.content)\n", 969 | " accuracy = np.mean(predicted == y_test) * 100\n", 970 | " print(accuracy)" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": 37, 976 | "metadata": {}, 977 | "outputs": [ 978 | { 979 | "name": "stdout", 980 | "output_type": "stream", 981 | "text": [ 982 | "59.2625024324\n" 983 | ] 984 | } 985 | ], 986 | "source": [ 987 | "nb_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])\n", 988 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n", 989 | "print_acc(nb_clf)" 990 | ] 991 | }, 992 | { 993 | "cell_type": "markdown", 994 | "metadata": {}, 995 | "source": [ 996 | "#### TF IDF transformer" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": 38, 1002 | "metadata": {}, 1003 | "outputs": [ 1004 | { 1005 | "name": "stdout", 1006 | "output_type": "stream", 1007 | "text": [ 1008 | "58.5425179996\n" 1009 | ] 1010 | } 1011 | ], 1012 | "source": [ 1013 | "nb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n", 1014 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n", 1015 | "print_acc(nb_clf)" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "metadata": {}, 1021 | "source": [ 1022 | "#### Hash Vectorizer\n", 1023 | "\n", 1024 | "Note: Naive Bayes requires input to be non negative. Therefore, the alternate sign should be set to false in Hashing Vectorizer to make it work with naive bayes algorithm" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 39, 1030 | "metadata": {}, 1031 | "outputs": [ 1032 | { 1033 | "name": "stdout", 1034 | "output_type": "stream", 1035 | "text": [ 1036 | "53.765323993\n" 1037 | ] 1038 | } 1039 | ], 1040 | "source": [ 1041 | "nb_clf = Pipeline([('vect', HashingVectorizer(n_features=2500,alternate_sign=False)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n", 1042 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n", 1043 | "print_acc(nb_clf)" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "execution_count": 40, 1049 | "metadata": { 1050 | "scrolled": true 1051 | }, 1052 | "outputs": [ 1053 | { 1054 | "data": { 1055 | "text/plain": [ 1056 | "array([[2430, 517, 349],\n", 1057 | " [ 551, 1989, 1058],\n", 1058 | " [ 434, 1278, 1672]], dtype=int64)" 1059 | ] 1060 | }, 1061 | "execution_count": 40, 1062 | "metadata": {}, 1063 | "output_type": "execute_result" 1064 | } 1065 | ], 1066 | "source": [ 1067 | "confusion_matrix(y_test,predicted)" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "#### Remove Stop Words" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": 41, 1080 | "metadata": {}, 1081 | "outputs": [ 1082 | { 1083 | "name": "stdout", 1084 | "output_type": "stream", 1085 | "text": [ 1086 | "58.23117338\n" 1087 | ] 1088 | } 1089 | ], 1090 | "source": [ 1091 | "stop_words = set(stopwords.words('english'))\n", 1092 | "nb_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), ('clf', MultinomialNB())])\n", 1093 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n", 1094 | "print_acc(nb_clf)" 1095 | ] 1096 | }, 1097 | { 1098 | "cell_type": "code", 1099 | "execution_count": 42, 1100 | "metadata": {}, 1101 | "outputs": [ 1102 | { 1103 | "name": "stdout", 1104 | "output_type": "stream", 1105 | "text": [ 1106 | "57.6084841409\n" 1107 | ] 1108 | } 1109 | ], 1110 | "source": [ 1111 | "nb_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n", 1112 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n", 1113 | "print_acc(nb_clf)" 1114 | ] 1115 | }, 1116 | { 1117 | "cell_type": "markdown", 1118 | "metadata": {}, 1119 | "source": [ 1120 | "#### Lemmatization" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "execution_count": 43, 1126 | "metadata": {}, 1127 | "outputs": [ 1128 | { 1129 | "name": "stderr", 1130 | "output_type": "stream", 1131 | "text": [ 1132 | "c:\\program files\\python35\\lib\\site-packages\\pandas\\core\\indexing.py:601: SettingWithCopyWarning: \n", 1133 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1134 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1135 | "\n", 1136 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 1137 | " self.obj[item_labels[indexer[info_axis]]] = value\n" 1138 | ] 1139 | } 1140 | ], 1141 | "source": [ 1142 | "w_tokenizer = nltk.tokenize.WhitespaceTokenizer()\n", 1143 | "lemmatizer = nltk.stem.WordNetLemmatizer()\n", 1144 | "\n", 1145 | "def lemmatize_text(text):\n", 1146 | " return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])\n", 1147 | "X_train.loc[:,'content'] = X_train['content'].apply(lemmatize_text)\n", 1148 | "X_test.loc[:,'content'] = X_test['content'].apply(lemmatize_text)" 1149 | ] 1150 | }, 1151 | { 1152 | "cell_type": "code", 1153 | "execution_count": 44, 1154 | "metadata": { 1155 | "scrolled": true 1156 | }, 1157 | "outputs": [ 1158 | { 1159 | "name": "stdout", 1160 | "output_type": "stream", 1161 | "text": [ 1162 | "57.4138937536\n" 1163 | ] 1164 | } 1165 | ], 1166 | "source": [ 1167 | "nb_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n", 1168 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n", 1169 | "print_acc(nb_clf)" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "execution_count": 45, 1175 | "metadata": {}, 1176 | "outputs": [], 1177 | "source": [ 1178 | "#### Spell Correction with Flashtext" 1179 | ] 1180 | }, 1181 | { 1182 | "cell_type": "markdown", 1183 | "metadata": {}, 1184 | "source": [ 1185 | "#### Do the same pipeline with NLTK, spacy and pytorch" 1186 | ] 1187 | } 1188 | ], 1189 | "metadata": { 1190 | "kernelspec": { 1191 | "display_name": "Python 3", 1192 | "language": "python", 1193 | "name": "python3" 1194 | }, 1195 | "language_info": { 1196 | "codemirror_mode": { 1197 | "name": "ipython", 1198 | "version": 3 1199 | }, 1200 | "file_extension": ".py", 1201 | "mimetype": "text/x-python", 1202 | "name": "python", 1203 | "nbconvert_exporter": "python", 1204 | "pygments_lexer": "ipython3", 1205 | "version": "3.5.2" 1206 | } 1207 | }, 1208 | "nbformat": 4, 1209 | "nbformat_minor": 2 1210 | } 1211 | --------------------------------------------------------------------------------