├── Equity-Evaluation-Corpus.csv
├── README.md
├── emotion_classification.ipynb
├── text_emotion.csv
└── tweets_clean.txt
/README.md:
--------------------------------------------------------------------------------
1 | ## Text Emotion Classification
2 |
3 | In this project, we try to the classify the text according to the emotion that it represents. This is multi class sentiment analysis problem. We combine three different datasets, namely [equity evaluation corpus](https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/Equity-Evaluation-Corpus.csv), [text emotion](https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/text_emotion.csv) and [cleaned tweets](https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/tweets_clean.txt) to create the large dataset used in this problem.
4 |
5 | The Jupyter Notebook file is [here](https://nbviewer.jupyter.org/github/abishekarun/Text-Emotion-Classification/blob/master/emotion_classification.ipynb) for this project.
6 |
7 | The resources that helped me are:
8 |
9 | + [Emotion Classification in Microblog Text](https://pdfs.semanticscholar.org/c804/78e361ed8f5fd5400fdbd4f6a6f37a2e4b57.pdf)
10 | + [Emotxt: A Toolkit for Emotion Recognition](https://arxiv.org/ftp/arxiv/papers/1708/1708.03892.pdf)
11 | + [Emotion Detection](https://www.microsoft.com/developerblog/2015/11/29/emotion-detection-and-recognition-from-text-using-deep-learning/)
12 | + [ANN for Emotion Recognition](https://medium.com/data-science-group-iitr/artificial-neural-network-for-text-classification-b7aa5994d985)
13 |
--------------------------------------------------------------------------------
/emotion_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import re\n",
12 | "\n",
13 | "import nltk\n",
14 | "from nltk.corpus import stopwords\n",
15 | "\n",
16 | "from scipy.stats import itemfreq\n",
17 | "from sklearn.model_selection import train_test_split\n",
18 | "from sklearn.naive_bayes import MultinomialNB\n",
19 | "from sklearn.pipeline import Pipeline\n",
20 | "from sklearn.preprocessing import LabelEncoder\n",
21 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer\n",
22 | "from sklearn.metrics import confusion_matrix\n",
23 | "\n",
24 | "pd.options.mode.chained_assignment = None"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "data1 = pd.read_csv('text_emotion.csv',encoding = \"ISO-8859-1\")"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/html": [
44 | "
\n",
45 | "\n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " | \n",
62 | " tweet_id | \n",
63 | " sentiment | \n",
64 | " author | \n",
65 | " content | \n",
66 | "
\n",
67 | " \n",
68 | " \n",
69 | " \n",
70 | " 0 | \n",
71 | " 1956967341 | \n",
72 | " empty | \n",
73 | " xoshayzers | \n",
74 | " @tiffanylue i know i was listenin to bad habi... | \n",
75 | "
\n",
76 | " \n",
77 | " 1 | \n",
78 | " 1956967666 | \n",
79 | " sadness | \n",
80 | " wannamama | \n",
81 | " Layin n bed with a headache ughhhh...waitin o... | \n",
82 | "
\n",
83 | " \n",
84 | " 2 | \n",
85 | " 1956967696 | \n",
86 | " sadness | \n",
87 | " coolfunky | \n",
88 | " Funeral ceremony...gloomy friday... | \n",
89 | "
\n",
90 | " \n",
91 | " 3 | \n",
92 | " 1956967789 | \n",
93 | " enthusiasm | \n",
94 | " czareaquino | \n",
95 | " wants to hang out with friends SOON! | \n",
96 | "
\n",
97 | " \n",
98 | " 4 | \n",
99 | " 1956968416 | \n",
100 | " neutral | \n",
101 | " xkilljoyx | \n",
102 | " @dannycastillo We want to trade with someone w... | \n",
103 | "
\n",
104 | " \n",
105 | " 5 | \n",
106 | " 1956968477 | \n",
107 | " worry | \n",
108 | " xxxPEACHESxxx | \n",
109 | " Re-pinging @ghostridah14: why didn't you go to... | \n",
110 | "
\n",
111 | " \n",
112 | " 6 | \n",
113 | " 1956968487 | \n",
114 | " sadness | \n",
115 | " ShansBee | \n",
116 | " I should be sleep, but im not! thinking about ... | \n",
117 | "
\n",
118 | " \n",
119 | " 7 | \n",
120 | " 1956968636 | \n",
121 | " worry | \n",
122 | " mcsleazy | \n",
123 | " Hmmm. http://www.djhero.com/ is down | \n",
124 | "
\n",
125 | " \n",
126 | " 8 | \n",
127 | " 1956969035 | \n",
128 | " sadness | \n",
129 | " nic0lepaula | \n",
130 | " @charviray Charlene my love. I miss you | \n",
131 | "
\n",
132 | " \n",
133 | " 9 | \n",
134 | " 1956969172 | \n",
135 | " sadness | \n",
136 | " Ingenue_Em | \n",
137 | " @kelcouch I'm sorry at least it's Friday? | \n",
138 | "
\n",
139 | " \n",
140 | "
\n",
141 | "
"
142 | ],
143 | "text/plain": [
144 | " tweet_id sentiment author \\\n",
145 | "0 1956967341 empty xoshayzers \n",
146 | "1 1956967666 sadness wannamama \n",
147 | "2 1956967696 sadness coolfunky \n",
148 | "3 1956967789 enthusiasm czareaquino \n",
149 | "4 1956968416 neutral xkilljoyx \n",
150 | "5 1956968477 worry xxxPEACHESxxx \n",
151 | "6 1956968487 sadness ShansBee \n",
152 | "7 1956968636 worry mcsleazy \n",
153 | "8 1956969035 sadness nic0lepaula \n",
154 | "9 1956969172 sadness Ingenue_Em \n",
155 | "\n",
156 | " content \n",
157 | "0 @tiffanylue i know i was listenin to bad habi... \n",
158 | "1 Layin n bed with a headache ughhhh...waitin o... \n",
159 | "2 Funeral ceremony...gloomy friday... \n",
160 | "3 wants to hang out with friends SOON! \n",
161 | "4 @dannycastillo We want to trade with someone w... \n",
162 | "5 Re-pinging @ghostridah14: why didn't you go to... \n",
163 | "6 I should be sleep, but im not! thinking about ... \n",
164 | "7 Hmmm. http://www.djhero.com/ is down \n",
165 | "8 @charviray Charlene my love. I miss you \n",
166 | "9 @kelcouch I'm sorry at least it's Friday? "
167 | ]
168 | },
169 | "execution_count": 3,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "data1.head(10)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 4,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "data": {
185 | "text/plain": [
186 | "(40000, 4)"
187 | ]
188 | },
189 | "execution_count": 4,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "data1.shape"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 5,
201 | "metadata": {
202 | "scrolled": true
203 | },
204 | "outputs": [
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "\n",
210 | "RangeIndex: 40000 entries, 0 to 39999\n",
211 | "Data columns (total 4 columns):\n",
212 | "tweet_id 40000 non-null int64\n",
213 | "sentiment 40000 non-null object\n",
214 | "author 40000 non-null object\n",
215 | "content 40000 non-null object\n",
216 | "dtypes: int64(1), object(3)\n",
217 | "memory usage: 1.2+ MB\n"
218 | ]
219 | }
220 | ],
221 | "source": [
222 | "data1.info()"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 6,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "data1=data1[['tweet_id','sentiment','content']].copy()"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 7,
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "data": {
241 | "text/plain": [
242 | "neutral 8638\n",
243 | "worry 8459\n",
244 | "happiness 5209\n",
245 | "sadness 5165\n",
246 | "love 3842\n",
247 | "surprise 2187\n",
248 | "fun 1776\n",
249 | "relief 1526\n",
250 | "hate 1323\n",
251 | "empty 827\n",
252 | "enthusiasm 759\n",
253 | "boredom 179\n",
254 | "anger 110\n",
255 | "Name: sentiment, dtype: int64"
256 | ]
257 | },
258 | "execution_count": 7,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "data1.sentiment.value_counts()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 8,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "data1.sentiment = np.where((data1.sentiment == 'neutral') |(data1.sentiment == 'empty')|(data1.sentiment == 'boredom'),'neutral',data1.sentiment)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 9,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "data1.sentiment = np.where((data1.sentiment == 'fun') |(data1.sentiment == 'enthusiasm'),'fun',data1.sentiment)"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 10,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "data1=data1[data1.sentiment !='neutral']"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 11,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/plain": [
302 | "worry 8459\n",
303 | "happiness 5209\n",
304 | "sadness 5165\n",
305 | "love 3842\n",
306 | "fun 2535\n",
307 | "surprise 2187\n",
308 | "relief 1526\n",
309 | "hate 1323\n",
310 | "anger 110\n",
311 | "Name: sentiment, dtype: int64"
312 | ]
313 | },
314 | "execution_count": 11,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "data1.sentiment.value_counts()"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 12,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "data2=pd.read_csv('tweets_clean.txt',sep='\t',header=None)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 13,
335 | "metadata": {},
336 | "outputs": [
337 | {
338 | "data": {
339 | "text/html": [
340 | "\n",
341 | "\n",
354 | "
\n",
355 | " \n",
356 | " \n",
357 | " | \n",
358 | " 0 | \n",
359 | " 1 | \n",
360 | " 2 | \n",
361 | "
\n",
362 | " \n",
363 | " \n",
364 | " \n",
365 | " 0 | \n",
366 | " 145353048817012736: | \n",
367 | " Thinks that @melbahughes had a great 50th birt... | \n",
368 | " :: surprise | \n",
369 | "
\n",
370 | " \n",
371 | " 1 | \n",
372 | " 144279638024257536: | \n",
373 | " Como una expresión tan simple, una sola oració... | \n",
374 | " :: sadness | \n",
375 | "
\n",
376 | " \n",
377 | " 2 | \n",
378 | " 140499585285111809: | \n",
379 | " the moment when you get another follower and y... | \n",
380 | " :: joy | \n",
381 | "
\n",
382 | " \n",
383 | " 3 | \n",
384 | " 145207578270507009: | \n",
385 | " Be the greatest dancer of your life! practice ... | \n",
386 | " :: joy | \n",
387 | "
\n",
388 | " \n",
389 | " 4 | \n",
390 | " 139502146390470656: | \n",
391 | " eww.. my moms starting to make her annual rum ... | \n",
392 | " :: disgust | \n",
393 | "
\n",
394 | " \n",
395 | " 5 | \n",
396 | " 146042696899887106: | \n",
397 | " If ur heart hurts all the time for tht person ... | \n",
398 | " :: joy | \n",
399 | "
\n",
400 | " \n",
401 | " 6 | \n",
402 | " 145492569609084928: | \n",
403 | " I feel awful, and it's way too freaking early.... | \n",
404 | " :: joy | \n",
405 | "
\n",
406 | " \n",
407 | " 7 | \n",
408 | " 145903955229151232: | \n",
409 | " So chuffed for safc fans! Bet me dar comes in ... | \n",
410 | " :: joy | \n",
411 | "
\n",
412 | " \n",
413 | " 8 | \n",
414 | " 142717613234069504: | \n",
415 | " Making art and viewing art are different at th... | \n",
416 | " :: fear | \n",
417 | "
\n",
418 | " \n",
419 | " 9 | \n",
420 | " 144183822873927680: | \n",
421 | " Soooo dooowwwn!! Move on, get some sleep... Me... | \n",
422 | " :: anger | \n",
423 | "
\n",
424 | " \n",
425 | "
\n",
426 | "
"
427 | ],
428 | "text/plain": [
429 | " 0 1 \\\n",
430 | "0 145353048817012736: Thinks that @melbahughes had a great 50th birt... \n",
431 | "1 144279638024257536: Como una expresión tan simple, una sola oració... \n",
432 | "2 140499585285111809: the moment when you get another follower and y... \n",
433 | "3 145207578270507009: Be the greatest dancer of your life! practice ... \n",
434 | "4 139502146390470656: eww.. my moms starting to make her annual rum ... \n",
435 | "5 146042696899887106: If ur heart hurts all the time for tht person ... \n",
436 | "6 145492569609084928: I feel awful, and it's way too freaking early.... \n",
437 | "7 145903955229151232: So chuffed for safc fans! Bet me dar comes in ... \n",
438 | "8 142717613234069504: Making art and viewing art are different at th... \n",
439 | "9 144183822873927680: Soooo dooowwwn!! Move on, get some sleep... Me... \n",
440 | "\n",
441 | " 2 \n",
442 | "0 :: surprise \n",
443 | "1 :: sadness \n",
444 | "2 :: joy \n",
445 | "3 :: joy \n",
446 | "4 :: disgust \n",
447 | "5 :: joy \n",
448 | "6 :: joy \n",
449 | "7 :: joy \n",
450 | "8 :: fear \n",
451 | "9 :: anger "
452 | ]
453 | },
454 | "execution_count": 13,
455 | "metadata": {},
456 | "output_type": "execute_result"
457 | }
458 | ],
459 | "source": [
460 | "data2.head(10)"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 14,
466 | "metadata": {},
467 | "outputs": [],
468 | "source": [
469 | "data2.columns=['tweet_id','content','sentiment']"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 15,
475 | "metadata": {},
476 | "outputs": [],
477 | "source": [
478 | "data2.sentiment = data2.sentiment.str.replace(':: ','')"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 16,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "data": {
488 | "text/plain": [
489 | "joy 8240\n",
490 | "surprise 3849\n",
491 | "sadness 3830\n",
492 | "fear 2816\n",
493 | "anger 1555\n",
494 | "disgust 761\n",
495 | "Name: sentiment, dtype: int64"
496 | ]
497 | },
498 | "execution_count": 16,
499 | "metadata": {},
500 | "output_type": "execute_result"
501 | }
502 | ],
503 | "source": [
504 | "data2.sentiment.value_counts()"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 17,
510 | "metadata": {},
511 | "outputs": [],
512 | "source": [
513 | "# Emotions to keep\n",
514 | "\n",
515 | "# worry,happpy(happiness,joy),surprise,sadness,love,fear,anger,hate(disgust+hate),relief,fun(fun+enthusiasm)"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 18,
521 | "metadata": {},
522 | "outputs": [],
523 | "source": [
524 | "data = data1.append(data2)"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 19,
530 | "metadata": {},
531 | "outputs": [
532 | {
533 | "data": {
534 | "text/html": [
535 | "\n",
536 | "\n",
549 | "
\n",
550 | " \n",
551 | " \n",
552 | " | \n",
553 | " content | \n",
554 | " sentiment | \n",
555 | " tweet_id | \n",
556 | "
\n",
557 | " \n",
558 | " \n",
559 | " \n",
560 | " 1 | \n",
561 | " Layin n bed with a headache ughhhh...waitin o... | \n",
562 | " sadness | \n",
563 | " 1956967666 | \n",
564 | "
\n",
565 | " \n",
566 | " 2 | \n",
567 | " Funeral ceremony...gloomy friday... | \n",
568 | " sadness | \n",
569 | " 1956967696 | \n",
570 | "
\n",
571 | " \n",
572 | " 3 | \n",
573 | " wants to hang out with friends SOON! | \n",
574 | " fun | \n",
575 | " 1956967789 | \n",
576 | "
\n",
577 | " \n",
578 | " 5 | \n",
579 | " Re-pinging @ghostridah14: why didn't you go to... | \n",
580 | " worry | \n",
581 | " 1956968477 | \n",
582 | "
\n",
583 | " \n",
584 | " 6 | \n",
585 | " I should be sleep, but im not! thinking about ... | \n",
586 | " sadness | \n",
587 | " 1956968487 | \n",
588 | "
\n",
589 | " \n",
590 | " 7 | \n",
591 | " Hmmm. http://www.djhero.com/ is down | \n",
592 | " worry | \n",
593 | " 1956968636 | \n",
594 | "
\n",
595 | " \n",
596 | " 8 | \n",
597 | " @charviray Charlene my love. I miss you | \n",
598 | " sadness | \n",
599 | " 1956969035 | \n",
600 | "
\n",
601 | " \n",
602 | " 9 | \n",
603 | " @kelcouch I'm sorry at least it's Friday? | \n",
604 | " sadness | \n",
605 | " 1956969172 | \n",
606 | "
\n",
607 | " \n",
608 | " 11 | \n",
609 | " Choked on her retainers | \n",
610 | " worry | \n",
611 | " 1956969531 | \n",
612 | "
\n",
613 | " \n",
614 | " 12 | \n",
615 | " Ugh! I have to beat this stupid song to get to... | \n",
616 | " sadness | \n",
617 | " 1956970047 | \n",
618 | "
\n",
619 | " \n",
620 | "
\n",
621 | "
"
622 | ],
623 | "text/plain": [
624 | " content sentiment tweet_id\n",
625 | "1 Layin n bed with a headache ughhhh...waitin o... sadness 1956967666\n",
626 | "2 Funeral ceremony...gloomy friday... sadness 1956967696\n",
627 | "3 wants to hang out with friends SOON! fun 1956967789\n",
628 | "5 Re-pinging @ghostridah14: why didn't you go to... worry 1956968477\n",
629 | "6 I should be sleep, but im not! thinking about ... sadness 1956968487\n",
630 | "7 Hmmm. http://www.djhero.com/ is down worry 1956968636\n",
631 | "8 @charviray Charlene my love. I miss you sadness 1956969035\n",
632 | "9 @kelcouch I'm sorry at least it's Friday? sadness 1956969172\n",
633 | "11 Choked on her retainers worry 1956969531\n",
634 | "12 Ugh! I have to beat this stupid song to get to... sadness 1956970047"
635 | ]
636 | },
637 | "execution_count": 19,
638 | "metadata": {},
639 | "output_type": "execute_result"
640 | }
641 | ],
642 | "source": [
643 | "data.head(10)"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 20,
649 | "metadata": {},
650 | "outputs": [],
651 | "source": [
652 | "data.sentiment = np.where((data.sentiment == 'disgust') |(data.sentiment == 'hate'),'hate',data.sentiment)"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": 21,
658 | "metadata": {},
659 | "outputs": [
660 | {
661 | "data": {
662 | "text/plain": [
663 | "sadness 8995\n",
664 | "worry 8459\n",
665 | "joy 8240\n",
666 | "surprise 6036\n",
667 | "happiness 5209\n",
668 | "love 3842\n",
669 | "fear 2816\n",
670 | "fun 2535\n",
671 | "hate 2084\n",
672 | "anger 1665\n",
673 | "relief 1526\n",
674 | "Name: sentiment, dtype: int64"
675 | ]
676 | },
677 | "execution_count": 21,
678 | "metadata": {},
679 | "output_type": "execute_result"
680 | }
681 | ],
682 | "source": [
683 | "data.sentiment.value_counts()"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": 22,
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "data=data[data.sentiment.isin(['sadness','worry','joy'])]"
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": 23,
698 | "metadata": {},
699 | "outputs": [
700 | {
701 | "data": {
702 | "text/plain": [
703 | "sadness 8995\n",
704 | "worry 8459\n",
705 | "joy 8240\n",
706 | "Name: sentiment, dtype: int64"
707 | ]
708 | },
709 | "execution_count": 23,
710 | "metadata": {},
711 | "output_type": "execute_result"
712 | }
713 | ],
714 | "source": [
715 | "data.sentiment.value_counts()"
716 | ]
717 | },
718 | {
719 | "cell_type": "code",
720 | "execution_count": 24,
721 | "metadata": {},
722 | "outputs": [],
723 | "source": [
724 | "#data3 = pd.read_csv('Equity-Evaluation-Corpus.csv',sep=',')"
725 | ]
726 | },
727 | {
728 | "cell_type": "code",
729 | "execution_count": 25,
730 | "metadata": {},
731 | "outputs": [],
732 | "source": [
733 | "#data3.Emotion.value_counts()"
734 | ]
735 | },
736 | {
737 | "cell_type": "markdown",
738 | "metadata": {},
739 | "source": [
740 | "## Clean Text"
741 | ]
742 | },
743 | {
744 | "cell_type": "markdown",
745 | "metadata": {},
746 | "source": [
747 | "#### Remove irrelevant characters other than alphanumeric and space"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 26,
753 | "metadata": {},
754 | "outputs": [],
755 | "source": [
756 | "data['content']=data['content'].str.replace('[^A-Za-z0-9\\s]+', '')"
757 | ]
758 | },
759 | {
760 | "cell_type": "markdown",
761 | "metadata": {},
762 | "source": [
763 | "#### Remove links from the text"
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": 27,
769 | "metadata": {},
770 | "outputs": [],
771 | "source": [
772 | "data['content']=data['content'].str.replace('http\\S+|www.\\S+', '', case=False)"
773 | ]
774 | },
775 | {
776 | "cell_type": "markdown",
777 | "metadata": {},
778 | "source": [
779 | "#### Convert everything to lowercase"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "execution_count": 28,
785 | "metadata": {},
786 | "outputs": [],
787 | "source": [
788 | "data['content']=data['content'].str.lower()"
789 | ]
790 | },
791 | {
792 | "cell_type": "markdown",
793 | "metadata": {},
794 | "source": [
795 | "#### Assign Target Variable"
796 | ]
797 | },
798 | {
799 | "cell_type": "code",
800 | "execution_count": 29,
801 | "metadata": {},
802 | "outputs": [],
803 | "source": [
804 | "target=data.sentiment\n",
805 | "data = data.drop(['sentiment'],axis=1)"
806 | ]
807 | },
808 | {
809 | "cell_type": "code",
810 | "execution_count": 30,
811 | "metadata": {},
812 | "outputs": [],
813 | "source": [
814 | "le=LabelEncoder()\n",
815 | "target=le.fit_transform(target)"
816 | ]
817 | },
818 | {
819 | "cell_type": "markdown",
820 | "metadata": {},
821 | "source": [
822 | "### Split Data into train & test"
823 | ]
824 | },
825 | {
826 | "cell_type": "code",
827 | "execution_count": 31,
828 | "metadata": {},
829 | "outputs": [],
830 | "source": [
831 | "X_train, X_test, y_train, y_test = train_test_split(data,target,stratify=target,test_size=0.4, random_state=42)"
832 | ]
833 | },
834 | {
835 | "cell_type": "markdown",
836 | "metadata": {},
837 | "source": [
838 | "##### Check if the split divides the classes uniformly"
839 | ]
840 | },
841 | {
842 | "cell_type": "code",
843 | "execution_count": 32,
844 | "metadata": {},
845 | "outputs": [
846 | {
847 | "data": {
848 | "text/plain": [
849 | "array([[ 0, 4944],\n",
850 | " [ 1, 5397],\n",
851 | " [ 2, 5075]], dtype=int64)"
852 | ]
853 | },
854 | "execution_count": 32,
855 | "metadata": {},
856 | "output_type": "execute_result"
857 | }
858 | ],
859 | "source": [
860 | "itemfreq(y_train)"
861 | ]
862 | },
863 | {
864 | "cell_type": "code",
865 | "execution_count": 33,
866 | "metadata": {},
867 | "outputs": [
868 | {
869 | "data": {
870 | "text/plain": [
871 | "array([[ 0, 3296],\n",
872 | " [ 1, 3598],\n",
873 | " [ 2, 3384]], dtype=int64)"
874 | ]
875 | },
876 | "execution_count": 33,
877 | "metadata": {},
878 | "output_type": "execute_result"
879 | }
880 | ],
881 | "source": [
882 | "itemfreq(y_test)"
883 | ]
884 | },
885 | {
886 | "cell_type": "markdown",
887 | "metadata": {},
888 | "source": [
889 | "### Tokenization \n",
890 | "\n",
891 | "Tokenization can be done in a variety of ways, namely **Bag of words, tf-idf, Glove, word2vec ,fasttext **etc. Lets see how they can be applied and how they affect the accuracy"
892 | ]
893 | },
894 | {
895 | "cell_type": "markdown",
896 | "metadata": {},
897 | "source": [
898 | "#### Bag of Words "
899 | ]
900 | },
901 | {
902 | "cell_type": "code",
903 | "execution_count": 34,
904 | "metadata": {},
905 | "outputs": [
906 | {
907 | "name": "stdout",
908 | "output_type": "stream",
909 | "text": [
910 | "Shape of Term Frequency Matrix: (15416, 25747)\n"
911 | ]
912 | }
913 | ],
914 | "source": [
915 | "# Extracting features from text files\n",
916 | "count_vect = CountVectorizer()\n",
917 | "X_train_counts = count_vect.fit_transform(X_train.content)\n",
918 | "X_test_counts =count_vect.transform(X_test.content)\n",
919 | "print('Shape of Term Frequency Matrix: ',X_train_counts.shape)"
920 | ]
921 | },
922 | {
923 | "cell_type": "markdown",
924 | "metadata": {},
925 | "source": [
926 | "#### Naive Bayes Model"
927 | ]
928 | },
929 | {
930 | "cell_type": "code",
931 | "execution_count": 35,
932 | "metadata": {},
933 | "outputs": [
934 | {
935 | "name": "stdout",
936 | "output_type": "stream",
937 | "text": [
938 | "59.2625024324\n"
939 | ]
940 | }
941 | ],
942 | "source": [
943 | "# Machine Learning\n",
944 | "# Training Naive Bayes (NB) classifier on training data.\n",
945 | "clf = MultinomialNB().fit(X_train_counts,y_train)\n",
946 | "predicted = clf.predict(X_test_counts)\n",
947 | "nb_clf_accuracy = np.mean(predicted == y_test) * 100\n",
948 | "print(nb_clf_accuracy)"
949 | ]
950 | },
951 | {
952 | "cell_type": "markdown",
953 | "metadata": {},
954 | "source": [
955 | "#### Same thing can be done using a Pipeline\n",
956 | "\n",
957 | "Lets take a look at how it can be done.
\n",
958 | "First lets define a function for printing accuracy"
959 | ]
960 | },
961 | {
962 | "cell_type": "code",
963 | "execution_count": 36,
964 | "metadata": {},
965 | "outputs": [],
966 | "source": [
967 | "def print_acc(model):\n",
968 | " predicted = model.predict(X_test.content)\n",
969 | " accuracy = np.mean(predicted == y_test) * 100\n",
970 | " print(accuracy)"
971 | ]
972 | },
973 | {
974 | "cell_type": "code",
975 | "execution_count": 37,
976 | "metadata": {},
977 | "outputs": [
978 | {
979 | "name": "stdout",
980 | "output_type": "stream",
981 | "text": [
982 | "59.2625024324\n"
983 | ]
984 | }
985 | ],
986 | "source": [
987 | "nb_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])\n",
988 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n",
989 | "print_acc(nb_clf)"
990 | ]
991 | },
992 | {
993 | "cell_type": "markdown",
994 | "metadata": {},
995 | "source": [
996 | "#### TF IDF transformer"
997 | ]
998 | },
999 | {
1000 | "cell_type": "code",
1001 | "execution_count": 38,
1002 | "metadata": {},
1003 | "outputs": [
1004 | {
1005 | "name": "stdout",
1006 | "output_type": "stream",
1007 | "text": [
1008 | "58.5425179996\n"
1009 | ]
1010 | }
1011 | ],
1012 | "source": [
1013 | "nb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n",
1014 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n",
1015 | "print_acc(nb_clf)"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "markdown",
1020 | "metadata": {},
1021 | "source": [
1022 | "#### Hash Vectorizer\n",
1023 | "\n",
1024 | "Note: Naive Bayes requires input to be non negative. Therefore, the alternate sign should be set to false in Hashing Vectorizer to make it work with naive bayes algorithm"
1025 | ]
1026 | },
1027 | {
1028 | "cell_type": "code",
1029 | "execution_count": 39,
1030 | "metadata": {},
1031 | "outputs": [
1032 | {
1033 | "name": "stdout",
1034 | "output_type": "stream",
1035 | "text": [
1036 | "53.765323993\n"
1037 | ]
1038 | }
1039 | ],
1040 | "source": [
1041 | "nb_clf = Pipeline([('vect', HashingVectorizer(n_features=2500,alternate_sign=False)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n",
1042 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n",
1043 | "print_acc(nb_clf)"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "code",
1048 | "execution_count": 40,
1049 | "metadata": {
1050 | "scrolled": true
1051 | },
1052 | "outputs": [
1053 | {
1054 | "data": {
1055 | "text/plain": [
1056 | "array([[2430, 517, 349],\n",
1057 | " [ 551, 1989, 1058],\n",
1058 | " [ 434, 1278, 1672]], dtype=int64)"
1059 | ]
1060 | },
1061 | "execution_count": 40,
1062 | "metadata": {},
1063 | "output_type": "execute_result"
1064 | }
1065 | ],
1066 | "source": [
1067 | "confusion_matrix(y_test,predicted)"
1068 | ]
1069 | },
1070 | {
1071 | "cell_type": "markdown",
1072 | "metadata": {},
1073 | "source": [
1074 | "#### Remove Stop Words"
1075 | ]
1076 | },
1077 | {
1078 | "cell_type": "code",
1079 | "execution_count": 41,
1080 | "metadata": {},
1081 | "outputs": [
1082 | {
1083 | "name": "stdout",
1084 | "output_type": "stream",
1085 | "text": [
1086 | "58.23117338\n"
1087 | ]
1088 | }
1089 | ],
1090 | "source": [
1091 | "stop_words = set(stopwords.words('english'))\n",
1092 | "nb_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), ('clf', MultinomialNB())])\n",
1093 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n",
1094 | "print_acc(nb_clf)"
1095 | ]
1096 | },
1097 | {
1098 | "cell_type": "code",
1099 | "execution_count": 42,
1100 | "metadata": {},
1101 | "outputs": [
1102 | {
1103 | "name": "stdout",
1104 | "output_type": "stream",
1105 | "text": [
1106 | "57.6084841409\n"
1107 | ]
1108 | }
1109 | ],
1110 | "source": [
1111 | "nb_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n",
1112 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n",
1113 | "print_acc(nb_clf)"
1114 | ]
1115 | },
1116 | {
1117 | "cell_type": "markdown",
1118 | "metadata": {},
1119 | "source": [
1120 | "#### Lemmatization"
1121 | ]
1122 | },
1123 | {
1124 | "cell_type": "code",
1125 | "execution_count": 43,
1126 | "metadata": {},
1127 | "outputs": [
1128 | {
1129 | "name": "stderr",
1130 | "output_type": "stream",
1131 | "text": [
1132 | "c:\\program files\\python35\\lib\\site-packages\\pandas\\core\\indexing.py:601: SettingWithCopyWarning: \n",
1133 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1134 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
1135 | "\n",
1136 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
1137 | " self.obj[item_labels[indexer[info_axis]]] = value\n"
1138 | ]
1139 | }
1140 | ],
1141 | "source": [
1142 | "w_tokenizer = nltk.tokenize.WhitespaceTokenizer()\n",
1143 | "lemmatizer = nltk.stem.WordNetLemmatizer()\n",
1144 | "\n",
1145 | "def lemmatize_text(text):\n",
1146 | " return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])\n",
1147 | "X_train.loc[:,'content'] = X_train['content'].apply(lemmatize_text)\n",
1148 | "X_test.loc[:,'content'] = X_test['content'].apply(lemmatize_text)"
1149 | ]
1150 | },
1151 | {
1152 | "cell_type": "code",
1153 | "execution_count": 44,
1154 | "metadata": {
1155 | "scrolled": true
1156 | },
1157 | "outputs": [
1158 | {
1159 | "name": "stdout",
1160 | "output_type": "stream",
1161 | "text": [
1162 | "57.4138937536\n"
1163 | ]
1164 | }
1165 | ],
1166 | "source": [
1167 | "nb_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])\n",
1168 | "nb_clf = nb_clf.fit(X_train.content,y_train)\n",
1169 | "print_acc(nb_clf)"
1170 | ]
1171 | },
1172 | {
1173 | "cell_type": "code",
1174 | "execution_count": 45,
1175 | "metadata": {},
1176 | "outputs": [],
1177 | "source": [
1178 | "#### Spell Correction with Flashtext"
1179 | ]
1180 | },
1181 | {
1182 | "cell_type": "markdown",
1183 | "metadata": {},
1184 | "source": [
1185 | "#### Do the same pipeline with NLTK, spacy and pytorch"
1186 | ]
1187 | }
1188 | ],
1189 | "metadata": {
1190 | "kernelspec": {
1191 | "display_name": "Python 3",
1192 | "language": "python",
1193 | "name": "python3"
1194 | },
1195 | "language_info": {
1196 | "codemirror_mode": {
1197 | "name": "ipython",
1198 | "version": 3
1199 | },
1200 | "file_extension": ".py",
1201 | "mimetype": "text/x-python",
1202 | "name": "python",
1203 | "nbconvert_exporter": "python",
1204 | "pygments_lexer": "ipython3",
1205 | "version": "3.5.2"
1206 | }
1207 | },
1208 | "nbformat": 4,
1209 | "nbformat_minor": 2
1210 | }
1211 |
--------------------------------------------------------------------------------