├── README.md
├── LICENSE
├── .gitignore
└── main.py


/README.md:
--------------------------------------------------------------------------------
1 | # Text-Emotion-Detection-Using-NLP
2 | 
3 | This code is a part of my tutorial project on implementing various NLP techniques for text sentiment analysis. For the complete tutorial, visit https://medium.com/the-research-nest/applied-machine-learning-part-3-3fd405842a18 
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 aditya-xq
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from nltk.corpus import stopwords
  4 | from textblob import Word
  5 | import re
  6 | from sklearn import preprocessing
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.feature_extraction.text import TfidfVectorizer
  9 | from sklearn.feature_extraction.text import CountVectorizer
 10 | from sklearn.metrics import accuracy_score
 11 | from sklearn.naive_bayes import MultinomialNB
 12 | from sklearn.linear_model import SGDClassifier
 13 | from sklearn.linear_model import LogisticRegression
 14 | from sklearn.ensemble import RandomForestClassifier
 15 | 
 16 | data = pd.read_csv('text_emotion.csv')
 17 | 
 18 | data = data.drop('author', axis=1)
 19 | 
 20 | # Dropping rows with other emotion labels
 21 | data = data.drop(data[data.sentiment == 'anger'].index)
 22 | data = data.drop(data[data.sentiment == 'boredom'].index)
 23 | data = data.drop(data[data.sentiment == 'enthusiasm'].index)
 24 | data = data.drop(data[data.sentiment == 'empty'].index)
 25 | data = data.drop(data[data.sentiment == 'fun'].index)
 26 | data = data.drop(data[data.sentiment == 'relief'].index)
 27 | data = data.drop(data[data.sentiment == 'surprise'].index)
 28 | data = data.drop(data[data.sentiment == 'love'].index)
 29 | data = data.drop(data[data.sentiment == 'hate'].index)
 30 | data = data.drop(data[data.sentiment == 'neutral'].index)
 31 | data = data.drop(data[data.sentiment == 'worry'].index)
 32 | 
 33 | # Making all letters lowercase
 34 | data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
 35 | 
 36 | # Removing Punctuation, Symbols
 37 | data['content'] = data['content'].str.replace('[^\w\s]',' ')
 38 | 
 39 | # Removing Stop Words using NLTK
 40 | stop = stopwords.words('english')
 41 | data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
 42 | 
 43 | #Lemmatisation
 44 | data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
 45 | #Correcting Letter Repetitions
 46 | 
 47 | def de_repeat(text):
 48 |     pattern = re.compile(r"(.)\1{2,}")
 49 |     return pattern.sub(r"\1\1", text)
 50 | 
 51 | data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))
 52 | 
 53 | # Code to find the top 10,000 rarest words appearing in the data
 54 | freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]
 55 | 
 56 | # Removing all those rarely appearing words from the data
 57 | freq = list(freq.index)
 58 | data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
 59 | 
 60 | #Encoding output labels 'sadness' as '1' & 'happiness' as '0'
 61 | lbl_enc = preprocessing.LabelEncoder()
 62 | y = lbl_enc.fit_transform(data.sentiment.values)
 63 | 
 64 | # Splitting into training and testing data in 90:10 ratio
 65 | X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
 66 | 
 67 | # Extracting TF-IDF parameters
 68 | tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
 69 | X_train_tfidf = tfidf.fit_transform(X_train)
 70 | X_val_tfidf = tfidf.fit_transform(X_val)
 71 | 
 72 | # Extracting Count Vectors Parameters
 73 | count_vect = CountVectorizer(analyzer='word')
 74 | count_vect.fit(data['content'])
 75 | X_train_count =  count_vect.transform(X_train)
 76 | X_val_count =  count_vect.transform(X_val)
 77 | 
 78 | # Model 1: Multinomial Naive Bayes Classifier
 79 | nb = MultinomialNB()
 80 | nb.fit(X_train_tfidf, y_train)
 81 | y_pred = nb.predict(X_val_tfidf)
 82 | print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val))
 83 | # naive bayes tfidf accuracy 0.5289017341040463
 84 | 
 85 | # Model 2: Linear SVM
 86 | lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
 87 | lsvm.fit(X_train_tfidf, y_train)
 88 | y_pred = lsvm.predict(X_val_tfidf)
 89 | print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))
 90 | # svm tfidf accuracy 0.5404624277456648
 91 | 
 92 | # Model 3: logistic regression
 93 | logreg = LogisticRegression(C=1)
 94 | logreg.fit(X_train_tfidf, y_train)
 95 | y_pred = logreg.predict(X_val_tfidf)
 96 | print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))
 97 | # log reg tfidf accuracy 0.5443159922928709
 98 | 
 99 | # Model 4: Random Forest Classifier
100 | rf = RandomForestClassifier(n_estimators=500)
101 | rf.fit(X_train_tfidf, y_train)
102 | y_pred = rf.predict(X_val_tfidf)
103 | print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val))
104 | # random forest tfidf accuracy 0.5385356454720617
105 | 
106 | ## Building models using count vectors feature
107 | # Model 1: Multinomial Naive Bayes Classifier
108 | nb = MultinomialNB()
109 | nb.fit(X_train_count, y_train)
110 | y_pred = nb.predict(X_val_count)
111 | print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))
112 | # naive bayes count vectors accuracy 0.7764932562620424
113 | 
114 | # Model 2: Linear SVM
115 | lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
116 | lsvm.fit(X_train_count, y_train)
117 | y_pred = lsvm.predict(X_val_count)
118 | print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))
119 | # lsvm using count vectors accuracy 0.7928709055876686
120 | 
121 | # Model 3: Logistic Regression
122 | logreg = LogisticRegression(C=1)
123 | logreg.fit(X_train_count, y_train)
124 | y_pred = logreg.predict(X_val_count)
125 | print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))
126 | # log reg count vectors accuracy 0.7851637764932563
127 | 
128 | # Model 4: Random Forest Classifier
129 | rf = RandomForestClassifier(n_estimators=500)
130 | rf.fit(X_train_count, y_train)
131 | y_pred = rf.predict(X_val_count)
132 | print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))
133 | # random forest with count vectors accuracy 0.7524084778420038
134 | 
135 | #Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness
136 | 
137 | tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful',
138 | 'Things are looking great. It was such a good day',
139 | 'Success is right around the corner. Lets celebrate this victory',
140 | 'Everything is more beautiful when you experience them with a smile!',
141 | 'Now this is my worst, okay? But I am gonna get better.',
142 | 'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
143 | 'This is quite depressing. I am filled with sorrow',
144 | 'His death broke my heart. It was a sad day'])
145 | 
146 | # Doing some preprocessing on these tweets as done before
147 | tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
148 | from nltk.corpus import stopwords
149 | stop = stopwords.words('english')
150 | tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
151 | from textblob import Word
152 | tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
153 | 
154 | # Extracting Count Vectors feature from our tweets
155 | tweet_count = count_vect.transform(tweets[0])
156 | 
157 | #Predicting the emotion of the tweet using our already trained linear SVM
158 | tweet_pred = lsvm.predict(tweet_count)
159 | print(tweet_pred)
160 | ## result
161 | ## [0 0 0 0 1 1 1 1]
162 | 


--------------------------------------------------------------------------------