├── README.md ├── LICENSE ├── .gitignore └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # Text-Emotion-Detection-Using-NLP 2 | 3 | This code is a part of my tutorial project on implementing various NLP techniques for text sentiment analysis. For the complete tutorial, visit https://medium.com/the-research-nest/applied-machine-learning-part-3-3fd405842a18 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 aditya-xq 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.corpus import stopwords 4 | from textblob import Word 5 | import re 6 | from sklearn import preprocessing 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | from sklearn.feature_extraction.text import CountVectorizer 10 | from sklearn.metrics import accuracy_score 11 | from sklearn.naive_bayes import MultinomialNB 12 | from sklearn.linear_model import SGDClassifier 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.ensemble import RandomForestClassifier 15 | 16 | data = pd.read_csv('text_emotion.csv') 17 | 18 | data = data.drop('author', axis=1) 19 | 20 | # Dropping rows with other emotion labels 21 | data = data.drop(data[data.sentiment == 'anger'].index) 22 | data = data.drop(data[data.sentiment == 'boredom'].index) 23 | data = data.drop(data[data.sentiment == 'enthusiasm'].index) 24 | data = data.drop(data[data.sentiment == 'empty'].index) 25 | data = data.drop(data[data.sentiment == 'fun'].index) 26 | data = data.drop(data[data.sentiment == 'relief'].index) 27 | data = data.drop(data[data.sentiment == 'surprise'].index) 28 | data = data.drop(data[data.sentiment == 'love'].index) 29 | data = data.drop(data[data.sentiment == 'hate'].index) 30 | data = data.drop(data[data.sentiment == 'neutral'].index) 31 | data = data.drop(data[data.sentiment == 'worry'].index) 32 | 33 | # Making all letters lowercase 34 | data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split())) 35 | 36 | # Removing Punctuation, Symbols 37 | data['content'] = data['content'].str.replace('[^\w\s]',' ') 38 | 39 | # Removing Stop Words using NLTK 40 | stop = stopwords.words('english') 41 | data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) 42 | 43 | #Lemmatisation 44 | data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 45 | #Correcting Letter Repetitions 46 | 47 | def de_repeat(text): 48 | pattern = re.compile(r"(.)\1{2,}") 49 | return pattern.sub(r"\1\1", text) 50 | 51 | data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 52 | 53 | # Code to find the top 10,000 rarest words appearing in the data 54 | freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:] 55 | 56 | # Removing all those rarely appearing words from the data 57 | freq = list(freq.index) 58 | data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq)) 59 | 60 | #Encoding output labels 'sadness' as '1' & 'happiness' as '0' 61 | lbl_enc = preprocessing.LabelEncoder() 62 | y = lbl_enc.fit_transform(data.sentiment.values) 63 | 64 | # Splitting into training and testing data in 90:10 ratio 65 | X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True) 66 | 67 | # Extracting TF-IDF parameters 68 | tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3)) 69 | X_train_tfidf = tfidf.fit_transform(X_train) 70 | X_val_tfidf = tfidf.fit_transform(X_val) 71 | 72 | # Extracting Count Vectors Parameters 73 | count_vect = CountVectorizer(analyzer='word') 74 | count_vect.fit(data['content']) 75 | X_train_count = count_vect.transform(X_train) 76 | X_val_count = count_vect.transform(X_val) 77 | 78 | # Model 1: Multinomial Naive Bayes Classifier 79 | nb = MultinomialNB() 80 | nb.fit(X_train_tfidf, y_train) 81 | y_pred = nb.predict(X_val_tfidf) 82 | print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val)) 83 | # naive bayes tfidf accuracy 0.5289017341040463 84 | 85 | # Model 2: Linear SVM 86 | lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None) 87 | lsvm.fit(X_train_tfidf, y_train) 88 | y_pred = lsvm.predict(X_val_tfidf) 89 | print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val)) 90 | # svm tfidf accuracy 0.5404624277456648 91 | 92 | # Model 3: logistic regression 93 | logreg = LogisticRegression(C=1) 94 | logreg.fit(X_train_tfidf, y_train) 95 | y_pred = logreg.predict(X_val_tfidf) 96 | print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val)) 97 | # log reg tfidf accuracy 0.5443159922928709 98 | 99 | # Model 4: Random Forest Classifier 100 | rf = RandomForestClassifier(n_estimators=500) 101 | rf.fit(X_train_tfidf, y_train) 102 | y_pred = rf.predict(X_val_tfidf) 103 | print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val)) 104 | # random forest tfidf accuracy 0.5385356454720617 105 | 106 | ## Building models using count vectors feature 107 | # Model 1: Multinomial Naive Bayes Classifier 108 | nb = MultinomialNB() 109 | nb.fit(X_train_count, y_train) 110 | y_pred = nb.predict(X_val_count) 111 | print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val)) 112 | # naive bayes count vectors accuracy 0.7764932562620424 113 | 114 | # Model 2: Linear SVM 115 | lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None) 116 | lsvm.fit(X_train_count, y_train) 117 | y_pred = lsvm.predict(X_val_count) 118 | print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val)) 119 | # lsvm using count vectors accuracy 0.7928709055876686 120 | 121 | # Model 3: Logistic Regression 122 | logreg = LogisticRegression(C=1) 123 | logreg.fit(X_train_count, y_train) 124 | y_pred = logreg.predict(X_val_count) 125 | print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val)) 126 | # log reg count vectors accuracy 0.7851637764932563 127 | 128 | # Model 4: Random Forest Classifier 129 | rf = RandomForestClassifier(n_estimators=500) 130 | rf.fit(X_train_count, y_train) 131 | y_pred = rf.predict(X_val_count) 132 | print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val)) 133 | # random forest with count vectors accuracy 0.7524084778420038 134 | 135 | #Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness 136 | 137 | tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful', 138 | 'Things are looking great. It was such a good day', 139 | 'Success is right around the corner. Lets celebrate this victory', 140 | 'Everything is more beautiful when you experience them with a smile!', 141 | 'Now this is my worst, okay? But I am gonna get better.', 142 | 'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel', 143 | 'This is quite depressing. I am filled with sorrow', 144 | 'His death broke my heart. It was a sad day']) 145 | 146 | # Doing some preprocessing on these tweets as done before 147 | tweets[0] = tweets[0].str.replace('[^\w\s]',' ') 148 | from nltk.corpus import stopwords 149 | stop = stopwords.words('english') 150 | tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) 151 | from textblob import Word 152 | tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 153 | 154 | # Extracting Count Vectors feature from our tweets 155 | tweet_count = count_vect.transform(tweets[0]) 156 | 157 | #Predicting the emotion of the tweet using our already trained linear SVM 158 | tweet_pred = lsvm.predict(tweet_count) 159 | print(tweet_pred) 160 | ## result 161 | ## [0 0 0 0 1 1 1 1] 162 | --------------------------------------------------------------------------------