├── README.md ├── emotion_recognizer_nbc.py ├── emotion_recognizer_svm.py └── emotion_recognizer_ann.py /README.md: -------------------------------------------------------------------------------- 1 | # Emotion-recognition-from-tweets 2 | A comprehensive approach on recognizing emotion (sentiment) from a certain tweet. Supervised machine learning. 3 | The 'Untitled.ipynb' file consists of a jupyter notebook of test codes. 4 | Rest of the .py files consist of code corresponding to their names. 5 | 6 | Find the complete explanation to the approach here https://medium.com/@ankushraut/artificial-neural-network-for-text-classification-b7aa5994d985. 7 | 8 | **Problem Statement** 9 | 10 | -> Given a dataset mapping tweets to the associated emotions, an emotion recognizer algorithm needs to be created. 11 | 12 | -> Data source - https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.crowdflower.com%2Fwp-content%2Fuploads%2F2016%2F07%2Ftext_emotion.csv&h=ATMfMjsluVtxjiGHelTFi-X_nKckyiEYJHwiGN9u-1LriOzSoQ4oFbjCBZItdFCummCAEnLlD_m7bUwMZs4LrZnDgDt4txt3laalwYWZnETPZBqChySF_gURrBRyFNGFNb_y&s=1&enc=AZP8Ci-JshpS1aE5ggdBCNoVTeDCTGhQTic6bXTQJ_M6PcwGzRMJfqiPYywc62pbkKEwvWl-M_9_OcEzpf1lKZ38b8xGHiCDtUf50JN-W6vA_Q 13 | 14 | -> Libraries - Natural Language Tool-kit (NLTK) and Sci-kit learn 15 | 16 | **Pre - processing** 17 | 18 | -> Removal of regular expressions, symbols using the 're' library 19 | 20 | -> Removal of lemmas (Lexicon Normalization) using WordNetLemmatizer from NLTK 21 | 22 | -> Removal of multi-letter ambiguities, e.g 'noooo' gets converted to 'no' 23 | 24 | -> (Optional) Removal of stop-words - caused decrease in f1-score as well as overall accuracy 25 | 26 | > A look at the data before and after pre-processing 27 | 28 | *before* 29 | 30 | ![whole_before](https://user-images.githubusercontent.com/26039458/29850795-c896e38a-8d4d-11e7-932a-e3e30a9ad675.png) 31 | 32 | *after* 33 | 34 | ![whole_after](https://user-images.githubusercontent.com/26039458/29850797-cce512fe-8d4d-11e7-84c5-229422f0e142.png) 35 | 36 | 37 | **Vectorization** 38 | 39 | -> Term frequency - inverse document frequency (TfidfVectorizer) deployed for converting the words to vectors (for SVM and Naive Bayes) 40 | 41 | -> Bag of words representation used as an input for the sigmoid layers model 42 | 43 | **Model - 1** 44 | 45 | -> Support Vector Machine - Creation of hyperplanes separating all the classes, linear kernel. 46 | 47 | **Model - 2** 48 | 49 | -> Naive Bayes classifier - naively assuming no inter-dependence between words of a sentence corpus. 50 | 51 | **Model -3** 52 | 53 | -> Aritificial Neural Network - 3 layer neural network with sigmoid activation and gradient descent optimization 54 | -------------------------------------------------------------------------------- /emotion_recognizer_nbc.py: -------------------------------------------------------------------------------- 1 | #important libraries 2 | import pandas as pd 3 | import numpy as np 4 | import nltk 5 | import re 6 | #importing stopwords is optional, in this case it decreased accuracy 7 | #from nltk.corpus import stopwords 8 | import itertools 9 | import time 10 | 11 | 12 | start_time = time.time() 13 | 14 | import os 15 | os.chdir('/tmp/guest-pltjjp/Downloads') 16 | 17 | 18 | data = pd.read_csv('text_emotion.csv') 19 | data = data.iloc[:1000,:] 20 | 21 | 22 | #stopset = set(stopwords.words('english')) 23 | 24 | from nltk.stem.wordnet import WordNetLemmatizer 25 | lem = WordNetLemmatizer() 26 | 27 | #comprehensive cleaning 28 | def cleaning(text): 29 | txt = str(text) 30 | txt = re.sub(r"http\S+", "", txt) 31 | if len(txt) == 0: 32 | return 'no text' 33 | else: 34 | txt = txt.split() 35 | index = 0 36 | for j in range(len(txt)): 37 | if txt[j][0] == '@': 38 | index = j 39 | txt = np.delete(txt, index) 40 | if len(txt) == 0: 41 | return 'no text' 42 | else: 43 | words = txt[0] 44 | for k in range(len(txt)-1): 45 | words+= " " + txt[k+1] 46 | txt = words 47 | txt = re.sub(r'[^\w]', ' ', txt) 48 | if len(txt) == 0: 49 | return 'no text' 50 | else: 51 | txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt)) 52 | txt = txt.replace("'", "") 53 | txt = nltk.tokenize.word_tokenize(txt) 54 | #data.content[i] = [w for w in data.content[i] if not w in stopset] 55 | for j in range(len(txt)): 56 | txt[j] = lem.lemmatize(txt[j], "v") 57 | if len(txt) == 0: 58 | return 'no text' 59 | else: 60 | return txt 61 | 62 | data['content'] = data['content'].map(lambda x: cleaning(x)) 63 | 64 | 65 | 66 | data = data.reset_index(drop=True) 67 | for i in range(len(data)): 68 | words = data.content[i][0] 69 | for j in range(len(data.content[i])-1): 70 | words+= ' ' + data.content[i][j+1] 71 | data.content[i] = words 72 | 73 | x = int(np.round(len(data)*0.75)) 74 | train = data.iloc[:x,:].reset_index(drop = True) 75 | test = data.iloc[x:,:].reset_index(drop = True) 76 | 77 | from textblob.classifiers import NaiveBayesClassifier as NBC 78 | 79 | training_corpus = [] 80 | 81 | for k in range(len(train)): 82 | training_corpus.append((train.content[k], train.sentiment[k])) 83 | 84 | test_corpus = [] 85 | 86 | for l in range(len(test)): 87 | test_corpus.append((test.content[l], test.sentiment[l])) 88 | 89 | model = NBC(training_corpus) 90 | 91 | print(model.accuracy(test_corpus)) 92 | 93 | from sklearn.metrics import classification_report 94 | 95 | predictions = [] 96 | for m in range(len(test)): 97 | predictions.append(model.classify(test.content[m])) 98 | print(classification_report(test.sentiment, predictions)) 99 | 100 | predictions_df = pd.DataFrame({'Content':test.content, 'Emotion_predicted':predictions, 'Emotion_actual':test.sentiment}) 101 | predictions_df.to_csv('naive_emotion_recognizer.csv', index = False) 102 | 103 | elapsed_time = time.time() - start_time 104 | print ("processing time:", elapsed_time, "seconds") 105 | -------------------------------------------------------------------------------- /emotion_recognizer_svm.py: -------------------------------------------------------------------------------- 1 | #important libraries 2 | import pandas as pd 3 | import numpy as np 4 | import nltk 5 | import re 6 | #importing stopwords is optional, in this case it decreased accuracy 7 | #from nltk.corpus import stopwords 8 | import itertools 9 | import time 10 | 11 | 12 | start_time = time.time() 13 | 14 | import os 15 | os.chdir('/tmp/guest-pltjjp/Downloads') 16 | 17 | 18 | data = pd.read_csv('text_emotion.csv') 19 | #data = data.iloc[:100,:] 20 | 21 | 22 | #stopset = set(stopwords.words('english')) 23 | 24 | from nltk.stem.wordnet import WordNetLemmatizer 25 | lem = WordNetLemmatizer() 26 | 27 | #comprehensive cleaning 28 | def cleaning(text): 29 | txt = str(text) 30 | txt = re.sub(r"http\S+", "", txt) 31 | if len(txt) == 0: 32 | return 'no text' 33 | else: 34 | txt = txt.split() 35 | index = 0 36 | for j in range(len(txt)): 37 | if txt[j][0] == '@': 38 | index = j 39 | txt = np.delete(txt, index) 40 | if len(txt) == 0: 41 | return 'no text' 42 | else: 43 | words = txt[0] 44 | for k in range(len(txt)-1): 45 | words+= " " + txt[k+1] 46 | txt = words 47 | txt = re.sub(r'[^\w]', ' ', txt) 48 | if len(txt) == 0: 49 | return 'no text' 50 | else: 51 | txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt)) 52 | txt = txt.replace("'", "") 53 | txt = nltk.tokenize.word_tokenize(txt) 54 | #data.content[i] = [w for w in data.content[i] if not w in stopset] 55 | for j in range(len(txt)): 56 | txt[j] = lem.lemmatize(txt[j], "v") 57 | if len(txt) == 0: 58 | return 'no text' 59 | else: 60 | return txt 61 | 62 | data['content'] = data['content'].map(lambda x: cleaning(x)) 63 | 64 | 65 | 66 | data = data.reset_index(drop=True) 67 | for i in range(len(data)): 68 | words = data.content[i][0] 69 | for j in range(len(data.content[i])-1): 70 | words+= ' ' + data.content[i][j+1] 71 | data.content[i] = words 72 | 73 | from sklearn.feature_extraction.text import TfidfVectorizer 74 | from sklearn.metrics import classification_report 75 | from sklearn import svm 76 | from sklearn.model_selection import train_test_split 77 | 78 | x_train, x_test, y_train, y_test = train_test_split(data.content, data.sentiment, test_size=0.25, random_state=0) 79 | 80 | x_train = x_train.reset_index(drop = True) 81 | x_test = x_test.reset_index(drop = True) 82 | 83 | y_train = y_train.reset_index(drop = True) 84 | y_test = y_test.reset_index(drop = True) 85 | 86 | vectorizer = TfidfVectorizer(min_df=3, max_df=0.9) 87 | 88 | train_vectors = vectorizer.fit_transform(x_train) 89 | test_vectors = vectorizer.transform(x_test) 90 | 91 | model = svm.SVC(kernel='linear') 92 | model.fit(train_vectors, y_train) 93 | predicted_sentiment = model.predict(test_vectors) 94 | 95 | print(classification_report(y_test, predicted_sentiment)) 96 | 97 | predicted_sentiments = [] 98 | for s in range(len(predicted_sentiment)): 99 | predicted_sentiments.append(predicted_sentiment[s]) 100 | 101 | prediction_df = pd.DataFrame({'Content':x_test, 'Emotion_predicted':predicted_sentiment, 'Emotion_actual': y_test}) 102 | prediction_df.to_csv('emotion_recognizer_svm.csv', index = False) 103 | 104 | elapsed_time = time.time() - start_time 105 | print ("processing time:", elapsed_time, "seconds") 106 | -------------------------------------------------------------------------------- /emotion_recognizer_ann.py: -------------------------------------------------------------------------------- 1 | #important libraries 2 | import pandas as pd 3 | import numpy as np 4 | import nltk 5 | import re 6 | #importing stopwords is optional, in this case it decreased accuracy 7 | #from nltk.corpus import stopwords 8 | import itertools 9 | import json 10 | import time 11 | import datetime 12 | 13 | 14 | start_time = time.time() 15 | 16 | import os 17 | os.chdir('/tmp/guest-pltjjp/Downloads') 18 | 19 | 20 | data = pd.read_csv('text_emotion.csv') 21 | #data = data.iloc[:100,:] 22 | 23 | 24 | #stopset = set(stopwords.words('english')) 25 | 26 | from nltk.stem.wordnet import WordNetLemmatizer 27 | lem = WordNetLemmatizer() 28 | 29 | #comprehensive cleaning 30 | def cleaning(text): 31 | txt = str(text) 32 | txt = re.sub(r"http\S+", "", txt) 33 | if len(txt) == 0: 34 | return 'no text' 35 | else: 36 | txt = txt.split() 37 | index = 0 38 | for j in range(len(txt)): 39 | if txt[j][0] == '@': 40 | index = j 41 | txt = np.delete(txt, index) 42 | if len(txt) == 0: 43 | return 'no text' 44 | else: 45 | words = txt[0] 46 | for k in range(len(txt)-1): 47 | words+= " " + txt[k+1] 48 | txt = words 49 | txt = re.sub(r'[^\w]', ' ', txt) 50 | if len(txt) == 0: 51 | return 'no text' 52 | else: 53 | txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt)) 54 | txt = txt.replace("'", "") 55 | txt = nltk.tokenize.word_tokenize(txt) 56 | #data.content[i] = [w for w in data.content[i] if not w in stopset] 57 | for j in range(len(txt)): 58 | txt[j] = lem.lemmatize(txt[j], "v") 59 | if len(txt) == 0: 60 | return 'no text' 61 | else: 62 | return txt 63 | 64 | data['content'] = data['content'].map(lambda x: cleaning(x)) 65 | 66 | 67 | 68 | data = data.reset_index(drop=True) 69 | for i in range(len(data)): 70 | words = data.content[i][0] 71 | for j in range(len(data.content[i])-1): 72 | words+= ' ' + data.content[i][j+1] 73 | data.content[i] = words 74 | 75 | le_train = data.iloc[:int(np.round(len(data)*.75)), :] 76 | val = data.iloc[int(np.round(len(data)*.75)):,:].reset_index(drop = True) 77 | 78 | training_data = [] 79 | for i in range(len(le_train)): 80 | training_data.append({"class": le_train.sentiment[i], "sentence": le_train.content[i]}) 81 | 82 | words = [] 83 | classes = [] 84 | documents = [] 85 | 86 | # loop through each sentence in our training data 87 | for pattern in training_data: 88 | w = nltk.word_tokenize(pattern['sentence']) 89 | words.extend(w) 90 | documents.append((w, pattern['class'])) 91 | if pattern['class'] not in classes: 92 | classes.append(pattern['class']) 93 | 94 | classes = list(set(classes)) 95 | 96 | # create our training data 97 | training = [] 98 | output = [] 99 | # create an empty array for our output 100 | output_empty = [0] * len(classes) 101 | 102 | # training set, bag of words for each sentence 103 | for doc in documents: 104 | bag = [] 105 | pattern_words = doc[0] 106 | for w in words: 107 | bag.append(1) if w in pattern_words else bag.append(0) 108 | 109 | training.append(bag) 110 | # output is a '0' for each tag and '1' for current tag 111 | output_row = list(output_empty) 112 | output_row[classes.index(doc[1])] = 1 113 | output.append(output_row) 114 | 115 | print (len(documents), "documents") 116 | print (len(classes), "classes", classes) 117 | 118 | 119 | # compute sigmoid nonlinearity 120 | def sigmoid(x): 121 | output = 1/(1+np.exp(-x)) 122 | return output 123 | 124 | # convert output of sigmoid function to its derivative 125 | def sigmoid_output_to_derivative(output): 126 | return output*(1-output) 127 | 128 | def clean_up_sentence(sentence): 129 | sentence_words = nltk.word_tokenize(sentence) 130 | return sentence_words 131 | 132 | def bow(sentence, words): 133 | sentence_words = clean_up_sentence(sentence) 134 | bag = [0]*len(words) 135 | for s in sentence_words: 136 | for i,w in enumerate(words): 137 | if w == s: 138 | bag[i] = 1 139 | return(np.array(bag)) 140 | 141 | def model(sentence): 142 | x = bow(sentence.lower(), words) 143 | # input layer is our bag of words 144 | l0 = x 145 | # matrix multiplication of input and hidden layer 146 | l1 = sigmoid(np.dot(l0, synapse_0)) 147 | # output layer 148 | l2 = sigmoid(np.dot(l1, synapse_1)) 149 | return l2 150 | 151 | def train(X, y, hidden_neurons=10, alpha=1, epochs=5000, dropout=False, dropout_percent=0.2): 152 | np.random.seed(1) 153 | 154 | last_mean_error = 1 155 | # randomly initialize our weights with mean 0 156 | synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1 157 | synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1 158 | 159 | prev_synapse_0_weight_update = np.zeros_like(synapse_0) 160 | prev_synapse_1_weight_update = np.zeros_like(synapse_1) 161 | 162 | synapse_0_direction_count = np.zeros_like(synapse_0) 163 | synapse_1_direction_count = np.zeros_like(synapse_1) 164 | for j in iter(range(epochs+1)): 165 | 166 | # Feed forward through layers 0, 1, and 2 167 | layer_0 = X 168 | layer_1 = sigmoid(np.dot(layer_0, synapse_0)) 169 | 170 | if(dropout): 171 | layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent)) 172 | 173 | layer_2 = sigmoid(np.dot(layer_1, synapse_1)) 174 | layer_2_error = y - layer_2 175 | 176 | if (j% 1000) == 0 and j > 500: 177 | # if this 1k iteration's error is greater than the last iteration, break out 178 | if np.mean(np.abs(layer_2_error)) < last_mean_error: 179 | print ("error after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) ) 180 | last_mean_error = np.mean(np.abs(layer_2_error)) 181 | else: 182 | print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error ) 183 | break 184 | 185 | layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2) 186 | 187 | layer_1_error = layer_2_delta.dot(synapse_1.T) 188 | layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1) 189 | 190 | synapse_1_weight_update = (layer_1.T.dot(layer_2_delta)) 191 | synapse_0_weight_update = (layer_0.T.dot(layer_1_delta)) 192 | if(j > 0): 193 | synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0)) 194 | synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0)) 195 | 196 | synapse_1 += alpha * synapse_1_weight_update 197 | synapse_0 += alpha * synapse_0_weight_update 198 | 199 | prev_synapse_0_weight_update = synapse_0_weight_update 200 | prev_synapse_1_weight_update = synapse_1_weight_update 201 | 202 | now = datetime.datetime.now() 203 | 204 | # persist synapses 205 | synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(), 206 | 'datetime': now.strftime("%Y-%m-%d %H:%M"), 207 | 'words': words, 208 | 'classes': classes 209 | } 210 | synapse_file = "synapses.json" 211 | 212 | with open(synapse_file, 'w') as outfile: 213 | json.dump(synapse, outfile, indent=4, sort_keys=True) 214 | 215 | 216 | X = np.array(training) 217 | y = np.array(output) 218 | 219 | train(X, y, hidden_neurons=20, alpha=0.1, epochs=10000, dropout=False, dropout_percent=0.2) 220 | 221 | # load our calculated synapse values 222 | synapse_file = 'synapses.json' 223 | with open(synapse_file) as data_file: 224 | synapse = json.load(data_file) 225 | synapse_0 = np.asarray(synapse['synapse0']) 226 | synapse_1 = np.asarray(synapse['synapse1']) 227 | 228 | def classify(sentence, show_details=False): 229 | results = model(sentence) 230 | for i in range(len(results)): 231 | if results[i] == np.max(results): 232 | emotion = classes[i] 233 | return emotion 234 | 235 | 236 | #validation 237 | 238 | predictions = [] 239 | for i in range(len(val)): 240 | predictions.append(classify(val.content[i])) 241 | 242 | from sklearn.metrics import f1_score 243 | from sklearn.metrics import classification_report 244 | 245 | print(classification_report(val.sentiment, predictions)) 246 | 247 | prediction_df = pd.DataFrame({'content':val.content, 'sentiment_predicted':predictions, 'sentimentActual':val.sentiment}) 248 | 249 | prediction_df.to_csv('emotion_recognizer.csv', index = False) 250 | elapsed_time = time.time() - start_time 251 | print ("processing time:", elapsed_time, "seconds") 252 | --------------------------------------------------------------------------------