├── README.md ├── Twitter sentiment analysis_live.ipynb └── Twitter sentiment analysis_live.py /README.md: -------------------------------------------------------------------------------- 1 | # Twitter-sentiment-analysis-using-Python-Machine-Learning-Project-8 2 | This project walks you on how to create a twitter sentiment analysis model using python. Twitter sentiment analysis is performed to identify the sentiments of the people towards various topics. For this project, we will be analysing the sentiment of people towards Pfizer vaccines. We will be using the data available on Kaggle to create this machine learning model. The collected tweets from Twitter will be analysed using machine learning to identify the different sentiments present in the tweets. The different sentiments identified in this project include positive sentiment, negative sentiment and neutral sentiment. We will also be using different classifiers to see which classifier gives the best model accuracy. 3 | 4 | 5 | To see the complete video explanation of this topic, check out the following link: 6 | https://youtu.be/ng6L_wvREB4 7 | -------------------------------------------------------------------------------- /Twitter sentiment analysis_live.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import re 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | from matplotlib import style 13 | style.use('ggplot') 14 | from textblob import TextBlob 15 | from nltk.tokenize import word_tokenize 16 | from nltk.stem import PorterStemmer 17 | from nltk.corpus import stopwords 18 | stop_words = set(stopwords.words('english')) 19 | from wordcloud import WordCloud 20 | from sklearn.feature_extraction.text import CountVectorizer 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.linear_model import LogisticRegression 23 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay 24 | 25 | 26 | # In[2]: 27 | 28 | 29 | df = pd.read_csv('vaccination_tweets.csv') 30 | 31 | 32 | # In[3]: 33 | 34 | 35 | df.head() 36 | 37 | 38 | # In[4]: 39 | 40 | 41 | df.info() 42 | 43 | 44 | # In[5]: 45 | 46 | 47 | df.isnull().sum() 48 | 49 | 50 | # In[6]: 51 | 52 | 53 | df.columns 54 | 55 | 56 | # In[7]: 57 | 58 | 59 | text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created', 60 | 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 61 | 'date', 'hashtags', 'source', 'retweets', 'favorites', 62 | 'is_retweet'], axis=1) 63 | text_df.head() 64 | 65 | 66 | # In[8]: 67 | 68 | 69 | print(text_df['text'].iloc[0],"\n") 70 | print(text_df['text'].iloc[1],"\n") 71 | print(text_df['text'].iloc[2],"\n") 72 | print(text_df['text'].iloc[3],"\n") 73 | print(text_df['text'].iloc[4],"\n") 74 | 75 | 76 | # In[9]: 77 | 78 | 79 | text_df.info() 80 | 81 | 82 | # In[10]: 83 | 84 | 85 | def data_processing(text): 86 | text = text.lower() 87 | text = re.sub(r"https\S+|www\S+https\S+", '',text, flags=re.MULTILINE) 88 | text = re.sub(r'\@w+|\#','',text) 89 | text = re.sub(r'[^\w\s]','',text) 90 | text_tokens = word_tokenize(text) 91 | filtered_text = [w for w in text_tokens if not w in stop_words] 92 | return " ".join(filtered_text) 93 | 94 | 95 | # In[11]: 96 | 97 | 98 | text_df.text = text_df['text'].apply(data_processing) 99 | 100 | 101 | # In[12]: 102 | 103 | 104 | text_df = text_df.drop_duplicates('text') 105 | 106 | 107 | # In[13]: 108 | 109 | 110 | stemmer = PorterStemmer() 111 | def stemming(data): 112 | text = [stemmer.stem(word) for word in data] 113 | return data 114 | 115 | 116 | # In[14]: 117 | 118 | 119 | text_df['text'] = text_df['text'].apply(lambda x: stemming(x)) 120 | 121 | 122 | # In[15]: 123 | 124 | 125 | text_df.head() 126 | 127 | 128 | # In[16]: 129 | 130 | 131 | print(text_df['text'].iloc[0],"\n") 132 | print(text_df['text'].iloc[1],"\n") 133 | print(text_df['text'].iloc[2],"\n") 134 | print(text_df['text'].iloc[3],"\n") 135 | print(text_df['text'].iloc[4],"\n") 136 | 137 | 138 | # In[17]: 139 | 140 | 141 | text_df.info() 142 | 143 | 144 | # In[18]: 145 | 146 | 147 | def polarity(text): 148 | return TextBlob(text).sentiment.polarity 149 | 150 | 151 | # In[19]: 152 | 153 | 154 | text_df['polarity'] = text_df['text'].apply(polarity) 155 | 156 | 157 | # In[20]: 158 | 159 | 160 | text_df.head(10) 161 | 162 | 163 | # In[21]: 164 | 165 | 166 | def sentiment(label): 167 | if label <0: 168 | return "Negative" 169 | elif label ==0: 170 | return "Neutral" 171 | elif label>0: 172 | return "Positive" 173 | 174 | 175 | # In[22]: 176 | 177 | 178 | text_df['sentiment'] = text_df['polarity'].apply(sentiment) 179 | 180 | 181 | # In[23]: 182 | 183 | 184 | text_df.head() 185 | 186 | 187 | # In[24]: 188 | 189 | 190 | fig = plt.figure(figsize=(5,5)) 191 | sns.countplot(x='sentiment', data = text_df) 192 | 193 | 194 | # In[26]: 195 | 196 | 197 | fig = plt.figure(figsize=(7,7)) 198 | colors = ("yellowgreen", "gold", "red") 199 | wp = {'linewidth':2, 'edgecolor':"black"} 200 | tags = text_df['sentiment'].value_counts() 201 | explode = (0.1,0.1,0.1) 202 | tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors = colors, 203 | startangle=90, wedgeprops = wp, explode = explode, label='') 204 | plt.title('Distribution of sentiments') 205 | 206 | 207 | # In[27]: 208 | 209 | 210 | pos_tweets = text_df[text_df.sentiment == 'Positive'] 211 | pos_tweets = pos_tweets.sort_values(['polarity'], ascending= False) 212 | pos_tweets.head() 213 | 214 | 215 | # In[28]: 216 | 217 | 218 | text = ' '.join([word for word in pos_tweets['text']]) 219 | plt.figure(figsize=(20,15), facecolor='None') 220 | wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text) 221 | plt.imshow(wordcloud, interpolation='bilinear') 222 | plt.axis("off") 223 | plt.title('Most frequent words in positive tweets', fontsize=19) 224 | plt.show() 225 | 226 | 227 | # In[29]: 228 | 229 | 230 | neg_tweets = text_df[text_df.sentiment == 'Negative'] 231 | neg_tweets = neg_tweets.sort_values(['polarity'], ascending= False) 232 | neg_tweets.head() 233 | 234 | 235 | # In[30]: 236 | 237 | 238 | text = ' '.join([word for word in neg_tweets['text']]) 239 | plt.figure(figsize=(20,15), facecolor='None') 240 | wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text) 241 | plt.imshow(wordcloud, interpolation='bilinear') 242 | plt.axis("off") 243 | plt.title('Most frequent words in negative tweets', fontsize=19) 244 | plt.show() 245 | 246 | 247 | # In[31]: 248 | 249 | 250 | neutral_tweets = text_df[text_df.sentiment == 'Neutral'] 251 | neutral_tweets = neutral_tweets.sort_values(['polarity'], ascending= False) 252 | neutral_tweets.head() 253 | 254 | 255 | # In[32]: 256 | 257 | 258 | text = ' '.join([word for word in neutral_tweets['text']]) 259 | plt.figure(figsize=(20,15), facecolor='None') 260 | wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text) 261 | plt.imshow(wordcloud, interpolation='bilinear') 262 | plt.axis("off") 263 | plt.title('Most frequent words in neutral tweets', fontsize=19) 264 | plt.show() 265 | 266 | 267 | # In[33]: 268 | 269 | 270 | vect = CountVectorizer(ngram_range=(1,2)).fit(text_df['text']) 271 | 272 | 273 | # In[34]: 274 | 275 | 276 | feature_names = vect.get_feature_names() 277 | print("Number of features: {}\n".format(len(feature_names))) 278 | print("First 20 features:\n {}".format(feature_names[:20])) 279 | 280 | 281 | # In[35]: 282 | 283 | 284 | X = text_df['text'] 285 | Y = text_df['sentiment'] 286 | X = vect.transform(X) 287 | 288 | 289 | # In[36]: 290 | 291 | 292 | x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 293 | 294 | 295 | # In[37]: 296 | 297 | 298 | print("Size of x_train:", (x_train.shape)) 299 | print("Size of y_train:", (y_train.shape)) 300 | print("Size of x_test:", (x_test.shape)) 301 | print("Size of y_test:", (y_test.shape)) 302 | 303 | 304 | # In[39]: 305 | 306 | 307 | import warnings 308 | warnings.filterwarnings('ignore') 309 | 310 | 311 | # In[40]: 312 | 313 | 314 | logreg = LogisticRegression() 315 | logreg.fit(x_train, y_train) 316 | logreg_pred = logreg.predict(x_test) 317 | logreg_acc = accuracy_score(logreg_pred, y_test) 318 | print("Test accuracy: {:.2f}%".format(logreg_acc*100)) 319 | 320 | 321 | # In[41]: 322 | 323 | 324 | print(confusion_matrix(y_test, logreg_pred)) 325 | print("\n") 326 | print(classification_report(y_test, logreg_pred)) 327 | 328 | 329 | # In[43]: 330 | 331 | 332 | style.use('classic') 333 | cm = confusion_matrix(y_test, logreg_pred, labels=logreg.classes_) 334 | disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=logreg.classes_) 335 | disp.plot() 336 | 337 | 338 | # In[44]: 339 | 340 | 341 | from sklearn.model_selection import GridSearchCV 342 | 343 | 344 | # In[45]: 345 | 346 | 347 | param_grid={'C':[0.001, 0.01, 0.1, 1, 10]} 348 | grid = GridSearchCV(LogisticRegression(), param_grid) 349 | grid.fit(x_train, y_train) 350 | 351 | 352 | # In[46]: 353 | 354 | 355 | print("Best parameters:", grid.best_params_) 356 | 357 | 358 | # In[47]: 359 | 360 | 361 | y_pred = grid.predict(x_test) 362 | 363 | 364 | # In[48]: 365 | 366 | 367 | logreg_acc = accuracy_score(y_pred, y_test) 368 | print("Test accuracy: {:.2f}%".format(logreg_acc*100)) 369 | 370 | 371 | # In[50]: 372 | 373 | 374 | print(confusion_matrix(y_test, y_pred)) 375 | print("\n") 376 | print(classification_report(y_test, y_pred)) 377 | 378 | 379 | # In[51]: 380 | 381 | 382 | from sklearn.svm import LinearSVC 383 | 384 | 385 | # In[52]: 386 | 387 | 388 | SVCmodel = LinearSVC() 389 | SVCmodel.fit(x_train, y_train) 390 | 391 | 392 | # In[53]: 393 | 394 | 395 | svc_pred = SVCmodel.predict(x_test) 396 | svc_acc = accuracy_score(svc_pred, y_test) 397 | print("test accuracy: {:.2f}%".format(svc_acc*100)) 398 | 399 | 400 | # In[54]: 401 | 402 | 403 | print(confusion_matrix(y_test, svc_pred)) 404 | print("\n") 405 | print(classification_report(y_test, svc_pred)) 406 | 407 | 408 | # In[55]: 409 | 410 | 411 | grid = { 412 | 'C':[0.01, 0.1, 1, 10], 413 | 'kernel':["linear","poly","rbf","sigmoid"], 414 | 'degree':[1,3,5,7], 415 | 'gamma':[0.01,1] 416 | } 417 | grid = GridSearchCV(SVCmodel, param_grid) 418 | grid.fit(x_train, y_train) 419 | 420 | 421 | # In[56]: 422 | 423 | 424 | print("Best parameter:", grid.best_params_) 425 | 426 | 427 | # In[57]: 428 | 429 | 430 | y_pred = grid.predict(x_test) 431 | 432 | 433 | # In[58]: 434 | 435 | 436 | logreg_acc = accuracy_score(y_pred, y_test) 437 | print("Test accuracy: {:.2f}%".format(logreg_acc*100)) 438 | 439 | 440 | # In[59]: 441 | 442 | 443 | print(confusion_matrix(y_test, y_pred)) 444 | print("\n") 445 | print(classification_report(y_test, y_pred)) 446 | 447 | 448 | # Additional code to extract data form twitter using twitter api 449 | 450 | # In[ ]: 451 | 452 | 453 | import tweepy #to access the twitter api 454 | import pandas as pd #for basic data operations 455 | 456 | 457 | # In[ ]: 458 | 459 | 460 | # Importing the keys from twitter api 461 | consumerKey = "xxxxxxxxxxxxxxxxxxxx" 462 | consumerSecret = "xxxxxxxxxxxxxxxxxxxx" 463 | accessToken = "xxxxxxxxxxxxxxxxxxxx" 464 | accessTokenSecret = "xxxxxxxxxxxxxxxxxxxx" 465 | 466 | 467 | # In[ ]: 468 | 469 | 470 | # Establish the connection with twitter API 471 | auth = tweepy.OAuthHandler(consumerKey, consumerSecret) 472 | auth.set_access_token(accessToken, accessTokenSecret) 473 | api = tweepy.API(auth) 474 | 475 | 476 | # In[ ]: 477 | 478 | 479 | # Search for the Term and define number of tweets 480 | searchTerm = input("Enter Keyword/Tag to search about: ") 481 | NoOfTerms = int(input("Enter how many tweets to search: ")) 482 | 483 | 484 | # In[ ]: 485 | 486 | 487 | # Get no of tweets and searched term together 488 | tweets = tweepy.Cursor(api.search, q=searchTerm).items(NoOfTerms) 489 | 490 | 491 | # In[ ]: 492 | 493 | 494 | 495 | 496 | --------------------------------------------------------------------------------