├── README.md
├── Twitter sentiment analysis_live.ipynb
└── Twitter sentiment analysis_live.py


/README.md:
--------------------------------------------------------------------------------
1 | # Twitter-sentiment-analysis-using-Python-Machine-Learning-Project-8
2 | This project walks you on how to create a twitter sentiment analysis model using python. Twitter sentiment analysis is performed to identify the sentiments of the people towards various topics. For this project, we will be analysing the sentiment of people towards Pfizer vaccines.    We will be using the data available on Kaggle to create this machine learning model. The collected tweets from Twitter will be analysed using machine learning to identify the different sentiments present in the tweets. The different sentiments identified in this project include positive sentiment, negative sentiment and neutral sentiment. We will also be using different classifiers to see which classifier gives the best model accuracy.
3 | 
4 | 
5 | To see the complete video explanation of this topic, check out the following link:
6 | https://youtu.be/ng6L_wvREB4
7 | 


--------------------------------------------------------------------------------
/Twitter sentiment analysis_live.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import re
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | from matplotlib import style
 13 | style.use('ggplot')
 14 | from textblob import TextBlob
 15 | from nltk.tokenize import word_tokenize
 16 | from nltk.stem import PorterStemmer
 17 | from nltk.corpus import stopwords
 18 | stop_words = set(stopwords.words('english'))
 19 | from wordcloud import WordCloud
 20 | from sklearn.feature_extraction.text import CountVectorizer
 21 | from sklearn.model_selection import train_test_split
 22 | from sklearn.linear_model import LogisticRegression
 23 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
 24 | 
 25 | 
 26 | # In[2]:
 27 | 
 28 | 
 29 | df = pd.read_csv('vaccination_tweets.csv')
 30 | 
 31 | 
 32 | # In[3]:
 33 | 
 34 | 
 35 | df.head()
 36 | 
 37 | 
 38 | # In[4]:
 39 | 
 40 | 
 41 | df.info()
 42 | 
 43 | 
 44 | # In[5]:
 45 | 
 46 | 
 47 | df.isnull().sum()
 48 | 
 49 | 
 50 | # In[6]:
 51 | 
 52 | 
 53 | df.columns
 54 | 
 55 | 
 56 | # In[7]:
 57 | 
 58 | 
 59 | text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created',
 60 |        'user_followers', 'user_friends', 'user_favourites', 'user_verified',
 61 |        'date', 'hashtags', 'source', 'retweets', 'favorites',
 62 |        'is_retweet'], axis=1)
 63 | text_df.head()
 64 | 
 65 | 
 66 | # In[8]:
 67 | 
 68 | 
 69 | print(text_df['text'].iloc[0],"\n")
 70 | print(text_df['text'].iloc[1],"\n")
 71 | print(text_df['text'].iloc[2],"\n")
 72 | print(text_df['text'].iloc[3],"\n")
 73 | print(text_df['text'].iloc[4],"\n")
 74 | 
 75 | 
 76 | # In[9]:
 77 | 
 78 | 
 79 | text_df.info()
 80 | 
 81 | 
 82 | # In[10]:
 83 | 
 84 | 
 85 | def data_processing(text):
 86 |     text = text.lower()
 87 |     text = re.sub(r"https\S+|www\S+https\S+", '',text, flags=re.MULTILINE)
 88 |     text = re.sub(r'\@w+|\#','',text)
 89 |     text = re.sub(r'[^\w\s]','',text)
 90 |     text_tokens = word_tokenize(text)
 91 |     filtered_text = [w for w in text_tokens if not w in stop_words]
 92 |     return " ".join(filtered_text)
 93 | 
 94 | 
 95 | # In[11]:
 96 | 
 97 | 
 98 | text_df.text = text_df['text'].apply(data_processing)
 99 | 
100 | 
101 | # In[12]:
102 | 
103 | 
104 | text_df = text_df.drop_duplicates('text')
105 | 
106 | 
107 | # In[13]:
108 | 
109 | 
110 | stemmer = PorterStemmer()
111 | def stemming(data):
112 |     text = [stemmer.stem(word) for word in data]
113 |     return data
114 | 
115 | 
116 | # In[14]:
117 | 
118 | 
119 | text_df['text'] = text_df['text'].apply(lambda x: stemming(x))
120 | 
121 | 
122 | # In[15]:
123 | 
124 | 
125 | text_df.head()
126 | 
127 | 
128 | # In[16]:
129 | 
130 | 
131 | print(text_df['text'].iloc[0],"\n")
132 | print(text_df['text'].iloc[1],"\n")
133 | print(text_df['text'].iloc[2],"\n")
134 | print(text_df['text'].iloc[3],"\n")
135 | print(text_df['text'].iloc[4],"\n")
136 | 
137 | 
138 | # In[17]:
139 | 
140 | 
141 | text_df.info()
142 | 
143 | 
144 | # In[18]:
145 | 
146 | 
147 | def polarity(text):
148 |     return TextBlob(text).sentiment.polarity
149 | 
150 | 
151 | # In[19]:
152 | 
153 | 
154 | text_df['polarity'] = text_df['text'].apply(polarity)
155 | 
156 | 
157 | # In[20]:
158 | 
159 | 
160 | text_df.head(10)
161 | 
162 | 
163 | # In[21]:
164 | 
165 | 
166 | def sentiment(label):
167 |     if label <0:
168 |         return "Negative"
169 |     elif label ==0:
170 |         return "Neutral"
171 |     elif label>0:
172 |         return "Positive"
173 | 
174 | 
175 | # In[22]:
176 | 
177 | 
178 | text_df['sentiment'] = text_df['polarity'].apply(sentiment)
179 | 
180 | 
181 | # In[23]:
182 | 
183 | 
184 | text_df.head()
185 | 
186 | 
187 | # In[24]:
188 | 
189 | 
190 | fig = plt.figure(figsize=(5,5))
191 | sns.countplot(x='sentiment', data = text_df)
192 | 
193 | 
194 | # In[26]:
195 | 
196 | 
197 | fig = plt.figure(figsize=(7,7))
198 | colors = ("yellowgreen", "gold", "red")
199 | wp = {'linewidth':2, 'edgecolor':"black"}
200 | tags = text_df['sentiment'].value_counts()
201 | explode = (0.1,0.1,0.1)
202 | tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors = colors,
203 |          startangle=90, wedgeprops = wp, explode = explode, label='')
204 | plt.title('Distribution of sentiments')
205 | 
206 | 
207 | # In[27]:
208 | 
209 | 
210 | pos_tweets = text_df[text_df.sentiment == 'Positive']
211 | pos_tweets = pos_tweets.sort_values(['polarity'], ascending= False)
212 | pos_tweets.head()
213 | 
214 | 
215 | # In[28]:
216 | 
217 | 
218 | text = ' '.join([word for word in pos_tweets['text']])
219 | plt.figure(figsize=(20,15), facecolor='None')
220 | wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
221 | plt.imshow(wordcloud, interpolation='bilinear')
222 | plt.axis("off")
223 | plt.title('Most frequent words in positive tweets', fontsize=19)
224 | plt.show()
225 | 
226 | 
227 | # In[29]:
228 | 
229 | 
230 | neg_tweets = text_df[text_df.sentiment == 'Negative']
231 | neg_tweets = neg_tweets.sort_values(['polarity'], ascending= False)
232 | neg_tweets.head()
233 | 
234 | 
235 | # In[30]:
236 | 
237 | 
238 | text = ' '.join([word for word in neg_tweets['text']])
239 | plt.figure(figsize=(20,15), facecolor='None')
240 | wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
241 | plt.imshow(wordcloud, interpolation='bilinear')
242 | plt.axis("off")
243 | plt.title('Most frequent words in negative tweets', fontsize=19)
244 | plt.show()
245 | 
246 | 
247 | # In[31]:
248 | 
249 | 
250 | neutral_tweets = text_df[text_df.sentiment == 'Neutral']
251 | neutral_tweets = neutral_tweets.sort_values(['polarity'], ascending= False)
252 | neutral_tweets.head()
253 | 
254 | 
255 | # In[32]:
256 | 
257 | 
258 | text = ' '.join([word for word in neutral_tweets['text']])
259 | plt.figure(figsize=(20,15), facecolor='None')
260 | wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
261 | plt.imshow(wordcloud, interpolation='bilinear')
262 | plt.axis("off")
263 | plt.title('Most frequent words in neutral tweets', fontsize=19)
264 | plt.show()
265 | 
266 | 
267 | # In[33]:
268 | 
269 | 
270 | vect = CountVectorizer(ngram_range=(1,2)).fit(text_df['text'])
271 | 
272 | 
273 | # In[34]:
274 | 
275 | 
276 | feature_names = vect.get_feature_names()
277 | print("Number of features: {}\n".format(len(feature_names)))
278 | print("First 20 features:\n {}".format(feature_names[:20]))
279 | 
280 | 
281 | # In[35]:
282 | 
283 | 
284 | X = text_df['text']
285 | Y = text_df['sentiment']
286 | X = vect.transform(X)
287 | 
288 | 
289 | # In[36]:
290 | 
291 | 
292 | x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
293 | 
294 | 
295 | # In[37]:
296 | 
297 | 
298 | print("Size of x_train:", (x_train.shape))
299 | print("Size of y_train:", (y_train.shape))
300 | print("Size of x_test:", (x_test.shape))
301 | print("Size of y_test:", (y_test.shape))
302 | 
303 | 
304 | # In[39]:
305 | 
306 | 
307 | import warnings
308 | warnings.filterwarnings('ignore')
309 | 
310 | 
311 | # In[40]:
312 | 
313 | 
314 | logreg = LogisticRegression()
315 | logreg.fit(x_train, y_train)
316 | logreg_pred = logreg.predict(x_test)
317 | logreg_acc = accuracy_score(logreg_pred, y_test)
318 | print("Test accuracy: {:.2f}%".format(logreg_acc*100))
319 | 
320 | 
321 | # In[41]:
322 | 
323 | 
324 | print(confusion_matrix(y_test, logreg_pred))
325 | print("\n")
326 | print(classification_report(y_test, logreg_pred))
327 | 
328 | 
329 | # In[43]:
330 | 
331 | 
332 | style.use('classic')
333 | cm = confusion_matrix(y_test, logreg_pred, labels=logreg.classes_)
334 | disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=logreg.classes_)
335 | disp.plot()
336 | 
337 | 
338 | # In[44]:
339 | 
340 | 
341 | from sklearn.model_selection import GridSearchCV
342 | 
343 | 
344 | # In[45]:
345 | 
346 | 
347 | param_grid={'C':[0.001, 0.01, 0.1, 1, 10]}
348 | grid = GridSearchCV(LogisticRegression(), param_grid)
349 | grid.fit(x_train, y_train)
350 | 
351 | 
352 | # In[46]:
353 | 
354 | 
355 | print("Best parameters:", grid.best_params_)
356 | 
357 | 
358 | # In[47]:
359 | 
360 | 
361 | y_pred = grid.predict(x_test)
362 | 
363 | 
364 | # In[48]:
365 | 
366 | 
367 | logreg_acc = accuracy_score(y_pred, y_test)
368 | print("Test accuracy: {:.2f}%".format(logreg_acc*100))
369 | 
370 | 
371 | # In[50]:
372 | 
373 | 
374 | print(confusion_matrix(y_test, y_pred))
375 | print("\n")
376 | print(classification_report(y_test, y_pred))
377 | 
378 | 
379 | # In[51]:
380 | 
381 | 
382 | from sklearn.svm import LinearSVC
383 | 
384 | 
385 | # In[52]:
386 | 
387 | 
388 | SVCmodel = LinearSVC()
389 | SVCmodel.fit(x_train, y_train)
390 | 
391 | 
392 | # In[53]:
393 | 
394 | 
395 | svc_pred = SVCmodel.predict(x_test)
396 | svc_acc = accuracy_score(svc_pred, y_test)
397 | print("test accuracy: {:.2f}%".format(svc_acc*100))
398 | 
399 | 
400 | # In[54]:
401 | 
402 | 
403 | print(confusion_matrix(y_test, svc_pred))
404 | print("\n")
405 | print(classification_report(y_test, svc_pred))
406 | 
407 | 
408 | # In[55]:
409 | 
410 | 
411 | grid = {
412 |     'C':[0.01, 0.1, 1, 10],
413 |     'kernel':["linear","poly","rbf","sigmoid"],
414 |     'degree':[1,3,5,7],
415 |     'gamma':[0.01,1]
416 | }
417 | grid = GridSearchCV(SVCmodel, param_grid)
418 | grid.fit(x_train, y_train)
419 | 
420 | 
421 | # In[56]:
422 | 
423 | 
424 | print("Best parameter:", grid.best_params_)
425 | 
426 | 
427 | # In[57]:
428 | 
429 | 
430 | y_pred = grid.predict(x_test)
431 | 
432 | 
433 | # In[58]:
434 | 
435 | 
436 | logreg_acc = accuracy_score(y_pred, y_test)
437 | print("Test accuracy: {:.2f}%".format(logreg_acc*100))
438 | 
439 | 
440 | # In[59]:
441 | 
442 | 
443 | print(confusion_matrix(y_test, y_pred))
444 | print("\n")
445 | print(classification_report(y_test, y_pred))
446 | 
447 | 
448 | # Additional code to extract data form twitter using twitter api
449 | 
450 | # In[ ]:
451 | 
452 | 
453 | import tweepy #to access the twitter api
454 | import pandas as pd #for basic data operations
455 | 
456 | 
457 | # In[ ]:
458 | 
459 | 
460 | # Importing the keys from twitter api
461 | consumerKey = "xxxxxxxxxxxxxxxxxxxx"
462 | consumerSecret = "xxxxxxxxxxxxxxxxxxxx"
463 | accessToken = "xxxxxxxxxxxxxxxxxxxx"
464 | accessTokenSecret = "xxxxxxxxxxxxxxxxxxxx"
465 | 
466 | 
467 | # In[ ]:
468 | 
469 | 
470 | # Establish the connection with twitter API
471 | auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
472 | auth.set_access_token(accessToken, accessTokenSecret)
473 | api = tweepy.API(auth)
474 | 
475 | 
476 | # In[ ]:
477 | 
478 | 
479 | # Search for the Term and define number of tweets 
480 | searchTerm = input("Enter Keyword/Tag to search about: ")
481 | NoOfTerms = int(input("Enter how many tweets to search: "))
482 | 
483 | 
484 | # In[ ]:
485 | 
486 | 
487 | # Get no of tweets and searched term together 
488 | tweets = tweepy.Cursor(api.search, q=searchTerm).items(NoOfTerms)
489 | 
490 | 
491 | # In[ ]:
492 | 
493 | 
494 | 
495 | 
496 | 


--------------------------------------------------------------------------------