├── README.md └── gws.py /README.md: -------------------------------------------------------------------------------- 1 | # Google-Web-Scraper 2 |

This Python code scrapes Google search results then applies sentiment analysis (using TextBlob and VADER in tandem), generates text summaries (4 different methods) for each classification, and ranks stopwords-scrubbed keywords per classification.

3 |

Results are displayed on-screen and are also saved as text files.

4 |

By changing 2 URLs, the search engine homepage and the search engine results page, this code should work with Bing or any other search engine.

5 |

Click here to read the WordPress blog article about this project.

6 | -------------------------------------------------------------------------------- /gws.py: -------------------------------------------------------------------------------- 1 | # acknowledgements 2 | # https://github.com/vprusso/youtube_tutorials/blob/master/web_scraping_and_automation/beautiful_soup/beautiful_soup_and_requests.py 3 | # https://github.com/llSourcell/web_scraper_live_demo/blob/master/main.py 4 | # https://www.crummy.com/software/BeautifulSoup/bs4/doc/ 5 | # https://stackoverflow.com/questions/5598524/can-i-remove-script-tags-with-beautifulsoup 6 | # http://zetcode.com/python/beautifulsoup/ 7 | # https://github.com/AgentANAKIN/Dual-Twitter-Sentiment-Analysis-with-4-Text-Summary-Tools-and-Stopwords-Scrubbed-Keywords 8 | # https://github.com/llSourcell/twitter_sentiment_challenge/blob/master/demo.py 9 | # https://youtu.be/qTyj2R-wcks 10 | # https://www.youtube.com/watch?v=8p9nHmtwk0o 11 | # https://github.com/Jcharis/Natural-Language-Processing-Tutorials/blob/master/Text%20Summarization%20with%20Sumy%20Python%20.ipynb 12 | # Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. 13 | # https://codeburst.io/python-basics-11-word-count-filter-out-punctuation-dictionary-manipulation-and-sorting-lists-3f6c55420855 14 | # https://pythonspot.com/nltk-stop-words/ 15 | 16 | 17 | 18 | # import dependencies: scrape the links off search results pages and the text off web pages 19 | import requests 20 | from bs4 import BeautifulSoup 21 | import re 22 | # import dependencies: sentiment analysis 23 | from textblob import TextBlob 24 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 25 | # import dependency: pie chart 26 | import matplotlib.pyplot as plt 27 | # import dependencies: text summarization 28 | from sumy.parsers.plaintext import PlaintextParser 29 | from sumy.nlp.tokenizers import Tokenizer 30 | from sumy.summarizers.lex_rank import LexRankSummarizer 31 | import nltk; nltk.download('punkt') 32 | from sumy.summarizers.luhn import LuhnSummarizer 33 | from sumy.summarizers.lsa import LsaSummarizer 34 | from sumy.nlp.stemmers import Stemmer 35 | from sumy.utils import get_stop_words 36 | # import dependencies: keywords 37 | import nltk; nltk.download('stopwords') 38 | from nltk.corpus import stopwords 39 | 40 | 41 | 42 | # scrape a search engine's homepage 43 | URL = "https://www.google.com/" 44 | result = requests.get(URL) 45 | src = result.content 46 | soup = BeautifulSoup(src, 'lxml') 47 | 48 | 49 | 50 | # create a list of the links on the selected search engine's homepage 51 | links = soup.find_all("a") 52 | stopURLs = [] 53 | for link in links: 54 | if link not in stopURLs: 55 | stopURLs.append(link.attrs['href']) 56 | 57 | 58 | 59 | # scrape the results page; unfortunately, this URL has to be updated manually 60 | URL = "https://www.google.com/search?source=hp&ei=V6zsXN2RCcemoATqs7OoAw&q=chocolate&oq=chocolate&gs_l=mobile-gws-wiz-hp.3..0j46l2j0l5.3285.5362..5639...0.0..0.219.1763.0j5j4......0....1.......8..41j0i131j46i275.mNHGFzr_jX8" 61 | result = requests.get(URL) 62 | src = result.content 63 | soup = BeautifulSoup(src, 'lxml') 64 | 65 | 66 | 67 | # create a list of links that are not in the list of homepage links, because the homepage links are not results links 68 | links = soup.find_all("a") 69 | URL_List = [] 70 | # only add absolute hyperlinks (starting with "http") 71 | pattern = r"http" 72 | for link in links: 73 | if link not in URL_List: 74 | if re.match(pattern, str(link.attrs['href'])) is not None: 75 | URL_List.append(link.attrs['href']) 76 | 77 | 78 | 79 | # improves performance 80 | SIA = SentimentIntensityAnalyzer() 81 | 82 | 83 | 84 | # set some variables 85 | negative = 0 86 | positive = 0 87 | neutral = 0 88 | unknown = 0 89 | 90 | 91 | 92 | # create text files to append tweets to 93 | txtNegative = open('negative.txt','a+') 94 | txtPositive = open('positive.txt','a+') 95 | txtNeutral = open('neutral.txt','a+') 96 | txtUnknown = open('unknown.txt','a+') 97 | 98 | 99 | 100 | i = 0 101 | size_of_URL_List = len(URL_List) 102 | while i < size_of_URL_List: 103 | URL = URL_List[i] 104 | result = requests.get(URL) 105 | src = result.text 106 | soup = BeautifulSoup(src,'lxml') 107 | [s.extract() for s in soup(['iframe', 'script', 'style'])] 108 | text = soup.get_text() 109 | # classify tweets as negative, positive, neutral, or unknown 110 | # TextBlob and VADER must agree, or the result is "unknown" 111 | analysisTB = TextBlob(text) 112 | analysisVS = SIA.polarity_scores(text) 113 | if ((analysisTB.sentiment.polarity < -0.05) & 114 | (analysisVS['compound'] < -0.05)): 115 | txtNegative.write(text) 116 | negative += 1 117 | elif ((analysisTB.sentiment.polarity > 0.05) & 118 | (analysisVS['compound'] > 0.05)): 119 | txtPositive.write(text) 120 | positive += 1 121 | elif ((analysisTB.sentiment.polarity > -0.05) & 122 | (analysisTB.sentiment.polarity < 0.05) & (analysisVS['compound'] > -0.05) & (analysisVS['compound'] < 0.05)): 123 | txtNeutral.write(text) 124 | neutral += 1 125 | else: 126 | txtUnknown.write(text) 127 | unknown += 1 128 | i += 1 129 | 130 | 131 | 132 | # open file to append summaries to 133 | txtSummary = open('summary.txt','a+') 134 | 135 | 136 | 137 | # print totals on screen and to file 138 | total = negative + positive + neutral + unknown 139 | negative_pct = ((negative / total) * 100) 140 | txtSummary.write("Negative: ") 141 | txtSummary.write(str(negative)) 142 | txtSummary.write(" ") 143 | txtSummary.write(str(negative_pct)) 144 | txtSummary.write("%") 145 | print("Negative: "+str(negative)) 146 | 147 | positive_pct = ((positive / total) * 100) 148 | txtSummary.write("\n\nPositive: ") 149 | txtSummary.write(str(positive)) 150 | txtSummary.write(" ") 151 | txtSummary.write(str(positive_pct)) 152 | txtSummary.write("%") 153 | print("Positive: "+str(positive)) 154 | 155 | neutral_pct = ((neutral / total) * 100) 156 | txtSummary.write("\n\nNeutral: ") 157 | txtSummary.write(str(neutral)) 158 | txtSummary.write(" ") 159 | txtSummary.write(str(neutral_pct)) 160 | txtSummary.write("%") 161 | print("Neutral: "+str(neutral)) 162 | 163 | unknown_pct = ((unknown / total) * 100) 164 | txtSummary.write("\n\nUnknown: ") 165 | txtSummary.write(str(unknown)) 166 | txtSummary.write(" ") 167 | txtSummary.write(str(unknown_pct)) 168 | txtSummary.write("%") 169 | print("Unknown: "+str(unknown)) 170 | 171 | 172 | 173 | # displays the results as a pie chart and explodes the largest slice 174 | labels = ['negative', 'positive', 'neutral', 'unknown'] 175 | sizes = [negative, positive, neutral, unknown] 176 | colors = ['red', 'green', 'yellow', 'gray'] 177 | if ((negative > positive) & 178 | (negative > neutral) & 179 | (negative > unknown)): 180 | explode = [.1, 0, 0, 0] 181 | elif ((positive > negative) & 182 | (positive > neutral) & 183 | (positive > unknown)): 184 | explode = [0, .1, 0, 0] 185 | elif ((neutral > negative) & 186 | (neutral > positive) & 187 | (neutral > unknown)): 188 | explode = [0, 0, .1, 0] 189 | else: 190 | explode = [0, 0, 0, .1] 191 | plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140) 192 | plt.axis('equal') 193 | plt.show() 194 | 195 | 196 | 197 | # close files and reopen for reading 198 | txtNegative.close() 199 | txtPositive.close() 200 | txtNeutral.close() 201 | txtUnknown.close() 202 | txtNegative = open('negative.txt','r') 203 | txtPositive = open('positive.txt','r') 204 | txtNeutral = open('neutral.txt','r') 205 | txtUnknown = open('unknown.txt','r') 206 | 207 | 208 | 209 | # stop words are common English words, such as "the;" the lists of keywords will exclude these stop words 210 | stopWords = set(stopwords.words('english')) 211 | 212 | 213 | 214 | # summarize tweets with LexRank, Luhn, LSA, Stop Words 215 | parser = PlaintextParser.from_file("negative.txt", Tokenizer("english")) 216 | LRSummarizer = LexRankSummarizer() 217 | summary = LRSummarizer(parser.document, 1) 218 | txtSummary.write("\n\n*** LEXRANK NEGATIVE ***\n") 219 | print("*** LEXRANK NEGATIVE ***") 220 | for sentence in summary: 221 | txtSummary.write(str(sentence)) 222 | print(sentence) 223 | 224 | LSummarizer = LuhnSummarizer() 225 | summary = LSummarizer(parser.document, 1) 226 | txtSummary.write("\n\n*** LUHN NEGATIVE ***\n") 227 | print("") 228 | print("*** LUHN NEGATIVE ***") 229 | for sentence in summary: 230 | txtSummary.write(str(sentence)) 231 | print(sentence) 232 | 233 | LSASummarizer = LsaSummarizer() 234 | summary = LSASummarizer(parser.document, 1) 235 | txtSummary.write("\n\n*** LSA NEGATIVE ***\n") 236 | print("") 237 | print("*** LSA NEGATIVE ***") 238 | for sentence in summary: 239 | txtSummary.write(str(sentence)) 240 | print(sentence) 241 | 242 | LSA2Summarizer = LsaSummarizer() 243 | LSA2Summarizer = LsaSummarizer(Stemmer("english")) 244 | LSA2Summarizer.stop_words = get_stop_words("english") 245 | txtSummary.write("\n\n*** LSA W/ STOP WORDS NEGATIVE ***\n") 246 | print("") 247 | print("*** LSA W/ STOP WORDS NEGATIVE ***") 248 | for sentence in LSA2Summarizer(parser.document, 1): 249 | txtSummary.write(str(sentence)) 250 | print(sentence) 251 | 252 | # clean text and convert all words to lowercase 253 | Text = open('negative.txt').read() 254 | for char in '-.,:;?!\n': 255 | Text = Text.replace(char,' ') 256 | Text = Text.lower() 257 | word_list = Text.split() 258 | # initialize dictionary 259 | d = {} 260 | # count instances of each word 261 | for word in word_list: 262 | if word not in d: 263 | d[word] = 0 264 | d[word] += 1 265 | # reverse the key and values so they can be sorted using tuples 266 | # discard common words and words that appear only once 267 | word_freq = [] 268 | for key, value in d.items(): 269 | if (value > 1) and (key not in stopWords): 270 | word_freq.append((value, key)) 271 | word_freq.sort(key=lambda tup:(-tup[0], tup[1])) 272 | txtSummary.write("\n\n*** KEYWORDS NEGATIVE ***\n") 273 | print("") 274 | print("*** KEYWORDS NEGATIVE ***") 275 | for word in word_freq: 276 | txtSummary.write(str(word)) 277 | txtSummary.write("\n") 278 | print(word) 279 | 280 | parser = PlaintextParser.from_file("positive.txt", Tokenizer("english")) 281 | LRSummarizer = LexRankSummarizer() 282 | summary = LRSummarizer(parser.document, 1) 283 | txtSummary.write("\n\n*** LEXRANK POSITIVE ***\n") 284 | print("") 285 | print("*** LEXRANK POSITIVE ***") 286 | for sentence in summary: 287 | txtSummary.write(str(sentence)) 288 | print(sentence) 289 | 290 | LSummarizer = LuhnSummarizer() 291 | summary = LSummarizer(parser.document, 1) 292 | txtSummary.write("\n\n*** LUHN POSITIVE ***\n") 293 | print("") 294 | print("*** LUHN POSITIVE ***") 295 | for sentence in summary: 296 | txtSummary.write(str(sentence)) 297 | print(sentence) 298 | 299 | LSASummarizer = LsaSummarizer() 300 | summary = LSASummarizer(parser.document, 1) 301 | txtSummary.write("\n\n*** LSA POSITIVE ***\n") 302 | print("") 303 | print("*** LSA POSITIVE ***") 304 | for sentence in summary: 305 | txtSummary.write(str(sentence)) 306 | print(sentence) 307 | 308 | LSA2Summarizer = LsaSummarizer() 309 | LSA2Summarizer = LsaSummarizer(Stemmer("english")) 310 | LSA2Summarizer.stop_words = get_stop_words("english") 311 | txtSummary.write("\n\n*** LSA W/ STOP WORDS POSITIVE ***\n") 312 | print("") 313 | print("*** LSA W/ STOP WORDS POSITIVE ***") 314 | for sentence in LSA2Summarizer(parser.document, 1): 315 | txtSummary.write(str(sentence)) 316 | print(sentence) 317 | 318 | # clean text and convert all words to lowercase 319 | Text = open('positive.txt').read() 320 | for char in '-.,:;?!\n': 321 | Text = Text.replace(char,' ') 322 | Text = Text.lower() 323 | word_list = Text.split() 324 | # initialize dictionary 325 | d = {} 326 | # count instances of each word 327 | for word in word_list: 328 | if word not in d: 329 | d[word] = 0 330 | d[word] += 1 331 | # reverse the key and values so they can be sorted using tuples 332 | # discard common words and words that appear only once 333 | word_freq = [] 334 | for key, value in d.items(): 335 | if (value > 1) and (key not in stopWords): 336 | word_freq.append((value, key)) 337 | word_freq.sort(key=lambda tup:(-tup[0], tup[1])) 338 | txtSummary.write("\n\n*** KEYWORDS POSITIVE ***\n") 339 | print("") 340 | print("*** KEYWORDS POSITIVE ***") 341 | for word in word_freq: 342 | txtSummary.write(str(word)) 343 | txtSummary.write("\n") 344 | print(word) 345 | 346 | parser = PlaintextParser.from_file("neutral.txt", Tokenizer("english")) 347 | LRSummarizer = LexRankSummarizer() 348 | summary = LRSummarizer(parser.document, 1) 349 | txtSummary.write("\n\n*** LEXRANK NEUTRAL ***\n") 350 | print("") 351 | print("*** LEXRANK NEUTRAL ***") 352 | for sentence in summary: 353 | txtSummary.write(str(sentence)) 354 | print(sentence) 355 | 356 | LSummarizer = LuhnSummarizer() 357 | summary = LSummarizer(parser.document, 1) 358 | txtSummary.write("\n\n*** LUHN NEUTRAL ***\n") 359 | print("") 360 | print("*** LUHN NEUTRAL ***") 361 | for sentence in summary: 362 | txtSummary.write(str(sentence)) 363 | print(sentence) 364 | 365 | LSASummarizer = LsaSummarizer() 366 | summary = LSASummarizer(parser.document, 1) 367 | txtSummary.write("\n\n*** LSA NEUTRAL ***\n") 368 | print("") 369 | print("*** LSA NEUTRAL ***") 370 | for sentence in summary: 371 | txtSummary.write(str(sentence)) 372 | print(sentence) 373 | 374 | LSA2Summarizer = LsaSummarizer() 375 | LSA2Summarizer = LsaSummarizer(Stemmer("english")) 376 | LSA2Summarizer.stop_words = get_stop_words("english") 377 | txtSummary.write("\n\n*** LSA W/ STOP WORDS NEUTRAL ***\n") 378 | print("") 379 | print("*** LSA W/ STOP WORDS NEUTRAL ***") 380 | for sentence in LSA2Summarizer(parser.document, 1): 381 | txtSummary.write(str(sentence)) 382 | print(sentence) 383 | 384 | # clean text and convert all words to lowercase 385 | Text = open('neutral.txt').read() 386 | for char in '-.,:;?!\n': 387 | Text = Text.replace(char,' ') 388 | Text = Text.lower() 389 | word_list = Text.split() 390 | # initialize dictionary 391 | d = {} 392 | # count instances of each word 393 | for word in word_list: 394 | if word not in d: 395 | d[word] = 0 396 | d[word] += 1 397 | # reverse the key and values so they can be sorted using tuples 398 | # discard common words and words that appear only once 399 | word_freq = [] 400 | for key, value in d.items(): 401 | if (value > 1) and (key not in stopWords): 402 | word_freq.append((value, key)) 403 | word_freq.sort(key=lambda tup:(-tup[0], tup[1])) 404 | txtSummary.write("\n\n*** KEYWORDS NEUTRAL ***\n") 405 | print("") 406 | print("*** KEYWORDS NEUTRAL ***") 407 | for word in word_freq: 408 | txtSummary.write(str(word)) 409 | txtSummary.write("\n") 410 | print(word) 411 | 412 | parser = PlaintextParser.from_file("unknown.txt", Tokenizer("english")) 413 | LRSummarizer = LexRankSummarizer() 414 | summary = LRSummarizer(parser.document, 1) 415 | txtSummary.write("\n\n*** LEXRANK UNKNOWN ***\n") 416 | print("") 417 | print("*** LEXRANK UNKNOWN ***") 418 | for sentence in summary: 419 | txtSummary.write(str(sentence)) 420 | print(sentence) 421 | 422 | LSummarizer = LuhnSummarizer() 423 | summary = LSummarizer(parser.document, 1) 424 | txtSummary.write("\n\n*** LUHN UNKNOWN ***\n") 425 | print("") 426 | print("*** LUHN UNKNOWN ***") 427 | for sentence in summary: 428 | txtSummary.write(str(sentence)) 429 | print(sentence) 430 | 431 | LSASummarizer = LsaSummarizer() 432 | summary = LSASummarizer(parser.document, 1) 433 | txtSummary.write("\n\n*** LSA UNKNOWN ***\n") 434 | print("") 435 | print("*** LSA UNKNOWN ***") 436 | for sentence in summary: 437 | txtSummary.write(str(sentence)) 438 | print(sentence) 439 | 440 | LSA2Summarizer = LsaSummarizer() 441 | LSA2Summarizer = LsaSummarizer(Stemmer("english")) 442 | LSA2Summarizer.stop_words = get_stop_words("english") 443 | txtSummary.write("\n\n*** LSA W/ STOP WORDS UNKNOWN ***\n") 444 | print("") 445 | print("*** LSA W/ STOP WORDS UNKNOWN ***") 446 | for sentence in LSA2Summarizer(parser.document, 1): 447 | txtSummary.write(str(sentence)) 448 | print(sentence) 449 | 450 | # clean text and convert all words to lowercase 451 | Text = open('unknown.txt').read() 452 | for char in '-.,:;?!\n': 453 | Text = Text.replace(char,' ') 454 | Text = Text.lower() 455 | word_list = Text.split() 456 | # initialize dictionary 457 | d = {} 458 | # count instances of each word 459 | for word in word_list: 460 | if word not in d: 461 | d[word] = 0 462 | d[word] += 1 463 | # reverse the key and values so they can be sorted using tuples 464 | # discard common words and words that appear only once 465 | word_freq = [] 466 | for key, value in d.items(): 467 | if (value > 1) and (key not in stopWords): 468 | word_freq.append((value, key)) 469 | word_freq.sort(key=lambda tup:(-tup[0], tup[1])) 470 | txtSummary.write("\n\n*** KEYWORDS UNKNOWN ***\n") 471 | print("") 472 | print("*** KEYWORDS UNKNOWN ***") 473 | for word in word_freq: 474 | txtSummary.write(str(word)) 475 | txtSummary.write("\n") 476 | print(word) 477 | 478 | 479 | 480 | # close files 481 | txtNegative.close() 482 | txtPositive.close() 483 | txtNeutral.close() 484 | txtUnknown.close() 485 | txtSummary.close() 486 | --------------------------------------------------------------------------------