├── NewspaperScraper.py ├── README.md └── RunScrapers.py /NewspaperScraper.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | import time 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from datetime import datetime, timedelta 7 | from pytz import timezone 8 | from selenium import webdriver 9 | from selenium.webdriver.common.action_chains import ActionChains 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from dateutil.parser import parse 14 | from newspaper import Article 15 | 16 | 17 | class NewspaperScraper: 18 | def __init__ (self, newspaper, searchTerm, dateStart, dateEnd): 19 | self.newspaper = newspaper 20 | self.searchTerm = searchTerm 21 | self.dateStart = parse(dateStart) 22 | self.dateEnd = parse(dateEnd) 23 | self.links = [] 24 | 25 | def get_newspaper_name (self): 26 | return self.newspaper 27 | 28 | def get_pages (self): 29 | print 'Unimplemented for ' + self.newspaper + ' scraper' 30 | return 31 | 32 | def check_dates (self, date): 33 | page_date = parse(date) 34 | if page_date >= self.dateStart and page_date <= self.dateEnd: 35 | return True 36 | return False 37 | 38 | def newspaper_parser (self, sleep_time=0): 39 | print 'running newspaper_parser()...' 40 | 41 | results = [] 42 | count = 0 43 | 44 | for l in self.links: 45 | article = Article(url=l) 46 | try: 47 | article.build() 48 | except: 49 | time.sleep(60) 50 | continue 51 | 52 | data = { 53 | 'title': article.title, 54 | 'date_published': article.publish_date, 55 | 'news_outlet': self.newspaper, 56 | 'authors': article.authors, 57 | 'feature_img': article.top_image, 58 | 'article_link': article.canonical_link, 59 | 'keywords': article.keywords, 60 | 'movies': article.movies, 61 | 'summary': article.summary, 62 | 'text': article.text, 63 | 'html': article.html 64 | } 65 | 66 | print data['title'] 67 | print data['text'] 68 | print 69 | print 70 | results.append(data) 71 | 72 | count += 1 73 | print count 74 | time.sleep(sleep_time) 75 | 76 | return results 77 | 78 | def write_to_csv (self, data, file_name): 79 | print 'writing to CSV...' 80 | 81 | keys = data[0].keys() 82 | with open(file_name, 'wb') as output_file: 83 | dict_writer = csv.DictWriter(output_file, keys) 84 | dict_writer.writeheader() 85 | dict_writer.writerows(data) 86 | 87 | def write_to_mongo (self, data, collection): 88 | print 'writing to mongoDB...' 89 | count = 0 90 | 91 | for d in data: 92 | collection.insert(d) 93 | count += 1 94 | print count 95 | 96 | 97 | class NewspaperScraperWithAuthentication(NewspaperScraper): 98 | def __init__ (self, newspaper, searchTerm, dateStart, dateEnd, userID, password): 99 | NewspaperScraper.__init__(self, newspaper, searchTerm, dateStart, dateEnd) 100 | self.userId = userID 101 | self.password = password 102 | 103 | if newspaper == 'New York Times': 104 | self.credentials = { 105 | 'userid': userID, 106 | 'password1': password 107 | } 108 | self.login_url = 'https://myaccount.nytimes.com/auth/login' 109 | self.submit_id = 'submit' 110 | elif newspaper == 'Wall Street Journal': 111 | self.credentials = { 112 | 'username': userID, 113 | 'password': password 114 | } 115 | self.login_url = 'https://id.wsj.com/access/pages/wsj/us/signin.html' 116 | self.submit_id = 'submitButton' 117 | 118 | def newspaper_parser (self, sleep_time=0): 119 | print 'running newspaper_parser()...' 120 | results = [] 121 | count = 0 122 | 123 | profile = webdriver.FirefoxProfile() 124 | browser = webdriver.Firefox(profile) 125 | credential_names = self.credentials.keys() 126 | 127 | browser.get(self.login_url) 128 | cred1 = browser.find_element_by_id(credential_names[0]) 129 | cred2 = browser.find_element_by_id(credential_names[1]) 130 | cred1.send_keys(self.credentials[credential_names[0]]) 131 | cred2.send_keys(self.credentials[credential_names[1]]) 132 | browser.find_element_by_id(self.submit_id).click() 133 | time.sleep(15) 134 | 135 | cookies = browser.get_cookies() 136 | browser.close() 137 | 138 | s = requests.Session() 139 | for cookie in cookies: 140 | s.cookies.set(cookie['name'], cookie['value']) 141 | 142 | for l in self.links: 143 | page = s.get(l) 144 | soup = BeautifulSoup(page.content) 145 | article = Article(url=l) 146 | article.set_html(str(soup)) 147 | 148 | try: 149 | article.parse() 150 | article.nlp() 151 | except: 152 | time.sleep(60) 153 | continue 154 | 155 | data = { 156 | 'title': article.title, 157 | 'date_published': article.publish_date, 158 | 'news_outlet': self.newspaper, 159 | 'authors': article.authors, 160 | 'feature_img': article.top_image, 161 | 'article_link': article.canonical_link, 162 | 'keywords': article.keywords, 163 | 'movies': article.movies, 164 | 'summary': article.summary, 165 | 'text': article.text, 166 | 'html': article.html 167 | } 168 | 169 | print data['title'] 170 | print data['text'] 171 | print 172 | print 173 | results.append(data) 174 | time.sleep(sleep_time) 175 | 176 | count += 1 177 | print count 178 | 179 | return results 180 | 181 | 182 | class ChicagoTribuneScraper(NewspaperScraper): 183 | def get_pages (self, sleep_time=3): 184 | print 'running get_pages()...' 185 | 186 | profile = webdriver.FirefoxProfile() 187 | browser = webdriver.Firefox(profile) 188 | 189 | links = [] 190 | stop = False 191 | index = 1 192 | 193 | while not stop: 194 | browser.get('http://www.chicagotribune.com/search/dispatcher.front?page=' 195 | + str(index) 196 | + '&sortby=display_time%20descending&target=stories&spell=on&Query=' 197 | + self.searchTerm 198 | + '#trb_search') 199 | 200 | soup = BeautifulSoup(browser.page_source) 201 | 202 | if not soup.find('div', class_='trb_search_results'): 203 | stop = True 204 | 205 | for result in soup.find_all('div', class_="trb_search_result_wrapper"): 206 | pub_date = result.find('time', class_='trb_search_result_datetime').get('data-dt') 207 | if ':' in pub_date: 208 | pub_date = str(datetime.now(timezone('America/Chicago')).date()) 209 | 210 | if self.check_dates(pub_date): 211 | link = result.find('a', class_='trb_search_result_title') 212 | ltext = 'http://www.chicagotribune.com' + link.get('href') 213 | 214 | if ltext not in links: 215 | print ltext 216 | links.append(ltext) 217 | 218 | else: 219 | stop = True 220 | break 221 | 222 | index += 1 223 | time.sleep(sleep_time) 224 | 225 | browser.close() 226 | self.links = links 227 | return links 228 | 229 | 230 | class LaTimesScraper(NewspaperScraper): 231 | def get_pages (self, sleep_time=3): 232 | print 'running get_pages()...' 233 | 234 | profile = webdriver.FirefoxProfile() 235 | browser = webdriver.Firefox(profile) 236 | 237 | links = [] 238 | stop = False 239 | index = 1 240 | 241 | while not stop: 242 | browser.get('http://www.latimes.com/search/dispatcher.front?page=' 243 | + str(index) 244 | + '&sortby=display_time%20descending&target=stories&spell=on&Query=' 245 | + self.searchTerm 246 | + '#trb_search') 247 | 248 | soup = BeautifulSoup(browser.page_source) 249 | 250 | if not soup.find('div', class_='trb_search_results'): 251 | stop = True 252 | 253 | for result in soup.find_all('div', class_="trb_search_result_wrapper"): 254 | pub_date = result.find('time', class_='trb_search_result_datetime').get('data-dt') 255 | if ':' in pub_date: 256 | pub_date = str(datetime.now(timezone('US/Pacific')).date()) 257 | 258 | if self.check_dates(pub_date): 259 | link = result.find('a', class_='trb_search_result_title') 260 | ltext = 'http://www.latimes.com' + link.get('href') 261 | 262 | if ltext not in links: 263 | print ltext 264 | links.append(ltext) 265 | 266 | else: 267 | stop = True 268 | break 269 | 270 | index += 1 271 | time.sleep(sleep_time) 272 | 273 | browser.close() 274 | self.links = links 275 | return links 276 | 277 | 278 | class WashPostScraper(NewspaperScraper): 279 | def get_pages (self, sleep_time=3): 280 | print 'running get_pages()...' 281 | 282 | browser = webdriver.Chrome() 283 | 284 | links = [] 285 | stop = False 286 | index = 0 287 | 288 | while not stop: 289 | browser.get('https://www.washingtonpost.com/newssearch/' 290 | + '?utm_term=.94befa345ad6&query=' 291 | + self.searchTerm 292 | + '&sort=Date&datefilter=12%20Months&contenttype=Article' 293 | + '&spellcheck&startat=' + str(index) + '#top') 294 | 295 | soup = BeautifulSoup(browser.page_source) 296 | if not soup.find_all('div', class_="pb-feed-item"): 297 | stop = True 298 | continue 299 | 300 | for result in soup.find_all('div', class_="pb-feed-item"): 301 | if self.check_dates(result.find('span', class_='pb-timestamp').get_text()): 302 | link = result.find('a', class_="ng-binding") 303 | ltext = link.get('href') 304 | 305 | if ltext not in links: 306 | print ltext 307 | links.append(ltext) 308 | 309 | else: 310 | stop = True 311 | break 312 | 313 | index += 20 314 | time.sleep(sleep_time) 315 | 316 | browser.close() 317 | self.links = links 318 | return links 319 | 320 | 321 | class SlateScraper(NewspaperScraper): 322 | def get_pages (self, sleep_time=3): 323 | print 'running get_pages()...' 324 | 325 | browser = webdriver.Chrome() 326 | 327 | links = [] 328 | stop = False 329 | 330 | browser.get('http://www.slate.com/search.html#search=' + self.searchTerm) 331 | 332 | while not stop: 333 | soup = BeautifulSoup(browser.page_source) 334 | 335 | for result in soup.find_all('div', class_="full-width left-image"): 336 | if self.check_dates(result.find('span', class_='timestamp').get_text()): 337 | ltext = result.find('a').get('href') 338 | section = self.get_section(ltext) 339 | 340 | if (section == 'articles' or section == 'blogs') and ltext not in links: 341 | print ltext 342 | links.append(ltext) 343 | 344 | header = soup.find('header', class_="tag-header").get_text().split() 345 | if int(header[2].split('-')[1]) == int(header[4]): 346 | stop = True 347 | 348 | try: 349 | element = browser.find_element_by_xpath('//*[@id="search_content"]/p/a') 350 | ActionChains(browser).move_to_element(element) \ 351 | .click(element) \ 352 | .perform() 353 | element = WebDriverWait(browser, 10).until( 354 | EC.presence_of_element_located((By.ID, "search_results"))) 355 | 356 | except: 357 | stop = True 358 | 359 | time.sleep(sleep_time) 360 | 361 | browser.close() 362 | self.links = links 363 | return links 364 | 365 | def get_section (self, href): 366 | href = href[20:] 367 | try: 368 | return re.search('/.*?/', href).group(0)[1:-1] 369 | except: 370 | return 'error' 371 | 372 | 373 | class FoxNewsScraper(NewspaperScraper): 374 | def get_pages (self, sleep_time=3): 375 | print 'running get_pages()...' 376 | 377 | profile = webdriver.FirefoxProfile() 378 | browser = webdriver.Firefox(profile) 379 | 380 | links = [] 381 | stop = False 382 | index = 0 383 | 384 | while not stop: 385 | browser.get('http://www.foxnews.com/search-results/search?q=' 386 | + self.searchTerm 387 | + '&ss=fn&sort=latest&type=story' 388 | + '&min_date=' + str(self.dateStart.date()) + '&max_date=' + str(self.dateEnd.date()) 389 | + '&start=' 390 | + str(index)) 391 | 392 | soup = BeautifulSoup(browser.page_source) 393 | if not soup.find_all('div', class_="search-info"): 394 | stop = True 395 | continue 396 | 397 | for result in soup.find_all('div', class_="search-info"): 398 | if self.check_dates(result.find('span', class_='search-date').get_text()): 399 | link = result.find('h3').find('a') 400 | ltext = link.get('href') 401 | section = self.get_section(ltext) 402 | 403 | if section != 'v' and ltext not in links: 404 | print ltext 405 | links.append(ltext) 406 | 407 | else: 408 | stop = True 409 | break 410 | 411 | index += 10 412 | time.sleep(sleep_time) 413 | 414 | browser.close() 415 | self.links = links 416 | return links 417 | 418 | def get_section (self, href): 419 | href = href[22:] 420 | try: 421 | section = re.search('/.*?/', href).group(0)[1:-1] 422 | if (section == 'politics' or section == 'us' or section == 'opinion' or section == 'v'): 423 | return section 424 | else: 425 | return 'other' 426 | except: 427 | return 'error' 428 | 429 | 430 | class PoliticoScraper(NewspaperScraper): 431 | def get_pages (self, sleep_time=3): 432 | print 'running get_pages()...' 433 | 434 | links = [] 435 | stop = False 436 | index = 1 437 | 438 | while not stop: 439 | page = requests.get('http://www.politico.com/search/' + str(index) + '?s=newest&q=' + self.searchTerm) 440 | soup = BeautifulSoup(page.content) 441 | 442 | for result in soup.find_all('article', class_='story-frag format-ml'): 443 | pub_date = result.find('p', class_='timestamp') 444 | if pub_date is None: 445 | continue 446 | 447 | if self.check_dates(pub_date.get_text().split()[0]): 448 | try: 449 | link = result.find('h3').find('a') 450 | ltext = link.get('href') 451 | section = self.get_section(ltext) 452 | except: 453 | continue 454 | 455 | if (section == 'story' or section == 'blogs') and ltext not in links: 456 | print ltext 457 | links.append(ltext) 458 | 459 | else: 460 | stop = True 461 | break 462 | 463 | index += 1 464 | time.sleep(sleep_time) 465 | 466 | # browser.close() 467 | self.links = links 468 | return links 469 | 470 | def get_section (self, href): 471 | href = href[23:] 472 | try: 473 | return re.search('/.*?/', href).group(0)[1:-1] 474 | except: 475 | return 'error' 476 | 477 | 478 | class WeeklyStandardScraper(NewspaperScraper): 479 | def get_pages (self, sleep_time=3): 480 | print 'running get_pages()...' 481 | 482 | browser = webdriver.Chrome() 483 | 484 | links = [] 485 | stop = False 486 | 487 | browser.get('http://www.weeklystandard.com/search?query=' + self.searchTerm) 488 | 489 | while not stop: 490 | soup = BeautifulSoup(browser.page_source) 491 | 492 | for result in soup.find_all('div', class_="data-item"): 493 | if self.check_dates(result.find('div', class_='item-pubdate').get_text()): 494 | link = result.find('div', class_="item-headline").find('a') 495 | ltext = link.get('href') 496 | 497 | if ltext not in links: 498 | print ltext 499 | links.append(ltext) 500 | 501 | else: 502 | stop = True 503 | break 504 | 505 | try: 506 | element = browser.find_element_by_xpath('//*[@id="resultdata"]/div[22]/a') 507 | ActionChains(browser).move_to_element(element) \ 508 | .click(element) \ 509 | .perform() 510 | element = WebDriverWait(browser, 10).until( 511 | EC.presence_of_element_located((By.ID, "resultdata"))) 512 | 513 | except: 514 | stop = True 515 | 516 | time.sleep(sleep_time) 517 | 518 | browser.close() 519 | self.links = links 520 | return links 521 | 522 | 523 | class BloombergScraper(NewspaperScraper): 524 | def get_pages (self, sleep_time=3): 525 | print 'running get_pages()...' 526 | 527 | profile = webdriver.FirefoxProfile() 528 | browser = webdriver.Firefox(profile) 529 | 530 | links = [] 531 | stop = False 532 | index = 1 533 | days = (self.dateEnd.date() - self.dateStart.date()).days + 1 534 | 535 | while not stop: 536 | browser.get('https://www.bloomberg.com/search?query=' 537 | + self.searchTerm 538 | + '&startTime=-' + str(days) + 'd' 539 | + '&sort=time:desc' 540 | + '&endTime=' + str(self.dateEnd.date()) + 'T23:59:59.999Z' 541 | + '&page=' + str(index)) 542 | 543 | soup = BeautifulSoup(browser.page_source) 544 | 545 | if soup.find('div', class_="search-result-story__container") is None: 546 | stop = True 547 | continue 548 | 549 | for result in soup.find_all('div', class_="search-result-story__container"): 550 | if self.check_dates(result.find('span', class_='metadata-timestamp').get_text()): 551 | link = result.find('h1', class_="search-result-story__headline") 552 | ltext = link.find('a').get('href') 553 | section = self.get_section(ltext) 554 | 555 | if section == 'articles' and ltext not in links: 556 | print ltext 557 | links.append(ltext) 558 | 559 | index += 1 560 | time.sleep(sleep_time) 561 | 562 | browser.close() 563 | self.links = links 564 | return links 565 | 566 | def get_section (self, href): 567 | href = href[25:] 568 | try: 569 | return re.search('/.*?/', href).group(0)[1:-1] 570 | except: 571 | return 'error' 572 | 573 | 574 | class TimeScraper(NewspaperScraper): 575 | def get_pages (self, sleep_time=3): 576 | print 'running get_pages()...' 577 | 578 | profile = webdriver.FirefoxProfile() 579 | browser = webdriver.Firefox(profile) 580 | 581 | links = [] 582 | stop = False 583 | index = 1 584 | 585 | while not stop: 586 | browser.get('http://search.time.com/?q=' + self.searchTerm + '&startIndex=' + str(index) + '&sort=Date') 587 | soup = BeautifulSoup(browser.page_source) 588 | 589 | for result in soup.find_all('div', class_="content-right"): 590 | pub_date = result.find('div', class_='content-snippet').get_text().split('...')[0].strip() 591 | if 'hour' in pub_date: 592 | pub_date = str((datetime.now(timezone('EST')) - timedelta(hours=int(pub_date[0]))).date()) 593 | elif 'day' in pub_date: 594 | pub_date = str((datetime.today() - timedelta(days=int(pub_date[0]))).date()) 595 | 596 | if self.check_dates(pub_date): 597 | link = result.find('div', class_="content-title") 598 | ltext = link.find('a').get('href') 599 | 600 | if ltext not in links: 601 | print ltext 602 | links.append(ltext) 603 | 604 | else: 605 | stop = True 606 | break 607 | 608 | error_message = soup.find('div', class_="search-results-message") 609 | if error_message: 610 | if error_message.get_text() == 'Error getting Search Results': 611 | stop = True 612 | 613 | index += 10 614 | time.sleep(sleep_time) 615 | 616 | browser.close() 617 | self.links = links 618 | return links 619 | 620 | 621 | class CNNScraper(NewspaperScraper): 622 | def get_pages (self, sleep_time=3): 623 | print 'running get_pages()...' 624 | 625 | browser = webdriver.Chrome() 626 | 627 | links = [] 628 | index = 0 629 | 630 | browser.get('http://www.cnn.com/search/?text=' + self.searchTerm) 631 | soup = BeautifulSoup(browser.page_source) 632 | search_results = int(soup.find('div', class_='cn cn--idx-0 search-results_msg').get_text().split()[4]) 633 | 634 | while index < search_results: 635 | soup = BeautifulSoup(browser.page_source) 636 | 637 | for result in soup.find_all('article', 638 | class_="cd cd--card cd--idx-0 cd--large cd--horizontal cd--has-media"): 639 | pub_date = result.find('span', class_='cd__timestamp').get_text() 640 | if not pub_date: 641 | continue 642 | if ':' in pub_date: 643 | pub_date = pub_date.split(',') 644 | pub_date = (pub_date[1] + ',' + pub_date[2]).strip() 645 | 646 | if self.check_dates(pub_date): 647 | link = result.find('h3', class_="cd__headline").find('a') 648 | ltext = link.get('href') 649 | 650 | if 'http://' not in ltext: 651 | ltext = 'http://www.cnn.com' + ltext 652 | 653 | if ltext not in links: 654 | print ltext 655 | links.append(ltext) 656 | 657 | index += 10 658 | time.sleep(sleep_time) 659 | 660 | try: 661 | element = browser.find_element_by_xpath('//*[@id="cnnSearchPagination"]/div/div[3]/a/span[1]') 662 | ActionChains(browser).move_to_element(element) \ 663 | .click(element) \ 664 | .perform() 665 | element = WebDriverWait(browser, 10).until( 666 | EC.presence_of_element_located((By.ID, "textResultsContainer"))) 667 | 668 | except: 669 | continue 670 | 671 | browser.close() 672 | self.links = links 673 | return links 674 | 675 | 676 | class CNBCScraper(NewspaperScraper): 677 | def get_pages (self, sleep_time=3): 678 | print 'running get_pages()...' 679 | 680 | links = [] 681 | stop = False 682 | index = 1 683 | days = (self.dateEnd.date() - self.dateStart.date()).days + 1 684 | 685 | while not stop: 686 | page = requests.get('http://search.cnbc.com/rs/search/view.html?partnerId=2000' 687 | + '&keywords=' + self.searchTerm 688 | + '&sort=date&type=news&source=CNBC.com' 689 | + '&pubtime=' + str(days) + '&pubfreq=d' 690 | + '&page=' + str(index)) 691 | soup = BeautifulSoup(page.content) 692 | 693 | if soup.find('div', class_="SearchResultCard") is None: 694 | stop = True 695 | continue 696 | 697 | for result in soup.find_all('div', class_="SearchResultCard"): 698 | seconds_since_epoch = float(re.findall(r'\d+', result.find('time').get_text())[0]) 699 | pub_date = str(datetime.fromtimestamp(seconds_since_epoch / 1000).replace(hour=0, minute=0, second=0, 700 | microsecond=0)) 701 | 702 | if self.check_dates(pub_date): 703 | link = result.find('h3', class_="title") 704 | ltext = link.find('a').get('href') 705 | if ltext not in links: 706 | print ltext 707 | links.append(ltext) 708 | 709 | index += 1 710 | time.sleep(sleep_time) 711 | 712 | self.links = links 713 | return links 714 | 715 | 716 | class USATodayScraper(NewspaperScraper): 717 | def get_pages (self, sleep_time=5): 718 | print 'running get_pages()...' 719 | 720 | browser = webdriver.Chrome() 721 | browser.get('http://www.usatoday.com/search/' + self.searchTerm + '/') 722 | 723 | links = [] 724 | stop = False 725 | index = 1 726 | 727 | element = browser.find_element_by_xpath('/html/body/div[2]/div[1]/div/div[3]/span[2]') 728 | ActionChains(browser).move_to_element(element) \ 729 | .click(element) \ 730 | .perform() 731 | 732 | lastHeight = browser.execute_script("return document.body.scrollHeight") 733 | tries = 0 734 | 735 | while not stop: 736 | soup = BeautifulSoup(browser.page_source) 737 | # print soup.find_all('li', class_=' search-result-item') 738 | last_search_item = soup.find_all('li', class_=' search-result-item')[-1] 739 | link = last_search_item.find('a', class_='search-result-item-link').get('href') 740 | date_match = re.search('([0-9]{4}/[0-9]{2}/[0-9]{2})', link) 741 | if date_match is not None: 742 | print date_match.group(1) 743 | 744 | browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") 745 | time.sleep(sleep_time) 746 | 747 | newHeight = browser.execute_script("return document.body.scrollHeight") 748 | if (newHeight == lastHeight): 749 | tries += 1 750 | time.sleep(5) 751 | if tries >= 5: 752 | stop = True 753 | else: 754 | tries = 0 755 | 756 | lastHeight = newHeight 757 | 758 | soup = BeautifulSoup(browser.page_source) 759 | 760 | for result in soup.find_all('li', class_=' search-result-item'): 761 | link = result.find('a', class_='search-result-item-link').get('href') 762 | date_match = re.search('([0-9]{4}/[0-9]{2}/[0-9]{2})', link) 763 | 764 | if date_match is not None: 765 | if self.check_dates(date_match.group(1)): 766 | ltext = 'http://www.usatoday.com/' + link 767 | 768 | if ltext not in links: 769 | print ltext 770 | links.append(ltext) 771 | else: 772 | continue 773 | 774 | index += 1 775 | 776 | browser.close() 777 | self.links = links 778 | return links 779 | 780 | 781 | class WSJScraper(NewspaperScraperWithAuthentication): 782 | def get_pages (self, sleep_time=3): 783 | print 'running get_pages()...' 784 | 785 | links = [] 786 | stop = False 787 | index = 1 788 | 789 | while not stop: 790 | page = requests.get('http://www.wsj.com/search/term.html?KEYWORDS=' 791 | + self.searchTerm 792 | + '&min-date=' + str(self.dateStart.date()).replace('-', '/') 793 | + '&max-date=' + str(self.dateEnd.date()).replace('-', '/') 794 | + '&page=' + str(index) 795 | + '&isAdvanced=true&daysback=4y&andor=AND&sort=date-desc&source=wsjarticle,wsjblogs,sitesearch') 796 | soup = BeautifulSoup(page.content) 797 | 798 | if soup.find('div', class_="headline-item") is None: 799 | stop = True 800 | continue 801 | 802 | for result in soup.find_all('div', class_="headline-item"): 803 | pub_date = result.find('time', class_='date-stamp-container').get_text() 804 | if 'min' in pub_date: 805 | pub_date = str((datetime.now(timezone('EST')) - timedelta(minutes=int(pub_date[0]))).date()) 806 | elif 'hour' in pub_date: 807 | pub_date = str((datetime.now(timezone('EST')) - timedelta(hours=int(pub_date[0]))).date()) 808 | else: 809 | pub_date = pub_date.split() 810 | pub_date = pub_date[0] + ' ' + pub_date[1] + ' ' + pub_date[2] 811 | 812 | if self.check_dates(pub_date): 813 | link = result.find('h3', class_="headline") 814 | ltext = link.find('a').get('href') 815 | if 'http://' not in ltext: 816 | ltext = 'http://www.wsj.com' + ltext 817 | 818 | if ltext not in links and 'video' not in ltext: 819 | print ltext 820 | links.append(ltext) 821 | 822 | index += 1 823 | time.sleep(sleep_time) 824 | 825 | self.links = links 826 | return links 827 | 828 | 829 | class NYTScraper(NewspaperScraperWithAuthentication): 830 | def get_pages (self, sleep_time=5): 831 | print 'running get_pages()...' 832 | 833 | profile = webdriver.FirefoxProfile() 834 | browser = webdriver.Firefox(profile) 835 | 836 | links = [] 837 | stop = False 838 | index = 1 839 | current_start = (self.dateEnd - timedelta(days=6)).date() 840 | current_end = self.dateEnd.date() 841 | 842 | while not stop: 843 | while True: 844 | browser.get('http://query.nytimes.com/search/sitesearch/?action=click&contentCollection' 845 | + '®ion=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/' 846 | + self.searchTerm 847 | + '/from' + str(current_start).replace('-', '') + 'to' + str(current_end).replace('-', '') 848 | + '/allresults/' 849 | + str(index) 850 | + '/allauthors/newest/') 851 | 852 | time.sleep(sleep_time) 853 | soup = BeautifulSoup(browser.page_source) 854 | 855 | for result in soup.find_all('li', class_="story"): 856 | pub_div = result.find('span', class_='dateline') 857 | if pub_div is None: 858 | continue 859 | 860 | if self.check_dates(pub_div.get_text()): 861 | link = result.find('div', class_='element2') 862 | ltext = link.find('a').get('href') 863 | section = self.get_section(ltext) 864 | 865 | if section != 'video' and ltext not in links: 866 | # print ltext 867 | print pub_div.get_text() 868 | links.append(ltext) 869 | 870 | else: 871 | stop = True 872 | break 873 | 874 | next_page = soup.find('a', class_="stepToPage next") 875 | if not next_page and index == 1: 876 | continue 877 | elif not next_page or stop is True: 878 | break 879 | 880 | index += 1 881 | 882 | current_start = current_start - timedelta(days=7) 883 | current_end = current_end - timedelta(days=7) 884 | index = 1 885 | 886 | browser.close() 887 | self.links = links 888 | return links 889 | 890 | def get_section (self, href): 891 | href = href[22:] 892 | try: 893 | return re.search('/.*?/', href).group(0)[1:-1] 894 | except: 895 | return 'error''' 896 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Newspaper Scrapers 2 | 3 | ## A quick precursor: 4 | We used these scripts to collect data for two projects on The DataFace's website: [34 Percent of Articles about Trump Now Mention His Twitter Activity](http://thedataface.com/trumps-twitter-activity/) and [Trump and the Media: A Text Analysis](http://thedataface.com/trump-media-analysis/). 5 | 6 | NewspaperScraper.py provides support for scraping the websites of 14 major media outlets. They are listed below: 7 | * New York Times 8 | * Washington Post 9 | * Wall Street Journal 10 | * USA Today 11 | * CNN 12 | * Fox News 13 | * Politico 14 | * Slate 15 | * CNBC 16 | * Bloomberg 17 | * TIME 18 | * The Weekly Standard 19 | * LA Times 20 | * Chicago Tribune 21 | 22 | You can extend the library to support other websites by creating new classes in NewspaperScraper.py. Just make sure your class inherits from NewspaperScraper, then write your own version of get_pages() specific to each new site! 23 | 24 | ## Dependencies: 25 | This project is indebted to the great work of Lucas Ou-Yang and his [Newspaper library](http://newspaper.readthedocs.io/en/latest/). 26 | 27 | Here are the rest of the project's dependencies. Be sure to install these before proceeding: 28 | * [requests](http://docs.python-requests.org/en/master/) 29 | * [selenium](http://selenium-python.readthedocs.io/) 30 | * [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) 31 | * MongoDB + [pyMongo](https://api.mongodb.com/python/current/) 32 | * [pytz](http://pythonhosted.org/pytz/) 33 | 34 | ## Here's how to use this thing... 35 | A NewspaperScraper object expects four inputs (at a minimum). The scraper's name, a search term, a start date, and an end date. After initializing a scraper, the intended workflow is as follows: 36 | * First, run get_pages() to find the URLs of all articles matching the search term within the relevant time period. 37 | * Then, run newspaper_parser() to grab metadata about each article returned by get_pages() 38 | * Finally, store the data using either write_to_mongo() or write_to_csv() 39 | 40 | If you have mongoDB installed, you can get started quickly by referencing RunScrapers.py. You'll simply write the four inputs on the command line. 41 | 42 | Note 1: NYT and WSJ require the credentials of a subscribed user to work. Those can be input as command line arguments as well (see RunScrapers.py). 43 | 44 | Note 2: Some scrapers work better than others. We had some glitches gathering data from NYT and CNN in particular (oops), so feel free to fork + improve! 45 | 46 | ## What you'll end up with: 47 | A database (or file) that contains the following pieces of metadata about each article: 48 | * title 49 | * date_published 50 | * news_outlet 51 | * authors 52 | * feature_img 53 | * article_link 54 | * keywords 55 | * movies 56 | * summary 57 | * text 58 | * html 59 | -------------------------------------------------------------------------------- /RunScrapers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pymongo import MongoClient 3 | from NewspaperScraper import * 4 | 5 | client = MongoClient() 6 | db = client.News_database 7 | 8 | 9 | def run_scraper (scraper): 10 | scraper.get_pages() 11 | data = scraper.newspaper_parser() 12 | scraper.write_to_mongo(data, db.articles_about_fake_news_rerun) 13 | 14 | 15 | def initialize_scraper (args): 16 | if args[1] == 'Chicago Tribune': 17 | run_scraper(ChicagoTribuneScraper(args[1], args[2], args[3], args[4])) 18 | elif args[1] == 'Los Angeles Times': 19 | run_scraper(LaTimesScraper(args[1], args[2], args[3], args[4])) 20 | elif args[1] == 'Washington Post': 21 | run_scraper(WashPostScraper(args[1], args[2], args[3], args[4])) 22 | elif args[1] == 'Slate': 23 | run_scraper(SlateScraper(args[1], args[2], args[3], args[4])) 24 | elif args[1] == 'Politico': 25 | run_scraper(PoliticoScraper(args[1], args[2], args[3], args[4])) 26 | elif args[1] == 'Fox News': 27 | run_scraper(FoxNewsScraper(args[1], args[2], args[3], args[4])) 28 | elif args[1] == 'The Weekly Standard': 29 | run_scraper(WeeklyStandardScraper(args[1], args[2], args[3], args[4])) 30 | elif args[1] == 'Bloomberg': 31 | run_scraper(BloombergScraper(args[1], args[2], args[3], args[4])) 32 | elif args[1] == 'TIME': 33 | run_scraper(TimeScraper(args[1], args[2], args[3], args[4])) 34 | elif args[1] == 'Wall Street Journal': 35 | run_scraper(WSJScraper(args[1], args[2], args[3], args[4], args[5], args[6])) 36 | elif args[1] == 'New York Times': 37 | run_scraper(NYTScraper(args[1], args[2], args[3], args[4], args[5], args[6])) 38 | elif args[1] == 'CNN': 39 | run_scraper(CNNScraper(args[1], args[2], args[3], args[4])) 40 | elif args[1] == 'USA Today': 41 | run_scraper(USATodayScraper(args[1], args[2], args[3], args[4])) 42 | elif args[1] == 'CNBC': 43 | run_scraper(CNBCScraper(args[1], args[2], args[3], args[4])) 44 | 45 | 46 | if __name__ == "__main__": 47 | initialize_scraper(sys.argv) 48 | --------------------------------------------------------------------------------