├── NewspaperScraper.py
├── README.md
└── RunScrapers.py


/NewspaperScraper.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import csv
  3 | import time
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from datetime import datetime, timedelta
  7 | from pytz import timezone
  8 | from selenium import webdriver
  9 | from selenium.webdriver.common.action_chains import ActionChains
 10 | from selenium.webdriver.common.by import By
 11 | from selenium.webdriver.support.ui import WebDriverWait
 12 | from selenium.webdriver.support import expected_conditions as EC
 13 | from dateutil.parser import parse
 14 | from newspaper import Article
 15 | 
 16 | 
 17 | class NewspaperScraper:
 18 |     def __init__ (self, newspaper, searchTerm, dateStart, dateEnd):
 19 |         self.newspaper = newspaper
 20 |         self.searchTerm = searchTerm
 21 |         self.dateStart = parse(dateStart)
 22 |         self.dateEnd = parse(dateEnd)
 23 |         self.links = []
 24 | 
 25 |     def get_newspaper_name (self):
 26 |         return self.newspaper
 27 | 
 28 |     def get_pages (self):
 29 |         print 'Unimplemented for ' + self.newspaper + ' scraper'
 30 |         return
 31 | 
 32 |     def check_dates (self, date):
 33 |         page_date = parse(date)
 34 |         if page_date >= self.dateStart and page_date <= self.dateEnd:
 35 |             return True
 36 |         return False
 37 | 
 38 |     def newspaper_parser (self, sleep_time=0):
 39 |         print 'running newspaper_parser()...'
 40 | 
 41 |         results = []
 42 |         count = 0
 43 | 
 44 |         for l in self.links:
 45 |             article = Article(url=l)
 46 |             try:
 47 |                 article.build()
 48 |             except:
 49 |                 time.sleep(60)
 50 |                 continue
 51 | 
 52 |             data = {
 53 |                 'title': article.title,
 54 |                 'date_published': article.publish_date,
 55 |                 'news_outlet': self.newspaper,
 56 |                 'authors': article.authors,
 57 |                 'feature_img': article.top_image,
 58 |                 'article_link': article.canonical_link,
 59 |                 'keywords': article.keywords,
 60 |                 'movies': article.movies,
 61 |                 'summary': article.summary,
 62 |                 'text': article.text,
 63 |                 'html': article.html
 64 |             }
 65 | 
 66 |             print data['title']
 67 |             print data['text']
 68 |             print
 69 |             print
 70 |             results.append(data)
 71 | 
 72 |             count += 1
 73 |             print count
 74 |             time.sleep(sleep_time)
 75 | 
 76 |         return results
 77 | 
 78 |     def write_to_csv (self, data, file_name):
 79 |         print 'writing to CSV...'
 80 | 
 81 |         keys = data[0].keys()
 82 |         with open(file_name, 'wb') as output_file:
 83 |             dict_writer = csv.DictWriter(output_file, keys)
 84 |             dict_writer.writeheader()
 85 |             dict_writer.writerows(data)
 86 | 
 87 |     def write_to_mongo (self, data, collection):
 88 |         print 'writing to mongoDB...'
 89 |         count = 0
 90 | 
 91 |         for d in data:
 92 |             collection.insert(d)
 93 |             count += 1
 94 |             print count
 95 | 
 96 | 
 97 | class NewspaperScraperWithAuthentication(NewspaperScraper):
 98 |     def __init__ (self, newspaper, searchTerm, dateStart, dateEnd, userID, password):
 99 |         NewspaperScraper.__init__(self, newspaper, searchTerm, dateStart, dateEnd)
100 |         self.userId = userID
101 |         self.password = password
102 | 
103 |         if newspaper == 'New York Times':
104 |             self.credentials = {
105 |                 'userid': userID,
106 |                 'password1': password
107 |             }
108 |             self.login_url = 'https://myaccount.nytimes.com/auth/login'
109 |             self.submit_id = 'submit'
110 |         elif newspaper == 'Wall Street Journal':
111 |             self.credentials = {
112 |                 'username': userID,
113 |                 'password': password
114 |             }
115 |             self.login_url = 'https://id.wsj.com/access/pages/wsj/us/signin.html'
116 |             self.submit_id = 'submitButton'
117 | 
118 |     def newspaper_parser (self, sleep_time=0):
119 |         print 'running newspaper_parser()...'
120 |         results = []
121 |         count = 0
122 | 
123 |         profile = webdriver.FirefoxProfile()
124 |         browser = webdriver.Firefox(profile)
125 |         credential_names = self.credentials.keys()
126 | 
127 |         browser.get(self.login_url)
128 |         cred1 = browser.find_element_by_id(credential_names[0])
129 |         cred2 = browser.find_element_by_id(credential_names[1])
130 |         cred1.send_keys(self.credentials[credential_names[0]])
131 |         cred2.send_keys(self.credentials[credential_names[1]])
132 |         browser.find_element_by_id(self.submit_id).click()
133 |         time.sleep(15)
134 | 
135 |         cookies = browser.get_cookies()
136 |         browser.close()
137 | 
138 |         s = requests.Session()
139 |         for cookie in cookies:
140 |             s.cookies.set(cookie['name'], cookie['value'])
141 | 
142 |         for l in self.links:
143 |             page = s.get(l)
144 |             soup = BeautifulSoup(page.content)
145 |             article = Article(url=l)
146 |             article.set_html(str(soup))
147 | 
148 |             try:
149 |                 article.parse()
150 |                 article.nlp()
151 |             except:
152 |                 time.sleep(60)
153 |                 continue
154 | 
155 |             data = {
156 |                 'title': article.title,
157 |                 'date_published': article.publish_date,
158 |                 'news_outlet': self.newspaper,
159 |                 'authors': article.authors,
160 |                 'feature_img': article.top_image,
161 |                 'article_link': article.canonical_link,
162 |                 'keywords': article.keywords,
163 |                 'movies': article.movies,
164 |                 'summary': article.summary,
165 |                 'text': article.text,
166 |                 'html': article.html
167 |             }
168 | 
169 |             print data['title']
170 |             print data['text']
171 |             print
172 |             print
173 |             results.append(data)
174 |             time.sleep(sleep_time)
175 | 
176 |             count += 1
177 |             print count
178 | 
179 |         return results
180 | 
181 | 
182 | class ChicagoTribuneScraper(NewspaperScraper):
183 |     def get_pages (self, sleep_time=3):
184 |         print 'running get_pages()...'
185 | 
186 |         profile = webdriver.FirefoxProfile()
187 |         browser = webdriver.Firefox(profile)
188 | 
189 |         links = []
190 |         stop = False
191 |         index = 1
192 | 
193 |         while not stop:
194 |             browser.get('http://www.chicagotribune.com/search/dispatcher.front?page='
195 |                         + str(index)
196 |                         + '&sortby=display_time%20descending&target=stories&spell=on&Query='
197 |                         + self.searchTerm
198 |                         + '#trb_search')
199 | 
200 |             soup = BeautifulSoup(browser.page_source)
201 | 
202 |             if not soup.find('div', class_='trb_search_results'):
203 |                 stop = True
204 | 
205 |             for result in soup.find_all('div', class_="trb_search_result_wrapper"):
206 |                 pub_date = result.find('time', class_='trb_search_result_datetime').get('data-dt')
207 |                 if ':' in pub_date:
208 |                     pub_date = str(datetime.now(timezone('America/Chicago')).date())
209 | 
210 |                 if self.check_dates(pub_date):
211 |                     link = result.find('a', class_='trb_search_result_title')
212 |                     ltext = 'http://www.chicagotribune.com' + link.get('href')
213 | 
214 |                     if ltext not in links:
215 |                         print ltext
216 |                         links.append(ltext)
217 | 
218 |                 else:
219 |                     stop = True
220 |                     break
221 | 
222 |             index += 1
223 |             time.sleep(sleep_time)
224 | 
225 |         browser.close()
226 |         self.links = links
227 |         return links
228 | 
229 | 
230 | class LaTimesScraper(NewspaperScraper):
231 |     def get_pages (self, sleep_time=3):
232 |         print 'running get_pages()...'
233 | 
234 |         profile = webdriver.FirefoxProfile()
235 |         browser = webdriver.Firefox(profile)
236 | 
237 |         links = []
238 |         stop = False
239 |         index = 1
240 | 
241 |         while not stop:
242 |             browser.get('http://www.latimes.com/search/dispatcher.front?page='
243 |                         + str(index)
244 |                         + '&sortby=display_time%20descending&target=stories&spell=on&Query='
245 |                         + self.searchTerm
246 |                         + '#trb_search')
247 | 
248 |             soup = BeautifulSoup(browser.page_source)
249 | 
250 |             if not soup.find('div', class_='trb_search_results'):
251 |                 stop = True
252 | 
253 |             for result in soup.find_all('div', class_="trb_search_result_wrapper"):
254 |                 pub_date = result.find('time', class_='trb_search_result_datetime').get('data-dt')
255 |                 if ':' in pub_date:
256 |                     pub_date = str(datetime.now(timezone('US/Pacific')).date())
257 | 
258 |                 if self.check_dates(pub_date):
259 |                     link = result.find('a', class_='trb_search_result_title')
260 |                     ltext = 'http://www.latimes.com' + link.get('href')
261 | 
262 |                     if ltext not in links:
263 |                         print ltext
264 |                         links.append(ltext)
265 | 
266 |                 else:
267 |                     stop = True
268 |                     break
269 | 
270 |             index += 1
271 |             time.sleep(sleep_time)
272 | 
273 |         browser.close()
274 |         self.links = links
275 |         return links
276 | 
277 | 
278 | class WashPostScraper(NewspaperScraper):
279 |     def get_pages (self, sleep_time=3):
280 |         print 'running get_pages()...'
281 | 
282 |         browser = webdriver.Chrome()
283 | 
284 |         links = []
285 |         stop = False
286 |         index = 0
287 | 
288 |         while not stop:
289 |             browser.get('https://www.washingtonpost.com/newssearch/'
290 |                         + '?utm_term=.94befa345ad6&query='
291 |                         + self.searchTerm
292 |                         + '&sort=Date&datefilter=12%20Months&contenttype=Article'
293 |                         + '&spellcheck&startat=' + str(index) + '#top')
294 | 
295 |             soup = BeautifulSoup(browser.page_source)
296 |             if not soup.find_all('div', class_="pb-feed-item"):
297 |                 stop = True
298 |                 continue
299 | 
300 |             for result in soup.find_all('div', class_="pb-feed-item"):
301 |                 if self.check_dates(result.find('span', class_='pb-timestamp').get_text()):
302 |                     link = result.find('a', class_="ng-binding")
303 |                     ltext = link.get('href')
304 | 
305 |                     if ltext not in links:
306 |                         print ltext
307 |                         links.append(ltext)
308 | 
309 |                 else:
310 |                     stop = True
311 |                     break
312 | 
313 |             index += 20
314 |             time.sleep(sleep_time)
315 | 
316 |         browser.close()
317 |         self.links = links
318 |         return links
319 | 
320 | 
321 | class SlateScraper(NewspaperScraper):
322 |     def get_pages (self, sleep_time=3):
323 |         print 'running get_pages()...'
324 | 
325 |         browser = webdriver.Chrome()
326 | 
327 |         links = []
328 |         stop = False
329 | 
330 |         browser.get('http://www.slate.com/search.html#search=' + self.searchTerm)
331 | 
332 |         while not stop:
333 |             soup = BeautifulSoup(browser.page_source)
334 | 
335 |             for result in soup.find_all('div', class_="full-width left-image"):
336 |                 if self.check_dates(result.find('span', class_='timestamp').get_text()):
337 |                     ltext = result.find('a').get('href')
338 |                     section = self.get_section(ltext)
339 | 
340 |                     if (section == 'articles' or section == 'blogs') and ltext not in links:
341 |                         print ltext
342 |                         links.append(ltext)
343 | 
344 |             header = soup.find('header', class_="tag-header").get_text().split()
345 |             if int(header[2].split('-')[1]) == int(header[4]):
346 |                 stop = True
347 | 
348 |             try:
349 |                 element = browser.find_element_by_xpath('//*[@id="search_content"]/p/a')
350 |                 ActionChains(browser).move_to_element(element) \
351 |                     .click(element) \
352 |                     .perform()
353 |                 element = WebDriverWait(browser, 10).until(
354 |                     EC.presence_of_element_located((By.ID, "search_results")))
355 | 
356 |             except:
357 |                 stop = True
358 | 
359 |             time.sleep(sleep_time)
360 | 
361 |         browser.close()
362 |         self.links = links
363 |         return links
364 | 
365 |     def get_section (self, href):
366 |         href = href[20:]
367 |         try:
368 |             return re.search('/.*?/', href).group(0)[1:-1]
369 |         except:
370 |             return 'error'
371 | 
372 | 
373 | class FoxNewsScraper(NewspaperScraper):
374 |     def get_pages (self, sleep_time=3):
375 |         print 'running get_pages()...'
376 | 
377 |         profile = webdriver.FirefoxProfile()
378 |         browser = webdriver.Firefox(profile)
379 | 
380 |         links = []
381 |         stop = False
382 |         index = 0
383 | 
384 |         while not stop:
385 |             browser.get('http://www.foxnews.com/search-results/search?q='
386 |                         + self.searchTerm
387 |                         + '&ss=fn&sort=latest&type=story'
388 |                         + '&min_date=' + str(self.dateStart.date()) + '&max_date=' + str(self.dateEnd.date())
389 |                         + '&start='
390 |                         + str(index))
391 | 
392 |             soup = BeautifulSoup(browser.page_source)
393 |             if not soup.find_all('div', class_="search-info"):
394 |                 stop = True
395 |                 continue
396 | 
397 |             for result in soup.find_all('div', class_="search-info"):
398 |                 if self.check_dates(result.find('span', class_='search-date').get_text()):
399 |                     link = result.find('h3').find('a')
400 |                     ltext = link.get('href')
401 |                     section = self.get_section(ltext)
402 | 
403 |                     if section != 'v' and ltext not in links:
404 |                         print ltext
405 |                         links.append(ltext)
406 | 
407 |                 else:
408 |                     stop = True
409 |                     break
410 | 
411 |             index += 10
412 |             time.sleep(sleep_time)
413 | 
414 |         browser.close()
415 |         self.links = links
416 |         return links
417 | 
418 |     def get_section (self, href):
419 |         href = href[22:]
420 |         try:
421 |             section = re.search('/.*?/', href).group(0)[1:-1]
422 |             if (section == 'politics' or section == 'us' or section == 'opinion' or section == 'v'):
423 |                 return section
424 |             else:
425 |                 return 'other'
426 |         except:
427 |             return 'error'
428 | 
429 | 
430 | class PoliticoScraper(NewspaperScraper):
431 |     def get_pages (self, sleep_time=3):
432 |         print 'running get_pages()...'
433 | 
434 |         links = []
435 |         stop = False
436 |         index = 1
437 | 
438 |         while not stop:
439 |             page = requests.get('http://www.politico.com/search/' + str(index) + '?s=newest&q=' + self.searchTerm)
440 |             soup = BeautifulSoup(page.content)
441 | 
442 |             for result in soup.find_all('article', class_='story-frag format-ml'):
443 |                 pub_date = result.find('p', class_='timestamp')
444 |                 if pub_date is None:
445 |                     continue
446 | 
447 |                 if self.check_dates(pub_date.get_text().split()[0]):
448 |                     try:
449 |                         link = result.find('h3').find('a')
450 |                         ltext = link.get('href')
451 |                         section = self.get_section(ltext)
452 |                     except:
453 |                         continue
454 | 
455 |                     if (section == 'story' or section == 'blogs') and ltext not in links:
456 |                         print ltext
457 |                         links.append(ltext)
458 | 
459 |                 else:
460 |                     stop = True
461 |                     break
462 | 
463 |             index += 1
464 |             time.sleep(sleep_time)
465 | 
466 |         # browser.close()
467 |         self.links = links
468 |         return links
469 | 
470 |     def get_section (self, href):
471 |         href = href[23:]
472 |         try:
473 |             return re.search('/.*?/', href).group(0)[1:-1]
474 |         except:
475 |             return 'error'
476 | 
477 | 
478 | class WeeklyStandardScraper(NewspaperScraper):
479 |     def get_pages (self, sleep_time=3):
480 |         print 'running get_pages()...'
481 | 
482 |         browser = webdriver.Chrome()
483 | 
484 |         links = []
485 |         stop = False
486 | 
487 |         browser.get('http://www.weeklystandard.com/search?query=' + self.searchTerm)
488 | 
489 |         while not stop:
490 |             soup = BeautifulSoup(browser.page_source)
491 | 
492 |             for result in soup.find_all('div', class_="data-item"):
493 |                 if self.check_dates(result.find('div', class_='item-pubdate').get_text()):
494 |                     link = result.find('div', class_="item-headline").find('a')
495 |                     ltext = link.get('href')
496 | 
497 |                     if ltext not in links:
498 |                         print ltext
499 |                         links.append(ltext)
500 | 
501 |                 else:
502 |                     stop = True
503 |                     break
504 | 
505 |             try:
506 |                 element = browser.find_element_by_xpath('//*[@id="resultdata"]/div[22]/a')
507 |                 ActionChains(browser).move_to_element(element) \
508 |                     .click(element) \
509 |                     .perform()
510 |                 element = WebDriverWait(browser, 10).until(
511 |                     EC.presence_of_element_located((By.ID, "resultdata")))
512 | 
513 |             except:
514 |                 stop = True
515 | 
516 |             time.sleep(sleep_time)
517 | 
518 |         browser.close()
519 |         self.links = links
520 |         return links
521 | 
522 | 
523 | class BloombergScraper(NewspaperScraper):
524 |     def get_pages (self, sleep_time=3):
525 |         print 'running get_pages()...'
526 | 
527 |         profile = webdriver.FirefoxProfile()
528 |         browser = webdriver.Firefox(profile)
529 | 
530 |         links = []
531 |         stop = False
532 |         index = 1
533 |         days = (self.dateEnd.date() - self.dateStart.date()).days + 1
534 | 
535 |         while not stop:
536 |             browser.get('https://www.bloomberg.com/search?query='
537 |                         + self.searchTerm
538 |                         + '&startTime=-' + str(days) + 'd'
539 |                         + '&sort=time:desc'
540 |                         + '&endTime=' + str(self.dateEnd.date()) + 'T23:59:59.999Z'
541 |                         + '&page=' + str(index))
542 | 
543 |             soup = BeautifulSoup(browser.page_source)
544 | 
545 |             if soup.find('div', class_="search-result-story__container") is None:
546 |                 stop = True
547 |                 continue
548 | 
549 |             for result in soup.find_all('div', class_="search-result-story__container"):
550 |                 if self.check_dates(result.find('span', class_='metadata-timestamp').get_text()):
551 |                     link = result.find('h1', class_="search-result-story__headline")
552 |                     ltext = link.find('a').get('href')
553 |                     section = self.get_section(ltext)
554 | 
555 |                     if section == 'articles' and ltext not in links:
556 |                         print ltext
557 |                         links.append(ltext)
558 | 
559 |             index += 1
560 |             time.sleep(sleep_time)
561 | 
562 |         browser.close()
563 |         self.links = links
564 |         return links
565 | 
566 |     def get_section (self, href):
567 |         href = href[25:]
568 |         try:
569 |             return re.search('/.*?/', href).group(0)[1:-1]
570 |         except:
571 |             return 'error'
572 | 
573 | 
574 | class TimeScraper(NewspaperScraper):
575 |     def get_pages (self, sleep_time=3):
576 |         print 'running get_pages()...'
577 | 
578 |         profile = webdriver.FirefoxProfile()
579 |         browser = webdriver.Firefox(profile)
580 | 
581 |         links = []
582 |         stop = False
583 |         index = 1
584 | 
585 |         while not stop:
586 |             browser.get('http://search.time.com/?q=' + self.searchTerm + '&startIndex=' + str(index) + '&sort=Date')
587 |             soup = BeautifulSoup(browser.page_source)
588 | 
589 |             for result in soup.find_all('div', class_="content-right"):
590 |                 pub_date = result.find('div', class_='content-snippet').get_text().split('...')[0].strip()
591 |                 if 'hour' in pub_date:
592 |                     pub_date = str((datetime.now(timezone('EST')) - timedelta(hours=int(pub_date[0]))).date())
593 |                 elif 'day' in pub_date:
594 |                     pub_date = str((datetime.today() - timedelta(days=int(pub_date[0]))).date())
595 | 
596 |                 if self.check_dates(pub_date):
597 |                     link = result.find('div', class_="content-title")
598 |                     ltext = link.find('a').get('href')
599 | 
600 |                     if ltext not in links:
601 |                         print ltext
602 |                         links.append(ltext)
603 | 
604 |                 else:
605 |                     stop = True
606 |                     break
607 | 
608 |             error_message = soup.find('div', class_="search-results-message")
609 |             if error_message:
610 |                 if error_message.get_text() == 'Error getting Search Results':
611 |                     stop = True
612 | 
613 |             index += 10
614 |             time.sleep(sleep_time)
615 | 
616 |         browser.close()
617 |         self.links = links
618 |         return links
619 | 
620 | 
621 | class CNNScraper(NewspaperScraper):
622 |     def get_pages (self, sleep_time=3):
623 |         print 'running get_pages()...'
624 | 
625 |         browser = webdriver.Chrome()
626 | 
627 |         links = []
628 |         index = 0
629 | 
630 |         browser.get('http://www.cnn.com/search/?text=' + self.searchTerm)
631 |         soup = BeautifulSoup(browser.page_source)
632 |         search_results = int(soup.find('div', class_='cn cn--idx-0 search-results_msg').get_text().split()[4])
633 | 
634 |         while index < search_results:
635 |             soup = BeautifulSoup(browser.page_source)
636 | 
637 |             for result in soup.find_all('article',
638 |                                         class_="cd cd--card cd--idx-0 cd--large cd--horizontal cd--has-media"):
639 |                 pub_date = result.find('span', class_='cd__timestamp').get_text()
640 |                 if not pub_date:
641 |                     continue
642 |                 if ':' in pub_date:
643 |                     pub_date = pub_date.split(',')
644 |                     pub_date = (pub_date[1] + ',' + pub_date[2]).strip()
645 | 
646 |                 if self.check_dates(pub_date):
647 |                     link = result.find('h3', class_="cd__headline").find('a')
648 |                     ltext = link.get('href')
649 | 
650 |                     if 'http://' not in ltext:
651 |                         ltext = 'http://www.cnn.com' + ltext
652 | 
653 |                     if ltext not in links:
654 |                         print ltext
655 |                         links.append(ltext)
656 | 
657 |             index += 10
658 |             time.sleep(sleep_time)
659 | 
660 |             try:
661 |                 element = browser.find_element_by_xpath('//*[@id="cnnSearchPagination"]/div/div[3]/a/span[1]')
662 |                 ActionChains(browser).move_to_element(element) \
663 |                     .click(element) \
664 |                     .perform()
665 |                 element = WebDriverWait(browser, 10).until(
666 |                     EC.presence_of_element_located((By.ID, "textResultsContainer")))
667 | 
668 |             except:
669 |                 continue
670 | 
671 |         browser.close()
672 |         self.links = links
673 |         return links
674 | 
675 | 
676 | class CNBCScraper(NewspaperScraper):
677 |     def get_pages (self, sleep_time=3):
678 |         print 'running get_pages()...'
679 | 
680 |         links = []
681 |         stop = False
682 |         index = 1
683 |         days = (self.dateEnd.date() - self.dateStart.date()).days + 1
684 | 
685 |         while not stop:
686 |             page = requests.get('http://search.cnbc.com/rs/search/view.html?partnerId=2000'
687 |                                 + '&keywords=' + self.searchTerm
688 |                                 + '&sort=date&type=news&source=CNBC.com'
689 |                                 + '&pubtime=' + str(days) + '&pubfreq=d'
690 |                                 + '&page=' + str(index))
691 |             soup = BeautifulSoup(page.content)
692 | 
693 |             if soup.find('div', class_="SearchResultCard") is None:
694 |                 stop = True
695 |                 continue
696 | 
697 |             for result in soup.find_all('div', class_="SearchResultCard"):
698 |                 seconds_since_epoch = float(re.findall(r'\d+', result.find('time').get_text())[0])
699 |                 pub_date = str(datetime.fromtimestamp(seconds_since_epoch / 1000).replace(hour=0, minute=0, second=0,
700 |                                                                                           microsecond=0))
701 | 
702 |                 if self.check_dates(pub_date):
703 |                     link = result.find('h3', class_="title")
704 |                     ltext = link.find('a').get('href')
705 |                     if ltext not in links:
706 |                         print ltext
707 |                         links.append(ltext)
708 | 
709 |             index += 1
710 |             time.sleep(sleep_time)
711 | 
712 |         self.links = links
713 |         return links
714 | 
715 | 
716 | class USATodayScraper(NewspaperScraper):
717 |     def get_pages (self, sleep_time=5):
718 |         print 'running get_pages()...'
719 | 
720 |         browser = webdriver.Chrome()
721 |         browser.get('http://www.usatoday.com/search/' + self.searchTerm + '/')
722 | 
723 |         links = []
724 |         stop = False
725 |         index = 1
726 | 
727 |         element = browser.find_element_by_xpath('/html/body/div[2]/div[1]/div/div[3]/span[2]')
728 |         ActionChains(browser).move_to_element(element) \
729 |             .click(element) \
730 |             .perform()
731 | 
732 |         lastHeight = browser.execute_script("return document.body.scrollHeight")
733 |         tries = 0
734 | 
735 |         while not stop:
736 |             soup = BeautifulSoup(browser.page_source)
737 |             # print soup.find_all('li', class_=' search-result-item')
738 |             last_search_item = soup.find_all('li', class_=' search-result-item')[-1]
739 |             link = last_search_item.find('a', class_='search-result-item-link').get('href')
740 |             date_match = re.search('([0-9]{4}/[0-9]{2}/[0-9]{2})', link)
741 |             if date_match is not None:
742 |                 print date_match.group(1)
743 | 
744 |             browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
745 |             time.sleep(sleep_time)
746 | 
747 |             newHeight = browser.execute_script("return document.body.scrollHeight")
748 |             if (newHeight == lastHeight):
749 |                 tries += 1
750 |                 time.sleep(5)
751 |                 if tries >= 5:
752 |                     stop = True
753 |             else:
754 |                 tries = 0
755 | 
756 |             lastHeight = newHeight
757 | 
758 |         soup = BeautifulSoup(browser.page_source)
759 | 
760 |         for result in soup.find_all('li', class_=' search-result-item'):
761 |             link = result.find('a', class_='search-result-item-link').get('href')
762 |             date_match = re.search('([0-9]{4}/[0-9]{2}/[0-9]{2})', link)
763 | 
764 |             if date_match is not None:
765 |                 if self.check_dates(date_match.group(1)):
766 |                     ltext = 'http://www.usatoday.com/' + link
767 | 
768 |                     if ltext not in links:
769 |                         print ltext
770 |                         links.append(ltext)
771 |                 else:
772 |                     continue
773 | 
774 |             index += 1
775 | 
776 |         browser.close()
777 |         self.links = links
778 |         return links
779 | 
780 | 
781 | class WSJScraper(NewspaperScraperWithAuthentication):
782 |     def get_pages (self, sleep_time=3):
783 |         print 'running get_pages()...'
784 | 
785 |         links = []
786 |         stop = False
787 |         index = 1
788 | 
789 |         while not stop:
790 |             page = requests.get('http://www.wsj.com/search/term.html?KEYWORDS='
791 |                                 + self.searchTerm
792 |                                 + '&min-date=' + str(self.dateStart.date()).replace('-', '/')
793 |                                 + '&max-date=' + str(self.dateEnd.date()).replace('-', '/')
794 |                                 + '&page=' + str(index)
795 |                                 + '&isAdvanced=true&daysback=4y&andor=AND&sort=date-desc&source=wsjarticle,wsjblogs,sitesearch')
796 |             soup = BeautifulSoup(page.content)
797 | 
798 |             if soup.find('div', class_="headline-item") is None:
799 |                 stop = True
800 |                 continue
801 | 
802 |             for result in soup.find_all('div', class_="headline-item"):
803 |                 pub_date = result.find('time', class_='date-stamp-container').get_text()
804 |                 if 'min' in pub_date:
805 |                     pub_date = str((datetime.now(timezone('EST')) - timedelta(minutes=int(pub_date[0]))).date())
806 |                 elif 'hour' in pub_date:
807 |                     pub_date = str((datetime.now(timezone('EST')) - timedelta(hours=int(pub_date[0]))).date())
808 |                 else:
809 |                     pub_date = pub_date.split()
810 |                     pub_date = pub_date[0] + ' ' + pub_date[1] + ' ' + pub_date[2]
811 | 
812 |                 if self.check_dates(pub_date):
813 |                     link = result.find('h3', class_="headline")
814 |                     ltext = link.find('a').get('href')
815 |                     if 'http://' not in ltext:
816 |                         ltext = 'http://www.wsj.com' + ltext
817 | 
818 |                     if ltext not in links and 'video' not in ltext:
819 |                         print ltext
820 |                         links.append(ltext)
821 | 
822 |             index += 1
823 |             time.sleep(sleep_time)
824 | 
825 |         self.links = links
826 |         return links
827 | 
828 | 
829 | class NYTScraper(NewspaperScraperWithAuthentication):
830 |     def get_pages (self, sleep_time=5):
831 |         print 'running get_pages()...'
832 | 
833 |         profile = webdriver.FirefoxProfile()
834 |         browser = webdriver.Firefox(profile)
835 | 
836 |         links = []
837 |         stop = False
838 |         index = 1
839 |         current_start = (self.dateEnd - timedelta(days=6)).date()
840 |         current_end = self.dateEnd.date()
841 | 
842 |         while not stop:
843 |             while True:
844 |                 browser.get('http://query.nytimes.com/search/sitesearch/?action=click&contentCollection'
845 |                             + '&region=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/'
846 |                             + self.searchTerm
847 |                             + '/from' + str(current_start).replace('-', '') + 'to' + str(current_end).replace('-', '')
848 |                             + '/allresults/'
849 |                             + str(index)
850 |                             + '/allauthors/newest/')
851 | 
852 |                 time.sleep(sleep_time)
853 |                 soup = BeautifulSoup(browser.page_source)
854 | 
855 |                 for result in soup.find_all('li', class_="story"):
856 |                     pub_div = result.find('span', class_='dateline')
857 |                     if pub_div is None:
858 |                         continue
859 | 
860 |                     if self.check_dates(pub_div.get_text()):
861 |                         link = result.find('div', class_='element2')
862 |                         ltext = link.find('a').get('href')
863 |                         section = self.get_section(ltext)
864 | 
865 |                         if section != 'video' and ltext not in links:
866 |                             # print ltext
867 |                             print pub_div.get_text()
868 |                             links.append(ltext)
869 | 
870 |                     else:
871 |                         stop = True
872 |                         break
873 | 
874 |                 next_page = soup.find('a', class_="stepToPage next")
875 |                 if not next_page and index == 1:
876 |                     continue
877 |                 elif not next_page or stop is True:
878 |                     break
879 | 
880 |                 index += 1
881 | 
882 |             current_start = current_start - timedelta(days=7)
883 |             current_end = current_end - timedelta(days=7)
884 |             index = 1
885 | 
886 |         browser.close()
887 |         self.links = links
888 |         return links
889 | 
890 |     def get_section (self, href):
891 |         href = href[22:]
892 |         try:
893 |             return re.search('/.*?/', href).group(0)[1:-1]
894 |         except:
895 |             return 'error'''
896 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Newspaper Scrapers
 2 | 
 3 | ## A quick precursor: 
 4 | We used these scripts to collect data for two projects on The DataFace's website: [34 Percent of Articles about Trump Now Mention His Twitter Activity](http://thedataface.com/trumps-twitter-activity/) and [Trump and the Media: A Text Analysis](http://thedataface.com/trump-media-analysis/).
 5 | 
 6 | NewspaperScraper.py provides support for scraping the websites of 14 major media outlets. They are listed below:
 7 | * New York Times
 8 | * Washington Post
 9 | * Wall Street Journal
10 | * USA Today
11 | * CNN
12 | * Fox News
13 | * Politico
14 | * Slate
15 | * CNBC
16 | * Bloomberg
17 | * TIME
18 | * The Weekly Standard
19 | * LA Times
20 | * Chicago Tribune
21 | 
22 | You can extend the library to support other websites by creating new classes in NewspaperScraper.py. Just make sure your class inherits from NewspaperScraper, then write your own version of get_pages() specific to each new site!
23 | 
24 | ## Dependencies:
25 | This project is indebted to the great work of Lucas Ou-Yang and his [Newspaper library](http://newspaper.readthedocs.io/en/latest/).
26 | 
27 | Here are the rest of the project's dependencies. Be sure to install these before proceeding:
28 | * [requests](http://docs.python-requests.org/en/master/)
29 | * [selenium](http://selenium-python.readthedocs.io/)
30 | * [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
31 | * MongoDB + [pyMongo](https://api.mongodb.com/python/current/)
32 | * [pytz](http://pythonhosted.org/pytz/)
33 | 
34 | ## Here's how to use this thing...
35 | A NewspaperScraper object expects four inputs (at a minimum). The scraper's name, a search term, a start date, and an end date. After initializing a scraper, the intended workflow is as follows:
36 | * First, run get_pages() to find the URLs of all articles matching the search term within the relevant time period.
37 | * Then, run newspaper_parser() to grab metadata about each article returned by get_pages()
38 | * Finally, store the data using either write_to_mongo() or write_to_csv()
39 | 
40 | If you have mongoDB installed, you can get started quickly by referencing RunScrapers.py. You'll simply write the four inputs on the command line.
41 | 
42 | Note 1: NYT and WSJ require the credentials of a subscribed user to work. Those can be input as command line arguments as well (see RunScrapers.py).
43 | 
44 | Note 2: Some scrapers work better than others. We had some glitches gathering data from NYT and CNN in particular (oops), so feel free to fork + improve!
45 | 
46 | ## What you'll end up with:
47 | A database (or file) that contains the following pieces of metadata about each article:
48 | * title
49 | * date_published
50 | * news_outlet
51 | * authors
52 | * feature_img
53 | * article_link
54 | * keywords
55 | * movies
56 | * summary
57 | * text
58 | * html
59 | 


--------------------------------------------------------------------------------
/RunScrapers.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pymongo import MongoClient
 3 | from NewspaperScraper import *
 4 | 
 5 | client = MongoClient()
 6 | db = client.News_database
 7 | 
 8 | 
 9 | def run_scraper (scraper):
10 |     scraper.get_pages()
11 |     data = scraper.newspaper_parser()
12 |     scraper.write_to_mongo(data, db.articles_about_fake_news_rerun)
13 | 
14 | 
15 | def initialize_scraper (args):
16 |     if args[1] == 'Chicago Tribune':
17 |         run_scraper(ChicagoTribuneScraper(args[1], args[2], args[3], args[4]))
18 |     elif args[1] == 'Los Angeles Times':
19 |         run_scraper(LaTimesScraper(args[1], args[2], args[3], args[4]))
20 |     elif args[1] == 'Washington Post':
21 |         run_scraper(WashPostScraper(args[1], args[2], args[3], args[4]))
22 |     elif args[1] == 'Slate':
23 |         run_scraper(SlateScraper(args[1], args[2], args[3], args[4]))
24 |     elif args[1] == 'Politico':
25 |         run_scraper(PoliticoScraper(args[1], args[2], args[3], args[4]))
26 |     elif args[1] == 'Fox News':
27 |         run_scraper(FoxNewsScraper(args[1], args[2], args[3], args[4]))
28 |     elif args[1] == 'The Weekly Standard':
29 |         run_scraper(WeeklyStandardScraper(args[1], args[2], args[3], args[4]))
30 |     elif args[1] == 'Bloomberg':
31 |         run_scraper(BloombergScraper(args[1], args[2], args[3], args[4]))
32 |     elif args[1] == 'TIME':
33 |         run_scraper(TimeScraper(args[1], args[2], args[3], args[4]))
34 |     elif args[1] == 'Wall Street Journal':
35 |         run_scraper(WSJScraper(args[1], args[2], args[3], args[4], args[5], args[6]))
36 |     elif args[1] == 'New York Times':
37 |         run_scraper(NYTScraper(args[1], args[2], args[3], args[4], args[5], args[6]))
38 |     elif args[1] == 'CNN':
39 |         run_scraper(CNNScraper(args[1], args[2], args[3], args[4]))
40 |     elif args[1] == 'USA Today':
41 |         run_scraper(USATodayScraper(args[1], args[2], args[3], args[4]))
42 |     elif args[1] == 'CNBC':
43 |         run_scraper(CNBCScraper(args[1], args[2], args[3], args[4]))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     initialize_scraper(sys.argv)
48 | 


--------------------------------------------------------------------------------