├── .gitignore ├── source_list.csv ├── cloacina ├── __init__.py ├── construct_page_list.py ├── mongo_connection.py ├── extract_from_b64.py ├── get_source_day_total.py ├── authenticate.py ├── download_day_source.py ├── utilities.py └── get_results.py ├── default_config.ini ├── README.md ├── LICENSE ├── source_name_id.json └── bbc_test_download.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | default_config.ini 4 | -------------------------------------------------------------------------------- /source_list.csv: -------------------------------------------------------------------------------- 1 | New York Times;1981-01-01;1983-12-31 2 | -------------------------------------------------------------------------------- /cloacina/__init__.py: -------------------------------------------------------------------------------- 1 | from authenticate import authenticate 2 | from construct_page_list import construct_page_list 3 | from download_day_source import download_day_source 4 | from extract_from_b64 import extract_from_b64 5 | from get_results import get_results 6 | from get_source_day_total import get_source_day_total 7 | from utilities import parse_config 8 | import mongo_connection 9 | -------------------------------------------------------------------------------- /default_config.ini: -------------------------------------------------------------------------------- 1 | [LexisNexis] 2 | user = fool 3 | password = highentropy 4 | 5 | [Database] 6 | collection_list = stories 7 | 8 | [URLS] 9 | file = source_list.csv 10 | 11 | [Processes] 12 | pool_size = 30 13 | 14 | [Logging] 15 | log_file = scraper_log.log 16 | #Can set to debug, info, warning. If debug there will be a lot of information. 17 | #If info there will be entries about new additions to the database and errors. 18 | #If warning will only be errors thrown by the scraper. 19 | level = info 20 | 21 | [Auth] 22 | auth_db = db_name 23 | auth_user = username 24 | auth_pass = password 25 | db_host = 127.0.0.1 26 | -------------------------------------------------------------------------------- /cloacina/construct_page_list.py: -------------------------------------------------------------------------------- 1 | def construct_page_list(total_results): 2 | total_results = int(total_results) 3 | base_iter = total_results / 10 4 | remainder = total_results % 10 5 | 6 | iter_list = [] 7 | 8 | if total_results == 0: 9 | return [(0, 0)] 10 | 11 | if total_results < 10: 12 | return [(1, total_results)] 13 | 14 | for i in range(base_iter): 15 | iter_list.append((i * 10 + 1, i * 10 + 10)) 16 | 17 | if remainder: 18 | iter_list.append((range(base_iter)[-1] * 10 + 10 + 1, range(base_iter)[-1] * 10 + 10 + remainder)) 19 | 20 | return iter_list 21 | -------------------------------------------------------------------------------- /cloacina/mongo_connection.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | def add_entry(collection, news_source, article_title, publication_date_raw, article_body, lang, doc_id): 5 | toInsert = {"news_source": news_source, 6 | "article_title": article_title, 7 | "publication_date_raw": publication_date_raw, 8 | "date_added": datetime.datetime.utcnow(), 9 | "article_body": article_body, 10 | "stanford": 0, 11 | "language": lang, 12 | "doc_id" : doc_id} 13 | object_id = collection.insert(toInsert) 14 | return object_id 15 | 16 | 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cloacina 2 | ======= 3 | 4 | Tools for downloading from the LexisNexis API 5 | 6 | Currently, `cloacina` includes high-level interfaces for several LexisNexis 7 | endpoint and operations, including: 8 | 9 | - authentication 10 | 11 | - getting the number of results available for a given source for a given day 12 | 13 | - downloading and formatting all articles for a given source for a given day 14 | 15 | 16 | Usage 17 | ----- 18 | 19 | Run a test to download some days of BBC Monitoring articles like this: 20 | `python bbc_test_download.py` 21 | 22 | To modify the sources and dates downloaded, modify the `source_list.csv` file, 23 | which is actually a semicolon separated file with source;startdate;enddate. 24 | 25 | The Mongo collection the stories go into is specified in the 26 | `default_config.ini` file. 27 | 28 | If you're a memeber of the Open Event Data Alliance, get in touch with me for 29 | the test username/password. 30 | -------------------------------------------------------------------------------- /cloacina/extract_from_b64.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import json 3 | import re 4 | 5 | def extract_from_b64(encoded_doc): 6 | #doc = base64.urlsafe_b64decode(encoded_doc) 7 | doc = encoded_doc.decode("base64") 8 | doc = doc.decode('utf-8') 9 | doc = re.sub("

", " ", doc) 10 | doc = re.sub('

', " ", doc) 11 | soup = BeautifulSoup(doc) 12 | news_source = soup.find("meta", {"name":"sourceName"})['content'] 13 | article_title = soup.find("title").text.strip() 14 | try: 15 | publication_date = soup.find("div", {"class":"PUB-DATE"}).text.strip() 16 | except AttributeError: 17 | publication_date = soup.find("div", {"class":"DATE"}).text.strip() 18 | article_body = soup.find("div", {"class":"BODY"}).text.strip() 19 | doc_id = soup.find("meta", {"name":"documentToken"})['content'] 20 | 21 | data = {"news_source" : news_source, 22 | "publication_date_raw" : publication_date, 23 | "article_title" : article_title, 24 | "article_body" : article_body, 25 | "doc_id" : doc_id} 26 | 27 | return data 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Andy Halterman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /cloacina/get_source_day_total.py: -------------------------------------------------------------------------------- 1 | import re 2 | from get_results import get_results 3 | 4 | # logging is a hassle in multiprocessing, so use print for now. Sorry! 5 | 6 | def get_source_day_total(source_name, date, authToken): 7 | try: 8 | t = get_results(source_name, date, 1, 10, authToken) 9 | if t.status_code == 500: 10 | #print "There was an error. Check the log file" 11 | print "Error 500 from server on getting source-day total for {0} on {1}: {2}".format(source_name, date, t.text) 12 | return 0 13 | c = re.findall('">(\d+?) 10 | 11 | 12 | {0} 13 | {1} 14 | 15 | 16 | 17 | """.format(username, password) 18 | 19 | headers = {"Host": "www.lexisnexis.com", 20 | "Content-Type": "text/xml; charset=UTF-8", 21 | "Content-Length": len(request), 22 | "SOAPAction": "Authenticate"} 23 | 24 | t = requests.post(url="https://www.lexisnexis.com/wsapi/v1/services/Authentication", 25 | headers = headers, 26 | data = request) 27 | 28 | t = t.text 29 | p = ET.fromstring(t) 30 | p = p[0][0] 31 | for i in p.findall('{http://authenticate.authentication.services.v1.wsapi.lexisnexis.com}binarySecurityToken'): 32 | return i.text 33 | 34 | -------------------------------------------------------------------------------- /cloacina/download_day_source.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | from construct_page_list import construct_page_list 4 | from extract_from_b64 import extract_from_b64 5 | from get_results import get_results 6 | 7 | def download_day_source(source_name, date, source_day_total, authToken): 8 | 9 | iter_list = construct_page_list(source_day_total) 10 | 11 | results_list = [] 12 | 13 | # if source_day_total is 0, just pass? 14 | 15 | for p in iter_list: 16 | t = get_results(source_name, date, p[0], p[1], authToken) 17 | results_list.append(t.text) 18 | 19 | output_list = [] # keep outside the iter 20 | junk_list = [] 21 | 22 | for t in results_list: 23 | soup = BeautifulSoup(t) 24 | 25 | for num, i in enumerate(soup.findAll("ns1:document")): 26 | try: 27 | t = i.text 28 | # try: 29 | d = extract_from_b64(t) 30 | output_list.append(d) 31 | except: 32 | junk_list.append(t) # error handling ¯\_(ツ)_/¯ 33 | if junk_list: 34 | print "There were problems getting text from the base 64 in download_day_source. {0}".format(len(junk_list)) 35 | 36 | 37 | output = {"stories" : output_list, 38 | "junk" : junk_list} 39 | return output 40 | -------------------------------------------------------------------------------- /cloacina/utilities.py: -------------------------------------------------------------------------------- 1 | from ConfigParser import ConfigParser 2 | import glob 3 | import os 4 | 5 | # thanks, OEDA! https://github.com/openeventdata/scraper/blob/master/scraper.py 6 | 7 | def _parse_config(parser): 8 | try: 9 | if 'Auth' in parser.sections(): 10 | auth_db = parser.get('Auth', 'auth_db') 11 | auth_user = parser.get('Auth', 'auth_user') 12 | auth_pass = parser.get('Auth', 'auth_pass') 13 | db_host = parser.get('Auth', 'db_host') 14 | else: 15 | # Try env vars too 16 | auth_db = os.getenv('MONGO_AUTH_DB') or '' 17 | auth_user = os.getenv('MONGO_AUTH_USER') or '' 18 | auth_pass = os.getenv('MONGO_AUTH_PASS') or '' 19 | db_host = os.getenv('MONGO_HOST') or '' 20 | 21 | ln_user = parser.get('LexisNexis', 'user') 22 | ln_password = parser.get('LexisNexis', 'password') 23 | log_dir = parser.get('Logging', 'log_file') 24 | log_level = parser.get('Logging', 'level') 25 | collection = parser.get('Database', 'collection_list') 26 | whitelist = parser.get('URLS', 'file') 27 | pool_size = int(parser.get('Processes', 'pool_size')) 28 | return ln_user, ln_password, collection, whitelist, pool_size, log_dir, log_level, auth_db, auth_user, \ 29 | auth_pass, db_host 30 | except Exception, e: 31 | print 'Problem parsing config file. {}'.format(e) 32 | 33 | def parse_config(): 34 | """Function to parse the config file.""" 35 | config_file = glob.glob('default_config.ini') 36 | parser = ConfigParser() 37 | if config_file: 38 | parser.read(config_file) 39 | else: 40 | cwd = os.path.abspath(os.path.dirname(__file__)) 41 | config_file = os.path.join(cwd, 'default_config.ini') 42 | parser.read(config_file) 43 | return _parse_config(parser) 44 | -------------------------------------------------------------------------------- /source_name_id.json: -------------------------------------------------------------------------------- 1 | { 2 | "New York Times":"6742", 3 | "BBC Monitoring":"10962", 4 | "AFP":"10903", 5 | "AllAfrica":"361826", 6 | "Australian Associated Press":"160586", 7 | "Times of India" : "362639", 8 | "The Times (South Africa)" : "345265", 9 | "Today's Zaman" : "410344", 10 | "The Toronto Star" : "8286", 11 | "Al-Akhbar English" : "399153", 12 | "Al-Ahram" : "402861", 13 | "The Christian Science Monitor" : "7945", 14 | "Asharq Alawsat (English)" : "372173", 15 | "Asian News International" : "344861", 16 | "The Sydney Morning Herald (Australia)" : "314237", 17 | "The Bangkok Post (Thailand)" : "410348", 18 | "South China Morning Post" : "11314", 19 | "The Sofia Echo" : "390461", 20 | "The China Post" : "41669", 21 | "Cyprus Mail" : "338764", 22 | "International New York Times" : "8357", 23 | "Dawn (Pakistan)" : "410347", 24 | "Dar Al Hayat (Lebanon)" : "335150", 25 | "Ghana News Agency" : "411388", 26 | "Ghanaian Chronicle (Accra)" : "361857", 27 | "GHN News Agency" : "404368", 28 | "The Globe and Mail" : "303830", 29 | "The Guardian" : "138620", 30 | "Indian Express" : "364854", 31 | "International New York Times" : "8357", 32 | "Iran News" : "399241", 33 | "The Japan News" : "145202", 34 | "The Japan Times" : "169018", 35 | "The Jerusalem Post" : "10911", 36 | "The Jordan Times" : "357154", 37 | "The Washington Post" : 167603, 38 | "BBC Monitoring International Reports" : 10962, 39 | "Big News Network" : 407666, 40 | 41 | "ITAR-TASS" : "384326", 42 | "AAP Newsfeed" : "160586", 43 | "Associated Press International" : 138211, 44 | "ABC Premium News" : "300224", 45 | "African Press Organization" : "408854", 46 | "Agence Marocaine De Presse" : "356948", 47 | "All Iraq News Agency" : "416751", 48 | "The Associated Press" : "7911", 49 | "Baltic News Service" : "172030", 50 | "Azeri-Press news agency" : "400825", 51 | "Central Asia General Newswire" : "294963", 52 | "CNN Wire" : "385157", 53 | "FARS News Agency" : "356949", 54 | "Inter Press Service" : "8001", 55 | "Interfax News Agency" : "167603", 56 | 57 | "Al Jazeera (Arabic)" : "416311", 58 | "Asharq Alawsat (Arabic)" : "383738", 59 | "Al Alam (Arabic)" : "416311", 60 | "Al Arabiya (Arabic)" : "367153", 61 | "CNN (Arabic)" : "407866" 62 | } 63 | -------------------------------------------------------------------------------- /cloacina/get_results.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | import re 4 | import json 5 | 6 | with open('source_name_id.json') as source_file: 7 | source_dict = json.load(source_file) 8 | 9 | def get_results(source_name, date, start_result, end_result, authToken): 10 | #searchterm = "a OR an OR the" 11 | searchterm = "a AND NOT playoff! OR teammate! OR NFL OR halftime OR NBA OR quarterback OR goalie OR NHL OR postseason OR head coach OR N.F.L. OR N.B.A. OR field goal! OR playoff!" 12 | if re.search("Arabic", source_name): 13 | searchterm = u"أن OR من OR هذا OR أن OR يا" 14 | print searchterm 15 | 16 | source = source_dict[source_name] 17 | 18 | req = """ 21 | 22 | 23 | {authToken} 24 | 25 | 26 | {source} 27 | 28 | 29 | {searchterm} 30 | 8412 31 | 32 | 33 | {date} 34 | {date} 35 | 36 | 37 | 38 | FullText 39 | Display 40 | 41 | {start_result} 42 | {end_result} 43 | 44 | 45 | 46 | 47 | """.format(authToken = authToken, date = date, source = source, searchterm = searchterm, start_result = start_result, end_result = end_result) 48 | 49 | headers = {"Host": "www.lexisnexis.com", 50 | "Content-Type": "text/xml; charset=UTF-8", 51 | "Content-Length": len(req), 52 | "Origin" : "http://www.lexisnexis.com", 53 | "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36", 54 | "SOAPAction": "Search"} 55 | 56 | try: 57 | t = requests.post(url = "http://www.lexisnexis.com/wsapi/v1/services/Search", 58 | headers = headers, 59 | data = req) 60 | return t 61 | 62 | except Exception as e: 63 | print "Problem in `get_results` for {0} on {1}: {2}".format(source_name, date, e) 64 | 65 | if __name__ == "__main__": 66 | auth = "" # put in fresh authToken before using 67 | t = get_results("New York Times", "2015-09-01", 1, 10, auth) 68 | print t.text 69 | -------------------------------------------------------------------------------- /bbc_test_download.py: -------------------------------------------------------------------------------- 1 | import cloacina 2 | from cloacina import mongo_connection 3 | import json 4 | import glob 5 | import csv 6 | import datetime 7 | from multiprocessing import Pool 8 | from pymongo import MongoClient 9 | import logging 10 | 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.INFO) 13 | 14 | handler = logging.FileHandler("cloacina_run.log") 15 | handler.setLevel(logging.INFO) 16 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | handler.setFormatter(formatter) 18 | logger.addHandler(handler) 19 | logger.info("Writing logs to {0}".format("cloacina_run.log")) 20 | 21 | ln_user, ln_password, db_collection, whitelist_file, pool_size, log_dir, log_level, auth_db, auth_user, auth_pass, db_host = cloacina.parse_config() 22 | 23 | if db_host: 24 | connection = MongoClient(host=db_host) 25 | else: 26 | connection = MongoClient() 27 | 28 | db = connection.lexisnexis 29 | collection = db[db_collection] 30 | 31 | 32 | authToken = cloacina.authenticate(ln_user, ln_password) 33 | if not authToken: 34 | logger.error("No auth token generated") 35 | print authToken 36 | 37 | #big_stories = [] 38 | #big_junk = [] 39 | 40 | try: 41 | sourcefile = open(whitelist_file, 'r').read().splitlines() 42 | sourcelist = [line.split(';') for line in sourcefile] 43 | print sourcelist 44 | # Filtering based on list of sources from the config file 45 | # to_scrape = {listing[0]: [listing[1], listing[2]] for listing in sourcelist} <-- leave as list for now. 46 | except IOError: 47 | print 'There was an error. Check the log file for more information.' 48 | logger.warning('Could not open URL whitelist file.') 49 | raise 50 | 51 | with open('source_name_id.json') as source_file: 52 | source_dict = json.load(source_file) 53 | 54 | print "Sourcelist:", 55 | print sourcelist 56 | print "Scraping from source number {0}".format(source_dict[sourcelist[0][0]]) 57 | 58 | def make_date_source_list(source): 59 | if len(source) != 3: 60 | logger.warning("Source is not of length 3. Formatting problem? {0}".format(source)) 61 | start = datetime.datetime.strptime(source[1], '%Y-%m-%d') 62 | end = datetime.datetime.strptime(source[2], '%Y-%m-%d') 63 | date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)] 64 | date_list = [i.strftime("%Y-%m-%d") for i in date_generated] 65 | source_list = [[source[0], date] for date in date_list] 66 | return source_list 67 | 68 | print "Un-nesting source list." 69 | sourcelist = [make_date_source_list(source) for source in sourcelist] # apply to each source 70 | sourcelist = [item for sublist in sourcelist for item in sublist] # flatten list of lists. there has to be a neater way. 71 | 72 | if len(sourcelist) < 30: 73 | print sourcelist 74 | if len(sourcelist) > 30: 75 | print sourcelist[0:30] 76 | #logger.info(sourcelist) 77 | 78 | def download_wrapper(source): 79 | # there's some global ugliness going on here. specifically, authToken 80 | try: 81 | output = cloacina.download_day_source(source[0], source[1], source[2], authToken) 82 | lang = 'english' 83 | 84 | mongo_error = [] 85 | for result in output['stories']: 86 | try: 87 | entry_id = mongo_connection.add_entry(collection, result['news_source'], 88 | result['article_title'], result['publication_date_raw'], 89 | result['article_body'], lang, result['doc_id']) 90 | except Exception as e: 91 | mongo_error.append(e) 92 | if mongo_error: 93 | logger.warning("There were error(s) in the Mongo loading {0}".format(mongo_error)) 94 | except Exception as e: 95 | logger.warning("Error downloading {0}: {1}".format(source, e)) 96 | 97 | pool = Pool(pool_size) 98 | logger.info("Using {0} workers to get source-day totals.".format(pool_size)) 99 | 100 | totals = [pool.apply_async(cloacina.get_source_day_total, (source[0], source[1], authToken)) for source in sourcelist] 101 | totals = [r.get(9999999) for r in totals] 102 | 103 | logger.info("Here are the totals:\n{0}".format(totals)) 104 | print totals 105 | 106 | try: 107 | print sum(totals) 108 | except Exception: 109 | print "Error printing sum of totals." 110 | 111 | # add the totals in a third "column" to the sourcelist 112 | # maybe a better way to do this is to have the totals function take in a list 113 | # and add the totals in the same function. 114 | for i, source in enumerate(sourcelist): 115 | source.append(totals[i]) 116 | 117 | # logger.info(sourcelist) 118 | 119 | print "Sending the source list to the pool of workers for downloading" 120 | pool.map(download_wrapper, sourcelist) 121 | 122 | logger.info("Process complete") 123 | 124 | --------------------------------------------------------------------------------