├── .gitignore
├── source_list.csv
├── cloacina
├── __init__.py
├── construct_page_list.py
├── mongo_connection.py
├── extract_from_b64.py
├── get_source_day_total.py
├── authenticate.py
├── download_day_source.py
├── utilities.py
└── get_results.py
├── default_config.ini
├── README.md
├── LICENSE
├── source_name_id.json
└── bbc_test_download.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | default_config.ini
4 |
--------------------------------------------------------------------------------
/source_list.csv:
--------------------------------------------------------------------------------
1 | New York Times;1981-01-01;1983-12-31
2 |
--------------------------------------------------------------------------------
/cloacina/__init__.py:
--------------------------------------------------------------------------------
1 | from authenticate import authenticate
2 | from construct_page_list import construct_page_list
3 | from download_day_source import download_day_source
4 | from extract_from_b64 import extract_from_b64
5 | from get_results import get_results
6 | from get_source_day_total import get_source_day_total
7 | from utilities import parse_config
8 | import mongo_connection
9 |
--------------------------------------------------------------------------------
/default_config.ini:
--------------------------------------------------------------------------------
1 | [LexisNexis]
2 | user = fool
3 | password = highentropy
4 |
5 | [Database]
6 | collection_list = stories
7 |
8 | [URLS]
9 | file = source_list.csv
10 |
11 | [Processes]
12 | pool_size = 30
13 |
14 | [Logging]
15 | log_file = scraper_log.log
16 | #Can set to debug, info, warning. If debug there will be a lot of information.
17 | #If info there will be entries about new additions to the database and errors.
18 | #If warning will only be errors thrown by the scraper.
19 | level = info
20 |
21 | [Auth]
22 | auth_db = db_name
23 | auth_user = username
24 | auth_pass = password
25 | db_host = 127.0.0.1
26 |
--------------------------------------------------------------------------------
/cloacina/construct_page_list.py:
--------------------------------------------------------------------------------
1 | def construct_page_list(total_results):
2 | total_results = int(total_results)
3 | base_iter = total_results / 10
4 | remainder = total_results % 10
5 |
6 | iter_list = []
7 |
8 | if total_results == 0:
9 | return [(0, 0)]
10 |
11 | if total_results < 10:
12 | return [(1, total_results)]
13 |
14 | for i in range(base_iter):
15 | iter_list.append((i * 10 + 1, i * 10 + 10))
16 |
17 | if remainder:
18 | iter_list.append((range(base_iter)[-1] * 10 + 10 + 1, range(base_iter)[-1] * 10 + 10 + remainder))
19 |
20 | return iter_list
21 |
--------------------------------------------------------------------------------
/cloacina/mongo_connection.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 |
4 | def add_entry(collection, news_source, article_title, publication_date_raw, article_body, lang, doc_id):
5 | toInsert = {"news_source": news_source,
6 | "article_title": article_title,
7 | "publication_date_raw": publication_date_raw,
8 | "date_added": datetime.datetime.utcnow(),
9 | "article_body": article_body,
10 | "stanford": 0,
11 | "language": lang,
12 | "doc_id" : doc_id}
13 | object_id = collection.insert(toInsert)
14 | return object_id
15 |
16 |
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | cloacina
2 | =======
3 |
4 | Tools for downloading from the LexisNexis API
5 |
6 | Currently, `cloacina` includes high-level interfaces for several LexisNexis
7 | endpoint and operations, including:
8 |
9 | - authentication
10 |
11 | - getting the number of results available for a given source for a given day
12 |
13 | - downloading and formatting all articles for a given source for a given day
14 |
15 |
16 | Usage
17 | -----
18 |
19 | Run a test to download some days of BBC Monitoring articles like this:
20 | `python bbc_test_download.py`
21 |
22 | To modify the sources and dates downloaded, modify the `source_list.csv` file,
23 | which is actually a semicolon separated file with source;startdate;enddate.
24 |
25 | The Mongo collection the stories go into is specified in the
26 | `default_config.ini` file.
27 |
28 | If you're a memeber of the Open Event Data Alliance, get in touch with me for
29 | the test username/password.
30 |
--------------------------------------------------------------------------------
/cloacina/extract_from_b64.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import json
3 | import re
4 |
5 | def extract_from_b64(encoded_doc):
6 | #doc = base64.urlsafe_b64decode(encoded_doc)
7 | doc = encoded_doc.decode("base64")
8 | doc = doc.decode('utf-8')
9 | doc = re.sub("
', " ", doc)
11 | soup = BeautifulSoup(doc)
12 | news_source = soup.find("meta", {"name":"sourceName"})['content']
13 | article_title = soup.find("title").text.strip()
14 | try:
15 | publication_date = soup.find("div", {"class":"PUB-DATE"}).text.strip()
16 | except AttributeError:
17 | publication_date = soup.find("div", {"class":"DATE"}).text.strip()
18 | article_body = soup.find("div", {"class":"BODY"}).text.strip()
19 | doc_id = soup.find("meta", {"name":"documentToken"})['content']
20 |
21 | data = {"news_source" : news_source,
22 | "publication_date_raw" : publication_date,
23 | "article_title" : article_title,
24 | "article_body" : article_body,
25 | "doc_id" : doc_id}
26 |
27 | return data
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Andy Halterman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/cloacina/get_source_day_total.py:
--------------------------------------------------------------------------------
1 | import re
2 | from get_results import get_results
3 |
4 | # logging is a hassle in multiprocessing, so use print for now. Sorry!
5 |
6 | def get_source_day_total(source_name, date, authToken):
7 | try:
8 | t = get_results(source_name, date, 1, 10, authToken)
9 | if t.status_code == 500:
10 | #print "There was an error. Check the log file"
11 | print "Error 500 from server on getting source-day total for {0} on {1}: {2}".format(source_name, date, t.text)
12 | return 0
13 | c = re.findall('">(\d+?)
10 |
11 |
12 | {0}
13 | {1}
14 |
15 |
16 |
17 | """.format(username, password)
18 |
19 | headers = {"Host": "www.lexisnexis.com",
20 | "Content-Type": "text/xml; charset=UTF-8",
21 | "Content-Length": len(request),
22 | "SOAPAction": "Authenticate"}
23 |
24 | t = requests.post(url="https://www.lexisnexis.com/wsapi/v1/services/Authentication",
25 | headers = headers,
26 | data = request)
27 |
28 | t = t.text
29 | p = ET.fromstring(t)
30 | p = p[0][0]
31 | for i in p.findall('{http://authenticate.authentication.services.v1.wsapi.lexisnexis.com}binarySecurityToken'):
32 | return i.text
33 |
34 |
--------------------------------------------------------------------------------
/cloacina/download_day_source.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from bs4 import BeautifulSoup
3 | from construct_page_list import construct_page_list
4 | from extract_from_b64 import extract_from_b64
5 | from get_results import get_results
6 |
7 | def download_day_source(source_name, date, source_day_total, authToken):
8 |
9 | iter_list = construct_page_list(source_day_total)
10 |
11 | results_list = []
12 |
13 | # if source_day_total is 0, just pass?
14 |
15 | for p in iter_list:
16 | t = get_results(source_name, date, p[0], p[1], authToken)
17 | results_list.append(t.text)
18 |
19 | output_list = [] # keep outside the iter
20 | junk_list = []
21 |
22 | for t in results_list:
23 | soup = BeautifulSoup(t)
24 |
25 | for num, i in enumerate(soup.findAll("ns1:document")):
26 | try:
27 | t = i.text
28 | # try:
29 | d = extract_from_b64(t)
30 | output_list.append(d)
31 | except:
32 | junk_list.append(t) # error handling ¯\_(ツ)_/¯
33 | if junk_list:
34 | print "There were problems getting text from the base 64 in download_day_source. {0}".format(len(junk_list))
35 |
36 |
37 | output = {"stories" : output_list,
38 | "junk" : junk_list}
39 | return output
40 |
--------------------------------------------------------------------------------
/cloacina/utilities.py:
--------------------------------------------------------------------------------
1 | from ConfigParser import ConfigParser
2 | import glob
3 | import os
4 |
5 | # thanks, OEDA! https://github.com/openeventdata/scraper/blob/master/scraper.py
6 |
7 | def _parse_config(parser):
8 | try:
9 | if 'Auth' in parser.sections():
10 | auth_db = parser.get('Auth', 'auth_db')
11 | auth_user = parser.get('Auth', 'auth_user')
12 | auth_pass = parser.get('Auth', 'auth_pass')
13 | db_host = parser.get('Auth', 'db_host')
14 | else:
15 | # Try env vars too
16 | auth_db = os.getenv('MONGO_AUTH_DB') or ''
17 | auth_user = os.getenv('MONGO_AUTH_USER') or ''
18 | auth_pass = os.getenv('MONGO_AUTH_PASS') or ''
19 | db_host = os.getenv('MONGO_HOST') or ''
20 |
21 | ln_user = parser.get('LexisNexis', 'user')
22 | ln_password = parser.get('LexisNexis', 'password')
23 | log_dir = parser.get('Logging', 'log_file')
24 | log_level = parser.get('Logging', 'level')
25 | collection = parser.get('Database', 'collection_list')
26 | whitelist = parser.get('URLS', 'file')
27 | pool_size = int(parser.get('Processes', 'pool_size'))
28 | return ln_user, ln_password, collection, whitelist, pool_size, log_dir, log_level, auth_db, auth_user, \
29 | auth_pass, db_host
30 | except Exception, e:
31 | print 'Problem parsing config file. {}'.format(e)
32 |
33 | def parse_config():
34 | """Function to parse the config file."""
35 | config_file = glob.glob('default_config.ini')
36 | parser = ConfigParser()
37 | if config_file:
38 | parser.read(config_file)
39 | else:
40 | cwd = os.path.abspath(os.path.dirname(__file__))
41 | config_file = os.path.join(cwd, 'default_config.ini')
42 | parser.read(config_file)
43 | return _parse_config(parser)
44 |
--------------------------------------------------------------------------------
/source_name_id.json:
--------------------------------------------------------------------------------
1 | {
2 | "New York Times":"6742",
3 | "BBC Monitoring":"10962",
4 | "AFP":"10903",
5 | "AllAfrica":"361826",
6 | "Australian Associated Press":"160586",
7 | "Times of India" : "362639",
8 | "The Times (South Africa)" : "345265",
9 | "Today's Zaman" : "410344",
10 | "The Toronto Star" : "8286",
11 | "Al-Akhbar English" : "399153",
12 | "Al-Ahram" : "402861",
13 | "The Christian Science Monitor" : "7945",
14 | "Asharq Alawsat (English)" : "372173",
15 | "Asian News International" : "344861",
16 | "The Sydney Morning Herald (Australia)" : "314237",
17 | "The Bangkok Post (Thailand)" : "410348",
18 | "South China Morning Post" : "11314",
19 | "The Sofia Echo" : "390461",
20 | "The China Post" : "41669",
21 | "Cyprus Mail" : "338764",
22 | "International New York Times" : "8357",
23 | "Dawn (Pakistan)" : "410347",
24 | "Dar Al Hayat (Lebanon)" : "335150",
25 | "Ghana News Agency" : "411388",
26 | "Ghanaian Chronicle (Accra)" : "361857",
27 | "GHN News Agency" : "404368",
28 | "The Globe and Mail" : "303830",
29 | "The Guardian" : "138620",
30 | "Indian Express" : "364854",
31 | "International New York Times" : "8357",
32 | "Iran News" : "399241",
33 | "The Japan News" : "145202",
34 | "The Japan Times" : "169018",
35 | "The Jerusalem Post" : "10911",
36 | "The Jordan Times" : "357154",
37 | "The Washington Post" : 167603,
38 | "BBC Monitoring International Reports" : 10962,
39 | "Big News Network" : 407666,
40 |
41 | "ITAR-TASS" : "384326",
42 | "AAP Newsfeed" : "160586",
43 | "Associated Press International" : 138211,
44 | "ABC Premium News" : "300224",
45 | "African Press Organization" : "408854",
46 | "Agence Marocaine De Presse" : "356948",
47 | "All Iraq News Agency" : "416751",
48 | "The Associated Press" : "7911",
49 | "Baltic News Service" : "172030",
50 | "Azeri-Press news agency" : "400825",
51 | "Central Asia General Newswire" : "294963",
52 | "CNN Wire" : "385157",
53 | "FARS News Agency" : "356949",
54 | "Inter Press Service" : "8001",
55 | "Interfax News Agency" : "167603",
56 |
57 | "Al Jazeera (Arabic)" : "416311",
58 | "Asharq Alawsat (Arabic)" : "383738",
59 | "Al Alam (Arabic)" : "416311",
60 | "Al Arabiya (Arabic)" : "367153",
61 | "CNN (Arabic)" : "407866"
62 | }
63 |
--------------------------------------------------------------------------------
/cloacina/get_results.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import requests
3 | import re
4 | import json
5 |
6 | with open('source_name_id.json') as source_file:
7 | source_dict = json.load(source_file)
8 |
9 | def get_results(source_name, date, start_result, end_result, authToken):
10 | #searchterm = "a OR an OR the"
11 | searchterm = "a AND NOT playoff! OR teammate! OR NFL OR halftime OR NBA OR quarterback OR goalie OR NHL OR postseason OR head coach OR N.F.L. OR N.B.A. OR field goal! OR playoff!"
12 | if re.search("Arabic", source_name):
13 | searchterm = u"أن OR من OR هذا OR أن OR يا"
14 | print searchterm
15 |
16 | source = source_dict[source_name]
17 |
18 | req = """
21 |
22 |
23 | {authToken}
24 |
25 |
26 | {source}
27 |
28 |
29 | {searchterm}
30 | 8412
31 |
32 |
33 | {date}
34 | {date}
35 |
36 |
37 |
38 | FullText
39 | Display
40 |
41 | {start_result}
42 | {end_result}
43 |
44 |
45 |
46 |
47 | """.format(authToken = authToken, date = date, source = source, searchterm = searchterm, start_result = start_result, end_result = end_result)
48 |
49 | headers = {"Host": "www.lexisnexis.com",
50 | "Content-Type": "text/xml; charset=UTF-8",
51 | "Content-Length": len(req),
52 | "Origin" : "http://www.lexisnexis.com",
53 | "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36",
54 | "SOAPAction": "Search"}
55 |
56 | try:
57 | t = requests.post(url = "http://www.lexisnexis.com/wsapi/v1/services/Search",
58 | headers = headers,
59 | data = req)
60 | return t
61 |
62 | except Exception as e:
63 | print "Problem in `get_results` for {0} on {1}: {2}".format(source_name, date, e)
64 |
65 | if __name__ == "__main__":
66 | auth = "" # put in fresh authToken before using
67 | t = get_results("New York Times", "2015-09-01", 1, 10, auth)
68 | print t.text
69 |
--------------------------------------------------------------------------------
/bbc_test_download.py:
--------------------------------------------------------------------------------
1 | import cloacina
2 | from cloacina import mongo_connection
3 | import json
4 | import glob
5 | import csv
6 | import datetime
7 | from multiprocessing import Pool
8 | from pymongo import MongoClient
9 | import logging
10 |
11 | logger = logging.getLogger(__name__)
12 | logger.setLevel(logging.INFO)
13 |
14 | handler = logging.FileHandler("cloacina_run.log")
15 | handler.setLevel(logging.INFO)
16 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17 | handler.setFormatter(formatter)
18 | logger.addHandler(handler)
19 | logger.info("Writing logs to {0}".format("cloacina_run.log"))
20 |
21 | ln_user, ln_password, db_collection, whitelist_file, pool_size, log_dir, log_level, auth_db, auth_user, auth_pass, db_host = cloacina.parse_config()
22 |
23 | if db_host:
24 | connection = MongoClient(host=db_host)
25 | else:
26 | connection = MongoClient()
27 |
28 | db = connection.lexisnexis
29 | collection = db[db_collection]
30 |
31 |
32 | authToken = cloacina.authenticate(ln_user, ln_password)
33 | if not authToken:
34 | logger.error("No auth token generated")
35 | print authToken
36 |
37 | #big_stories = []
38 | #big_junk = []
39 |
40 | try:
41 | sourcefile = open(whitelist_file, 'r').read().splitlines()
42 | sourcelist = [line.split(';') for line in sourcefile]
43 | print sourcelist
44 | # Filtering based on list of sources from the config file
45 | # to_scrape = {listing[0]: [listing[1], listing[2]] for listing in sourcelist} <-- leave as list for now.
46 | except IOError:
47 | print 'There was an error. Check the log file for more information.'
48 | logger.warning('Could not open URL whitelist file.')
49 | raise
50 |
51 | with open('source_name_id.json') as source_file:
52 | source_dict = json.load(source_file)
53 |
54 | print "Sourcelist:",
55 | print sourcelist
56 | print "Scraping from source number {0}".format(source_dict[sourcelist[0][0]])
57 |
58 | def make_date_source_list(source):
59 | if len(source) != 3:
60 | logger.warning("Source is not of length 3. Formatting problem? {0}".format(source))
61 | start = datetime.datetime.strptime(source[1], '%Y-%m-%d')
62 | end = datetime.datetime.strptime(source[2], '%Y-%m-%d')
63 | date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
64 | date_list = [i.strftime("%Y-%m-%d") for i in date_generated]
65 | source_list = [[source[0], date] for date in date_list]
66 | return source_list
67 |
68 | print "Un-nesting source list."
69 | sourcelist = [make_date_source_list(source) for source in sourcelist] # apply to each source
70 | sourcelist = [item for sublist in sourcelist for item in sublist] # flatten list of lists. there has to be a neater way.
71 |
72 | if len(sourcelist) < 30:
73 | print sourcelist
74 | if len(sourcelist) > 30:
75 | print sourcelist[0:30]
76 | #logger.info(sourcelist)
77 |
78 | def download_wrapper(source):
79 | # there's some global ugliness going on here. specifically, authToken
80 | try:
81 | output = cloacina.download_day_source(source[0], source[1], source[2], authToken)
82 | lang = 'english'
83 |
84 | mongo_error = []
85 | for result in output['stories']:
86 | try:
87 | entry_id = mongo_connection.add_entry(collection, result['news_source'],
88 | result['article_title'], result['publication_date_raw'],
89 | result['article_body'], lang, result['doc_id'])
90 | except Exception as e:
91 | mongo_error.append(e)
92 | if mongo_error:
93 | logger.warning("There were error(s) in the Mongo loading {0}".format(mongo_error))
94 | except Exception as e:
95 | logger.warning("Error downloading {0}: {1}".format(source, e))
96 |
97 | pool = Pool(pool_size)
98 | logger.info("Using {0} workers to get source-day totals.".format(pool_size))
99 |
100 | totals = [pool.apply_async(cloacina.get_source_day_total, (source[0], source[1], authToken)) for source in sourcelist]
101 | totals = [r.get(9999999) for r in totals]
102 |
103 | logger.info("Here are the totals:\n{0}".format(totals))
104 | print totals
105 |
106 | try:
107 | print sum(totals)
108 | except Exception:
109 | print "Error printing sum of totals."
110 |
111 | # add the totals in a third "column" to the sourcelist
112 | # maybe a better way to do this is to have the totals function take in a list
113 | # and add the totals in the same function.
114 | for i, source in enumerate(sourcelist):
115 | source.append(totals[i])
116 |
117 | # logger.info(sourcelist)
118 |
119 | print "Sending the source list to the pool of workers for downloading"
120 | pool.map(download_wrapper, sourcelist)
121 |
122 | logger.info("Process complete")
123 |
124 |
--------------------------------------------------------------------------------