├── .gitignore ├── README.md ├── abstracts.md ├── dump.sql ├── main.py ├── requirements.txt └── src ├── __init__.py ├── arxiv_etl.py ├── db.py ├── google_etl.py ├── nips_etl.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | keys 2 | db.sqlite 3 | *.pyc 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NIPScraper - scraper for NIPS 2017 paper abstracts 2 | 3 | Like most people I'm just trying to figure out how to best spend my time at this conference. Figured that collecting and reading [all of the abstracts](https://nips.cc/Conferences/2017/AcceptedPapersInitial) (680 of them!) would help me find the most relevant talks/posters/workshops for my work. I'm hoping that reading all this will make it easier to identify some promising research directions, too. 4 | 5 | Most of the papers aren't on the first page of Google results, which I take to mean that they haven't been published yet, and I'm also working around a daily rate limit of a few hundred searches from Google's API. Currently I have 297/680. 6 | 7 | Watch or star this repo, I'll be updating it frequently/daily! Contributions/feature requests welcome, of course. 8 | 9 | This program scrapes Arxiv for paper abstracts, authors, categories, dumps them into a [Postgres database](https://github.com/JasonBenn/nips-scraper/blob/master/dump.sql), and exports them to [abstracts.md](https://github.com/JasonBenn/nips-scraper/blob/master/abstracts.md) (which is nicer for reading on a Kindle). 10 | 11 | 12 | ## Related... 13 | 14 | I also made some simple JS one-liners that copy NIPS workshop and tutorial information to your clipboard so that you can paste them into a spreadsheet. Find the spreadsheet [here](https://docs.google.com/spreadsheets/d/1gQpSSjoypqtTSPaJdLvT8UsGEgjJXZSZc0KkLlSDLFk/edit?usp=sharing) (the snippets are saved as comments). 15 | 16 | 17 | ## Contributing 18 | 19 | Any pull requests are welcome, of course. 20 | 21 | If you'd like to run it yourself, you'll need a Google Custom Search API account connected to a billing account and postgres 9.6+. Run like so: 22 | ``` 23 | python main.py 24 | ``` 25 | 26 | Also: wow, I really should have used https://docs.scrapy.org/en/latest/. 27 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import sys 5 | from argparse import ArgumentParser 6 | import io 7 | import json 8 | from src.db import DB 9 | from src.arxiv_etl import ArxivETL 10 | from src.google_etl import GoogleETL 11 | from src.nips_etl import NipsETL 12 | from src.utils import RateLimitError 13 | 14 | 15 | NUM_NIPS_17_PAPERS = 680 16 | 17 | def scrape(start_index): 18 | db = DB() 19 | nips = NipsETL(db) 20 | google = GoogleETL(db) 21 | arxiv = ArxivETL(db) 22 | 23 | titles = db.all('nips_papers') 24 | print "found %s nips_papers" % len(titles) 25 | if len(titles) < NUM_NIPS_17_PAPERS: 26 | print "fetching..." 27 | response = nips.extract() 28 | titles = nips.transform(response) 29 | nips.load(titles) 30 | 31 | all_nips_papers_missing_abstracts = db.all_nips_papers_missing_abstracts() 32 | print "found %i nips papers missing abstracts" % len(all_nips_papers_missing_abstracts) 33 | 34 | for record in all_nips_papers_missing_abstracts: 35 | print "fetching #%d: %s" % (record['id'], record['title']) 36 | try: 37 | google_response = google.extract(record["title"]) 38 | except RateLimitError: 39 | break 40 | search_result = google.transform(record['id'], google_response) 41 | google.load(search_result) 42 | 43 | if search_result["abstract_url"]: 44 | print "found search result!" 45 | arxiv_response = arxiv.extract(search_result["abstract_url"]) 46 | abstract = arxiv.transform(arxiv_response) 47 | arxiv.load(record["id"], abstract) 48 | 49 | db.to_md("abstracts.md") 50 | 51 | if __name__ == '__main__': 52 | parser = ArgumentParser() 53 | parser.add_argument("--start-index", nargs='?', default=0) 54 | args = parser.parse_args() 55 | start_index = int(args.start_index) 56 | scrape(start_index) 57 | 58 | # INSERT INTO nips_papers (title) VALUES ('sup'); 59 | 60 | # INSERT INTO google_search_results (nips_paper_id, abstract_url, pdf_url, fetch_attempts) 61 | # VALUES (5, 'abs_url', 'pdf_url', 1) 62 | # ON CONFLICT (nips_paper_id) DO UPDATE SET fetch_attempts = 63 | # (SELECT fetch_attempts from google_search_results WHERE nips_paper_id = 1) + 1; 64 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | requests 3 | psycopg2 4 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonBenn/nips-scraper/262960527abcef3758f92aaca3c862359e1c5ef9/src/__init__.py -------------------------------------------------------------------------------- /src/arxiv_etl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from utils import strip_text 6 | 7 | 8 | class ArxivETL: 9 | def __init__(self, db): 10 | self.db = db 11 | 12 | def extract(self, abstract_url): 13 | return requests.get(abstract_url) 14 | 15 | def transform(self, response): 16 | arxiv_abstract_html = BeautifulSoup(response.text, "html.parser") 17 | abstract_text = arxiv_abstract_html.select_one('.abstract').text.replace('Abstract:', '') 18 | authors_text = arxiv_abstract_html.select_one('.authors').text.replace('Authors:', '') 19 | arxiv_category = arxiv_abstract_html.select_one('.subheader').text 20 | return { 21 | "authors": strip_text(authors_text), 22 | "abstract": strip_text(abstract_text), 23 | "category": strip_text(arxiv_category) 24 | } 25 | 26 | def load(self, nips_paper_id, abstract_record): 27 | abstract_record["nips_paper_id"] = nips_paper_id 28 | self.db.insert_abstract(abstract_record) 29 | -------------------------------------------------------------------------------- /src/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import psycopg2 4 | from psycopg2 import sql 5 | import io 6 | from psycopg2.extras import RealDictCursor 7 | 8 | 9 | class DB: 10 | def __init__(self): 11 | self.conn = psycopg2.connect("dbname=nips_scraper user=jasonbenn") 12 | self.cursor = self.conn.cursor(cursor_factory=RealDictCursor) 13 | self.load_schema() 14 | 15 | def load_schema(self): 16 | self.cursor.execute(''' 17 | CREATE TABLE IF NOT EXISTS nips_papers ( 18 | id SERIAL PRIMARY KEY, 19 | title TEXT UNIQUE 20 | ); 21 | 22 | CREATE TABLE IF NOT EXISTS google_search_results ( 23 | id SERIAL PRIMARY KEY, 24 | nips_paper_id INTEGER UNIQUE REFERENCES nips_papers, 25 | abstract_url TEXT, 26 | pdf_url TEXT, 27 | fetch_attempts INTEGER 28 | ); 29 | 30 | CREATE TABLE IF NOT EXISTS abstracts ( 31 | id SERIAL PRIMARY KEY, 32 | nips_paper_id INTEGER UNIQUE REFERENCES nips_papers, 33 | abstract TEXT, 34 | authors TEXT, 35 | category TEXT 36 | ); 37 | ''') 38 | 39 | def all(self, table): 40 | select_all = sql.SQL("SELECT * FROM {};").format(sql.Identifier(table)) 41 | self.cursor.execute(select_all) 42 | return self.cursor.fetchall() 43 | 44 | def all_nips_papers_missing_abstracts(self): 45 | self.cursor.execute(''' 46 | SELECT abstract, abstract_url, authors, category, nips_papers.id, pdf_url, title FROM nips_papers 47 | LEFT JOIN abstracts ON abstracts.nips_paper_id=nips_papers.id 48 | LEFT JOIN google_search_results ON google_search_results.nips_paper_id=nips_papers.id 49 | WHERE abstract IS NULL 50 | ORDER BY fetch_attempts ASC; 51 | ''') 52 | return self.cursor.fetchall() 53 | 54 | def insert_nips_paper(self, title): 55 | self.cursor.execute("INSERT INTO nips_papers (title) VALUES (%s) ON CONFLICT DO NOTHING;", (title,)) 56 | self.conn.commit() 57 | 58 | def insert_abstract(self, record): 59 | self.cursor.execute(''' 60 | INSERT INTO abstracts (nips_paper_id, abstract, authors, category) 61 | VALUES (%(nips_paper_id)s, %(abstract)s, %(authors)s, %(category)s) ON CONFLICT DO NOTHING; 62 | ''', record) 63 | self.conn.commit() 64 | 65 | def upsert_search_result(self, search_result): 66 | try: 67 | self.cursor.execute(''' 68 | INSERT INTO google_search_results (nips_paper_id, abstract_url, pdf_url, fetch_attempts) 69 | VALUES (%(nips_paper_id)s, %(abstract_url)s, %(pdf_url)s, 1) 70 | ON CONFLICT (nips_paper_id) DO UPDATE SET fetch_attempts = ( 71 | (SELECT fetch_attempts from google_search_results WHERE nips_paper_id = %(nips_paper_id)s) + 1 72 | ) 73 | ''', search_result) 74 | except KeyError: 75 | print search_result 76 | self.conn.commit() 77 | 78 | def to_md(self, filename): 79 | print "dumping db to md" 80 | 81 | self.cursor.execute(''' 82 | SELECT abstract, abstract_url, authors, category, nips_papers.id, pdf_url, title FROM nips_papers 83 | LEFT JOIN abstracts ON abstracts.nips_paper_id=nips_papers.id 84 | LEFT JOIN google_search_results ON google_search_results.nips_paper_id=nips_papers.id 85 | ORDER BY nips_papers.id ASC; 86 | ''') 87 | abstracts = self.cursor.fetchall() 88 | 89 | print "found %i abstracts" % len(abstracts) 90 | 91 | with io.open(filename, 'w', encoding="utf8") as f: 92 | for a in abstracts: 93 | f.write(u"### #%s: %s\n" % (a["id"], a["title"].decode('utf8', 'ignore'))) 94 | if a["authors"]: 95 | f.write(u"_%s_\n\n" % a["authors"].decode('utf8', 'ignore')) 96 | if a["abstract"]: 97 | f.write(u"%s\n" % a["abstract"].decode('utf8', 'ignore')) 98 | if a["abstract_url"]: 99 | f.write(u"[Abstract](%s), [PDF](%s)\n\n" % (a["abstract_url"], a["pdf_url"])) 100 | f.write(u"\n") 101 | -------------------------------------------------------------------------------- /src/google_etl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import json 5 | from .utils import ascii_alphafy, RateLimitError 6 | import sys 7 | 8 | 9 | class GoogleETL: 10 | def __init__(self, db): 11 | self.db = db 12 | self.google_search_api_key = open('keys/google_search_api_key').read().strip() 13 | self.search_engine_id = open('keys/search_engine_id').read().strip() 14 | 15 | def extract(self, title): 16 | urlified_title = "+".join(ascii_alphafy(title).split(" ")) 17 | url = "https://www.googleapis.com/customsearch/v1?q={}&cx={}&key={}".format(urlified_title, self.search_engine_id, self.google_search_api_key) 18 | response = requests.get(url) 19 | if response.status_code >= 400: 20 | print response.status_code 21 | print response.text 22 | raise RateLimitError 23 | return response 24 | 25 | def transform(self, nips_paper_id, response): 26 | body = json.loads(response.text) 27 | results = body.get('items') or [] 28 | truncated_search_query = body['queries']['request'][0]['searchTerms'][:40] # Long titles are truncated in results 29 | 30 | abstract = { 31 | "pdf_url": None, 32 | "abstract_url": None, 33 | "nips_paper_id": nips_paper_id 34 | } 35 | 36 | try: 37 | matches = [result for result in results if truncated_search_query in ascii_alphafy(result['title'])] 38 | except UnicodeEncodeError: 39 | return abstract 40 | 41 | if len(matches): 42 | match = matches[0] 43 | url = match["link"] 44 | assert "arxiv.org" in url 45 | if "pdf" in url: 46 | abstract["pdf_url"] = url 47 | abstract["abstract_url"] = url.replace("pdf", "abs") 48 | else: 49 | abstract["abstract_url"] = url 50 | abstract["pdf_url"] = url.replace("abs", "pdf") 51 | else: 52 | print "----not found in %s results----" % len(results) 53 | # print "----not found in----\n%s\n" % "\n".join(["\t" + r['title'] for r in results]) 54 | 55 | return abstract 56 | 57 | def load(self, record): 58 | self.db.upsert_search_result(record) 59 | -------------------------------------------------------------------------------- /src/nips_etl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import json 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | class NipsETL: 9 | def __init__(self, db): 10 | self.db = db 11 | 12 | def extract(self): 13 | response = requests.get("https://nips.cc/Conferences/2017/AcceptedPapersInitial") 14 | if response.status_code == 200: 15 | return response 16 | 17 | def transform(self, response): 18 | parsed_html = BeautifulSoup(response.text, "html.parser") 19 | return [title.text for title in parsed_html.select("p > b")] 20 | 21 | def load(self, titles): 22 | for title in titles: 23 | self.db.insert_nips_paper(title) 24 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def ascii_alphafy(string): 4 | return string.replace(r':', '').replace(',', '').decode('utf8', 'ignore').encode('ascii', errors='ignore') 5 | 6 | def strip_text(string): 7 | return string.replace('\n', ' ').strip().encode("utf8") 8 | 9 | class RateLimitError(ValueError): 10 | """For Google rate limits""" 11 | --------------------------------------------------------------------------------