├── .gitignore
├── README.md
├── config.py.example
├── example.py
├── extract.py
├── filters.py
├── paper.py
├── requirements.txt
├── scraper.py
├── selector.py
├── utils.py
└── venue.py
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | config.py
3 | __pycache__
4 | example.csv
5 | papers.pkl
6 | run.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenReview Scraper
2 | Scrape papers from top conferences like ICML, ICLR, NeurIPS, etc using OpenReview API, by searching for specific keywords in title, abstract or keywords in the submissions and save them to a CSV file.
3 | Brings down the time taken to gather papers from several hours to a few minutes through automation
4 |
5 | ## Installation
6 | ```python
7 | git clone https://github.com/pranftw/openreview_scraper.git # clone repo
8 | python -m venv venv # create virtual environment
9 | source venv/bin/activate # activate virtual environment
10 | pip install -r requirements.txt # install requirements
11 | cp config.py.example config.py # enter your OpenReview credentials in config.py
12 | ```
13 |
14 | ## Example
15 | ```python
16 | from scraper import Scraper
17 | from extract import Extractor
18 | from filters import title_filter, keywords_filter, abstract_filter
19 | from selector import Selector
20 | from utils import save_papers, load_papers
21 |
22 |
23 | years = [
24 | '2024'
25 | ]
26 | conferences = [
27 | 'ICLR'
28 | ]
29 | keywords = [
30 | 'generalization'
31 | ]
32 |
33 | def modify_paper(paper):
34 | paper.forum = f"https://openreview.net/forum?id={paper.forum}"
35 | paper.content['pdf'] = f"https://openreview.net{paper.content['pdf']}"
36 | return paper
37 |
38 | # what fields to extract
39 | extractor = Extractor(fields=['forum'], subfields={'content':['title', 'keywords', 'abstract', 'pdf', 'match']})
40 |
41 | # if you want to select papers manually among the scraped papers
42 | # selector = Selector()
43 |
44 | # select all scraped papers
45 | selector = None
46 |
47 | scraper = Scraper(conferences=conferences, years=years, keywords=keywords, extractor=extractor, fpath='example.csv', fns=[modify_paper], selector=selector)
48 |
49 | # adding filters to filter on
50 | scraper.add_filter(title_filter)
51 | scraper.add_filter(keywords_filter)
52 | scraper.add_filter(abstract_filter)
53 |
54 | scraper()
55 |
56 | # if you want to save scraped papers as OpenReview objects using pickle
57 | save_papers(scraper.papers, fpath='papers.pkl')
58 | saved_papers = load_papers(fpath='papers.pkl')
59 | ```
60 |
--------------------------------------------------------------------------------
/config.py.example:
--------------------------------------------------------------------------------
1 | # OPENREVIEW CREDENTIALS
2 |
3 | EMAIL = ""
4 | PASSWORD = ""
5 |
6 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | from scraper import Scraper
2 | from extract import Extractor
3 | from filters import title_filter, keywords_filter, abstract_filter
4 | from selector import Selector
5 | from utils import save_papers, load_papers
6 |
7 |
8 | years = [
9 | '2024'
10 | ]
11 | conferences = [
12 | 'ICLR'
13 | ]
14 | keywords = [
15 | 'generalization'
16 | ]
17 |
18 | def modify_paper(paper):
19 | paper.forum = f"https://openreview.net/forum?id={paper.forum}"
20 | paper.content['pdf'] = f"https://openreview.net{paper.content['pdf']}"
21 | return paper
22 |
23 | extractor = Extractor(fields=['forum'], subfields={'content':['title', 'keywords', 'abstract', 'pdf', 'match']})
24 | selector = Selector()
25 | scraper = Scraper(conferences=conferences, years=years, keywords=keywords, extractor=extractor, fpath='example.csv', fns=[modify_paper], selector=selector)
26 |
27 | scraper.add_filter(title_filter)
28 | scraper.add_filter(keywords_filter)
29 | scraper.add_filter(abstract_filter)
30 |
31 | scraper()
32 |
33 | save_papers(scraper.papers, fpath='papers.pkl')
34 | saved_papers = load_papers(fpath='papers.pkl')
--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
1 | class Extractor:
2 | def __init__(self, fields, subfields, include_subfield=False):
3 | self.fields = fields
4 | self.subfields = subfields
5 | self.include_subfield = include_subfield
6 |
7 | def __call__(self, paper):
8 | return self.extract(paper)
9 |
10 | def extract(self, paper):
11 | trimmed_paper = {}
12 | for field in self.fields:
13 | trimmed_paper[field] = paper.__getattribute__(field)
14 | for subfield, fields in self.subfields.items():
15 | if self.include_subfield:
16 | trimmed_paper[subfield] = {}
17 | for field in fields:
18 | field_value = paper.__getattribute__(subfield)[field]
19 | if self.include_subfield:
20 | trimmed_paper[subfield][field] = field_value
21 | else:
22 | trimmed_paper[field] = field_value
23 | return trimmed_paper
--------------------------------------------------------------------------------
/filters.py:
--------------------------------------------------------------------------------
1 | from thefuzz import fuzz
2 |
3 |
4 | def check_keywords_with_keywords(keywords, paper_keywords, threshold):
5 | if not paper_keywords:
6 | return None, False
7 |
8 | # Ensure paper_keywords is a list
9 | if not isinstance(paper_keywords, list):
10 | if isinstance(paper_keywords, str):
11 | paper_keywords = [paper_keywords]
12 | else:
13 | try:
14 | paper_keywords = list(paper_keywords)
15 | except:
16 | paper_keywords = [str(paper_keywords)]
17 |
18 | for keyword in keywords:
19 | if keyword is None:
20 | continue
21 |
22 | # Ensure keyword is a string
23 | keyword = str(keyword)
24 |
25 | if not keyword.strip():
26 | continue
27 |
28 | for paper_keyword in paper_keywords:
29 | if paper_keyword is None:
30 | continue
31 |
32 | # Ensure paper_keyword is a string
33 | paper_keyword = str(paper_keyword)
34 |
35 | if not paper_keyword.strip():
36 | continue
37 |
38 | try:
39 | if fuzz.ratio(keyword, paper_keyword) >= threshold:
40 | return keyword, True
41 | except Exception as e:
42 | print(f"Error comparing '{keyword}' with '{paper_keyword}': {e}")
43 | continue
44 |
45 | return None, False
46 |
47 |
48 | def check_keywords_with_text(keywords, text, threshold):
49 | if text is None:
50 | return None, False
51 |
52 | # Ensure text is a string
53 | text = str(text)
54 |
55 | for keyword in keywords:
56 | if keyword is None:
57 | continue
58 |
59 | # Ensure keyword is a string
60 | keyword = str(keyword)
61 |
62 | # Skip empty strings
63 | if not keyword.strip() or not text.strip():
64 | continue
65 |
66 | try:
67 | if fuzz.partial_ratio(keyword, text) >= threshold:
68 | return keyword, True
69 | except Exception as e:
70 | print(f"Error comparing '{keyword}' with text: {e}")
71 | continue
72 |
73 | return None, False
74 |
75 |
76 | def satisfies_any_filters(paper, keywords, filters):
77 | for filter_, args, kwargs in filters:
78 | matched_keyword, matched = filter_(paper, keywords=keywords, *args, **kwargs)
79 | if matched:
80 | filter_type = filter_.__name__
81 | return matched_keyword, filter_type, True
82 | return None, None, False
83 |
84 |
85 | def keywords_filter(paper, keywords, threshold=85):
86 | paper_keywords = paper.content.get('keywords')
87 | if paper_keywords is not None:
88 | return check_keywords_with_keywords(keywords, paper_keywords, threshold)
89 | return None, False
90 |
91 |
92 | def title_filter(paper, keywords, threshold=85):
93 | paper_title = paper.content.get('title')
94 | if paper_title is not None:
95 | return check_keywords_with_text(keywords, paper_title, threshold)
96 | return None, False
97 |
98 |
99 | def abstract_filter(paper, keywords, threshold=85):
100 | paper_abstract = paper.content.get('abstract')
101 | if paper_abstract is not None:
102 | return check_keywords_with_text(keywords, paper_abstract, threshold)
103 | return None, False
--------------------------------------------------------------------------------
/paper.py:
--------------------------------------------------------------------------------
1 | def get_grouped_venue_papers(clients, grouped_venue, only_accepted):
2 | """
3 | Get papers from both API v1 and API v2 clients and merge the results.
4 |
5 | Args:
6 | clients: Tuple of (client_v1, client_v2)
7 | grouped_venue: List of venue IDs
8 | only_accepted: Boolean to filter only accepted papers
9 |
10 | Returns:
11 | Dictionary of papers by venue
12 | """
13 | client_v1, client_v2 = clients
14 | papers = {}
15 |
16 | for venue in grouped_venue:
17 | papers[venue] = []
18 |
19 | # Get papers from API v1
20 | submissions_v1 = []
21 | try:
22 | if only_accepted:
23 | submissions_v1 = client_v1.get_all_notes(content={'venueid': venue}, details='directReplies')
24 | else:
25 | single_blind_submissions = client_v1.get_all_notes(invitation=f'{venue}/-/Submission', details='directReplies')
26 | double_blind_submissions = client_v1.get_all_notes(invitation=f'{venue}/-/Blind_Submission', details='directReplies')
27 | submissions_v1 = single_blind_submissions + double_blind_submissions
28 | except Exception as e:
29 | print(f"Error getting papers from API v1 for venue {venue}: {e}")
30 |
31 | # Get papers from API v2
32 | submissions_v2 = []
33 | try:
34 | if only_accepted:
35 | submissions_v2 = client_v2.get_all_notes(content={'venueid': venue}, details='directReplies')
36 | else:
37 | single_blind_submissions = client_v2.get_all_notes(invitation=f'{venue}/-/Submission', details='directReplies')
38 | double_blind_submissions = client_v2.get_all_notes(invitation=f'{venue}/-/Blind_Submission', details='directReplies')
39 | submissions_v2 = single_blind_submissions + double_blind_submissions
40 | except Exception as e:
41 | print(f"Error getting papers from API v2 for venue {venue}: {e}")
42 |
43 | # Merge submissions from both APIs
44 | # Use forum IDs to avoid duplicates
45 | forum_ids = set()
46 | merged_submissions = []
47 |
48 | for submission in submissions_v1 + submissions_v2:
49 | if hasattr(submission, 'forum') and submission.forum not in forum_ids:
50 | forum_ids.add(submission.forum)
51 | merged_submissions.append(submission)
52 |
53 | papers[venue] += merged_submissions
54 |
55 | print(venue)
56 | print(f'Number of papers: {len(merged_submissions)}')
57 |
58 | return papers
59 |
60 |
61 | def get_papers(clients, grouped_venues, only_accepted):
62 | """
63 | Get papers for all grouped venues.
64 |
65 | Args:
66 | clients: Tuple of (client_v1, client_v2)
67 | grouped_venues: Dictionary of venue IDs by group
68 | only_accepted: Boolean to filter only accepted papers
69 |
70 | Returns:
71 | Dictionary of papers by group and venue
72 | """
73 | papers = {}
74 | for group, grouped_venue in grouped_venues.items():
75 | papers[group] = get_grouped_venue_papers(clients, grouped_venue, only_accepted)
76 | return papers
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openreview-py
2 | thefuzz[speedup]
3 | dill
--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
1 | from utils import get_client, to_csv, papers_to_list
2 | from venue import get_venues, group_venues
3 | from paper import get_papers
4 | from filters import satisfies_any_filters
5 |
6 |
7 | class Scraper:
8 | def __init__(self, conferences, years, keywords, extractor, fpath, selector=None, fns=[], groups=['conference'], only_accepted=True):
9 | # fns is a list of functions that can be specified by the user each taking in a single paper object as a parameter and returning the modified paper
10 | self.confs = conferences
11 | self.years = years
12 | self.keywords = keywords
13 | self.extractor = extractor
14 | self.fpath = fpath
15 | self.fns = fns
16 | self.groups = groups
17 | self.only_accepted = only_accepted
18 | self.selector = selector
19 | self.filters = []
20 | # Get both API v1 and API v2 clients
21 | self.clients = get_client()
22 | self.papers = None # this'll contain all the papers returned from apply_on_papers
23 |
24 | def __call__(self):
25 | self.scrape()
26 |
27 | def scrape(self):
28 | print("Getting venues...")
29 | venues = get_venues(self.clients, self.confs, self.years)
30 | print("Getting papers...\n")
31 | papers = get_papers(self.clients, group_venues(venues, self.groups), self.only_accepted)
32 | self.papers = papers
33 | print("\nFiltering papers...")
34 | papers = self.apply_on_papers(papers)
35 | if self.selector is not None:
36 | papers_list = self.selector(papers)
37 | else:
38 | papers_list = papers_to_list(papers)
39 | print("Saving as CSV...")
40 | to_csv(papers_list, self.fpath)
41 | print(f"Saved at {self.fpath}")
42 |
43 | def apply_on_papers(self, papers):
44 | modified_papers = {}
45 | for group, grouped_venues in papers.items():
46 | modified_papers[group] = {}
47 | for venue, venue_papers in grouped_venues.items():
48 | modified_papers[group][venue] = []
49 | venue_split = venue.split('/')
50 | venue_name, venue_year, venue_type = venue_split[0], venue_split[1], venue_split[2]
51 | for paper in venue_papers:
52 | # FILTERS
53 | satisfying_keyword, satisfying_filter_type, satisfies = satisfies_any_filters(paper, self.keywords, self.filters)
54 | if satisfies:
55 | # creating a new field(key) in content attr which is a dict
56 | paper.content['match'] = {satisfying_filter_type: satisfying_keyword}
57 | paper.content['group'] = group
58 | # Execute some custom functions
59 | for fn in self.fns:
60 | paper = fn(paper)
61 | # FIELD EXTRACTION here paper object will be converted into a dict
62 | extracted_paper = self.extractor(paper)
63 | # add some extra fields
64 | extracted_paper['venue'] = venue_name
65 | extracted_paper['year'] = venue_year
66 | extracted_paper['type'] = venue_type
67 | modified_papers[group][venue].append(extracted_paper)
68 | return modified_papers
69 |
70 | def add_filter(self, filter_, *args, **kwargs):
71 | self.filters.append((filter_, args, kwargs))
--------------------------------------------------------------------------------
/selector.py:
--------------------------------------------------------------------------------
1 | import os
2 | from utils import papers_to_list
3 |
4 |
5 | class Selector:
6 | def __init__(self, fields=None, options=None, start_idx=0):
7 | self.idx = start_idx
8 | self.fields = fields if fields is not None else ['title', 'abstract']
9 | if options is None:
10 | self.options = {
11 | 'y':{
12 | 'desc':'yes',
13 | 'fn':lambda paper, selected_papers:selected_papers.append(paper)
14 | },
15 | 'n':{
16 | 'desc':'no'
17 | }
18 | }
19 | else:
20 | self.options = options
21 | self.options['e'] = {'desc':'exit'}
22 |
23 | def __call__(self, papers):
24 | return self.select(papers)
25 |
26 | def select(self, papers):
27 | os.system('clear') # i only support unix based systems
28 | papers_list = papers_to_list(papers)
29 | selected_papers = []
30 | while self.idx0:
45 | field_names = list(papers_list[0].keys()) # choose one of the papers, get all the keys as they'll be same for rest of them
46 | write_csv()
47 |
48 |
49 | def save_papers(papers, fpath):
50 | with open(fpath, 'wb') as fp:
51 | dill.dump(papers, fp)
52 | print(f'Papers saved at: {fpath}')
53 |
54 |
55 | def load_papers(fpath):
56 | with open(fpath, 'rb') as fp:
57 | papers = dill.load(fp)
58 | print(f'Papers loaded from: {fpath}')
59 | return papers
--------------------------------------------------------------------------------
/venue.py:
--------------------------------------------------------------------------------
1 | def get_venues(clients, confs, years):
2 | """
3 | Get venues from both API v1 and API v2 clients and merge the results.
4 |
5 | Args:
6 | clients: Tuple of (client_v1, client_v2)
7 | confs: List of conference names
8 | years: List of years
9 |
10 | Returns:
11 | List of venue IDs
12 | """
13 | client_v1, client_v2 = clients
14 |
15 | def filter_year(venue):
16 | if venue is None:
17 | return None
18 | for year in years:
19 | if year in venue:
20 | return venue
21 | return None
22 |
23 | # Get venues from API v1
24 | venues_v1 = []
25 | try:
26 | venues_v1 = client_v1.get_group(id='venues').members
27 | except Exception as e:
28 | print(f"Error getting venues from API v1: {e}")
29 |
30 | # Get venues from API v2
31 | venues_v2 = []
32 | try:
33 | venues_v2 = client_v2.get_group(id='venues').members
34 | except Exception as e:
35 | print(f"Error getting venues from API v2: {e}")
36 |
37 | # Merge venues from both APIs
38 | venues = list(set(venues_v1 + venues_v2))
39 |
40 | venues = list(map(filter_year, venues))
41 | venues = filter(lambda venue: venue is not None, venues)
42 | reqd_venues = []
43 | for venue in venues:
44 | for conf in confs:
45 | if conf.lower() in venue.lower():
46 | reqd_venues.append(venue)
47 | break
48 | reqd_venues = map(filter_year, reqd_venues)
49 | reqd_venues = list(filter(lambda venue: venue is not None, reqd_venues))
50 | return reqd_venues
51 |
52 |
53 | def group_venues(venues, bins):
54 | def get_bins_dict():
55 | bins_dict = {bin:[] for bin in bins}
56 | return bins_dict
57 |
58 | bins_dict = get_bins_dict()
59 | for venue in venues:
60 | for bin in bins:
61 | if bin.lower() in venue.lower():
62 | bins_dict[bin].append(venue)
63 | break
64 |
65 | return bins_dict
--------------------------------------------------------------------------------