├── __init__.py
├── base
├── __init__.py
├── general_utils.py
├── file_download.py
└── pdf_extract.py
├── db
├── __init__.py
├── csv.py
├── endnote_html.py
├── ris.py
├── bibtex.py
├── rayyan.py
├── ref_utils.py
└── data.py
├── requirements.txt
├── search
├── __init__.py
├── other_search.py
├── base_search.py
├── google_scholar.py
└── metadata_harvest.py
├── import_metadata.py
├── bib_to_csv.py
├── bulk_download.py
├── titles_and_bibs.py
├── export_rayyan_results.py
├── export_to_ris.py
├── .gitignore
├── import_from_endnote.py
├── gather_metadata.py
├── add_abstracts_from_pdf.py
├── search_to_file.py
├── bib_diff.py
├── reasons_for_exclusion.py
├── README.md
├── snowball_citations.py
└── filter_results.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/db/csv.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def readCSVFile(filename):
5 | df = pd.read_csv(filename)
6 | return df.to_dict(orient='records')
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langdetect
2 | pandas
3 | beautifulsoup4
4 | lxml
5 | scholarly
6 | tqdm
7 | bibtexparser
8 | requests
9 | strsimpy
10 | RISparser==0.4.3
11 |
--------------------------------------------------------------------------------
/search/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_search import MAX_RESULTS, Searcher, SearchResult, getSearchResultsFromBib
2 | from .google_scholar import GScholarSearcher
3 | from .metadata_harvest import enrichMetadata, enrichAndUpdateMetadata
4 | from .other_search import PubMedSearcher, SemanticScholarSearcher
--------------------------------------------------------------------------------
/import_metadata.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib
3 | import pandas as pd
4 |
5 | def main(conf):
6 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, True)
7 |
8 | paperstore.addPapers(papers_to_add)
9 | if conf.force:
10 | paperstore.updatePapers(papers_existing)
11 |
12 |
13 | if __name__ == '__main__':
14 | parser = ArgumentParser(description='Import metadata from bib file')
15 | parser.add_argument('-i', '--input', type=str,
16 | help='Input bib file name')
17 | parser.add_argument('-f', '--force', type=bool, default=False,
18 | help='Force updating of existing paper records')
19 |
20 | conf = parser.parse_args()
21 |
22 | main(conf)
23 |
--------------------------------------------------------------------------------
/search/other_search.py:
--------------------------------------------------------------------------------
1 | from .base_search import Searcher, MAX_RESULTS
2 | from .metadata_harvest import SemanticScholarScraper, PubMedScraper
3 |
4 |
5 | class SemanticScholarSearcher(Searcher):
6 | def __init__(self, paperstore):
7 | super().__init__(paperstore)
8 | self.scraper = SemanticScholarScraper()
9 |
10 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS):
11 | res = self.scraper.search(query, identity='', min_year=min_year, max_year=max_year)
12 | return res
13 |
14 |
15 | class PubMedSearcher(Searcher):
16 | def __init__(self, paperstore):
17 | super().__init__(paperstore)
18 | self.scraper = PubMedScraper()
19 |
20 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS):
21 | self.scraper.search(query, identity='')
22 |
--------------------------------------------------------------------------------
/bib_to_csv.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from base.general_utils import loadEntriesAndSetUp
3 |
4 | import pandas as pd
5 |
6 | def dataframeFromPapers(papers):
7 | report = []
8 |
9 | for paper in papers:
10 | report.append(paper.asDict())
11 |
12 | df = pd.DataFrame(report, columns=['id', 'year', 'title', 'authors', 'venue', 'abstract', 'doi', 'pmid', ])
13 | return df
14 |
15 |
16 | def main(conf):
17 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache)
18 |
19 |
20 | df = dataframeFromPapers(all_papers)
21 | df.to_csv(conf.output)
22 |
23 |
24 | if __name__ == '__main__':
25 | parser = ArgumentParser(description='Filter results ')
26 |
27 | parser.add_argument('-i', '--input', type=str,
28 | help='Input bib file name')
29 | parser.add_argument('-o', '--output', type=str,
30 | help='Output csv file name')
31 | parser.add_argument('-c', '--cache', type=bool, default=True,
32 | help='Use local cache for results')
33 |
34 | conf = parser.parse_args()
35 |
36 | main(conf)
37 |
--------------------------------------------------------------------------------
/bulk_download.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from base.file_download import bulkDownload
4 | from base.general_utils import loadEntriesAndSetUp
5 |
6 |
7 | def main(conf):
8 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache, conf.max)
9 |
10 | bulkDownload(all_papers, conf.dir, conf.report_path, do_not_download_just_list=False)
11 |
12 |
13 | if __name__ == '__main__':
14 | parser = ArgumentParser(description='Filter results ')
15 |
16 | parser.add_argument('-i', '--input', type=str,
17 | help='Input bib/RIS/CSV file name')
18 | parser.add_argument('-d', '--dir', type=str,
19 | help='Directory where to store the output')
20 | parser.add_argument('-c', '--cache', type=bool, default=True,
21 | help='Use local cache for results')
22 | parser.add_argument('-m', '--max', type=int, default=100,
23 | help='Maximum number of results to process')
24 | parser.add_argument('-r', '--report-path', type=str, default='results_report.csv',
25 | help='Path to CSV file with a download report')
26 |
27 | conf = parser.parse_args()
28 |
29 | main(conf)
30 |
--------------------------------------------------------------------------------
/search/base_search.py:
--------------------------------------------------------------------------------
1 | import re
2 | from db.data import Paper
3 |
4 | MAX_RESULTS = 100
5 |
6 |
7 | class Searcher:
8 | def __init__(self, paperstore):
9 | self.paperstore = paperstore
10 |
11 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS):
12 | pass
13 |
14 |
15 | class SearchResult(Paper):
16 | def __init__(self, index, bib, source, extra_data):
17 | super().__init__(bib, extra_data)
18 | self.index = index
19 | self.source = source
20 | self.paper = None
21 |
22 | def __getitem__(self, item):
23 | return self.extra_data.get(item, self.bib.get(item))
24 |
25 | def __repr__(self):
26 | return f"<#%d: %s - %s - %s> \n %s" % (
27 | self.index, self.bib.get("title", ""),
28 | self.bib.get("author", ""),
29 | self.bib.get("year", ""), str(self.bib))
30 |
31 |
32 | def getSearchResultsFromBib(bib_entries, max_results=100000000):
33 | results = []
34 | for index, bib in enumerate(bib_entries[:max_results]):
35 | res = SearchResult(index, bib, 'bibfile', {})
36 | if bib.get('note'):
37 | match = re.search('(\d+)\scites:\s.+?scholar\?cites\=(\d+)', bib['note'])
38 | if match:
39 | res.source = 'scholar'
40 | res.extra_data['scholarid'] = match.group(2)
41 | res.extra_data['citedby'] = match.group(1)
42 | results.append(res)
43 |
44 | return results
45 |
--------------------------------------------------------------------------------
/titles_and_bibs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pandas as pd
4 |
5 | from db.data import PaperStore
6 | from db.rayyan import loadRayyan, computeReviewerOverlap
7 | from db.rayyan import selectPapersToReview
8 |
9 |
10 | if __name__ == "__main__":
11 |
12 | parser = argparse.ArgumentParser(description='Gather metadata such as'
13 | 'reason for exclusion + bib information')
14 | parser.add_argument('-o', '--outfile', type=str,
15 | help='Output pandas csv filename')
16 |
17 | args = parser.parse_args()
18 |
19 | paper_store = PaperStore()
20 |
21 | # sysreview articles
22 | sysreviewdf = pd.read_excel(os.path.join('reasons_for_exclusion', 'sysreview-15-09-2020.xlsx'))
23 |
24 | bibs = []
25 |
26 | # Add bib files to the dataframe for those that have a bib entry
27 | for title in sysreviewdf.title:
28 | paper = paper_store.findPapersByTitle(title)
29 | if paper:
30 | bibs.append(paper[0].bib)
31 | else:
32 | bibs.append(None)
33 |
34 | sysreviewdf['bib'] = bibs
35 |
36 | # Only keep titles and bibs
37 | sysreviewdf = sysreviewdf[['title', 'bib']]
38 |
39 | print(sysreviewdf)
40 | print('Writing results to %s' % args.outfile)
41 | sysreviewdf.to_csv(args.outfile, index=False)
42 |
43 | # notes = joined.notes.str.split('|').str[1]
44 | # notes = notes.str.split(':').str[-1]
45 | # notes = notes.str.split(',')
46 | # print(notes.isna().sum())
47 | #
48 | # # Extract reasons from the notes section
49 | # pass
50 |
--------------------------------------------------------------------------------
/export_rayyan_results.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from db.rayyan import loadRayyan, computeReviewerOverlap
3 | from db.rayyan import selectPapersToReview, selectPapersToFilter
4 |
5 |
6 | def main(conf):
7 | df = loadRayyan(conf.input)
8 | computeReviewerOverlap(df)
9 | # If we want exact include/exclude - call filter
10 | if (conf.num_included, conf.num_excluded) != (None, None):
11 | to_filter = selectPapersToFilter(df,
12 | include_count=conf.num_included,
13 | exclude_count=conf.num_excluded)
14 | print('\nTotal selected for filtering', len(to_filter))
15 | to_filter.to_csv(conf.output)
16 | else:
17 | to_review = selectPapersToReview(df, conf.min_votes)
18 | print('\nTotal selected for review', len(to_review))
19 | to_review.to_csv(conf.output)
20 |
21 |
22 | if __name__ == '__main__':
23 | parser = ArgumentParser(description='Filter results ')
24 |
25 | parser.add_argument('-i', '--input', type=str,
26 | help='Input .zip file downloaded from Rayyan')
27 | parser.add_argument('-o', '--output', type=str,
28 | help='Path to output report CSV')
29 | parser.add_argument('-v', '--min-votes', type=int, default=1,
30 | help='Minimum votes for inclusion')
31 | parser.add_argument('--num_included', type=int,
32 | help='Exact number of inclusion votes')
33 | parser.add_argument('--num_excluded', type=int,
34 | help='Exact number of exclusion votes')
35 |
36 | conf = parser.parse_args()
37 |
38 | main(conf)
39 |
--------------------------------------------------------------------------------
/db/endnote_html.py:
--------------------------------------------------------------------------------
1 | import re
2 | from db.ref_utils import isPDFURL
3 |
4 | mapping = [
5 | # ('Reference Type: ', 'ENTRYTYPE'),
6 | ('Title', 'title'),
7 | ('Journal', 'journal'),
8 | ('DOI', 'doi'),
9 | ('Author Address', 'address'),
10 | ('Author', 'author'),
11 | ('volume', 'VL'),
12 | ]
13 |
14 | type_mapping = {
15 | 'Journal Article': 'article',
16 | 'Thesis': 'thesis',
17 | 'Book': 'book',
18 | }
19 |
20 |
21 | def loadRefsFromHTML(filename):
22 | with open(filename) as f:
23 | html = f.read()
24 |
25 | html = html[html.find('
') + 6:]
26 | # html = re.sub('.+', '', html, flags=re.DOTALL)
27 | entries = re.split('(\n
\n
)', html)
28 | res = []
29 |
30 | for entry in entries:
31 | lines = entry.split('\n')
32 | new_bib = {}
33 |
34 | for line in lines:
35 | match = re.search('Reference Type: <\/b> (.+?)
', line)
36 | if match:
37 | if match.group(1) in type_mapping:
38 | new_bib['ENTRYTYPE'] = type_mapping[match.group(1)]
39 | else:
40 | new_bib['ENTRYTYPE'] = 'article'
41 |
42 | for bib_map in mapping:
43 | match = re.search('' + bib_map[0] + ':<\/b> (.+?)
', line)
44 | if match:
45 | new_bib[bib_map[1]] = match.group(1)
46 |
47 | for match in re.finditer('', entry):
48 | if isPDFURL(match.group(1)):
49 | new_bib['eprint'] = match.group(1)
50 | else:
51 | new_bib['url'] = match.group(1)
52 |
53 | res.append(new_bib)
54 |
55 | return res
56 |
--------------------------------------------------------------------------------
/export_to_ris.py:
--------------------------------------------------------------------------------
1 | from base.general_utils import loadEntriesAndSetUp
2 |
3 | from argparse import ArgumentParser
4 | from db.ris import writeBibToRISFile
5 |
6 |
7 | def main(conf):
8 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache)
9 |
10 | if conf.missing_abstract:
11 | all_bibs = []
12 | for paper in all_papers:
13 | if not paper.has_pdf and not paper.has_abstract:
14 | all_bibs.append(paper.bib)
15 | elif conf.missing_pdf:
16 | all_bibs = []
17 | for paper in all_papers:
18 | if not paper.has_pdf:
19 | all_bibs.append(paper.bib)
20 | else:
21 | all_bibs = [p.bib for p in all_papers]
22 |
23 | writeBibToRISFile(all_bibs, conf.output)
24 |
25 |
26 | if __name__ == '__main__':
27 | parser = ArgumentParser(
28 | description='Exports a bibliography to RIS (EndNote) for further gathering of PDFs')
29 |
30 | parser.add_argument('-i', '--input', type=str,
31 | help='Input Bibtex file with the previously cached search results')
32 | parser.add_argument('-o', '--output', type=str,
33 | help='Output RIS file')
34 | parser.add_argument('-x', '--missing-pdf', type=bool, default=False,
35 | help='Export *only* papers missing a PDF')
36 | parser.add_argument('-a', '--missing-abstract', type=bool, default=False,
37 | help='Export *only* papers that are also missing an abstract')
38 | parser.add_argument('-c', '--cache', type=bool, default=True,
39 | help='Use local cache for results')
40 |
41 | conf = parser.parse_args()
42 |
43 | main(conf)
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/base/general_utils.py:
--------------------------------------------------------------------------------
1 | from db.bibtex import readBibtexFile
2 | from db.data import PaperStore, Paper
3 | from search import getSearchResultsFromBib
4 | from db.ref_utils import simpleResultDeDupe
5 | from db.bibtex import writeBibtex
6 | from db.ris import writeRIS, readRIS
7 | from db.csv import readCSVFile
8 | from search.metadata_harvest import mergeResultData
9 |
10 | def loadEntriesAndSetUp(input, use_cache=True, max_results=10000000):
11 | if use_cache:
12 | paperstore = PaperStore()
13 | else:
14 | paperstore = None
15 |
16 | bib_entries = readInputBib(input)
17 | results = getSearchResultsFromBib(bib_entries, max_results)
18 |
19 | results = simpleResultDeDupe(results)
20 |
21 | if paperstore:
22 | found, missing = paperstore.matchResultsWithPapers(results)
23 | else:
24 | found = []
25 | missing = results
26 |
27 | papers_to_add = [Paper(res.bib, res.extra_data) for res in missing]
28 | papers_existing = [mergeResultData(res, res.paper) for res in found]
29 |
30 | all_papers = papers_to_add + papers_existing
31 |
32 | # FIXME: a second dedupe is needed because it seems I'm matching the wrong paper
33 | # a total of 5 records suffer from this so it's no big deal
34 | all_papers = simpleResultDeDupe(all_papers)
35 |
36 | return paperstore, papers_to_add, papers_existing, all_papers
37 |
38 | def readInputBib(filename):
39 | if filename.endswith('.bib'):
40 | return readBibtexFile(filename)
41 | elif filename.endswith('.csv'):
42 | return readCSVFile(filename)
43 | elif filename.endswith('.ris'):
44 | return readRIS(filename)
45 |
46 | def writeOutputBib(bib, filename):
47 | if filename.endswith('.ris'):
48 | writeRIS(bib, filename)
49 | else:
50 | writeBibtex(bib, filename)
--------------------------------------------------------------------------------
/import_from_endnote.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from db.data import PaperStore, Paper
4 | from db.endnote_html import loadRefsFromHTML
5 | from search import getSearchResultsFromBib
6 | from db.ref_utils import addUrlIfNewWithType
7 |
8 |
9 | def main(conf):
10 | if conf.cache:
11 | paperstore = PaperStore()
12 | else:
13 | paperstore = None
14 |
15 | bib_entries = loadRefsFromHTML(conf.input)
16 |
17 | results = getSearchResultsFromBib(bib_entries)
18 |
19 | if paperstore:
20 | found, missing = paperstore.matchResultsWithPapers(results)
21 | else:
22 | found = []
23 | missing = results
24 |
25 | papers_to_add = [Paper(res.bib, res.extra_data) for res in missing]
26 |
27 | counter = 0
28 |
29 | for res in found:
30 | if res.bib.get('url'):
31 | if addUrlIfNewWithType(res.paper, res['url'], 'endnote'):
32 | counter += 1
33 | if res.bib.get('eprint'):
34 | if addUrlIfNewWithType(res.paper, res['eprint'], 'endnote'):
35 | counter += 1
36 |
37 | papers_existing = [res.paper for res in found]
38 | paperstore.updatePapers(papers_existing)
39 |
40 | print('Papers found', len(papers_existing))
41 | print('Papers not found', len(papers_to_add))
42 | print('Added', counter, 'urls')
43 |
44 | if __name__ == '__main__':
45 | parser = ArgumentParser(
46 | description='Exports a bibliography to RIS (EndNote) for further gathering of PDFs')
47 |
48 | parser.add_argument('-i', '--input', type=str,
49 | help='Input EndNote HTML file')
50 | parser.add_argument('-c', '--cache', type=bool, default=True,
51 | help='Use local cache for results')
52 |
53 | conf = parser.parse_args()
54 |
55 | main(conf)
56 |
--------------------------------------------------------------------------------
/gather_metadata.py:
--------------------------------------------------------------------------------
1 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib
2 |
3 | from search import enrichAndUpdateMetadata
4 | from argparse import ArgumentParser
5 | from db.bibtex import writeBibtex
6 |
7 |
8 | def main(conf):
9 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache, conf.max)
10 |
11 | if conf.cache:
12 | successful, unsuccessful = enrichAndUpdateMetadata(papers_to_add, paperstore, conf.email)
13 |
14 | if conf.force and conf.cache:
15 | enrichAndUpdateMetadata(papers_existing, paperstore, conf.email)
16 |
17 | all_papers = papers_to_add + papers_existing
18 | writeOutputBib(all_papers, conf.output)
19 |
20 |
21 | if __name__ == '__main__':
22 | parser = ArgumentParser(
23 | description='Gathers metadata, including the abstract, on a list of search results by searching on Crossref, PubMed, arXiv, Semantic Scholar and Unpaywall')
24 |
25 | parser.add_argument('-i', '--input', type=str,
26 | help='Input BIB/RIS file with the previously cached search results')
27 | parser.add_argument('-o', '--output', type=str,
28 | help='Output BIB/RIS file into which to update the new, augmented results')
29 | parser.add_argument('-m', '--max', type=int, default=100,
30 | help='Maximum number of results to process')
31 | parser.add_argument('-em', '--email', type=str,
32 | help='Email to serve as identity to API endpoints')
33 | parser.add_argument('-c', '--cache', type=bool, default=True,
34 | help='Use local cache for results')
35 | parser.add_argument('-f', '--force', type=bool, default=False,
36 | help='Force updating metadata for cached results')
37 |
38 | conf = parser.parse_args()
39 |
40 | main(conf)
41 |
--------------------------------------------------------------------------------
/add_abstracts_from_pdf.py:
--------------------------------------------------------------------------------
1 | import os
2 | from base.general_utils import loadEntriesAndSetUp
3 | from base.file_download import bulkDownload
4 | from base.pdf_extract import getAbstractFromPDF
5 | from argparse import ArgumentParser
6 | from db.bibtex import writeBibtex
7 |
8 |
9 | def main(conf):
10 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache, conf.max)
11 |
12 | no_abstract_but_pdf = [p for p in all_papers if not p.has_abstract and p.has_pdf]
13 | bulkDownload(no_abstract_but_pdf, conf.dir, 'results_report.csv', do_not_download_just_list=True)
14 |
15 | successful = []
16 | for paper in no_abstract_but_pdf:
17 | if not os.path.exists(paper.pdf_filename):
18 | continue
19 |
20 | abstract = getAbstractFromPDF(paper.pdf_filename)
21 |
22 | if abstract:
23 | print(abstract)
24 | paper.bib['abstract'] = abstract
25 | paperstore.updatePapers([paper])
26 | successful.append(paper)
27 |
28 | print('Generated',len(successful), 'new abstracts')
29 | writeBibtex(successful, conf.output)
30 |
31 |
32 | if __name__ == '__main__':
33 | parser = ArgumentParser(
34 | description='Tries to download the PDF for each file and extract the abstract from it')
35 |
36 | parser.add_argument('-i', '--input', type=str,
37 | help='Input Bibtex file with the previously cached search results')
38 | parser.add_argument('-o', '--output', type=str,
39 | help='Output Bbibex file into which to update the new, augmented results')
40 | parser.add_argument('-d', '--dir', type=str,
41 | help='Directory where to store the downloaded PDFs')
42 | parser.add_argument('-m', '--max', type=int, default=100,
43 | help='Maximum number of results to process')
44 | parser.add_argument('-em', '--email', type=str,
45 | help='Email to serve as identity to API endpoints')
46 | parser.add_argument('-c', '--cache', type=bool, default=True,
47 | help='Use local cache for results')
48 |
49 | conf = parser.parse_args()
50 |
51 | main(conf)
52 |
--------------------------------------------------------------------------------
/search_to_file.py:
--------------------------------------------------------------------------------
1 | from db.data import PaperStore, Paper
2 |
3 | from search import GScholarSearcher, enrichAndUpdateMetadata
4 | from argparse import ArgumentParser
5 | from db.bibtex import writeBibtex
6 |
7 |
8 | def main(conf):
9 | if conf.cache:
10 | paperstore = PaperStore()
11 | else:
12 | paperstore = None
13 |
14 | if conf.engine == "scholar":
15 | searcher = GScholarSearcher(paperstore)
16 | # elif conf.engine == "pubmed":
17 | # searcher = PubMedSearcher(paperstore)
18 | else:
19 | raise ValueError
20 |
21 | if conf.query_file:
22 | with open(conf.query_file, 'r') as f:
23 | query = f.read()
24 | else:
25 | query = conf.query
26 |
27 | print("Query:", query)
28 |
29 | results = searcher.search(query, min_year=conf.year_start, max_results=conf.max)
30 |
31 | if conf.cache:
32 | found, missing = paperstore.matchResultsWithPapers(results)
33 |
34 | papers_to_add = [Paper(res.bib, res.extra_data) for res in missing]
35 | paperstore.updatePapers(papers_to_add)
36 |
37 | writeBibtex([Paper(res.bib, res.extra_data) for res in results], conf.file)
38 |
39 |
40 | if __name__ == '__main__':
41 | parser = ArgumentParser(description='Searches an engine and saves results to a file')
42 |
43 | parser.add_argument('-q', '--query', type=str,
44 | help='The query to use to retrieve the articles')
45 | parser.add_argument('-qf', '--query-file', type=str,
46 | help='Text file containing the query to use to retrieve the articles')
47 | parser.add_argument('-h', '--headers-file', type=str,
48 | help='YAML file containing the headers to use for requests, particularly to Google Scholar')
49 | parser.add_argument('-ys', '--year-start', type=int,
50 | help='The minimum year for results')
51 | parser.add_argument('-ye', '--year-end', type=int,
52 | help='The maximum year for results')
53 | parser.add_argument('-f', '--file', type=str,
54 | help='Filename to dump the results to')
55 | parser.add_argument('-m', '--max', type=int, default=100,
56 | help='Maximum number of results to retrieve')
57 | parser.add_argument('-e', '--engine', type=str, default="scholar",
58 | help='Which search engine to use. Currently only "scholar" (Google Scholar) available ')
59 | parser.add_argument('-em', '--email', type=str,
60 | help='Email to serve as identity to API endpoints')
61 | parser.add_argument('-c', '--cache', type=bool, default=True,
62 | help='Use local cache for results')
63 |
64 | conf = parser.parse_args()
65 |
66 | main(conf)
67 |
--------------------------------------------------------------------------------
/db/ris.py:
--------------------------------------------------------------------------------
1 | from db.bibtex import fixBibData
2 | from db.ref_utils import parseBibAuthors, authorListFromListOfAuthors
3 | from RISparser import readris
4 |
5 | mapping = [
6 | ('address', 'AD'),
7 | ('abstract', 'AB'),
8 | ('doi', 'DO'),
9 | ('eprint', 'LK'),
10 | ('editor', 'ED'),
11 | ('issue', 'IS'),
12 | ('journal', 'JF'),
13 | ('publisher', 'PB'),
14 | ('title', 'TI'),
15 | ('url', 'UR'),
16 | ('volume', 'VL'),
17 | ]
18 |
19 | type_mapping = {
20 | 'inproceedings': 'CONF',
21 | 'article': 'JOUR',
22 | 'thesis': 'THES',
23 | 'book': 'BOOK',
24 | }
25 |
26 | reverse_type_mapping = {b: a for a, b in type_mapping.items()}
27 |
28 |
29 | def exportBibToRIS(entries):
30 | lines = []
31 | for entry in entries:
32 | authors = parseBibAuthors(entry['author'])
33 |
34 | if entry['ENTRYTYPE'].lower() in type_mapping:
35 | ris_type = type_mapping[entry['ENTRYTYPE'].lower()]
36 | else:
37 | ris_type = 'JOUR'
38 |
39 | lines.append('TY - ' + ris_type)
40 |
41 | for author in authors:
42 | au_line = 'AU - %s, %s' % (author['family'], author['given'])
43 | if author.get('middle'):
44 | au_line += ' ' + author['middle']
45 | lines.append(au_line)
46 |
47 | # lines.append('PY - %s/%s/%s/' % (entry['year'], entry['month'], entry['day']))
48 | lines.append('PY - %s' % (entry.get('year', ''),))
49 |
50 | pages = entry.get('pages')
51 | if pages:
52 | bits = pages.split('-')
53 |
54 | lines.append('SP - ' + bits[0])
55 | lines.append('EP - ' + bits[-1])
56 |
57 | for eq in mapping:
58 | if entry.get(eq[0]):
59 | lines.append(str(eq[1]) + ' - ' + str(entry[eq[0]]))
60 |
61 | lines.append('ER - ')
62 |
63 | return '\n'.join(lines)
64 |
65 |
66 | def writeBibToRISFile(entries, filename):
67 | with open(filename, 'w') as f:
68 | text = exportBibToRIS(entries)
69 | f.write(text)
70 |
71 |
72 | def writeRIS(papers, filename):
73 | bibs = [paper.bib for paper in papers]
74 | writeBibToRISFile(bibs, filename)
75 |
76 |
77 | def readRIS(filename):
78 | with open(filename, 'r') as f:
79 | entries = readris(f)
80 |
81 | res = []
82 |
83 | for entry in entries:
84 | entry['author'] = authorListFromListOfAuthors(entry.get('authors', []))
85 | if 'authors' in entry:
86 | del entry['authors']
87 |
88 | new_type = 'article'
89 | if entry.get('type_of_reference'):
90 | if entry['type_of_reference'] in reverse_type_mapping:
91 | new_type = reverse_type_mapping[entry['type_of_reference']]
92 |
93 | entry['ENTRYTYPE'] = new_type
94 | entry = fixBibData(entry, 0)
95 | res.append(entry)
96 |
97 | return res
98 |
--------------------------------------------------------------------------------
/bib_diff.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from db.data import Paper
4 | from db.ref_utils import normalizeTitle
5 | from base.general_utils import readInputBib, writeOutputBib
6 |
7 |
8 | def merge_two_dicts(x, y):
9 | z = x.copy() # start with x's keys and values
10 | z.update(y) # modifies z with y's keys and values & returns None
11 | return z
12 |
13 |
14 | def buildHashTable(bib):
15 | res = {}
16 | for entry in bib:
17 | norm_title = normalizeTitle(entry['title'])
18 | res[norm_title] = entry
19 | return res
20 |
21 |
22 | def set_sub(a, b):
23 | res = set(a.keys()) - set(b.keys())
24 | res_list = [value for key, value in a.items() if key in res]
25 | return [Paper(x, {}) for x in res_list]
26 |
27 |
28 | def set_intersect(a, b):
29 | res = set(a.keys()) & set(b.keys())
30 | res_list = [value for key, value in a.items() if key in res]
31 | return [Paper(x, {}) for x in res_list]
32 |
33 |
34 | def set_union(a, b):
35 | res = set(a.keys()) | set(b.keys())
36 | full_dict = merge_two_dicts(a, b)
37 | res_list = [value for key, value in full_dict.items() if key in res]
38 | return [Paper(x, {}) for x in res_list]
39 |
40 |
41 | def main(conf):
42 | bib1 = readInputBib(conf.input1)
43 | bib2 = readInputBib(conf.input2)
44 |
45 | s1 = buildHashTable(bib1)
46 | s2 = buildHashTable(bib2)
47 |
48 | list_sub1 = set_sub(s1, s2)
49 | list_sub2 = set_sub(s2, s1)
50 | list_and = set_intersect(s1, s2)
51 | list_or = set_union(s1, s2)
52 |
53 | output_format = conf.format.lower()
54 |
55 | writeOutputBib(list_sub1, conf.output + '_a-b.' + output_format)
56 | writeOutputBib(list_sub2, conf.output + '_b-a.' + output_format)
57 | writeOutputBib(list_and, conf.output + '_a_and_b.' + output_format)
58 | writeOutputBib(list_or, conf.output + '_a_or_b.' + output_format)
59 |
60 | print('A - B:', len(list_sub1))
61 | print('B - A:', len(list_sub2))
62 | print('B & A:', len(list_and))
63 | print('B | A:', len(list_or))
64 |
65 |
66 | if __name__ == '__main__':
67 | parser = ArgumentParser(
68 | description='Compute diff between bib lists. Takes 2 lists of bib entries, an "old" and a "new" one. It outputs 3 lists: 1. papers only found in input1 2. papers only in input 2 3. papers in both')
69 |
70 | parser.add_argument('-i1', '--input1', type=str,
71 | help='Input BIB/RIS/CSV file name (set A)')
72 | parser.add_argument('-i2', '--input2', type=str,
73 | help='Input BIB/RIS/CSV file name (set B)')
74 | parser.add_argument('-o', '--output', type=str,
75 | help='Beginning of output filename')
76 | parser.add_argument('-f', '--format', type=str, default='bib',
77 | help='Output format: bib, ris, csv')
78 |
79 | conf = parser.parse_args()
80 |
81 | main(conf)
82 |
--------------------------------------------------------------------------------
/search/google_scholar.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import scholarly
3 | from time import sleep
4 | from .base_search import Searcher, MAX_RESULTS, SearchResult
5 | import bibtexparser
6 | from tqdm import tqdm
7 | from random import random
8 | from db.bibtex import fixBibData
9 | from db.ref_utils import isPDFURL, getDOIfromURL, addUrlIfNew, addUrlIfNewWithType
10 |
11 |
12 | class GScholarSearcher(Searcher):
13 | """
14 | Retrieves results and bibtex data from Google Scholar
15 | """
16 |
17 | def __init__(self, paperstore):
18 | super().__init__(paperstore)
19 | self.min_delay_between_requests = 0.1
20 |
21 | def randomSleep(self):
22 | sleep(self.min_delay_between_requests + random() / 10) # random sleep so we don't get blocked
23 |
24 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS):
25 | # TODO implement max year
26 | if min_year:
27 | scholarly.scholarly._PUBSEARCH = '/scholar?as_ylo=' + str(min_year) + '&q={0}'
28 |
29 | query = scholarly.search_pubs_query(query)
30 | results = []
31 | index = 0
32 | for result in tqdm(query, desc="Getting results", total=max_results):
33 | bib = fixBibData(result.bib, index)
34 |
35 | extra_data = {}
36 |
37 | for field in ['citedby', 'url_scholarbib']:
38 | if hasattr(result, field):
39 | extra_data[field] = getattr(result, field)
40 |
41 | if hasattr(result, 'id_scholarcitedby'):
42 | extra_data['scholarid'] = result.id_scholarcitedby
43 |
44 | for field in ['url', 'eprint']:
45 |
46 | if hasattr(result, field):
47 | bib[field] = getattr(result, field)
48 |
49 | addUrlIfNewWithType(result, result.url, 'scholar')
50 |
51 | doi = getDOIfromURL(bib.get('url'))
52 | if not doi:
53 | doi = getDOIfromURL(bib.get('eprint', ''))
54 |
55 | if doi:
56 | bib['doi'] = doi
57 |
58 | result = SearchResult(index, bib, result.source, extra_data)
59 | results.append(result)
60 | index += 1
61 |
62 | if len(results) == max_results:
63 | break
64 |
65 | if len(results) % 10 == 0:
66 | self.randomSleep()
67 | return results
68 |
69 | def getScholarBibForResults(self, results):
70 | res = []
71 | for result in tqdm(results, desc="Getting Scholar bib data"):
72 | if result.get("url_scholarbib"):
73 | bib = result["bib"]
74 | try:
75 | r = requests.get(result["url_scholarbib"])
76 | # print(r)
77 | db = bibtexparser.loads(r.text)
78 | bib = db.entries[0]
79 |
80 | except Exception as e:
81 | print(e)
82 |
83 | bib['abstract'] = result["bib"]['abstract']
84 | for key in ['abstract', 'eprint', 'url']:
85 | if key in result["bib"]:
86 | bib[key] = result["bib"][key]
87 | result["bib"] = bib
88 |
89 | self.randomSleep()
90 |
--------------------------------------------------------------------------------
/base/file_download.py:
--------------------------------------------------------------------------------
1 | import os
2 | from multiprocessing.pool import ThreadPool
3 |
4 | import pandas as pd
5 | import requests
6 |
7 | from db.ref_utils import parseBibAuthors, isPDFURL
8 |
9 |
10 | def fetch_url(entry):
11 | result = {'id': entry['id'],
12 | 'file_exists': False,
13 | 'return_code': None}
14 |
15 | if not os.path.exists(entry['filename']):
16 | print("Get %s - %s" % (entry['id'][:30], entry['url']))
17 | try:
18 | r = requests.get(entry['url'], stream=True)
19 | result['return_code'] = r.status_code
20 | if r.status_code == 200:
21 | with open(entry['filename'], 'wb') as f:
22 | for chunk in r:
23 | f.write(chunk)
24 | except Exception as e:
25 | print(e.__class__.__name__, e)
26 | result['return_code'] = 'TooManyRedirects'
27 |
28 | else:
29 | result['file_exists'] = True
30 |
31 | return result
32 |
33 |
34 | def generateFilename(paper):
35 | res = ''
36 | authors = parseBibAuthors(paper.authors)
37 | if not authors:
38 | print(paper.authors)
39 | print()
40 |
41 | if authors and authors[0].get('family'):
42 | res += authors[0]['family'] + ' '
43 | if paper.year:
44 | res += '(%s)' % paper.year
45 |
46 | if len(res) > 0:
47 | res += ' - '
48 | res += paper.norm_title.title()
49 | return res
50 |
51 |
52 | def bulkDownload(papers, root_dir, report_path, do_not_download_just_list=False):
53 | root_dir = os.path.abspath(root_dir)
54 |
55 | if not os.path.exists(root_dir):
56 | os.makedirs(root_dir)
57 |
58 | download_tasks = []
59 |
60 | for paper in papers:
61 | # if not paper.year:
62 | # print("missing year", paper)
63 |
64 | filename = os.path.join(root_dir, generateFilename(paper)) + '.pdf'
65 | paper.pdf_filename = filename
66 |
67 | task_record = {'id': paper.id,
68 | 'doi': paper.doi,
69 | 'filename': filename,
70 | 'abstract': paper.abstract
71 | }
72 | url = None
73 | url_source = None
74 |
75 | for url_rec in paper.extra_data.get('urls', []):
76 | if url_rec['type'] == 'pdf':
77 | url = url_rec['url']
78 | url_source = url_rec['source']
79 | break
80 |
81 | if not url:
82 | if paper.bib.get('eprint'):
83 | url = paper.bib['eprint']
84 | url_source = 'search'
85 | elif paper.bib.get('url') and isPDFURL(paper.bib['url']):
86 | url = paper.bib['url']
87 | url_source = 'search'
88 |
89 | if url:
90 | task_record['url'] = url
91 | task_record['url_source'] = url_source
92 | download_tasks.append(task_record)
93 | else:
94 | print(paper.extra_data)
95 | print(paper.bib)
96 | print()
97 |
98 | df = pd.DataFrame(download_tasks)
99 | df.to_csv('download_tasks.csv')
100 |
101 | if do_not_download_just_list:
102 | return
103 |
104 | results = ThreadPool(8).imap_unordered(fetch_url, download_tasks)
105 |
106 | df = pd.DataFrame(results)
107 | df.to_csv(report_path)
108 |
--------------------------------------------------------------------------------
/db/bibtex.py:
--------------------------------------------------------------------------------
1 | import bibtexparser
2 | import re
3 | import random
4 |
5 | import requests
6 |
7 | from db.ref_utils import parseBibAuthors, normalizeTitle
8 |
9 |
10 | def fixBibData(bib, index):
11 | """
12 | Add mandatory missing fields to bibtex data
13 |
14 | :param bib:
15 | :param index:
16 | :return:
17 | """
18 | if "ENTRYTYPE" not in bib:
19 | bib["ENTRYTYPE"] = "ARTICLE"
20 | if "ID" not in bib:
21 | authors = parseBibAuthors(bib["author"])
22 | if not authors:
23 | bib['ID'] = 'id' + str(random.randint(1000, 9000))
24 | else:
25 | bib["ID"] = authors[0]["family"]
26 |
27 | bib['ID'] += str(bib.get("year", "YEAR")) + bib["title"].split()[0].lower()
28 |
29 | return bib
30 |
31 |
32 | def readBibtexString(bibstr):
33 | return bibtexparser.loads(bibstr).entries
34 |
35 |
36 | def readBibtexFile(filename):
37 | return bibtexparser.load(open(filename, 'r')).entries
38 |
39 |
40 | def writeBibtex(results: list, filename: str):
41 | """
42 | Exports the list of results to a BibTeX file.
43 |
44 | :param results: a list of either SearchResult or Paper objects, with a .bib dict property
45 | :param filename: file to export the bibtex to
46 | """
47 | db = bibtexparser.bibdatabase.BibDatabase()
48 |
49 | for index, result in enumerate(results):
50 | db.entries.append(fixBibData(result.bib, index))
51 |
52 | with open(filename, 'w') as bibtex_file:
53 | bibtexparser.dump(db, bibtex_file)
54 |
55 |
56 | def getBibtextFromDOI(doi: str):
57 | assert doi
58 | headers = {'Accept': 'text/bibliography; style=bibtex'}
59 | url = 'http://doi.org/' + doi
60 | r = requests.get(url, headers=headers)
61 | text = r.content.decode('utf-8')
62 | bib = readBibtexString(text)
63 | return bib
64 |
65 |
66 | def generateUniqueID(paper):
67 | """
68 | Returns a simple string id that is the mashup of the title and authors
69 |
70 | :param paper:
71 | :return:
72 | """
73 | author_bit = ''
74 | if paper.extra_data.get('xref_author'):
75 | authors = paper.extra_data['xref_author']
76 | else:
77 | try:
78 | authors = parseBibAuthors(paper.authors)
79 | except:
80 | print("Failed to parse authors string", paper.authors)
81 | authors = [{'given': '', 'family': ''}]
82 |
83 | for author in authors:
84 | if isinstance(author, str):
85 | author_bit += author
86 | else:
87 | if author.get('family'):
88 | author_bit += author.get('family', '_')[0] + author.get('given', '_')[0]
89 |
90 | title_bit = normalizeTitle(paper.title)
91 | title_bit = re.sub("\s+", "", title_bit)
92 | full_id = title_bit + "_" + author_bit
93 | full_id = full_id.lower()
94 |
95 | return full_id
96 |
97 |
98 | def test():
99 | bibtex = """@ARTICLE{Cesar2013,
100 | author = {Jean César},
101 | title = {An amazing title},
102 | year = {2013},
103 | volume = {12},
104 | pages = {12--23},
105 | journal = {Nice Journal},
106 | abstract = {This is an abstract. This line should be long enough to test
107 | multilines...},
108 | comments = {A comment},
109 | keywords = {keyword1, keyword2}
110 | }
111 | """
112 |
113 | with open('bibtex.bib', 'w') as bibfile:
114 | bibfile.write(bibtex)
115 |
116 | with open("bibtex.bib") as bibtex_file:
117 | bib_database = bibtexparser.load(bibtex_file)
118 |
119 | print(bib_database.entries)
120 |
121 |
122 | if __name__ == '__main__':
123 | test()
124 |
--------------------------------------------------------------------------------
/reasons_for_exclusion.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pandas as pd
4 |
5 | from itertools import chain
6 | from collections import Counter
7 |
8 | from db.data import PaperStore
9 | from db.rayyan import loadRayyan, computeReviewerOverlap
10 | from db.rayyan import selectPapersToReview
11 |
12 |
13 | EXCLUSION_PRECEDENCE = [
14 | 'foreign language',
15 | 'is review',
16 | 'uses images',
17 | 'not radiology',
18 | 'not nlp',
19 | 'wrong publication type',
20 | 'not peer reviewed',
21 | 'cannot find fulltext',
22 | 'conference',
23 | 'too short'
24 | ]
25 |
26 |
27 | def fix_reasons(r):
28 | if r == 'not radiology report':
29 | return 'not radiology'
30 | if r == 'not radiology reports':
31 | return 'not radiology'
32 | if r == 'review':
33 | return 'is review'
34 | if r == 'with_images':
35 | return 'uses images'
36 | if '_' in r:
37 | return r.replace('_', ' ')
38 | return r.strip()
39 |
40 |
41 | def get_main_reason(reasons):
42 | reasons = set(map(fix_reasons, reasons))
43 | for r in EXCLUSION_PRECEDENCE:
44 | if r in reasons:
45 | return r
46 | print(reasons)
47 | return None
48 |
49 |
50 | if __name__ == "__main__":
51 |
52 | parser = argparse.ArgumentParser(description='Gather metadata such as'
53 | 'reason for exclusion + bib information')
54 |
55 | args = parser.parse_args()
56 |
57 | paper_store = PaperStore()
58 |
59 | columns = ['title', 'abstract', 'authors', 'url']
60 |
61 | # 220 articles - original query
62 | querydf = loadRayyan(os.path.join('reasons_for_exclusion', 'rayyan-old-query.zip'))
63 | # Include all
64 | querydf = selectPapersToReview(querydf, 0)
65 | querydf['rayyan_source'] = 'old_query'
66 |
67 | # 397 articles, follow up snowballing and new query
68 | snowdf = loadRayyan(os.path.join('reasons_for_exclusion', 'rayyan-snowball.zip'))
69 | # Include all
70 | snowdf = selectPapersToReview(snowdf, 0)
71 | snowdf['rayyan_source'] = 'snowball'
72 |
73 | # sysreview articles
74 | sysreviewdf = pd.read_excel(os.path.join('reasons_for_exclusion', 'sysreview-15-09-2020.xlsx'))
75 | sysreviewdf['rayyan_source'] = 'combined'
76 | # Only keep columns we care about
77 | sysreviewdf = sysreviewdf[columns]
78 | # The last paper was added by Hang
79 | sysreviewdf = sysreviewdf.head(274)
80 |
81 | # Join on title - unsure if there is a better join to do
82 |
83 | joined = pd.concat([querydf, snowdf], ignore_index=True, sort=True)
84 | joined['lower_title'] = joined['title'].str.strip().str.lower()
85 | # Keep the snowballing entry if duplicate exists
86 | joined = joined.drop_duplicates(subset='lower_title', keep='last')
87 |
88 | joined = pd.concat([sysreviewdf, joined], ignore_index=True, sort=True)
89 | joined['lower_title'] = joined['title'].str.strip().str.lower()
90 |
91 | # Drop all duplicates (hence only keep entries that didn't make
92 | # it past Rayyan filtering)
93 | joined = joined.drop_duplicates(subset='lower_title', keep=False)
94 |
95 | joined = joined.reset_index(drop=True)
96 | del joined['lower_title']
97 |
98 | print(joined)
99 |
100 | possible_exclusion_reasons = set(map(fix_reasons, chain(*joined['exclusion_reasons'].tolist())))
101 | print('Possible exclusion reasons')
102 | print(possible_exclusion_reasons)
103 |
104 | exclusion_reasons = joined['exclusion_reasons']
105 |
106 | main_reasons = [get_main_reason(r) for r in exclusion_reasons]
107 | counts = Counter(main_reasons)
108 | print()
109 | for k, v in counts.most_common():
110 | print('%s: %d' % (k.ljust(25), v))
111 | print()
112 | print('Excluded %d articles' % sum(counts.values()))
113 |
--------------------------------------------------------------------------------
/base/pdf_extract.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tika
3 | import enchant
4 |
5 | d1 = enchant.Dict("en_US")
6 | d2 = enchant.Dict("en_UK")
7 |
8 | import re
9 |
10 | if not os.environ.get('TIKA_PATH'):
11 | os.environ['TIKA_PATH'] = '~/'
12 |
13 | tika.initVM()
14 | from tika import parser
15 |
16 |
17 | def dehyphenate(text):
18 | """
19 | Removes hyphens from text intelligently, checking plausible spelling
20 |
21 | :param text: hyphenated text
22 | :return: text: de-hyphenated text
23 | """
24 |
25 | def rep_func(match):
26 | full_word = match.group(1) + match.group(2)
27 | if d1.check(full_word) or d2.check(full_word):
28 | return full_word
29 | else:
30 | return match.group(1) + '-' + match.group(2)
31 |
32 | text = re.sub('(\w+)-\n(\w+)', rep_func, text)
33 | return text
34 |
35 |
36 | def cleanUpTikaText(text):
37 | text = re.sub('\n+', '\n', text)
38 | return text
39 |
40 |
41 | def findHeaders(strings, text, default):
42 | str_start = -1
43 |
44 | for str_string in strings:
45 | str_start = text.find(str_string)
46 | if str_start != -1:
47 | break
48 |
49 | if str_start == -1:
50 | str_start = default
51 |
52 | return str_start
53 |
54 |
55 | # def getAbstract(text):
56 | # abs_start = findHeaders(['Abstract', 'ABSTRACT'], text, 0)
57 | # abs_end = findHeaders(["Keywords:", "Keywords :", "KEYWORDS:", 'Related Work', 'Previous Work'], text[abs_start:],
58 | # len(text))
59 | #
60 | # abstract = text[abs_start:abs_end]
61 | # return abstract
62 |
63 | regex_abstract = re.compile('(^Abstract[\:\—\-\s\n]*.+?)^(\d*\.?\s*Introduction|Keywords\s*\:?|Previous work)',
64 | re.MULTILINE | re.IGNORECASE | re.DOTALL)
65 |
66 | regex_summary = re.compile(
67 | '(^(Abstract|Summary)\s*\:?\n.+?)^(\d*\.?\s*Introduction|Keywords\s*\:?|Previous work|Table of contents)',
68 | re.MULTILINE | re.IGNORECASE | re.DOTALL)
69 |
70 | regex_thesis = re.compile('I.+?declare that.+?(dissertation|thesis)', re.MULTILINE | re.DOTALL)
71 |
72 |
73 | def getAbstractFromPDF(filename):
74 | parsed = readPDF(filename)
75 |
76 | if parsed.get('error'):
77 | print(parsed['error'])
78 | return None
79 |
80 | if parsed.get('status', 200) == 422:
81 | print('Tika:: Unprocessable entity', filename)
82 | return None
83 |
84 | text = parsed['content']
85 | if not text:
86 | print('Tika:: No text in file', filename)
87 | return None
88 |
89 | text = cleanUpTikaText(text)
90 |
91 | if regex_thesis.search(text):
92 | match = regex_summary.search(text)
93 | else:
94 | match = regex_abstract.search(text)
95 |
96 | if match:
97 | abstract = match.group(1)
98 | else:
99 | print('[[[[[[Could not find the abstract]]]]]]')
100 | print(text[:1000])
101 | print('\n\n')
102 | return None
103 |
104 | abstract = dehyphenate(abstract)
105 | abstract = cleanUpTikaText(abstract)
106 |
107 | return abstract
108 |
109 |
110 | def readPDF(filename, to_xml=False):
111 | try:
112 | parsed = parser.from_file(filename, xmlContent=to_xml)
113 | except UnicodeEncodeError as e:
114 | print(e.__class__.__name__, e)
115 | return {'error': e.__class__.__name__ + ': ' + e.__str__()}
116 | return parsed
117 |
118 |
119 | def getStructuredArticle(xml):
120 | pass
121 |
122 |
123 | def test():
124 | parsed = readPDF(
125 | '/Users/masterman/Downloads/Towards dataset creation and establishing baselines for sentence-level neural clinical paraphrase generation and simplification.pdf',
126 | to_xml=True)
127 | print(parsed['content'])
128 |
129 |
130 | def test2():
131 | parsed = readPDF(
132 | '/Users/masterman/Downloads/Towards dataset creation and establishing baselines for sentence-level neural clinical paraphrase generation and simplification.pdf',
133 | to_xml=False)
134 | full_text = cleanUpTikaText(parsed['content'])
135 | abstract = getAbstractFromPDF(full_text)
136 | clean_abstract = dehyphenate(abstract)
137 | print(clean_abstract)
138 | print()
139 |
140 |
141 | if __name__ == '__main__':
142 | test()
143 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ReviewBuilder
2 | A collection of tools for automating parts of a [systematic review](https://consumers.cochrane.org/what-systematic-review) of scientific literature.
3 |
4 | Currently supports one use case: creating a bibtex file with the results of a Google Scholar search and augmenting the metadata for each result by retrieving its abstract and finding [Open Access](https://en.wikipedia.org/wiki/Open_access) versions of the paper on the web, including preprints.
5 |
6 | - All results are cached locally in a SQLite database, aiming to make iterating over queries for obtaining papers for a review less painful.
7 | - All data ingestion is _nice_ :), locally enforcing rate limiting, both from the known requirements of each service, and by parsing the `X-Rate-Limit-Limit` and `X-Rate-Limit-Interval` where provided in the response.
8 | - Implemented: [Google Scholar](https://scholar.google.com), [Crossref](https://www.crossref.org/services/metadata-delivery/rest-api/), [SemanticScholar (metadata)](https://api.semanticscholar.org/), [PubMed](https://www.ncbi.nlm.nih.gov/home/develop/api/), [arXiv](https://arxiv.org/help/api), [Unpaywall](https://unpaywall.org/products/api).
9 | - Not yet implemented: [Microsoft Academic](https://academic.microsoft.com), Semantic Scholar (search), [Web of Science](https://developer.clarivate.com/apis/wos)
10 | - Coming very soon:
11 | - locally filtering results (i.e. "selecting articles for inclusion") based on keywords and the detected language the paper is written in
12 | - automatic downloading of PDFs
13 |
14 | ## Installation
15 |
16 | Tested on Python 3.7 only. May work with earlier versions of Python 3, but not 2.
17 |
18 | > pip install -r requirements.txt
19 |
20 | ## Example usage
21 |
22 | > python search_to_file.py -q "OR \"natural language\" OR \"radiology reports\" OR lstm OR rnn OR bert OR elmo OR word2vec" -m 100 -f test.bib -ys 2015
23 |
24 | This will send the supplied query to Google Scholar, and set the minimum year (--year-start) to 2015, retrieve a maximum of 100 results and save them in the file `test.bib`.
25 |
26 | Alternatively, we can save the query in a text file and pass that as a parameter:
27 |
28 | > python search_to_file.py -qf query1.txt -m 100 -f test.bib -ys 2015
29 |
30 | Bibtex does not store everything we are interested in, so by default, extra data from Scholar such as the link to the "related articles", number of citations and other tidbits will be directly saved to the local SQLite cache (see below).
31 |
32 | Google Scholar offers perhaps the best coverage (recall) over all fields of science and does a great job at surfacing relevant articles. What it does not do, however, is make it easy to scrape, or connect these results to anything else useful. It does not provide any useful identifier for the results ([DOI](http://www.doi.org/), [PMID](https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/), etc) or the abstract of the paper, and a lot of information is mangled in the results, including authors' names. To get high quality data, we need to use other services.
33 |
34 | Once we have the list of results, we can collect extra data, such as the abstract of the paper and locations on the web where we may find it in open access, whether in HTML or PDF.
35 |
36 | > python gather_metadata.py -i test.bib -o test_plus.bib --max 20
37 |
38 | This will process a maximum of 200 entries from the `test.bib` file, and output an "enriched" version to `test_plus.bib`. For each entry it will try to:
39 | 1. match it with an entry in the local cache. If it can't be found go to step 2.
40 | 1. attempt to match the paper with its DOI via the [Crossref](http://www.crossref.org/) API.
41 | 1. once we have a DOI, check [SemanticScholar](http://www.semanticscholar.org/) for metadata and abstract for the paper
42 | 1. if we don't have a DOI or abstract, search [PubMed](http://www.ncbi.nlm.nih.gov/pubmed/) for its PubMed ID (PMID) and retrieve the abstract from there, if available
43 | 1. search [arXiv](http://arxiv.org) for a preprint of the paper
44 | 1. search [Unpawall](http://unpaywall.org) for available open access versions of the paper if we are missing a PDF link from the above
45 |
46 | Many of these steps require approximate matching, both for the local cache and the results from the remote APIs. Often a preprint version of a paper will have a slightly different title or will be missing an author or two. This repo implements several heuristics for dealing with this.
47 |
48 | A SQLite database cache is automatically created in `papers.sqlite` in the /db directory.
49 |
50 |
51 |
--------------------------------------------------------------------------------
/snowball_citations.py:
--------------------------------------------------------------------------------
1 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib
2 | from argparse import ArgumentParser
3 | from filter_results import filterPapers, printReport, filterOnePaper
4 | from search.metadata_harvest import semanticscholarmetadata, enrichAndUpdateMetadata
5 | import pandas as pd
6 |
7 |
8 | def getCitingPapers(paper):
9 | try:
10 | paper, citing_papers = semanticscholarmetadata.getMetadata(paper, get_citing_papers=True)
11 | except Exception as e:
12 | print(e.__class__.__name__, e)
13 | return []
14 |
15 | return citing_papers
16 |
17 |
18 | def deDupePaperList():
19 | pass
20 |
21 |
22 | def snowballCitations(paperstore, all_papers):
23 | newfound_paper_list = []
24 | report = []
25 |
26 | all_titles_ever_seen = {}
27 | search_nodes = all_papers
28 |
29 | while len(search_nodes) > 0:
30 | paper = search_nodes.pop(0)
31 | new_papers = getCitingPapers(paper)
32 | for new_paper in new_papers:
33 | if new_paper.title in all_titles_ever_seen:
34 | print('[Skipping] already seen paper', new_paper.title)
35 | all_titles_ever_seen[new_paper.title] += 1
36 | continue
37 |
38 | semanticscholarmetadata.getMetadata(new_paper)
39 | new_paper.extra_data['done_semanticscholar'] = True
40 | paperstore.updatePapers([new_paper])
41 |
42 | all_titles_ever_seen[new_paper.title] = 1
43 | # year = new_paper.bib.get('year', 0)
44 | # if year and int(year) >= 2015:
45 | # newfound_paper_list.append(Paper(paper.bib, paper.extra_data))
46 | # else:
47 | # print(new_paper)
48 | if not new_paper.has_abstract:
49 | record = {
50 | 'title': paper.title,
51 | 'year': paper.year,
52 | 'authors': paper.authors,
53 | 'venue': paper.venue,
54 | 'abstract': paper.abstract,
55 | 'excluded': False,
56 | 'exclude_reason': None
57 | }
58 | paper_add = new_paper
59 | else:
60 | paper_add, record = filterOnePaper(new_paper, exclude_rules={'no_pdf': False,
61 | 'year': False,
62 | 'is_review': False})
63 | report.append(record)
64 |
65 | if paper_add:
66 | newfound_paper_list.append(paper_add)
67 | print('Adding new seed paper', paper_add.bib['title'])
68 | search_nodes.append(new_paper)
69 | else:
70 | print('[Excluded]:', record['exclude_reason'], new_paper.bib['title'])
71 |
72 | df = pd.DataFrame(report, columns=['id', 'year', 'title', 'excluded', 'exclude_reason', 'language', 'abstract'])
73 |
74 | return newfound_paper_list, df
75 |
76 |
77 | def main(conf):
78 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache)
79 |
80 | # successful, unsuccessful = enrichAndUpdateMetadata(papers_to_add, paperstore, conf.email)
81 |
82 | snowballed_papers, df = snowballCitations(paperstore, all_papers)
83 | print('Number of snowballed papers:', len(snowballed_papers))
84 | printReport(df)
85 |
86 | successful, unsuccessful = enrichAndUpdateMetadata(snowballed_papers, paperstore, conf.email)
87 |
88 | # included, df = filterPapers(snowballed_papers)
89 | # printReport(df)
90 |
91 | writeOutputBib(snowballed_papers, conf.output)
92 |
93 |
94 | if __name__ == '__main__':
95 | parser = ArgumentParser(description='Filter results ')
96 |
97 | parser.add_argument('-i', '--input', type=str,
98 | help='Input bib file name with seed papers')
99 | parser.add_argument('-o', '--output', type=str,
100 | help='Output bib file name with snowballed papers')
101 | parser.add_argument('-r', '--report-path', type=str, default='filter_report.csv',
102 | help='Path to output report CSV')
103 | parser.add_argument('-c', '--cache', type=bool, default=True,
104 | help='Use local cache for results')
105 | parser.add_argument('-em', '--email', type=str,
106 | help='Email to serve as identity to API endpoints')
107 |
108 | conf = parser.parse_args()
109 |
110 | main(conf)
111 |
--------------------------------------------------------------------------------
/db/rayyan.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import re
3 | from zipfile import ZipFile
4 | from itertools import combinations
5 | import numpy as np
6 | from io import BytesIO
7 |
8 |
9 | DROP_FIELDS = ['key',
10 | 'issn',
11 | 'volume',
12 | 'pages',
13 | 'issue',
14 | 'language',
15 | 'location',
16 | 'notes',
17 | 'journal',
18 | 'day',
19 | 'month',
20 | 'maybe_count']
21 |
22 |
23 | def parseInclusion(text):
24 | reviewers = {}
25 | exclusion_reasons = []
26 | labels = []
27 |
28 | for match in re.findall('\"([\w\s\.]+?)\"=>\"([\w\s]+?)\"', text):
29 | reviewers[match[0]] = match[1]
30 |
31 | if match[1].lower() == 'excluded':
32 | exclusion_reasons = []
33 | match = re.search('RAYYAN-EXCLUSION-REASONS: ([\w\s,]+)', text)
34 | if match:
35 | exclusion_reasons.extend(match.group(1).split(','))
36 |
37 | match = re.search('RAYYAN-LABELS: ([\w\s,]+)', text)
38 | if match:
39 | labels.extend(match.group(1).split(','))
40 |
41 | return reviewers, exclusion_reasons, labels
42 |
43 |
44 | def loadRayyan(filename):
45 | with ZipFile(filename, 'r') as zip:
46 | data = zip.read('articles.csv')
47 | data = BytesIO(data)
48 | df = pd.read_csv(data)
49 |
50 | column_labels = []
51 | column_exclusion_reasons = []
52 | column_reviewers = []
53 | all_unique_reviewers = set()
54 |
55 | for index, row in df.iterrows():
56 | reviewers, exclusion_reasons, labels = parseInclusion(row['notes'])
57 | column_labels.append(labels)
58 | column_exclusion_reasons.append(exclusion_reasons)
59 | column_reviewers.append(reviewers)
60 |
61 | all_unique_reviewers = all_unique_reviewers | set(reviewers.keys())
62 |
63 | reviewer_titles = []
64 |
65 | for reviewer in all_unique_reviewers:
66 | reviewer_column_title = 'reviewer_' + reviewer
67 | reviewer_titles.append('reviewer_' + reviewer)
68 | reviewer_column_data = [r.get(reviewer) for r in column_reviewers]
69 | df.insert(len(df.columns), reviewer_column_title, reviewer_column_data)
70 |
71 | df.insert(len(df.columns), 'exclusion_reasons', column_exclusion_reasons)
72 | df.insert(len(df.columns), 'labels', column_labels)
73 |
74 | for index, row in df.iterrows():
75 | match = re.search(r'PY - (\d+)\/+?', row['authors'])
76 | if match:
77 | df.at[index, 'year'] = match.group(1)
78 | df.at[index, 'authors'] = df.iloc[index]['authors'][:match.start()]
79 |
80 | included_counts = []
81 | excluded_counts = []
82 | maybe_counts = []
83 |
84 | for index, row in df.iterrows():
85 | included_count = 0
86 | excluded_count = 0
87 | maybe_count = 0
88 | for reviewer in reviewer_titles:
89 | if row.get(reviewer) == 'Included':
90 | included_count += 1
91 | elif row.get(reviewer) == 'Excluded':
92 | excluded_count += 1
93 | elif row.get(reviewer) == 'Maybe':
94 | maybe_count += 1
95 | included_counts.append(included_count)
96 | excluded_counts.append(excluded_count)
97 | maybe_counts.append(maybe_count)
98 |
99 | df.insert(len(df.columns), 'included_count', included_counts)
100 | df.insert(len(df.columns), 'excluded_count', excluded_counts)
101 | df.insert(len(df.columns), 'maybe_count', maybe_counts)
102 |
103 | return df
104 |
105 |
106 | def computeOverlap(df):
107 | reviewer_columns = [c for c in df.columns if c.startswith('reviewer_')]
108 | df = df[reviewer_columns]
109 |
110 | a = df.values
111 | d = {(i, j): np.mean(a[:, i] == a[:, j]) for i, j in combinations(range(a.shape[1]), 2)}
112 |
113 | res, c, vals = np.zeros((a.shape[1], a.shape[1])), \
114 | list(map(list, zip(*d.keys()))), list(d.values())
115 |
116 | res[c[0], c[1]] = vals
117 |
118 | return pd.DataFrame(res, columns=df.columns, index=df.columns)
119 |
120 |
121 | # def compute_agreement(vals, vala, valb):
122 | # # Use to compute TP/TN/FP/FN
123 | # d = {(i, j): np.sum((vals[:, i] == vala) & (vals[:, j] == valb))
124 | # for i, j in combinations(range(vals.shape[1]), 2)}
125 | # df, c, vals = np.zeros((vals.shape[1], vals.shape[1])), \
126 | # list(map(list, zip(*d.keys()))), list(d.values())
127 | # df[c[0], c[1]] = vals
128 | # return df
129 |
130 |
131 | # def computeStats(df):
132 | # reviewer_columns = [c for c in df.columns if c.startswith('reviewer_')]
133 | # df = df[reviewer_columns]
134 | #
135 | # a = df.values
136 | # TP = compute_agreement(a, 'Included', 'Included')
137 | # TN = compute_agreement(a, 'Excluded', 'Excluded')
138 | # FP = compute_agreement(a, 'Included', 'Excluded')
139 | # FN = compute_agreement(a, 'Excluded', 'Included')
140 | #
141 | # print('TP', TP)
142 | # print('TN', TN)
143 | # print('FP', FP)
144 | # print('FN', FN)
145 | #
146 | # print('Total', TP+TN+FP+FN)
147 |
148 |
149 | def computeFleiss(df):
150 | reviewer_columns = [c for c in df.columns if c.startswith('reviewer_')]
151 | df = df[reviewer_columns]
152 |
153 | a = df.values
154 | classes = set(a.ravel())
155 |
156 | # rows are instances/examples
157 | # columns are classes
158 | # values are number of annotators assigned instance to class
159 | # so sum of each rows = num annotators
160 | P = np.hstack([np.sum(a == c, axis=1, keepdims=True)
161 | for c in classes])
162 | # Below is wikipedia example - expected kappa: 0.210
163 | # P = np.array([[0, 0, 0, 0, 14],
164 | # [0, 2, 6, 4, 2],
165 | # [0, 0, 3, 5, 6],
166 | # [0, 3, 9, 2, 0],
167 | # [2, 2, 8, 1, 1],
168 | # [7, 7, 0, 0, 0],
169 | # [3, 2, 6, 3, 0],
170 | # [2, 5, 3, 2, 2],
171 | # [6, 5, 2, 1, 0],
172 | # [0, 2, 2, 3, 7]])
173 |
174 | # N: number examples, k = number classes
175 | N, k = P.shape
176 | # n: number of annotators
177 | n = P.sum(axis=1)[0]
178 | assert(np.all(P.sum(axis=1) == n))
179 | # P_j..
180 | pee_jays = np.sum(P, axis=0) / (N * n)
181 | assert np.isclose(pee_jays.sum(), 1.), 'P_j calculation is wrong'
182 |
183 | # P_is
184 | pee_eye = np.sum(P * (P - 1), axis=1) / (n * (n - 1))
185 |
186 | pee_tilde = pee_eye.mean()
187 | pee_ee = np.sum(pee_jays ** 2)
188 |
189 | # Fleiss' kappa
190 | fleiss = (pee_tilde - pee_ee) / (1 - pee_ee)
191 | return fleiss
192 |
193 |
194 | def computeOverlap3(df):
195 | Yourdf = pd.DataFrame(columns=df.columns, index=df.columns)
196 | Yourdf = Yourdf.stack(dropna=False).to_frame().apply(lambda x: (df[x.name[0]] == df[x.name[1]]).mean(),
197 | axis=1).unstack()
198 | Yourdf = Yourdf.where(np.triu(np.ones(Yourdf.shape), 1).astype(np.bool))
199 | return Yourdf
200 |
201 |
202 | # def computeOverlap(df):
203 | # pd.crosstab(df.columns, df.columns, )
204 |
205 | def filterDFForInclusion(df, screen='Included'):
206 | if screen == 'Included':
207 | return df[df['included_count'] > 0]
208 | elif screen == 'Excluded':
209 | return df[df['excluded_count'] > 0]
210 | elif screen == 'Maybe':
211 | return df[df['maybe_count'] > 0]
212 |
213 |
214 | def computeReviewerOverlap(df):
215 | # df.at[df['reviewer_agrivas'] == 'Maybe', 'reviewer_agrivas'] = 'Included'
216 | # df.at[df['reviewer_Daniel'] == 'Maybe', 'reviewer_Daniel'] = 'Included'
217 |
218 | print('Total overlap')
219 | print(computeOverlap(df))
220 | print("Fleiss' kappa: %.2f" % computeFleiss(df))
221 |
222 | print('\nIncluded overlap')
223 | print(computeOverlap(filterDFForInclusion(df, 'Included')))
224 |
225 | print('\nExcluded overlap')
226 | print(computeOverlap(filterDFForInclusion(df, 'Excluded')))
227 |
228 |
229 | def selectPapersToReview(df, min_agreement=1):
230 | res = df[df['included_count'] >= min_agreement]
231 | res.drop(DROP_FIELDS, axis=1, inplace=True)
232 | return res
233 |
234 |
235 | def selectPapersToFilter(df, include_count, exclude_count):
236 | res = df[(df['included_count'] == include_count) & (df['excluded_count'] == exclude_count)]
237 | res.drop(DROP_FIELDS, axis=1, inplace=True)
238 | return res
239 |
--------------------------------------------------------------------------------
/filter_results.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib
3 | import pandas as pd
4 | from langdetect import detect
5 | from langdetect import DetectorFactory
6 |
7 | DetectorFactory.seed = 0
8 | import re
9 |
10 |
11 | def getPaperText(paper):
12 | res = paper.title
13 | abs = paper.bib.get('abstract')
14 | if abs:
15 | abstract = paper.bib.get('abstract', '')
16 | abstract = re.sub(r'[\n\s]+', ' ', abstract)
17 | abstract = re.sub(r'\s+', ' ', abstract)
18 |
19 | res += " " + abstract
20 | return res
21 |
22 |
23 | def isPatent(paper):
24 | url = paper.bib.get('url', paper.bib.get('eprint'))
25 | return 'patent' in paper.bib.get('journal', '') or (url and 'patent' in url.lower())
26 |
27 |
28 | def oneKeywordInText(keywords, text):
29 | text_lower = text.lower()
30 | for kw in keywords:
31 | kw = kw.lower()
32 | if kw in text_lower:
33 | return True
34 |
35 | return False
36 |
37 |
38 | def allKeywordsInText(keywords, text):
39 | text_lower = text.lower()
40 |
41 | in_text = 0
42 |
43 | for kw in keywords:
44 | kw = kw.lower()
45 | if kw in text_lower:
46 | in_text += 1
47 |
48 | return in_text == len(keywords)
49 |
50 |
51 | def oneKeywordNotInText(keywords, text):
52 | text_lower = text.lower()
53 | for kw in keywords:
54 | kw = kw.lower()
55 | if kw not in text_lower:
56 | return True
57 |
58 | return False
59 |
60 |
61 | def allKeywordsNotInText(keywords, text):
62 | text_lower = text.lower()
63 | not_in_text = 0
64 |
65 | for kw in keywords:
66 | kw = kw.lower()
67 | if kw not in text_lower:
68 | not_in_text += 1
69 |
70 | return not_in_text == len(keywords)
71 |
72 |
73 | def printReport(df):
74 | print(df)
75 | print('Included papers', len(df[df['excluded'] == False]))
76 | print('Excluded papers', len(df[df['excluded'] == True]))
77 | print('Excluded because of')
78 | print(' language', len(df[df['exclude_reason'] == 'language']))
79 | print(' is a patent', len(df[df['exclude_reason'] == 'is_patent']))
80 | print(' year out of range', len(df[df['exclude_reason'] == 'year']))
81 | print(' is a review', len(df[df['exclude_reason'] == 'is_review']))
82 | print(' using images', len(df[df['exclude_reason'] == 'uses_images']))
83 | # print(' full text not available', len(df[df['exclude_reason'] == 'no_pdf']))
84 | print(' not radiology', len(df[df['exclude_reason'] == 'not_radiology']))
85 | print(' not NLP', len(df[df['exclude_reason'] == 'not_nlp']))
86 |
87 |
88 | def collectStats(papers):
89 | results = []
90 | for paper in papers:
91 | res = {
92 | # 'id': paper.id,
93 | 'has_year': bool(paper.year),
94 | 'has_title': bool(paper.title),
95 | # 'authors': paper.authors,
96 | 'has_doi': bool(paper.doi),
97 | 'has_arxivid': bool(paper.arxivid),
98 | 'has_pmid': bool(paper.pmid),
99 | 'has_ssid': bool(paper.extra_data.get('ss_id')),
100 | 'has_valid_id': paper.has_valid_id,
101 | 'has_abstract': paper.has_abstract,
102 | 'has_full_abstract': paper.has_full_abstract,
103 | 'has_pdf': paper.has_pdf_link,
104 | 'not_abstract_but_pdf': not paper.has_abstract and paper.has_pdf
105 | }
106 | results.append(res)
107 |
108 | df = pd.DataFrame(results)
109 | for field in df.columns:
110 | print(field, len(df[df[field] == True]))
111 | print()
112 |
113 |
114 | def filterOnePaper(paper, exclude_rules={}):
115 | record = {
116 | 'title': paper.title,
117 | # 'year': int(paper.year) if paper.year else None,
118 | 'year': paper.year,
119 | 'authors': paper.authors,
120 | 'venue': paper.venue,
121 | 'abstract': paper.abstract,
122 | 'excluded': False,
123 | 'exclude_reason': None
124 | }
125 | accept = True
126 |
127 | text = getPaperText(paper)
128 | language = paper.extra_data.get('language')
129 |
130 | if not language:
131 | if len(text) < 62 or text.isupper():
132 | language = 'en'
133 | else:
134 | language = detect(text)
135 |
136 | # if language != 'en':
137 | # print(text)
138 | # print("Lang:", language)
139 | # print()
140 |
141 | language = language.lower()
142 | record['language'] = language
143 |
144 | lower_text = text.lower()
145 |
146 | if paper.title == "Identifying peripheral arterial disease cases using natural language processing of clinical notes":
147 | print()
148 |
149 | try:
150 | if paper.bib.get('year') is None or paper.bib.get('year') == '':
151 | paper.bib['year'] = 0
152 | else:
153 | paper.bib['year'] = int(paper.bib['year'])
154 | except:
155 | paper.bib['year'] = 0
156 |
157 | if exclude_rules.get('language', True) and not language.startswith('en'):
158 | record['excluded'] = True
159 | record['exclude_reason'] = 'language'
160 | accept = False
161 | elif exclude_rules.get('is_patent', True) and isPatent(paper):
162 | record['excluded'] = True
163 | record['exclude_reason'] = 'is_patent'
164 | accept = False
165 | elif exclude_rules.get('year', True) and int(paper.bib.get('year', 0)) < 2015:
166 | record['excluded'] = True
167 | record['exclude_reason'] = 'year'
168 | accept = False
169 | elif exclude_rules.get('is_review', True) and oneKeywordInText(['review', 'overview'],
170 | paper.title.lower()) or oneKeywordInText(
171 | ['this review', 'this chapter'], lower_text):
172 | record['excluded'] = True
173 | record['exclude_reason'] = 'is_review'
174 | accept = False
175 | elif exclude_rules.get('uses_images', True) and oneKeywordInText(
176 | ['images', 'visual', 'chest x-ray', 'segmentation'], lower_text):
177 | record['excluded'] = True
178 | record['exclude_reason'] = 'uses_images'
179 | accept = False
180 | # elif exclude_rules.get('no_pdf', True) and not paper.has_pdf:
181 | # record['excluded'] = True
182 | # record['exclude_reason'] = 'no_pdf'
183 | # accept = False
184 | elif exclude_rules.get('not_radiology', True) and allKeywordsNotInText(
185 | ['radiolo', 'imaging report', ' CT', ',CT', ':CT', 'MRI'], lower_text):
186 | record['excluded'] = True
187 | record['exclude_reason'] = 'not_radiology'
188 | accept = False
189 | elif exclude_rules.get('not_nlp', True) and allKeywordsNotInText(
190 | ['text', 'langu', 'lingu', 'nlp', 'synta', 'embedding', 'information extraction',
191 | 'text mining', 'words',
192 | 'deep learning', 'deep neural',
193 | 'machine learning', 'artificial intelligence', 'document classification', ],
194 | lower_text):
195 | record['excluded'] = True
196 | record['exclude_reason'] = 'not_nlp'
197 | accept = False
198 |
199 | if accept:
200 | return paper, record
201 | else:
202 | return None, record
203 |
204 |
205 | def filterPapers(papers):
206 | included = []
207 | report = []
208 |
209 | for paper in papers:
210 | paper, record = filterOnePaper(paper)
211 | if paper:
212 | included.append(paper)
213 | report.append(record)
214 |
215 | df = pd.DataFrame(report, columns=['id', 'year', 'title', 'excluded', 'exclude_reason', 'language', 'abstract'])
216 | return included, df
217 |
218 |
219 | def main(conf):
220 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache)
221 |
222 | collectStats(all_papers)
223 | included, df = filterPapers(all_papers)
224 |
225 | printReport(df)
226 |
227 | df.to_csv(conf.report_path)
228 |
229 | writeOutputBib(included, conf.output)
230 |
231 | return df
232 |
233 |
234 | if __name__ == '__main__':
235 | parser = ArgumentParser(description='Filter results ')
236 |
237 | parser.add_argument('-i', '--input', type=str,
238 | help='Input bib file name')
239 | parser.add_argument('-o', '--output', type=str,
240 | help='Output bib file name')
241 | parser.add_argument('-r', '--report-path', type=str, default='filter_report.csv',
242 | help='Path to output report CSV')
243 | parser.add_argument('-c', '--cache', type=bool, default=True,
244 | help='Use local cache for results')
245 |
246 | conf = parser.parse_args()
247 |
248 | df = main(conf)
249 |
--------------------------------------------------------------------------------
/db/ref_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import unicodedata
3 |
4 |
5 | def isPDFURL(url):
6 | return ('pdf' in url or 'openreview' in url)
7 |
8 |
9 | def getDOIfromURL(url):
10 | if not url:
11 | return None
12 |
13 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)\.pdf', url)
14 | if match:
15 | return match.group(1)
16 |
17 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)/', url)
18 | if match:
19 | return match.group(1)
20 |
21 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)\?', url)
22 | if match:
23 | return match.group(1)
24 |
25 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)', url)
26 | if match:
27 | return match.group(1)
28 |
29 | return None
30 |
31 |
32 | def unicodeToASCII(input_str):
33 | nfkd_form = unicodedata.normalize('NFKD', input_str)
34 | only_ascii = nfkd_form.encode('ASCII', 'ignore').decode("utf-8")
35 | return only_ascii
36 |
37 |
38 | pLu = "[A-Z\u00C0-\u00D6\u00D8-\u00DE\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176\u0178\u0179\u017B\u017D\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E\u0370\u0372\u0376\u037F\u0386\u0388-\u038A\u038C\u038E\u038F\u0391-\u03A1\u03A3-\u03AB\u03CF\u03D2-\u03D4\u03D8\u03DA\u03DC\u03DE\u03E0\u03E2\u03E4\u03E6\u03E8\u03EA\u03EC\u03EE\u03F4\u03F7\u03F9\u03FA\u03FD-\u042F\u0460\u0462\u0464\u0466\u0468\u046A\u046C\u046E\u0470\u0472\u0474\u0476\u0478\u047A\u047C\u047E\u0480\u048A\u048C\u048E\u0490\u0492\u0494\u0496\u0498\u049A\u049C\u049E\u04A0\u04A2\u04A4\u04A6\u04A8\u04AA\u04AC\u04AE\u04B0\u04B2\u04B4\u04B6\u04B8\u04BA\u04BC\u04BE\u04C0\u04C1\u04C3\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04D2\u04D4\u04D6\u04D8\u04DA\u04DC\u04DE\u04E0\u04E2\u04E4\u04E6\u04E8\u04EA\u04EC\u04EE\u04F0\u04F2\u04F4\u04F6\u04F8\u04FA\u04FC\u04FE\u0500\u0502\u0504\u0506\u0508\u050A\u050C\u050E\u0510\u0512\u0514\u0516\u0518\u051A\u051C\u051E\u0520\u0522\u0524\u0526\u0528\u052A\u052C\u052E\u0531-\u0556\u10A0-\u10C5\u10C7\u10CD\u13A0-\u13F5\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE\u1F08-\u1F0F\u1F18-\u1F1D\u1F28-\u1F2F\u1F38-\u1F3F\u1F48-\u1F4D\u1F59\u1F5B\u1F5D\u1F5F\u1F68-\u1F6F\u1FB8-\u1FBB\u1FC8-\u1FCB\u1FD8-\u1FDB\u1FE8-\u1FEC\u1FF8-\u1FFB\u2102\u2107\u210B-\u210D\u2110-\u2112\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u2130-\u2133\u213E\u213F\u2145\u2160-\u216F\u2183\u24B6-\u24CF\u2C00-\u2C2E\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E-\u2C80\u2C82\u2C84\u2C86\u2C88\u2C8A\u2C8C\u2C8E\u2C90\u2C92\u2C94\u2C96\u2C98\u2C9A\u2C9C\u2C9E\u2CA0\u2CA2\u2CA4\u2CA6\u2CA8\u2CAA\u2CAC\u2CAE\u2CB0\u2CB2\u2CB4\u2CB6\u2CB8\u2CBA\u2CBC\u2CBE\u2CC0\u2CC2\u2CC4\u2CC6\u2CC8\u2CCA\u2CCC\u2CCE\u2CD0\u2CD2\u2CD4\u2CD6\u2CD8\u2CDA\u2CDC\u2CDE\u2CE0\u2CE2\u2CEB\u2CED\u2CF2\uA640\uA642\uA644\uA646\uA648\uA64A\uA64C\uA64E\uA650\uA652\uA654\uA656\uA658\uA65A\uA65C\uA65E\uA660\uA662\uA664\uA666\uA668\uA66A\uA66C\uA680\uA682\uA684\uA686\uA688\uA68A\uA68C\uA68E\uA690\uA692\uA694\uA696\uA698\uA69A\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uFF21-\uFF3A\U00010400-\U00010427\U000104B0-\U000104D3\U00010C80-\U00010CB2\U000118A0-\U000118BF\U0001D400-\U0001D419\U0001D434-\U0001D44D\U0001D468-\U0001D481\U0001D49C\U0001D49E\U0001D49F\U0001D4A2\U0001D4A5\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B5\U0001D4D0-\U0001D4E9\U0001D504\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D538\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546\U0001D54A-\U0001D550\U0001D56C-\U0001D585\U0001D5A0-\U0001D5B9\U0001D5D4-\U0001D5ED\U0001D608-\U0001D621\U0001D63C-\U0001D655\U0001D670-\U0001D689\U0001D6A8-\U0001D6C0\U0001D6E2-\U0001D6FA\U0001D71C-\U0001D734\U0001D756-\U0001D76E\U0001D790-\U0001D7A8\U0001D7CA\U0001E900-\U0001E921\U0001F130-\U0001F149\U0001F150-\U0001F169\U0001F170-\U0001F189]"
39 | author_regex1 = re.compile("^(" + pLu + "+)\s+([\w\-]+)")
40 | author_regex2 = re.compile("^(" + pLu + "[\w\-]+)\s*,\s*(" + pLu + "\w*)")
41 | author_regex3 = re.compile("^(" + pLu + "[\w\-]+)\s+([\w\.\s\-]+),\s([\w\-]+)")
42 | author_regex4 = re.compile("^(" + pLu + "[\w\-]+)\s+([\w\.\s\-]+)(van|von|dos|del|de la),\s([\w\-]+)")
43 |
44 |
45 | def parseBibAuthors(authors):
46 | if not authors:
47 | return [{"given": '', "family": ''}]
48 |
49 | bits = authors.split('and')
50 | authors = []
51 | for bit in bits:
52 | bit = bit.strip()
53 | # matches 'SKC Chiang', 'S Chiang'
54 | match = author_regex1.search(bit)
55 | if match:
56 | author = {"given": match.group(1)[0], "family": match.group(2)}
57 | if len(match.group(1)) > 1:
58 | author['middle'] = match.group(1)[1:]
59 | authors.append(author)
60 | else:
61 | # matches 'Smith, Bob'
62 | match = author_regex2.search(bit)
63 | if match:
64 | author = {"given": match.group(2), "family": match.group(1)}
65 | authors.append(author)
66 | else:
67 | # matchesn 'Otmani Abdeldjallal, Nassim'
68 | match = author_regex3.search(bit)
69 | if match:
70 | author = {"given": match.group(3), "family": match.group(1)}
71 | if match.group(2):
72 | author['middle'] = match.group(2)
73 | authors.append(author)
74 | else:
75 | # special for Dutch/German/Portuguese/Spanish surnames - van/von/dos/del/de la Blah
76 | match = author_regex4.search(bit)
77 | if match:
78 | author = {"given": match.group(3),
79 | "family": match.group(3) + " " + match.group(1)}
80 | if match.group(2):
81 | author['middle'] = match.group(2)
82 | authors.append(author)
83 |
84 | author = {"given": '', "family": bit}
85 | # raise ValueError("Couldn't find names")
86 | return authors
87 |
88 |
89 | def authorListFromDict(authors):
90 | authorstrings = []
91 | for author in authors:
92 | authorstring = author.get('family', '')
93 | if author.get('middle', ''):
94 | authorstring += ' ' + author.get('middle')
95 | authorstring += ', ' + author.get('given', '')
96 | authorstrings.append(authorstring)
97 |
98 | authors_string = " and ".join(authorstrings)
99 | return authors_string
100 |
101 |
102 | def authorListFromListOfAuthors(authors):
103 | authors_string = " and ".join(authors)
104 | return authors_string
105 |
106 |
107 | def normalizeURL(url: str):
108 | return url.replace('https:', 'http:')
109 |
110 |
111 | def addUrlIfNew(paper, url: str, type: str, source: str):
112 | paper.extra_data['urls'] = paper.extra_data.get('urls', [])
113 |
114 | existing_urls = [normalizeURL(u['url']).lower() for u in paper.extra_data['urls']]
115 | norm_url = normalizeURL(url)
116 |
117 | if norm_url.lower() not in existing_urls:
118 | paper.extra_data['urls'].append({'url': norm_url,
119 | 'type': type,
120 | 'source': source})
121 | return True
122 | return False
123 |
124 |
125 | def addUrlIfNewWithType(paper, url: str, source: str):
126 | if isPDFURL(url):
127 | type = 'pdf'
128 | else:
129 | type = 'main'
130 |
131 | return addUrlIfNew(paper, url, type, source)
132 |
133 |
134 | def simpleResultDeDupe(results):
135 | from collections import OrderedDict
136 |
137 | duplicates = []
138 |
139 | unique_entries = OrderedDict()
140 | for result in results:
141 |
142 | if result.bib['ID'] in unique_entries:
143 | if normalizeTitle(result.bib['title']) == normalizeTitle(unique_entries[result.bib['ID']].bib['title']):
144 | # print(unique_entries[result.bib['ID']], '\n\n', result.bib, '\n---------------')
145 | duplicates.append(result)
146 | continue
147 | else:
148 | result.bib['ID'] += "_2"
149 |
150 | unique_entries[result.bib['ID']] = result
151 |
152 | print('Duplicates found:', len(duplicates))
153 | return [v for k, v in unique_entries.items()]
154 |
155 |
156 | def normalizeTitle(title):
157 | """
158 | Returns a "normalized" title for easy matching
159 | """
160 | title = title.lower()
161 | title = re.sub(r"–", " ", title)
162 | title = unicodeToASCII(title)
163 | title = title.replace("- ", "").replace("- ", "")
164 | title = re.sub(r"[\"\#\$\%\&\\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\¿\!\¡\@\[\]\^\_\`\{\|\}\~]", " ", title)
165 | title = re.sub(r"\s+", " ", title)
166 | title = title.strip()
167 | title = title[:200]
168 | return title
169 |
--------------------------------------------------------------------------------
/db/data.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import os, re, json
3 | import pandas as pd
4 | import bibtexparser
5 |
6 | from strsimpy import NormalizedLevenshtein
7 |
8 | stopwords = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
9 | "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
10 | "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this",
11 | "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has",
12 | "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or",
13 | "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
14 | "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
15 | "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there",
16 | "when", "where", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])
17 |
18 | from db.bibtex import generateUniqueID
19 | from db.ref_utils import parseBibAuthors, normalizeTitle
20 |
21 | current_dir = os.path.dirname(os.path.realpath(__file__))
22 |
23 | CACHE_FILE = os.path.join(current_dir, "papers.sqlite")
24 |
25 |
26 | class Paper:
27 | """
28 | A Paper consists of 2 dicts: .bib and .extra_data
29 | - .bib is simply a bibtex dict
30 | - .extra_data stores everything else we can't properly store in a BibTeX file
31 | """
32 |
33 | def __init__(self, bib: dict = None, extra_data: dict = None):
34 | self.bib = bib
35 | self.extra_data = extra_data
36 |
37 | for field in bib:
38 | if bib[field] is None:
39 | bib[field] = ''
40 |
41 | @classmethod
42 | def fromRecord(cls, paper_record):
43 | res = Paper(json.loads(paper_record["bib"]),
44 | json.loads(paper_record["extra_data"]))
45 |
46 | res.pmid = paper_record["pmid"]
47 | res.scholarid = paper_record["scholarid"]
48 | res.arxivid = paper_record["arxivid"]
49 | return res
50 |
51 | @property
52 | def id(self):
53 | return generateUniqueID(self)
54 |
55 | @property
56 | def doi(self):
57 | return self.bib.get("doi")
58 |
59 | @doi.setter
60 | def doi(self, doi):
61 | self.bib["doi"] = doi
62 |
63 | @property
64 | def arxivid(self):
65 | return self.extra_data.get("arxivid")
66 |
67 | @arxivid.setter
68 | def arxivid(self, arxivid):
69 | self.extra_data["arxivid"] = arxivid
70 |
71 | @property
72 | def pmid(self):
73 | return self.extra_data.get("pmid")
74 |
75 | @pmid.setter
76 | def pmid(self, pmid):
77 | self.extra_data["pmid"] = pmid
78 |
79 | @property
80 | def scholarid(self):
81 | return self.extra_data.get("scholarid")
82 |
83 | @scholarid.setter
84 | def scholarid(self, scholarid):
85 | self.extra_data["scholarid"] = scholarid
86 |
87 | @property
88 | def title(self):
89 | return self.bib.get("title")
90 |
91 | @title.setter
92 | def title(self, title):
93 | self.bib["title"] = title
94 |
95 | @property
96 | def norm_title(self):
97 | return normalizeTitle(self.title)
98 |
99 | @property
100 | def abstract(self):
101 | return self.bib.get("abstract")
102 |
103 | @property
104 | def year(self):
105 | return self.bib.get("year")
106 |
107 | @property
108 | def authors(self):
109 | return self.bib.get("author")
110 |
111 | @authors.setter
112 | def authors(self, authors):
113 | self.bib["author"] = authors
114 |
115 | @property
116 | def entrytype(self):
117 | return self.bib.get("ENTRYTYPE").lower()
118 |
119 | @property
120 | def venue(self):
121 | entrytype = self.entrytype
122 | if entrytype == "article":
123 | return self.bib.get("journal", "")
124 | elif entrytype in ["book", "booklet", "manual", "proceedings"]:
125 | # return self.bib.get("title", "")
126 | return ""
127 | elif entrytype in ["conference", "inproceedings", "incollection"]:
128 | return self.bib.get("booktitle", "")
129 | elif entrytype in ["mastersthesis", "phdthesis"]:
130 | return self.bib.get("school", "")
131 | elif entrytype in ["techreport"]:
132 | return self.bib.get("institution", "")
133 | elif entrytype in ["misc", "unpublished"]:
134 | return ""
135 |
136 | @property
137 | def has_pdf(self):
138 | for url in self.extra_data.get('urls', []):
139 | if url['type'] == 'pdf':
140 | return True
141 | return False
142 |
143 | @property
144 | def has_full_abstract(self):
145 | if not self.abstract:
146 | return False
147 |
148 | if self.abstract.endswith('…'):
149 | return False
150 |
151 | return True
152 |
153 | @property
154 | def has_abstract(self):
155 | return self.abstract is not None and self.abstract != ''
156 |
157 | @property
158 | def has_valid_id(self):
159 | return any([self.doi, self.pmid, self.arxivid, self.extra_data.get('ss_id')])
160 |
161 | @property
162 | def has_pdf_link(self):
163 | for url in self.extra_data.get('urls', []):
164 | if url.get('type') == 'pdf' or 'pdf' in url.get('url', ''):
165 | return True
166 |
167 | return False
168 |
169 | def asDict(self):
170 | return {
171 | "id": self.id,
172 | "title": self.title,
173 | "norm_title": self.norm_title,
174 | "authors": self.authors,
175 | "year": self.year,
176 | "venue": self.venue,
177 | "bib": json.dumps(self.bib),
178 | "doi": self.doi,
179 | "arxivid": self.arxivid,
180 | "scholarid": self.scholarid,
181 | "pmid": self.pmid,
182 | "extra_data": json.dumps(self.extra_data)
183 | }
184 |
185 | def __repr__(self):
186 | return f"<%s - %s - %s> \n %s" % (
187 | self.bib.get("title", ""),
188 | self.bib.get("author", ""),
189 | self.bib.get("year", ""), str(self.bib))
190 |
191 |
192 | class PaperStore:
193 | def __init__(self):
194 | self.conn = sqlite3.connect(CACHE_FILE)
195 | self.conn.row_factory = sqlite3.Row
196 | self.initaliseDB()
197 |
198 | def initaliseDB(self):
199 | self.conn.execute("""CREATE TABLE IF NOT EXISTS "papers" (
200 | "id" text primary key,
201 | "doi" text unique,
202 | "pmid" text unique,
203 | "scholarid" text unique,
204 | "arxivid" text unique,
205 | "authors" text,
206 | "year" integer,
207 | "title" text,
208 | "norm_title" text,
209 | "venue" text,
210 | "bib" text,
211 | "extra_data" text
212 | )
213 | """)
214 |
215 | self.conn.execute(
216 | """CREATE UNIQUE INDEX IF NOT EXISTS idx_papers_ids ON papers(id, doi)""")
217 |
218 | self.conn.execute(
219 | """CREATE INDEX IF NOT EXISTS idx_papers_otherids ON papers(pmid, scholarid, arxivid)""")
220 |
221 | self.conn.execute(
222 | """CREATE INDEX IF NOT EXISTS idx_papers_title ON papers(title, norm_title)""")
223 |
224 | self.conn.commit()
225 |
226 | # def runSelectStatement(self, sql, parameters):
227 | # """
228 | #
229 | # :param sql: SQL string to run
230 | # :return: Cursor to the results
231 | # """
232 | # c = self.conn.cursor()
233 | # c.execute(sql, parameters)
234 | # return c
235 |
236 | def getPaper(self, id_string, id_type="doi"):
237 | """
238 | Looks for a paper given an id.
239 |
240 | :param id_string: the actual id
241 | :param id_type: the type of id (doi, arxivid, pmid, scholarid)
242 | :return: paper if found, or None
243 | """
244 | c = self.conn.cursor()
245 |
246 | c.execute("SELECT * FROM papers WHERE %s=?" % id_type, (id_string,))
247 | paper_record = c.fetchone()
248 | if not paper_record:
249 | return None
250 |
251 | res = Paper.fromRecord(paper_record)
252 | return res
253 |
254 | def findPapersByTitle(self, title):
255 | """
256 | Looks for a paper given a title.
257 |
258 | :param title:
259 | :return:
260 | """
261 | c = self.conn.cursor()
262 | norm_title = normalizeTitle(title)
263 |
264 | c.execute("SELECT * FROM papers WHERE norm_title=?", (norm_title,))
265 | paper_records = c.fetchall()
266 | if not paper_records:
267 | return None
268 |
269 | res = []
270 | for paper_record in paper_records:
271 | res.append(Paper.fromRecord(paper_record))
272 | return res
273 |
274 | def findPaperByApproximateTitle(self, paper, ok_title_distance=0.35, ok_author_distance=0.1):
275 | """
276 | Very simple ngram-based similarity matching
277 |
278 | :param title:
279 | :return:
280 | """
281 | c = self.conn.cursor()
282 |
283 | self.createVirtualTable()
284 |
285 | norm_title = normalizeTitle(paper.title)
286 |
287 | bits = norm_title.split()
288 | bits = [b for b in bits if b not in stopwords]
289 |
290 | query_string = " OR ".join(bits)
291 |
292 | c.execute('SELECT id, norm_title FROM papers_search WHERE norm_title MATCH ?', (query_string,))
293 | paper_ids = c.fetchall()
294 | if not paper_ids:
295 | return None
296 |
297 | paper_id_list = [res['id'] for res in paper_ids]
298 | id_query_string = ",".join(['"%s"' % res['id'] for res in paper_ids])
299 |
300 | c.execute('SELECT * FROM papers WHERE id IN (%s)' % id_query_string)
301 | paper_records = c.fetchall()
302 | if not paper_records:
303 | return None
304 |
305 | results = [Paper.fromRecord(r) for r in paper_records]
306 |
307 | sorted_results = rerankByTitleSimilarity(results, paper.title)
308 |
309 | top_res = sorted_results[0][1]
310 |
311 | title_distance = dist.distance(top_res.title.lower(), paper.title.lower())
312 | author_distance = computeAuthorDistance(paper, top_res)
313 |
314 |
315 | if title_distance <= ok_title_distance and author_distance <= ok_author_distance:
316 | print('\n[matched] ', paper.title)
317 | print('Best match:', top_res.title)
318 | else:
319 | print('\n[skipped] ', paper.title)
320 | print('Options:\n' + '\n'.join([r[1].title for r in sorted_results[:5]]), '\n')
321 | return None
322 |
323 | print('title distance:', title_distance, 'author distance:', author_distance)
324 |
325 | new_paper = top_res
326 | # new_paper.title = paper.title
327 |
328 | return new_paper
329 |
330 | def addPaper(self, paper: Paper):
331 | self.addPapers([paper])
332 |
333 | def addPapers(self, papers: list):
334 | to_add = [paper.asDict() for paper in papers]
335 |
336 | df = pd.DataFrame(to_add)
337 | df.to_sql("papers", self.conn, if_exists="append", index=False)
338 |
339 | def updatePapers(self, papers: list):
340 | for paper in papers:
341 | values = paper.asDict()
342 | try:
343 | self.conn.execute(
344 | """REPLACE INTO papers (id, doi, pmid, scholarid, arxivid, authors, year, title, norm_title, venue, bib, extra_data) values (?,?,?,?,?,?,?,?,?,?,?,?)""",
345 | (values['id'], values['doi'], values['pmid'], values['scholarid'],
346 | values['arxivid'], values['authors'], values['year'],
347 | values['title'], values['norm_title'], values['venue'],
348 | values['bib'], values['extra_data']))
349 | except Exception as e:
350 | print(e.__class__.__name__, e)
351 | self.conn.commit()
352 |
353 | def createVirtualTable(self):
354 | self.conn.execute(
355 | """CREATE VIRTUAL TABLE IF NOT EXISTS papers_search USING fts5(id, norm_title, title);""")
356 | self.conn.execute(
357 | """REPLACE INTO papers_search (id, norm_title, title) SELECT id, norm_title, title from papers""")
358 |
359 | self.conn.commit()
360 |
361 | def deleteVirtualTable(self):
362 | self.conn.execute("DROP TABLE papers_search")
363 | self.conn.commit()
364 |
365 | def matchResultsWithPapers(self, results):
366 | """
367 | Tries to match each result with a paper already in the db.
368 |
369 | :param results:
370 | :return:
371 | """
372 | found = []
373 | missing = []
374 | self.createVirtualTable()
375 | for result in results:
376 | paper = Paper(result.bib, result.extra_data)
377 |
378 | paper_found = False
379 | for id_type in ["doi", "pmid", "arxivid", "scholarid"]:
380 | id_string = getattr(paper, id_type)
381 | if id_string:
382 | paper_record = self.getPaper(id_string, id_type=id_type)
383 | if paper_record:
384 | result.paper = paper_record
385 | found.append(result)
386 | paper_found = True
387 | break
388 |
389 | if not paper_found and paper.title:
390 | paper_records = self.findPapersByTitle(paper.title)
391 | if paper_records:
392 | result.paper = paper_records[0]
393 | found.append(result)
394 | paper_found = True
395 |
396 | if not paper_found and paper.title:
397 | paper_record = self.findPaperByApproximateTitle(paper)
398 | if paper_record:
399 | result.paper = paper_record
400 | found.append(result)
401 | paper_found = True
402 |
403 | if not paper_found:
404 | missing.append(result)
405 |
406 | self.deleteVirtualTable()
407 | return found, missing
408 |
409 |
410 | def computeAuthorDistance(paper1, paper2):
411 | """
412 | Returns a measure of how much the authors of papers overlap
413 |
414 | :param paper1:
415 | :param paper2:
416 | :return:
417 | """
418 | if not paper1.bib.get('author') or not paper2.bib.get('author'):
419 | return 1
420 |
421 | authors1 = paper1.extra_data.get('x_authors', parseBibAuthors(paper1.bib.get('author')))
422 | authors2 = paper2.extra_data.get('x_authors', parseBibAuthors(paper2.bib.get('author')))
423 |
424 | score = 0
425 | if len(authors1) >= len(authors2):
426 | a_short = authors2
427 | a_long = authors1
428 | else:
429 | a_short = authors1
430 | a_long = authors2
431 |
432 | max_score = 0
433 |
434 | for index, author in enumerate(a_short):
435 | factor = (len(a_long) - index) ** 2
436 | if author['family'].lower() == a_long[index]['family'].lower():
437 | score += factor
438 |
439 | max_score += factor
440 |
441 | if max_score == 0:
442 | return 1
443 |
444 | distance = 1 - (score / max_score)
445 | return distance
446 |
447 |
448 | def basicTitleCleaning(title):
449 | return re.sub(r'\s+', ' ', title, flags=re.MULTILINE)
450 |
451 |
452 | def rerankByTitleSimilarity(results: list, title):
453 | scores = []
454 | for res in results:
455 | res.bib['title'] = basicTitleCleaning(res.bib['title'])
456 | scores.append((dist.distance(res.bib['title'].lower(), title.lower()), res))
457 |
458 | return sorted(scores, key=lambda x: x[0], reverse=False)
459 |
460 | def removeListWrapper(value):
461 | while isinstance(value, list):
462 | value = value[0]
463 | return value
464 |
465 | def test1():
466 | bibstr = """@ARTICLE{Cesar2013,
467 | author = {Jean César},
468 | title = {An amazing title},
469 | year = {2013},
470 | volume = {12},
471 | pages = {12--23},
472 | journal = {Nice Journal},
473 | abstract = {This is an abstract. This line should be long enough to test
474 | multilines...},
475 | comments = {A comment},
476 | keywords = {keyword1, keyword2}
477 | }
478 | """
479 | bib = bibtexparser.loads(bibstr)
480 | paper = Paper(bib.entries[0])
481 | paperstore = PaperStore()
482 | paperstore.addPapers([paper])
483 |
484 |
485 | def test2():
486 | paperstore = PaperStore()
487 | paper = paperstore.getPaper('10.1148/radiol.2018171093')
488 | paper.arxivid = None
489 | paperstore.updatePapers([paper])
490 |
491 |
492 | if __name__ == '__main__':
493 | test2()
494 | dist = NormalizedLevenshtein()
495 |
496 |
497 |
--------------------------------------------------------------------------------
/search/metadata_harvest.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | warnings.filterwarnings("ignore")
4 |
5 | import requests
6 | import re, json
7 | import urllib.parse
8 | from db.bibtex import readBibtexString, fixBibData, getBibtextFromDOI
9 | from db.ref_utils import isPDFURL, getDOIfromURL, authorListFromDict, addUrlIfNew
10 | from db.data import Paper, computeAuthorDistance, rerankByTitleSimilarity, basicTitleCleaning, dist, removeListWrapper
11 | from .base_search import SearchResult
12 | from tqdm import tqdm
13 | import datetime
14 | from time import sleep
15 | from datetime import timedelta
16 | from io import StringIO, BytesIO
17 | from lxml import etree
18 | import datetime
19 |
20 | BIB_FIELDS_TRANSFER = ['abstract', 'address', 'annote', 'author', 'booktitle', 'chapter',
21 | 'crossref', 'doi', 'edition', 'editor',
22 | 'howpublished', 'institution', 'issue', 'journal', 'key',
23 | 'month', 'note', 'number', 'organization',
24 | 'pages', 'publisher', 'school', 'series', 'type', 'volume', 'year']
25 |
26 | interval_regex = re.compile(r'((?P\d+?)hr)?((?P\d+?)m)?((?P\d+?)s)?')
27 |
28 |
29 | def parse_time(time_str):
30 | parts = interval_regex.match(time_str)
31 | if not parts:
32 | return
33 | parts = parts.groupdict()
34 | time_params = {}
35 | for (name, param) in parts.items():
36 | if param:
37 | time_params[name] = int(param)
38 | return timedelta(**time_params)
39 |
40 |
41 | def refreshDOIfromURLs(paper):
42 | """
43 | If paper has no DOI, it tries to find one in any URLs stored in the bib or extra_data dicts
44 |
45 | :param paper: Paper or SearchResult
46 | """
47 | if paper.doi:
48 | return
49 |
50 | doi = getDOIfromURL(paper.bib.get('url', ''))
51 | if doi:
52 | paper.bib['doi'] = doi
53 | else:
54 | for url_dict in paper.extra_data.get('urls', []):
55 | doi = getDOIfromURL(url_dict['url'])
56 | if doi:
57 | paper.bib['doi'] = doi
58 | break
59 |
60 |
61 | def mergeResultData(result1, result2):
62 | """
63 | Merges bibtex and extra_data dictionaries for a SearchResult and/or a Paper
64 |
65 | :param result1:
66 | :param result2:
67 | :return:
68 | """
69 | # if there's no year we should update the ID after getting the year
70 | to_update_id = not result1.bib.get('year') or not 'ID' in result1.bib
71 |
72 | for field in BIB_FIELDS_TRANSFER:
73 | if len(str(result2.bib.get(field, ''))) > len(str(result1.bib.get(field, ''))):
74 | result1.bib[field] = str(result2.bib[field])
75 |
76 | for field in ['ID', 'ENTRYTYPE']:
77 | if field in result2.bib:
78 | result1.bib[field] = str(result2.bib[field])
79 |
80 | if 'ID' not in result2.bib and to_update_id:
81 | if 'ID' in result1.bib:
82 | del result1.bib['ID']
83 | fixBibData(result1.bib, 1)
84 |
85 | for field in result2.extra_data:
86 | if field not in result1.extra_data:
87 | result1.extra_data[field] = result2.extra_data[field]
88 |
89 | if 'urls' in result2.extra_data:
90 | for url in result2.extra_data['urls']:
91 | addUrlIfNew(result1, url['url'], url['type'], url['source'])
92 |
93 | refreshDOIfromURLs(result1)
94 | return result1
95 |
96 |
97 | class NiceScraper:
98 | def __init__(self, basic_delay=0., rate_limit=None, rate_interval=None):
99 | self.response_times = []
100 | self.request_times = []
101 | self.avg_response_time = 0
102 | self.basic_delay = basic_delay
103 | self.delay = 0.0
104 | self.rate_limit = rate_limit
105 | if isinstance(rate_interval, str):
106 | self.rate_interval = parse_time(rate_interval)
107 | else:
108 | self.rate_interval = rate_interval
109 |
110 | def playNice(self):
111 |
112 | if self.rate_limit and len(self.request_times) >= self.rate_limit:
113 | now = datetime.datetime.now()
114 |
115 | diff = now - self.request_times[-self.rate_limit]
116 | if diff < self.rate_interval:
117 | print('Waiting for the rate limit')
118 | sleep(self.rate_interval - diff.total_seconds())
119 |
120 | if len(self.response_times) > 0:
121 | self.avg_response_time = sum(self.response_times[-10:]) / len(self.response_times[-10:])
122 | if self.response_times[-1] > self.avg_response_time:
123 | self.delay += 0.1
124 | else:
125 | self.delay -= 0.1
126 | self.delay = max(self.delay, 0)
127 | else:
128 | self.avg_response_time = 0
129 |
130 | if self.delay:
131 | sleep(self.delay)
132 |
133 | def request(self, url, headers=None, data=None, post=False):
134 | """
135 | Makes a nice request, enforcing rate limits and adjusting the wait time
136 | between requests based on latency
137 |
138 | :param url: url to fetch
139 | :param headers: headers to pass
140 | :return: request object
141 | """
142 | class_name = self.__class__.__name__.split('.')[-1]
143 | status_code = 0
144 | retries = 0
145 |
146 | while status_code != 200 and retries < 2:
147 | self.playNice()
148 |
149 | self.request_times.append(datetime.datetime.now())
150 | before = datetime.datetime.now()
151 |
152 | if post:
153 | r = requests.post(url, json=data, headers=headers)
154 | else:
155 | r = requests.get(url, headers=headers)
156 |
157 | if r.status_code == 429:
158 | print(class_name, ': Status code 429: waiting and retrying')
159 | sleep(30)
160 |
161 | status_code = r.status_code
162 | retries += 1
163 |
164 | duration = datetime.datetime.now() - before
165 |
166 | self.setRateLimitsFromHeaders(r)
167 |
168 | self.response_times.append(duration.total_seconds())
169 | print(class_name, "request took", self.response_times[-1])
170 |
171 | return r
172 |
173 | def setRateLimitsFromHeaders(self, request):
174 | if request.headers.get('X-Rate-Limit-Limit'):
175 | self.rate_limit = int(request.headers.get('X-Rate-Limit-Limit'))
176 | if 'X-Rate-Limit-Interval' in request.headers:
177 | try:
178 | self.rate_interval = parse_time(request.headers['X-Rate-Limit-Interval'])
179 | except:
180 | print("Failed to parse X-Rate-Limit-Interval string",
181 | request.headers['X-Rate-Limit-Interval'])
182 | self.rate_interval = None
183 |
184 | def search(self, title, identity, max_results=5):
185 | raise NotImplementedError
186 |
187 | def matchPaperFromResults(self, paper, identity, ok_title_distance=0.1, ok_author_distance=0.1):
188 | """
189 | Tries to match a paper with a DOI and retrieves its metadata if successful
190 |
191 | :param paper:
192 | :param identity:
193 | :return:
194 | """
195 | class_name = self.__class__.__name__.split('.')[-1]
196 |
197 | try:
198 | results = self.search(paper.title, identity, max_results=5)
199 | except Exception as e:
200 | print('Error during %s.matchPaperFromResults()' % class_name, e)
201 | results = None
202 |
203 | if not results:
204 | return False
205 |
206 | sorted_results = rerankByTitleSimilarity(results, paper.title)
207 |
208 | top_res = sorted_results[0][1]
209 |
210 | title_distance = dist.distance(top_res['title'].lower(), paper.title.lower())
211 | author_distance = computeAuthorDistance(paper, top_res)
212 |
213 | if title_distance > 0.1:
214 | if title_distance <= ok_title_distance and author_distance <= ok_author_distance:
215 | print('\n[matched] Title distance is above 0.1, but within settings')
216 | print('Title:', paper.title)
217 | print('Best match:', top_res['title'])
218 | print('title distance:', title_distance, 'author distance:', author_distance)
219 | else:
220 | print('\n[skipped] Distance is too great \n')
221 | print('Title:', paper.title)
222 | print('title distance:', title_distance, 'author distance:', author_distance)
223 | print('Options:\n' + '\n'.join([r[1]['title'] for r in sorted_results]), '\n')
224 | return False
225 |
226 | try:
227 | mergeResultData(paper, top_res)
228 | return True
229 | except Exception as e:
230 | print('Error during %s.matchPaperFromResults() mergeResultData()' % class_name, e)
231 | return False
232 |
233 |
234 | class CrossrefScraper(NiceScraper):
235 |
236 | def bulkSearchCrossref(self, papers):
237 | pass
238 | # r = requests.get("https://doi.crossref.org/simpleTextQuery")
239 |
240 | def search(self, title, identity, year=None, max_results=1):
241 | """
242 | Searchs and returns a number of results from Crossref
243 |
244 | :param title: article title
245 | :param identity: email address to provide to Crossref
246 | :param year: publication year
247 | :param max_results:
248 | :return: list of Crossref JSON data results
249 | """
250 | urllib.parse.quote(title, safe='')
251 | headers = {'User-Agent': 'ReviewBuilder(mailto:%s)' % identity}
252 | # changed because of https://status.crossref.org/incidents/4y45gj63jsp4
253 | url = 'https://api.crossref.org/works?rows={}&query.bibliographic={}'.format(max_results, title)
254 | if year:
255 | url += '&query.published=' + str(year)
256 |
257 | r = self.request(url, headers)
258 |
259 | d = r.json()
260 | if d['status'] != 'ok':
261 | raise ValueError('Error in request:' + d.get('status', 'NO STATUS') + str(d.get('message', 'NO MESSAGE')))
262 |
263 | results = []
264 | for index, item in enumerate(d['message']['items']):
265 | # print(item.get('type'))
266 | new_bib = {'doi': item['DOI'],
267 | 'title': basicTitleCleaning(removeListWrapper(item['title']))}
268 |
269 | if 'container-title' in item:
270 | # reference-entry, book
271 |
272 | if item.get('type') in ['journal-article', 'reference-entry']:
273 | new_bib['journal'] = removeListWrapper(item['container-title'])
274 | new_bib['ENTRYTYPE'] = 'article'
275 | elif item.get('type') in ['book-chapter']:
276 | new_bib['ENTRYTYPE'] = 'inbook'
277 | new_bib['booktitle'] = removeListWrapper(item['container-title'])
278 | elif item.get('type') in ['proceedings-article']:
279 | new_bib['ENTRYTYPE'] = 'inproceedings'
280 | new_bib['booktitle'] = removeListWrapper(item['container-title'])
281 |
282 | if item.get('type') in ['book']:
283 | new_bib['ENTRYTYPE'] = 'book'
284 |
285 | if item.get('type') not in ['journal-article', 'reference-entry', 'book', 'book-chapter',
286 | 'proceedings-article']:
287 | print(json.dumps(item, indent=3))
288 |
289 | for field in [('publisher-location', 'address'),
290 | ('publisher', 'publisher'),
291 | ('issue', 'issue'),
292 | ('volume', 'volume'),
293 | ('page', 'pages'),
294 | ]:
295 | if field[0] in item:
296 | new_bib[field[1]] = str(item[field[0]])
297 |
298 | if 'URL' in item:
299 | new_bib['url'] = item['URL']
300 |
301 | if "issued" in item:
302 | date_parts = item['issued']['date-parts'][0]
303 | new_bib['year'] = str(date_parts[0])
304 | if len(date_parts) > 1:
305 | new_bib['month'] = str(date_parts[1])
306 | if len(date_parts) > 2:
307 | new_bib['day'] = str(date_parts[2])
308 |
309 | authors = []
310 | for author in item.get('author', []):
311 | authors.append({'given': author.get('given', ''), 'family': author.get('family', '')})
312 |
313 | if item.get('author'):
314 | new_bib['author'] = authorListFromDict(authors)
315 |
316 | new_extra = {'x_authors': authors,
317 | 'language': item.get('language')
318 | }
319 |
320 | new_res = SearchResult(index, new_bib, 'crossref', new_extra)
321 |
322 | addUrlIfNew(new_res, item['URL'], 'main', 'crossref')
323 |
324 | if 'link' in item:
325 | for link in item['link']:
326 | if isPDFURL(link['URL']):
327 | addUrlIfNew(new_res, link['URL'], 'pdf', 'crossref')
328 |
329 | results.append(new_res)
330 |
331 | return results
332 |
333 |
334 | class UnpaywallScraper(NiceScraper):
335 |
336 | def getMetadata(self, paper, identity):
337 | if not paper.doi:
338 | raise ValueError("Paper has no DOI")
339 |
340 | url = 'https://api.unpaywall.org/v2/%s?email=%s' % (paper.doi, identity)
341 |
342 | r = self.request(url)
343 |
344 | data = r.json()
345 | if data.get('error') == 'true':
346 | return
347 |
348 | top_url = data.get('best_oa_location')
349 | if not top_url:
350 | return
351 |
352 | if top_url.get('url_for_pdf') in top_url:
353 | addUrlIfNew(paper, top_url['url_for_pdf'], 'pdf', 'unpaywall')
354 | if top_url.get('url_for_landing_page'):
355 | addUrlIfNew(paper, top_url['url_for_landing_page'], 'main', 'unpaywall')
356 | if top_url.get('url'):
357 | url = top_url['url']
358 | if isPDFURL(url):
359 | type = 'pdf'
360 | else:
361 | type = 'main'
362 |
363 | addUrlIfNew(paper, url, type, 'unpaywall')
364 |
365 | paper.extra_data['done_unpaywall'] = True
366 |
367 |
368 | class PubMedScraper(NiceScraper):
369 | def search(self, title, identity, max_results=5):
370 | url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax={max_results}&sort=relevance&term='
371 | url += urllib.parse.quote(title)
372 |
373 | r = self.request(url)
374 | d = r.json()
375 | id_list = d['esearchresult']['idlist']
376 |
377 | try:
378 | result = self.getMetadata(id_list)
379 | except Exception as e:
380 | print('Error during %s.getMetadata()' % self.__class__.__name__.split('.')[-1], e)
381 | result = None
382 |
383 | return result
384 |
385 | def getMetadata(self, pmids: list):
386 | """
387 | Returns a dict with metadata extracted from PubMed from a PMID
388 |
389 | rettype = {NULL = xml, abstract, medline, uilist, docsum}
390 | retmode = {xml, text}
391 |
392 | :param pmids: list of PMID to get
393 | :return: dict with metadata from XML returned
394 | """
395 | assert isinstance(pmids, list)
396 |
397 | if not pmids:
398 | return []
399 |
400 | if len(pmids) > 1:
401 | pmids = ','.join(pmids)
402 | else:
403 | pmids = pmids[0]
404 |
405 | url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=' + pmids
406 | r = self.request(url)
407 |
408 | text = StringIO(r.content.decode('utf-8'))
409 | tree = etree.parse(text)
410 |
411 | results = []
412 |
413 | for index, article_node in enumerate(tree.xpath('/PubmedArticleSet/PubmedArticle')):
414 | new_bib = {}
415 |
416 | article = article_node.xpath('MedlineCitation/Article')[0]
417 |
418 | doi = article.xpath('ELocationID[@EIdType="doi"]')
419 | if doi:
420 | new_bib['doi'] = doi[0].text
421 |
422 | new_bib['title'] = article.xpath('ArticleTitle')[0].text
423 |
424 | abstract = ""
425 | for abs_piece in article.xpath('Abstract/AbstractText'):
426 | if 'Label' in abs_piece.keys():
427 | abstract += abs_piece.get('Label') + "\n"
428 |
429 | abstract += abs_piece.text + '\n'
430 | new_bib['abstract'] = abstract
431 |
432 | authors = []
433 | for author in article.xpath('AuthorList/Author'):
434 | new_author = {'given': author.xpath('ForeName')[0].text,
435 | 'family': author.xpath('LastName')[0].text, }
436 | authors.append(new_author)
437 |
438 | new_bib['author'] = authorListFromDict(authors)
439 | if article.xpath('ArticleDate'):
440 | date_node = article.xpath('ArticleDate')[0]
441 | elif article_node.xpath('PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]'):
442 | date_node = article_node.xpath('PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]')[0]
443 |
444 | new_bib['year'] = date_node.xpath('Year')[0].text
445 | new_bib['month'] = date_node.xpath('Month')[0].text
446 | new_bib['day'] = date_node.xpath('Day')[0].text
447 |
448 | new_extra = {'pmid': article_node.xpath('MedlineCitation/PMID')[0].text,
449 | 'x_authors': authors,
450 | 'language': article.xpath('Language')[0].text}
451 |
452 | new_res = SearchResult(index, new_bib, 'pubmed', new_extra)
453 | results.append(new_res)
454 |
455 | return results
456 |
457 | def getAlternateIDs(self, pmids: list):
458 | """
459 | Gets DOI and PMCID for a list of PMIDs
460 |
461 | :param pmids: list of PMID to resolve
462 | :return:
463 | """
464 | if isinstance(pmids, list):
465 | if len(pmids) > 1:
466 | pmids = ','.join([str(p) for p in pmids])
467 | else:
468 | pmids = pmids[0]
469 |
470 | res = {}
471 |
472 | r = requests.get(
473 | 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids=' + str(
474 | pmids))
475 |
476 | text = StringIO(r.content.decode('utf-8'))
477 | tree = etree.parse(text)
478 | for record in tree.xpath('/pmcids/record'):
479 | new_res = {}
480 | if 'pmcid' in record.keys():
481 | new_res['pmcid'] = record.get('pmcid')
482 | if 'doi' in record.keys():
483 | new_res['doi'] = record.get('doi')
484 | res[record.get('pmid')] = new_res
485 | return res
486 |
487 | def enrichWithMetadata(self, paper):
488 | if not paper.pmid:
489 | return
490 |
491 | if not paper.doi:
492 | ids = self.getAlternateIDs(paper.pmid)
493 | if 'doi' in ids[paper.pmid]:
494 | paper.doi = ids[paper.pmid]['doi']
495 | if 'pmcid' in ids[paper.pmid]:
496 | paper.pmcid = ids[paper.pmid]['pmcid']
497 |
498 | res = self.getMetadata([paper.pmid])[0]
499 |
500 | mergeResultData(paper, res)
501 |
502 | paper.extra_data['done_pubmed'] = True
503 |
504 |
505 | class arXivSearcher(NiceScraper):
506 | def search(self, title, identity, max_results=5):
507 | url = 'http://export.arxiv.org/api/query?search_query=title:{}&start=0&max_results={}'.format(
508 | urllib.parse.quote(title), max_results)
509 | r = self.request(url)
510 |
511 | text = BytesIO(r.content)
512 | tree = etree.parse(text)
513 |
514 | ns_map = {'ns': 'http://www.w3.org/2005/Atom',
515 | 'arxiv': 'http://arxiv.org/schemas/atom'}
516 |
517 | results = []
518 | for index, entry in enumerate(tree.xpath('/ns:feed/ns:entry', namespaces=ns_map)):
519 | new_bib = {'arxivid': entry.xpath('ns:id', namespaces=ns_map)[0].text.split('/')[-1],
520 | 'title': entry.xpath('ns:title', namespaces=ns_map)[0].text,
521 | 'abstract': entry.xpath('ns:summary', namespaces=ns_map)[0].text,
522 | }
523 |
524 | published = entry.xpath('ns:published', namespaces=ns_map)[0].text
525 | match = re.search(r"(\d{4})-(\d{2})-(\d{2})", published)
526 |
527 | new_bib['year'] = match.group(1)
528 | new_bib['month'] = str(int(match.group(2)))
529 | new_bib['date'] = str(int(match.group(3)))
530 |
531 | authors = []
532 | for author in entry.xpath('ns:author', namespaces=ns_map):
533 | bits = author.xpath('ns:name', namespaces=ns_map)[0].text.split()
534 | authors.append({'given': bits[0], 'family': bits[-1]})
535 |
536 | new_bib['author'] = authorListFromDict(authors)
537 | new_extra = {
538 | 'x_authors': authors,
539 | 'ax_main_category': entry.xpath('arxiv:primary_category', namespaces=ns_map)[0].get('term'),
540 |
541 | }
542 |
543 | categories = []
544 | for cat in entry.xpath('ns:category', namespaces=ns_map):
545 | categories.append(cat.get('term'))
546 |
547 | new_extra['ax_categories'] = categories
548 |
549 | new_res = SearchResult(index, new_bib, 'arxiv', new_extra)
550 |
551 | for link in entry.xpath('ns:link', namespaces=ns_map):
552 | if link.get('title') == 'pdf':
553 | addUrlIfNew(new_res, link.get('href'), 'pdf', 'arxiv')
554 | elif 'arxiv.org/abs/' in link.get('href'):
555 | addUrlIfNew(new_res, link.get('href'), 'main', 'arxiv')
556 |
557 | results.append(new_res)
558 |
559 | return results
560 |
561 |
562 | class GScholarScraper(NiceScraper):
563 | def getBibtex(self, paper):
564 | if paper.extra_data.get("url_scholarbib"):
565 | bib = paper.bib
566 | url = paper.extra_data.get("url_scholarbib")
567 | try:
568 | r = self.request(url)
569 |
570 | # print(r)
571 | text = r.content.decode('utf-8')
572 | bib = readBibtexString(text)[0]
573 |
574 | except Exception as e:
575 | print(e.__class__.__name__, e)
576 |
577 | bib['abstract'] = paper.abstract
578 | for key in ['abstract', 'eprint', 'url']:
579 | if key in paper.bib:
580 | bib[key] = paper.bib[key]
581 | paper.bib = bib
582 |
583 |
584 | class SemanticScholarScraper(NiceScraper):
585 |
586 | @classmethod
587 | def loadSSAuthors(self, authors_dict):
588 | authors = []
589 | for author in authors_dict:
590 | bits = author['name'].split()
591 | new_author = {'given': bits[0], 'family': bits[-1]}
592 | if len(bits) > 2:
593 | new_author['middle'] = " ".join(bits[1:len(bits) - 1])
594 | authors.append(new_author)
595 | return authors
596 |
597 | def search(self, title, identity, max_results=5, min_year=None, max_year=None):
598 | url = 'https://www.semanticscholar.org/api/1/search'
599 |
600 | yearFilter = None
601 |
602 | if min_year or max_year:
603 | yearFilter = {}
604 | if not max_year:
605 | now = datetime.datetime.now()
606 | max_year = now.year
607 |
608 | if min_year:
609 | yearFilter['min'] = int(min_year)
610 | if max_year:
611 | yearFilter['max'] = int(max_year)
612 |
613 | results_left = max_results
614 | page_num = 1
615 |
616 | return_results = []
617 |
618 | while results_left > 0:
619 | data = {"queryString": title,
620 | "page": page_num,
621 | "pageSize": 10,
622 | "sort": "relevance",
623 | "authors": [],
624 | "coAuthors": [],
625 | "venues": [],
626 | "yearFilter": yearFilter,
627 | "requireViewablePdf": False,
628 | "publicationTypes": [],
629 | "externalContentTypes": []
630 | }
631 |
632 | r = self.request(url, data=data, post=True)
633 |
634 | results_dict = r.json()
635 |
636 | if results_dict.get('totalResults') and max_results != results_dict['totalResults']:
637 | max_results = min(max_results, results_dict['totalResults'])
638 | results_left = max_results
639 |
640 | if 'results' in results_dict:
641 | results = results_dict['results']
642 | else:
643 | results = []
644 |
645 | results_left -= len(results)
646 |
647 | for index, res in enumerate(results[:results_left]):
648 |
649 | res_title = res['title']['text']
650 |
651 | authors_processed = []
652 | for author_list in res['authors']:
653 | for author_dict in author_list:
654 | if 'name' in author_dict:
655 | authors_processed.append(author_dict)
656 |
657 | authors = self.loadSSAuthors(authors_processed)
658 |
659 | bib = {'title': res_title,
660 | 'abstract': res['paperAbstract']['text'],
661 | 'year': res['year']['text'],
662 | 'url': 'https://www.semanticscholar.org/paper/{}/{}'.format(res['slug'],
663 | res['id']),
664 | 'author': authorListFromDict(authors),
665 | }
666 |
667 | if res.get('doiInfo'):
668 | bib['doi'] = res['doiInfo'].get('doi')
669 |
670 | extra_data = {
671 | 'ss_id': res['id'],
672 | 'x_authors': authors
673 | }
674 |
675 | new_res = SearchResult(index, bib, 'semantischolar', extra_data)
676 |
677 | for link in res.get('links', []):
678 | if isPDFURL(link['url']):
679 | bib['eprint'] = link['url']
680 | addUrlIfNew(new_res, link['url'], 'pdf', 'semanticscholar')
681 |
682 | venue = res['venue'].get('text')
683 | extra_data['venue'] = venue
684 | return_results.append(new_res)
685 |
686 | return return_results
687 |
688 | def getMetadata(self, paper, get_citing_papers=False):
689 | if not paper.doi and not paper.extra_data.get('ss_id'):
690 | raise ValueError('paper has no DOI or SSID')
691 |
692 | if paper.extra_data.get('ss_id'):
693 | unique_id = paper.extra_data.get('ss_id')
694 | else:
695 | unique_id = paper.doi
696 |
697 | url = 'https://api.semanticscholar.org/v1/paper/' + unique_id
698 |
699 | r = self.request(url)
700 | d = r.json()
701 |
702 | if 'error' in d:
703 | print("SemanticScholar error:", d['error'])
704 | return
705 |
706 | for field in ['abstract', 'year', 'venue']:
707 | if d.get(field):
708 | paper.bib[field] = str(d[field])
709 |
710 | if d.get('arxivId'):
711 | paper.arxivid = d['arxivId']
712 |
713 | for topic in d['topics']:
714 | # we really don't need to store the url, it's just
715 | # https://www.semanticscholar.org/topic/{topicId}
716 | del topic['url']
717 |
718 | authors = self.loadSSAuthors(d['authors'])
719 | paper.bib['author'] = authorListFromDict(authors)
720 |
721 | paper.extra_data['ss_topics'] = d['topics']
722 | paper.extra_data['ss_authors'] = d['authors']
723 | paper.extra_data['ss_id'] = d['paperId']
724 |
725 | if get_citing_papers:
726 | citing_papers = []
727 | for index, citation in enumerate(d['citations']):
728 | ss_authors = semanticscholarmetadata.loadSSAuthors(citation['authors'])
729 | authors = authorListFromDict(ss_authors)
730 |
731 | bib = {
732 | 'title': citation['title'],
733 | 'author': authors,
734 | 'year': citation['year'],
735 | 'doi': citation['year'],
736 | }
737 | bib = fixBibData(bib, index)
738 |
739 | extra_data = {
740 | 'ss_id': citation['paperId'],
741 | 'ss_influential': citation['isInfluential'],
742 | 'ss_authors': ss_authors
743 | }
744 | if citation.get('arxivId'):
745 | extra_data['arxivid'] = citation.get('arxivId')
746 |
747 | new_paper = Paper(bib, extra_data)
748 | citing_papers.append(new_paper)
749 | return paper, citing_papers
750 | return paper
751 |
752 |
753 | crossref_scraper = CrossrefScraper()
754 | scholar_scraper = GScholarScraper(basic_delay=0.1)
755 | unpaywall_scraper = UnpaywallScraper(rate_limit=100000, rate_interval='24h')
756 | pubmed_scraper = PubMedScraper()
757 | arxiv_scraper = arXivSearcher()
758 | semanticscholarmetadata = SemanticScholarScraper()
759 |
760 |
761 | def enrichAndUpdateMetadata(papers, paperstore, identity):
762 | successful = []
763 | unsuccessful = []
764 |
765 | for paper in tqdm(papers, desc='Enriching metadata'):
766 | try:
767 | enrichMetadata(paper, identity)
768 | successful.append(paper)
769 | except Exception as e:
770 | print(e.__class__.__name__, e)
771 | unsuccessful.append(paper)
772 |
773 | paperstore.updatePapers([paper])
774 |
775 | return successful, unsuccessful
776 |
777 |
778 | def enrichMetadata(paper: Paper, identity):
779 | """
780 | Tries to retrieve metadata from Crossref and abstract from SemanticScholar for a given paper,
781 | Google Scholar bib if all else fails
782 |
783 | :param paper: Paper instance
784 | """
785 | paper.title = basicTitleCleaning(paper.title)
786 | original_title = paper.title
787 |
788 | if paper.pmid and not paper.extra_data.get("done_pubmed"):
789 | pubmed_scraper.enrichWithMetadata(paper)
790 | paper.extra_data['done_pubmed'] = True
791 |
792 | # if we don't have a DOI, we need to find it on Crossref
793 | if not paper.doi and not paper.extra_data.get('done_crossref', False):
794 | crossref_scraper.matchPaperFromResults(paper, identity)
795 |
796 | if paper.doi:
797 | new_bib = getBibtextFromDOI(paper.doi)
798 | paper = mergeResultData(paper,
799 | SearchResult(1, new_bib[0], 'crossref', paper.extra_data))
800 | paper.extra_data['done_crossref'] = True
801 |
802 | # if we have a DOI and we haven't got the abstract yet
803 | if paper.doi and not paper.extra_data.get('done_semanticscholar'):
804 | semanticscholarmetadata.getMetadata(paper)
805 | paper.extra_data['done_semanticscholar'] = True
806 |
807 | # try PubMed if we still don't have a PMID
808 | if not paper.pmid and not paper.extra_data.get('done_pubmed'):
809 | # if (not paper.doi or not paper.has_full_abstract) and not paper.pmid and not paper.extra_data.get('done_pubmed'):
810 | if pubmed_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.4):
811 | pubmed_scraper.enrichWithMetadata(paper)
812 | paper.extra_data['done_pubmed'] = True
813 |
814 | # still no DOI? maybe we can get something from SemanticScholar
815 | if not paper.extra_data.get('ss_id') and not paper.extra_data.get('done_semanticscholar'):
816 | semanticscholarmetadata.matchPaperFromResults(paper, identity)
817 | paper.extra_data['done_semanticscholar'] = True
818 |
819 | # # time to try Scopus, see if it's behind a paywall
820 | # if not paper.doi and not paper.extra_data.get('done_scopus'):
821 | # semanticscholarmetadata.getMetadata(paper)
822 | # paper.extra_data['done_semanticscholar'] = True
823 |
824 | # if we don't have an abstract maybe it's on arXiv
825 | if not paper.has_full_abstract and not paper.extra_data.get('done_arxiv'):
826 | # if not paper.extra_data.get('done_arxiv'):
827 | arxiv_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.35)
828 | paper.extra_data['done_arxiv'] = True
829 |
830 | # try to get open access links if DOI present and missing PDF link
831 | if not paper.has_pdf_link and paper.doi and not paper.extra_data.get('done_unpaywall'):
832 | unpaywall_scraper.getMetadata(paper, identity)
833 | paper.extra_data['done_unpaywall'] = True
834 |
835 | # if all else has failed but we have a link to Google Scholar bib data, get that
836 | if not paper.year and paper.extra_data.get('url_scholarbib'):
837 | scholar_scraper.getBibtex(paper)
838 |
839 | if paper.title != original_title:
840 | print('Original: %s\nNew: %s' % (original_title, paper.title))
841 | paper.bib = fixBibData(paper.bib, 1)
842 |
843 |
844 | def test():
845 | title = 'NegBio: a high-performance tool for negation and uncertainty detection in radiology reports'
846 |
847 | # res = searchSemanticScholar(title)
848 |
849 | # res = searchCrossref(title)
850 | # for r in res:
851 | # print(json.dumps(r, indent=3))
852 | pubmed_scraper.search(title, 'dduma@ed.ac.uk')
853 |
854 |
855 | if __name__ == '__main__':
856 | test()
857 |
--------------------------------------------------------------------------------