├── __init__.py ├── base ├── __init__.py ├── general_utils.py ├── file_download.py └── pdf_extract.py ├── db ├── __init__.py ├── csv.py ├── endnote_html.py ├── ris.py ├── bibtex.py ├── rayyan.py ├── ref_utils.py └── data.py ├── requirements.txt ├── search ├── __init__.py ├── other_search.py ├── base_search.py ├── google_scholar.py └── metadata_harvest.py ├── import_metadata.py ├── bib_to_csv.py ├── bulk_download.py ├── titles_and_bibs.py ├── export_rayyan_results.py ├── export_to_ris.py ├── .gitignore ├── import_from_endnote.py ├── gather_metadata.py ├── add_abstracts_from_pdf.py ├── search_to_file.py ├── bib_diff.py ├── reasons_for_exclusion.py ├── README.md ├── snowball_citations.py └── filter_results.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /db/csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def readCSVFile(filename): 5 | df = pd.read_csv(filename) 6 | return df.to_dict(orient='records') 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langdetect 2 | pandas 3 | beautifulsoup4 4 | lxml 5 | scholarly 6 | tqdm 7 | bibtexparser 8 | requests 9 | strsimpy 10 | RISparser==0.4.3 11 | -------------------------------------------------------------------------------- /search/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_search import MAX_RESULTS, Searcher, SearchResult, getSearchResultsFromBib 2 | from .google_scholar import GScholarSearcher 3 | from .metadata_harvest import enrichMetadata, enrichAndUpdateMetadata 4 | from .other_search import PubMedSearcher, SemanticScholarSearcher -------------------------------------------------------------------------------- /import_metadata.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib 3 | import pandas as pd 4 | 5 | def main(conf): 6 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, True) 7 | 8 | paperstore.addPapers(papers_to_add) 9 | if conf.force: 10 | paperstore.updatePapers(papers_existing) 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = ArgumentParser(description='Import metadata from bib file') 15 | parser.add_argument('-i', '--input', type=str, 16 | help='Input bib file name') 17 | parser.add_argument('-f', '--force', type=bool, default=False, 18 | help='Force updating of existing paper records') 19 | 20 | conf = parser.parse_args() 21 | 22 | main(conf) 23 | -------------------------------------------------------------------------------- /search/other_search.py: -------------------------------------------------------------------------------- 1 | from .base_search import Searcher, MAX_RESULTS 2 | from .metadata_harvest import SemanticScholarScraper, PubMedScraper 3 | 4 | 5 | class SemanticScholarSearcher(Searcher): 6 | def __init__(self, paperstore): 7 | super().__init__(paperstore) 8 | self.scraper = SemanticScholarScraper() 9 | 10 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS): 11 | res = self.scraper.search(query, identity='', min_year=min_year, max_year=max_year) 12 | return res 13 | 14 | 15 | class PubMedSearcher(Searcher): 16 | def __init__(self, paperstore): 17 | super().__init__(paperstore) 18 | self.scraper = PubMedScraper() 19 | 20 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS): 21 | self.scraper.search(query, identity='') 22 | -------------------------------------------------------------------------------- /bib_to_csv.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from base.general_utils import loadEntriesAndSetUp 3 | 4 | import pandas as pd 5 | 6 | def dataframeFromPapers(papers): 7 | report = [] 8 | 9 | for paper in papers: 10 | report.append(paper.asDict()) 11 | 12 | df = pd.DataFrame(report, columns=['id', 'year', 'title', 'authors', 'venue', 'abstract', 'doi', 'pmid', ]) 13 | return df 14 | 15 | 16 | def main(conf): 17 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache) 18 | 19 | 20 | df = dataframeFromPapers(all_papers) 21 | df.to_csv(conf.output) 22 | 23 | 24 | if __name__ == '__main__': 25 | parser = ArgumentParser(description='Filter results ') 26 | 27 | parser.add_argument('-i', '--input', type=str, 28 | help='Input bib file name') 29 | parser.add_argument('-o', '--output', type=str, 30 | help='Output csv file name') 31 | parser.add_argument('-c', '--cache', type=bool, default=True, 32 | help='Use local cache for results') 33 | 34 | conf = parser.parse_args() 35 | 36 | main(conf) 37 | -------------------------------------------------------------------------------- /bulk_download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from base.file_download import bulkDownload 4 | from base.general_utils import loadEntriesAndSetUp 5 | 6 | 7 | def main(conf): 8 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache, conf.max) 9 | 10 | bulkDownload(all_papers, conf.dir, conf.report_path, do_not_download_just_list=False) 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = ArgumentParser(description='Filter results ') 15 | 16 | parser.add_argument('-i', '--input', type=str, 17 | help='Input bib/RIS/CSV file name') 18 | parser.add_argument('-d', '--dir', type=str, 19 | help='Directory where to store the output') 20 | parser.add_argument('-c', '--cache', type=bool, default=True, 21 | help='Use local cache for results') 22 | parser.add_argument('-m', '--max', type=int, default=100, 23 | help='Maximum number of results to process') 24 | parser.add_argument('-r', '--report-path', type=str, default='results_report.csv', 25 | help='Path to CSV file with a download report') 26 | 27 | conf = parser.parse_args() 28 | 29 | main(conf) 30 | -------------------------------------------------------------------------------- /search/base_search.py: -------------------------------------------------------------------------------- 1 | import re 2 | from db.data import Paper 3 | 4 | MAX_RESULTS = 100 5 | 6 | 7 | class Searcher: 8 | def __init__(self, paperstore): 9 | self.paperstore = paperstore 10 | 11 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS): 12 | pass 13 | 14 | 15 | class SearchResult(Paper): 16 | def __init__(self, index, bib, source, extra_data): 17 | super().__init__(bib, extra_data) 18 | self.index = index 19 | self.source = source 20 | self.paper = None 21 | 22 | def __getitem__(self, item): 23 | return self.extra_data.get(item, self.bib.get(item)) 24 | 25 | def __repr__(self): 26 | return f"<#%d: %s - %s - %s> \n %s" % ( 27 | self.index, self.bib.get("title", ""), 28 | self.bib.get("author", ""), 29 | self.bib.get("year", ""), str(self.bib)) 30 | 31 | 32 | def getSearchResultsFromBib(bib_entries, max_results=100000000): 33 | results = [] 34 | for index, bib in enumerate(bib_entries[:max_results]): 35 | res = SearchResult(index, bib, 'bibfile', {}) 36 | if bib.get('note'): 37 | match = re.search('(\d+)\scites:\s.+?scholar\?cites\=(\d+)', bib['note']) 38 | if match: 39 | res.source = 'scholar' 40 | res.extra_data['scholarid'] = match.group(2) 41 | res.extra_data['citedby'] = match.group(1) 42 | results.append(res) 43 | 44 | return results 45 | -------------------------------------------------------------------------------- /titles_and_bibs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | 5 | from db.data import PaperStore 6 | from db.rayyan import loadRayyan, computeReviewerOverlap 7 | from db.rayyan import selectPapersToReview 8 | 9 | 10 | if __name__ == "__main__": 11 | 12 | parser = argparse.ArgumentParser(description='Gather metadata such as' 13 | 'reason for exclusion + bib information') 14 | parser.add_argument('-o', '--outfile', type=str, 15 | help='Output pandas csv filename') 16 | 17 | args = parser.parse_args() 18 | 19 | paper_store = PaperStore() 20 | 21 | # sysreview articles 22 | sysreviewdf = pd.read_excel(os.path.join('reasons_for_exclusion', 'sysreview-15-09-2020.xlsx')) 23 | 24 | bibs = [] 25 | 26 | # Add bib files to the dataframe for those that have a bib entry 27 | for title in sysreviewdf.title: 28 | paper = paper_store.findPapersByTitle(title) 29 | if paper: 30 | bibs.append(paper[0].bib) 31 | else: 32 | bibs.append(None) 33 | 34 | sysreviewdf['bib'] = bibs 35 | 36 | # Only keep titles and bibs 37 | sysreviewdf = sysreviewdf[['title', 'bib']] 38 | 39 | print(sysreviewdf) 40 | print('Writing results to %s' % args.outfile) 41 | sysreviewdf.to_csv(args.outfile, index=False) 42 | 43 | # notes = joined.notes.str.split('|').str[1] 44 | # notes = notes.str.split(':').str[-1] 45 | # notes = notes.str.split(',') 46 | # print(notes.isna().sum()) 47 | # 48 | # # Extract reasons from the notes section 49 | # pass 50 | -------------------------------------------------------------------------------- /export_rayyan_results.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from db.rayyan import loadRayyan, computeReviewerOverlap 3 | from db.rayyan import selectPapersToReview, selectPapersToFilter 4 | 5 | 6 | def main(conf): 7 | df = loadRayyan(conf.input) 8 | computeReviewerOverlap(df) 9 | # If we want exact include/exclude - call filter 10 | if (conf.num_included, conf.num_excluded) != (None, None): 11 | to_filter = selectPapersToFilter(df, 12 | include_count=conf.num_included, 13 | exclude_count=conf.num_excluded) 14 | print('\nTotal selected for filtering', len(to_filter)) 15 | to_filter.to_csv(conf.output) 16 | else: 17 | to_review = selectPapersToReview(df, conf.min_votes) 18 | print('\nTotal selected for review', len(to_review)) 19 | to_review.to_csv(conf.output) 20 | 21 | 22 | if __name__ == '__main__': 23 | parser = ArgumentParser(description='Filter results ') 24 | 25 | parser.add_argument('-i', '--input', type=str, 26 | help='Input .zip file downloaded from Rayyan') 27 | parser.add_argument('-o', '--output', type=str, 28 | help='Path to output report CSV') 29 | parser.add_argument('-v', '--min-votes', type=int, default=1, 30 | help='Minimum votes for inclusion') 31 | parser.add_argument('--num_included', type=int, 32 | help='Exact number of inclusion votes') 33 | parser.add_argument('--num_excluded', type=int, 34 | help='Exact number of exclusion votes') 35 | 36 | conf = parser.parse_args() 37 | 38 | main(conf) 39 | -------------------------------------------------------------------------------- /db/endnote_html.py: -------------------------------------------------------------------------------- 1 | import re 2 | from db.ref_utils import isPDFURL 3 | 4 | mapping = [ 5 | # ('Reference Type: ', 'ENTRYTYPE'), 6 | ('Title', 'title'), 7 | ('Journal', 'journal'), 8 | ('DOI', 'doi'), 9 | ('Author Address', 'address'), 10 | ('Author', 'author'), 11 | ('volume', 'VL'), 12 | ] 13 | 14 | type_mapping = { 15 | 'Journal Article': 'article', 16 | 'Thesis': 'thesis', 17 | 'Book': 'book', 18 | } 19 | 20 | 21 | def loadRefsFromHTML(filename): 22 | with open(filename) as f: 23 | html = f.read() 24 | 25 | html = html[html.find('') + 6:] 26 | # html = re.sub('.+', '', html, flags=re.DOTALL) 27 | entries = re.split('(

\n

\n

)', html) 28 | res = [] 29 | 30 | for entry in entries: 31 | lines = entry.split('\n') 32 | new_bib = {} 33 | 34 | for line in lines: 35 | match = re.search('Reference Type: <\/b> (.+?)

', line) 36 | if match: 37 | if match.group(1) in type_mapping: 38 | new_bib['ENTRYTYPE'] = type_mapping[match.group(1)] 39 | else: 40 | new_bib['ENTRYTYPE'] = 'article' 41 | 42 | for bib_map in mapping: 43 | match = re.search('' + bib_map[0] + ':<\/b> (.+?)

', line) 44 | if match: 45 | new_bib[bib_map[1]] = match.group(1) 46 | 47 | for match in re.finditer('', entry): 48 | if isPDFURL(match.group(1)): 49 | new_bib['eprint'] = match.group(1) 50 | else: 51 | new_bib['url'] = match.group(1) 52 | 53 | res.append(new_bib) 54 | 55 | return res 56 | -------------------------------------------------------------------------------- /export_to_ris.py: -------------------------------------------------------------------------------- 1 | from base.general_utils import loadEntriesAndSetUp 2 | 3 | from argparse import ArgumentParser 4 | from db.ris import writeBibToRISFile 5 | 6 | 7 | def main(conf): 8 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache) 9 | 10 | if conf.missing_abstract: 11 | all_bibs = [] 12 | for paper in all_papers: 13 | if not paper.has_pdf and not paper.has_abstract: 14 | all_bibs.append(paper.bib) 15 | elif conf.missing_pdf: 16 | all_bibs = [] 17 | for paper in all_papers: 18 | if not paper.has_pdf: 19 | all_bibs.append(paper.bib) 20 | else: 21 | all_bibs = [p.bib for p in all_papers] 22 | 23 | writeBibToRISFile(all_bibs, conf.output) 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = ArgumentParser( 28 | description='Exports a bibliography to RIS (EndNote) for further gathering of PDFs') 29 | 30 | parser.add_argument('-i', '--input', type=str, 31 | help='Input Bibtex file with the previously cached search results') 32 | parser.add_argument('-o', '--output', type=str, 33 | help='Output RIS file') 34 | parser.add_argument('-x', '--missing-pdf', type=bool, default=False, 35 | help='Export *only* papers missing a PDF') 36 | parser.add_argument('-a', '--missing-abstract', type=bool, default=False, 37 | help='Export *only* papers that are also missing an abstract') 38 | parser.add_argument('-c', '--cache', type=bool, default=True, 39 | help='Use local cache for results') 40 | 41 | conf = parser.parse_args() 42 | 43 | main(conf) 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /base/general_utils.py: -------------------------------------------------------------------------------- 1 | from db.bibtex import readBibtexFile 2 | from db.data import PaperStore, Paper 3 | from search import getSearchResultsFromBib 4 | from db.ref_utils import simpleResultDeDupe 5 | from db.bibtex import writeBibtex 6 | from db.ris import writeRIS, readRIS 7 | from db.csv import readCSVFile 8 | from search.metadata_harvest import mergeResultData 9 | 10 | def loadEntriesAndSetUp(input, use_cache=True, max_results=10000000): 11 | if use_cache: 12 | paperstore = PaperStore() 13 | else: 14 | paperstore = None 15 | 16 | bib_entries = readInputBib(input) 17 | results = getSearchResultsFromBib(bib_entries, max_results) 18 | 19 | results = simpleResultDeDupe(results) 20 | 21 | if paperstore: 22 | found, missing = paperstore.matchResultsWithPapers(results) 23 | else: 24 | found = [] 25 | missing = results 26 | 27 | papers_to_add = [Paper(res.bib, res.extra_data) for res in missing] 28 | papers_existing = [mergeResultData(res, res.paper) for res in found] 29 | 30 | all_papers = papers_to_add + papers_existing 31 | 32 | # FIXME: a second dedupe is needed because it seems I'm matching the wrong paper 33 | # a total of 5 records suffer from this so it's no big deal 34 | all_papers = simpleResultDeDupe(all_papers) 35 | 36 | return paperstore, papers_to_add, papers_existing, all_papers 37 | 38 | def readInputBib(filename): 39 | if filename.endswith('.bib'): 40 | return readBibtexFile(filename) 41 | elif filename.endswith('.csv'): 42 | return readCSVFile(filename) 43 | elif filename.endswith('.ris'): 44 | return readRIS(filename) 45 | 46 | def writeOutputBib(bib, filename): 47 | if filename.endswith('.ris'): 48 | writeRIS(bib, filename) 49 | else: 50 | writeBibtex(bib, filename) -------------------------------------------------------------------------------- /import_from_endnote.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from db.data import PaperStore, Paper 4 | from db.endnote_html import loadRefsFromHTML 5 | from search import getSearchResultsFromBib 6 | from db.ref_utils import addUrlIfNewWithType 7 | 8 | 9 | def main(conf): 10 | if conf.cache: 11 | paperstore = PaperStore() 12 | else: 13 | paperstore = None 14 | 15 | bib_entries = loadRefsFromHTML(conf.input) 16 | 17 | results = getSearchResultsFromBib(bib_entries) 18 | 19 | if paperstore: 20 | found, missing = paperstore.matchResultsWithPapers(results) 21 | else: 22 | found = [] 23 | missing = results 24 | 25 | papers_to_add = [Paper(res.bib, res.extra_data) for res in missing] 26 | 27 | counter = 0 28 | 29 | for res in found: 30 | if res.bib.get('url'): 31 | if addUrlIfNewWithType(res.paper, res['url'], 'endnote'): 32 | counter += 1 33 | if res.bib.get('eprint'): 34 | if addUrlIfNewWithType(res.paper, res['eprint'], 'endnote'): 35 | counter += 1 36 | 37 | papers_existing = [res.paper for res in found] 38 | paperstore.updatePapers(papers_existing) 39 | 40 | print('Papers found', len(papers_existing)) 41 | print('Papers not found', len(papers_to_add)) 42 | print('Added', counter, 'urls') 43 | 44 | if __name__ == '__main__': 45 | parser = ArgumentParser( 46 | description='Exports a bibliography to RIS (EndNote) for further gathering of PDFs') 47 | 48 | parser.add_argument('-i', '--input', type=str, 49 | help='Input EndNote HTML file') 50 | parser.add_argument('-c', '--cache', type=bool, default=True, 51 | help='Use local cache for results') 52 | 53 | conf = parser.parse_args() 54 | 55 | main(conf) 56 | -------------------------------------------------------------------------------- /gather_metadata.py: -------------------------------------------------------------------------------- 1 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib 2 | 3 | from search import enrichAndUpdateMetadata 4 | from argparse import ArgumentParser 5 | from db.bibtex import writeBibtex 6 | 7 | 8 | def main(conf): 9 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache, conf.max) 10 | 11 | if conf.cache: 12 | successful, unsuccessful = enrichAndUpdateMetadata(papers_to_add, paperstore, conf.email) 13 | 14 | if conf.force and conf.cache: 15 | enrichAndUpdateMetadata(papers_existing, paperstore, conf.email) 16 | 17 | all_papers = papers_to_add + papers_existing 18 | writeOutputBib(all_papers, conf.output) 19 | 20 | 21 | if __name__ == '__main__': 22 | parser = ArgumentParser( 23 | description='Gathers metadata, including the abstract, on a list of search results by searching on Crossref, PubMed, arXiv, Semantic Scholar and Unpaywall') 24 | 25 | parser.add_argument('-i', '--input', type=str, 26 | help='Input BIB/RIS file with the previously cached search results') 27 | parser.add_argument('-o', '--output', type=str, 28 | help='Output BIB/RIS file into which to update the new, augmented results') 29 | parser.add_argument('-m', '--max', type=int, default=100, 30 | help='Maximum number of results to process') 31 | parser.add_argument('-em', '--email', type=str, 32 | help='Email to serve as identity to API endpoints') 33 | parser.add_argument('-c', '--cache', type=bool, default=True, 34 | help='Use local cache for results') 35 | parser.add_argument('-f', '--force', type=bool, default=False, 36 | help='Force updating metadata for cached results') 37 | 38 | conf = parser.parse_args() 39 | 40 | main(conf) 41 | -------------------------------------------------------------------------------- /add_abstracts_from_pdf.py: -------------------------------------------------------------------------------- 1 | import os 2 | from base.general_utils import loadEntriesAndSetUp 3 | from base.file_download import bulkDownload 4 | from base.pdf_extract import getAbstractFromPDF 5 | from argparse import ArgumentParser 6 | from db.bibtex import writeBibtex 7 | 8 | 9 | def main(conf): 10 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache, conf.max) 11 | 12 | no_abstract_but_pdf = [p for p in all_papers if not p.has_abstract and p.has_pdf] 13 | bulkDownload(no_abstract_but_pdf, conf.dir, 'results_report.csv', do_not_download_just_list=True) 14 | 15 | successful = [] 16 | for paper in no_abstract_but_pdf: 17 | if not os.path.exists(paper.pdf_filename): 18 | continue 19 | 20 | abstract = getAbstractFromPDF(paper.pdf_filename) 21 | 22 | if abstract: 23 | print(abstract) 24 | paper.bib['abstract'] = abstract 25 | paperstore.updatePapers([paper]) 26 | successful.append(paper) 27 | 28 | print('Generated',len(successful), 'new abstracts') 29 | writeBibtex(successful, conf.output) 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = ArgumentParser( 34 | description='Tries to download the PDF for each file and extract the abstract from it') 35 | 36 | parser.add_argument('-i', '--input', type=str, 37 | help='Input Bibtex file with the previously cached search results') 38 | parser.add_argument('-o', '--output', type=str, 39 | help='Output Bbibex file into which to update the new, augmented results') 40 | parser.add_argument('-d', '--dir', type=str, 41 | help='Directory where to store the downloaded PDFs') 42 | parser.add_argument('-m', '--max', type=int, default=100, 43 | help='Maximum number of results to process') 44 | parser.add_argument('-em', '--email', type=str, 45 | help='Email to serve as identity to API endpoints') 46 | parser.add_argument('-c', '--cache', type=bool, default=True, 47 | help='Use local cache for results') 48 | 49 | conf = parser.parse_args() 50 | 51 | main(conf) 52 | -------------------------------------------------------------------------------- /search_to_file.py: -------------------------------------------------------------------------------- 1 | from db.data import PaperStore, Paper 2 | 3 | from search import GScholarSearcher, enrichAndUpdateMetadata 4 | from argparse import ArgumentParser 5 | from db.bibtex import writeBibtex 6 | 7 | 8 | def main(conf): 9 | if conf.cache: 10 | paperstore = PaperStore() 11 | else: 12 | paperstore = None 13 | 14 | if conf.engine == "scholar": 15 | searcher = GScholarSearcher(paperstore) 16 | # elif conf.engine == "pubmed": 17 | # searcher = PubMedSearcher(paperstore) 18 | else: 19 | raise ValueError 20 | 21 | if conf.query_file: 22 | with open(conf.query_file, 'r') as f: 23 | query = f.read() 24 | else: 25 | query = conf.query 26 | 27 | print("Query:", query) 28 | 29 | results = searcher.search(query, min_year=conf.year_start, max_results=conf.max) 30 | 31 | if conf.cache: 32 | found, missing = paperstore.matchResultsWithPapers(results) 33 | 34 | papers_to_add = [Paper(res.bib, res.extra_data) for res in missing] 35 | paperstore.updatePapers(papers_to_add) 36 | 37 | writeBibtex([Paper(res.bib, res.extra_data) for res in results], conf.file) 38 | 39 | 40 | if __name__ == '__main__': 41 | parser = ArgumentParser(description='Searches an engine and saves results to a file') 42 | 43 | parser.add_argument('-q', '--query', type=str, 44 | help='The query to use to retrieve the articles') 45 | parser.add_argument('-qf', '--query-file', type=str, 46 | help='Text file containing the query to use to retrieve the articles') 47 | parser.add_argument('-h', '--headers-file', type=str, 48 | help='YAML file containing the headers to use for requests, particularly to Google Scholar') 49 | parser.add_argument('-ys', '--year-start', type=int, 50 | help='The minimum year for results') 51 | parser.add_argument('-ye', '--year-end', type=int, 52 | help='The maximum year for results') 53 | parser.add_argument('-f', '--file', type=str, 54 | help='Filename to dump the results to') 55 | parser.add_argument('-m', '--max', type=int, default=100, 56 | help='Maximum number of results to retrieve') 57 | parser.add_argument('-e', '--engine', type=str, default="scholar", 58 | help='Which search engine to use. Currently only "scholar" (Google Scholar) available ') 59 | parser.add_argument('-em', '--email', type=str, 60 | help='Email to serve as identity to API endpoints') 61 | parser.add_argument('-c', '--cache', type=bool, default=True, 62 | help='Use local cache for results') 63 | 64 | conf = parser.parse_args() 65 | 66 | main(conf) 67 | -------------------------------------------------------------------------------- /db/ris.py: -------------------------------------------------------------------------------- 1 | from db.bibtex import fixBibData 2 | from db.ref_utils import parseBibAuthors, authorListFromListOfAuthors 3 | from RISparser import readris 4 | 5 | mapping = [ 6 | ('address', 'AD'), 7 | ('abstract', 'AB'), 8 | ('doi', 'DO'), 9 | ('eprint', 'LK'), 10 | ('editor', 'ED'), 11 | ('issue', 'IS'), 12 | ('journal', 'JF'), 13 | ('publisher', 'PB'), 14 | ('title', 'TI'), 15 | ('url', 'UR'), 16 | ('volume', 'VL'), 17 | ] 18 | 19 | type_mapping = { 20 | 'inproceedings': 'CONF', 21 | 'article': 'JOUR', 22 | 'thesis': 'THES', 23 | 'book': 'BOOK', 24 | } 25 | 26 | reverse_type_mapping = {b: a for a, b in type_mapping.items()} 27 | 28 | 29 | def exportBibToRIS(entries): 30 | lines = [] 31 | for entry in entries: 32 | authors = parseBibAuthors(entry['author']) 33 | 34 | if entry['ENTRYTYPE'].lower() in type_mapping: 35 | ris_type = type_mapping[entry['ENTRYTYPE'].lower()] 36 | else: 37 | ris_type = 'JOUR' 38 | 39 | lines.append('TY - ' + ris_type) 40 | 41 | for author in authors: 42 | au_line = 'AU - %s, %s' % (author['family'], author['given']) 43 | if author.get('middle'): 44 | au_line += ' ' + author['middle'] 45 | lines.append(au_line) 46 | 47 | # lines.append('PY - %s/%s/%s/' % (entry['year'], entry['month'], entry['day'])) 48 | lines.append('PY - %s' % (entry.get('year', ''),)) 49 | 50 | pages = entry.get('pages') 51 | if pages: 52 | bits = pages.split('-') 53 | 54 | lines.append('SP - ' + bits[0]) 55 | lines.append('EP - ' + bits[-1]) 56 | 57 | for eq in mapping: 58 | if entry.get(eq[0]): 59 | lines.append(str(eq[1]) + ' - ' + str(entry[eq[0]])) 60 | 61 | lines.append('ER - ') 62 | 63 | return '\n'.join(lines) 64 | 65 | 66 | def writeBibToRISFile(entries, filename): 67 | with open(filename, 'w') as f: 68 | text = exportBibToRIS(entries) 69 | f.write(text) 70 | 71 | 72 | def writeRIS(papers, filename): 73 | bibs = [paper.bib for paper in papers] 74 | writeBibToRISFile(bibs, filename) 75 | 76 | 77 | def readRIS(filename): 78 | with open(filename, 'r') as f: 79 | entries = readris(f) 80 | 81 | res = [] 82 | 83 | for entry in entries: 84 | entry['author'] = authorListFromListOfAuthors(entry.get('authors', [])) 85 | if 'authors' in entry: 86 | del entry['authors'] 87 | 88 | new_type = 'article' 89 | if entry.get('type_of_reference'): 90 | if entry['type_of_reference'] in reverse_type_mapping: 91 | new_type = reverse_type_mapping[entry['type_of_reference']] 92 | 93 | entry['ENTRYTYPE'] = new_type 94 | entry = fixBibData(entry, 0) 95 | res.append(entry) 96 | 97 | return res 98 | -------------------------------------------------------------------------------- /bib_diff.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from db.data import Paper 4 | from db.ref_utils import normalizeTitle 5 | from base.general_utils import readInputBib, writeOutputBib 6 | 7 | 8 | def merge_two_dicts(x, y): 9 | z = x.copy() # start with x's keys and values 10 | z.update(y) # modifies z with y's keys and values & returns None 11 | return z 12 | 13 | 14 | def buildHashTable(bib): 15 | res = {} 16 | for entry in bib: 17 | norm_title = normalizeTitle(entry['title']) 18 | res[norm_title] = entry 19 | return res 20 | 21 | 22 | def set_sub(a, b): 23 | res = set(a.keys()) - set(b.keys()) 24 | res_list = [value for key, value in a.items() if key in res] 25 | return [Paper(x, {}) for x in res_list] 26 | 27 | 28 | def set_intersect(a, b): 29 | res = set(a.keys()) & set(b.keys()) 30 | res_list = [value for key, value in a.items() if key in res] 31 | return [Paper(x, {}) for x in res_list] 32 | 33 | 34 | def set_union(a, b): 35 | res = set(a.keys()) | set(b.keys()) 36 | full_dict = merge_two_dicts(a, b) 37 | res_list = [value for key, value in full_dict.items() if key in res] 38 | return [Paper(x, {}) for x in res_list] 39 | 40 | 41 | def main(conf): 42 | bib1 = readInputBib(conf.input1) 43 | bib2 = readInputBib(conf.input2) 44 | 45 | s1 = buildHashTable(bib1) 46 | s2 = buildHashTable(bib2) 47 | 48 | list_sub1 = set_sub(s1, s2) 49 | list_sub2 = set_sub(s2, s1) 50 | list_and = set_intersect(s1, s2) 51 | list_or = set_union(s1, s2) 52 | 53 | output_format = conf.format.lower() 54 | 55 | writeOutputBib(list_sub1, conf.output + '_a-b.' + output_format) 56 | writeOutputBib(list_sub2, conf.output + '_b-a.' + output_format) 57 | writeOutputBib(list_and, conf.output + '_a_and_b.' + output_format) 58 | writeOutputBib(list_or, conf.output + '_a_or_b.' + output_format) 59 | 60 | print('A - B:', len(list_sub1)) 61 | print('B - A:', len(list_sub2)) 62 | print('B & A:', len(list_and)) 63 | print('B | A:', len(list_or)) 64 | 65 | 66 | if __name__ == '__main__': 67 | parser = ArgumentParser( 68 | description='Compute diff between bib lists. Takes 2 lists of bib entries, an "old" and a "new" one. It outputs 3 lists: 1. papers only found in input1 2. papers only in input 2 3. papers in both') 69 | 70 | parser.add_argument('-i1', '--input1', type=str, 71 | help='Input BIB/RIS/CSV file name (set A)') 72 | parser.add_argument('-i2', '--input2', type=str, 73 | help='Input BIB/RIS/CSV file name (set B)') 74 | parser.add_argument('-o', '--output', type=str, 75 | help='Beginning of output filename') 76 | parser.add_argument('-f', '--format', type=str, default='bib', 77 | help='Output format: bib, ris, csv') 78 | 79 | conf = parser.parse_args() 80 | 81 | main(conf) 82 | -------------------------------------------------------------------------------- /search/google_scholar.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import scholarly 3 | from time import sleep 4 | from .base_search import Searcher, MAX_RESULTS, SearchResult 5 | import bibtexparser 6 | from tqdm import tqdm 7 | from random import random 8 | from db.bibtex import fixBibData 9 | from db.ref_utils import isPDFURL, getDOIfromURL, addUrlIfNew, addUrlIfNewWithType 10 | 11 | 12 | class GScholarSearcher(Searcher): 13 | """ 14 | Retrieves results and bibtex data from Google Scholar 15 | """ 16 | 17 | def __init__(self, paperstore): 18 | super().__init__(paperstore) 19 | self.min_delay_between_requests = 0.1 20 | 21 | def randomSleep(self): 22 | sleep(self.min_delay_between_requests + random() / 10) # random sleep so we don't get blocked 23 | 24 | def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS): 25 | # TODO implement max year 26 | if min_year: 27 | scholarly.scholarly._PUBSEARCH = '/scholar?as_ylo=' + str(min_year) + '&q={0}' 28 | 29 | query = scholarly.search_pubs_query(query) 30 | results = [] 31 | index = 0 32 | for result in tqdm(query, desc="Getting results", total=max_results): 33 | bib = fixBibData(result.bib, index) 34 | 35 | extra_data = {} 36 | 37 | for field in ['citedby', 'url_scholarbib']: 38 | if hasattr(result, field): 39 | extra_data[field] = getattr(result, field) 40 | 41 | if hasattr(result, 'id_scholarcitedby'): 42 | extra_data['scholarid'] = result.id_scholarcitedby 43 | 44 | for field in ['url', 'eprint']: 45 | 46 | if hasattr(result, field): 47 | bib[field] = getattr(result, field) 48 | 49 | addUrlIfNewWithType(result, result.url, 'scholar') 50 | 51 | doi = getDOIfromURL(bib.get('url')) 52 | if not doi: 53 | doi = getDOIfromURL(bib.get('eprint', '')) 54 | 55 | if doi: 56 | bib['doi'] = doi 57 | 58 | result = SearchResult(index, bib, result.source, extra_data) 59 | results.append(result) 60 | index += 1 61 | 62 | if len(results) == max_results: 63 | break 64 | 65 | if len(results) % 10 == 0: 66 | self.randomSleep() 67 | return results 68 | 69 | def getScholarBibForResults(self, results): 70 | res = [] 71 | for result in tqdm(results, desc="Getting Scholar bib data"): 72 | if result.get("url_scholarbib"): 73 | bib = result["bib"] 74 | try: 75 | r = requests.get(result["url_scholarbib"]) 76 | # print(r) 77 | db = bibtexparser.loads(r.text) 78 | bib = db.entries[0] 79 | 80 | except Exception as e: 81 | print(e) 82 | 83 | bib['abstract'] = result["bib"]['abstract'] 84 | for key in ['abstract', 'eprint', 'url']: 85 | if key in result["bib"]: 86 | bib[key] = result["bib"][key] 87 | result["bib"] = bib 88 | 89 | self.randomSleep() 90 | -------------------------------------------------------------------------------- /base/file_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | from multiprocessing.pool import ThreadPool 3 | 4 | import pandas as pd 5 | import requests 6 | 7 | from db.ref_utils import parseBibAuthors, isPDFURL 8 | 9 | 10 | def fetch_url(entry): 11 | result = {'id': entry['id'], 12 | 'file_exists': False, 13 | 'return_code': None} 14 | 15 | if not os.path.exists(entry['filename']): 16 | print("Get %s - %s" % (entry['id'][:30], entry['url'])) 17 | try: 18 | r = requests.get(entry['url'], stream=True) 19 | result['return_code'] = r.status_code 20 | if r.status_code == 200: 21 | with open(entry['filename'], 'wb') as f: 22 | for chunk in r: 23 | f.write(chunk) 24 | except Exception as e: 25 | print(e.__class__.__name__, e) 26 | result['return_code'] = 'TooManyRedirects' 27 | 28 | else: 29 | result['file_exists'] = True 30 | 31 | return result 32 | 33 | 34 | def generateFilename(paper): 35 | res = '' 36 | authors = parseBibAuthors(paper.authors) 37 | if not authors: 38 | print(paper.authors) 39 | print() 40 | 41 | if authors and authors[0].get('family'): 42 | res += authors[0]['family'] + ' ' 43 | if paper.year: 44 | res += '(%s)' % paper.year 45 | 46 | if len(res) > 0: 47 | res += ' - ' 48 | res += paper.norm_title.title() 49 | return res 50 | 51 | 52 | def bulkDownload(papers, root_dir, report_path, do_not_download_just_list=False): 53 | root_dir = os.path.abspath(root_dir) 54 | 55 | if not os.path.exists(root_dir): 56 | os.makedirs(root_dir) 57 | 58 | download_tasks = [] 59 | 60 | for paper in papers: 61 | # if not paper.year: 62 | # print("missing year", paper) 63 | 64 | filename = os.path.join(root_dir, generateFilename(paper)) + '.pdf' 65 | paper.pdf_filename = filename 66 | 67 | task_record = {'id': paper.id, 68 | 'doi': paper.doi, 69 | 'filename': filename, 70 | 'abstract': paper.abstract 71 | } 72 | url = None 73 | url_source = None 74 | 75 | for url_rec in paper.extra_data.get('urls', []): 76 | if url_rec['type'] == 'pdf': 77 | url = url_rec['url'] 78 | url_source = url_rec['source'] 79 | break 80 | 81 | if not url: 82 | if paper.bib.get('eprint'): 83 | url = paper.bib['eprint'] 84 | url_source = 'search' 85 | elif paper.bib.get('url') and isPDFURL(paper.bib['url']): 86 | url = paper.bib['url'] 87 | url_source = 'search' 88 | 89 | if url: 90 | task_record['url'] = url 91 | task_record['url_source'] = url_source 92 | download_tasks.append(task_record) 93 | else: 94 | print(paper.extra_data) 95 | print(paper.bib) 96 | print() 97 | 98 | df = pd.DataFrame(download_tasks) 99 | df.to_csv('download_tasks.csv') 100 | 101 | if do_not_download_just_list: 102 | return 103 | 104 | results = ThreadPool(8).imap_unordered(fetch_url, download_tasks) 105 | 106 | df = pd.DataFrame(results) 107 | df.to_csv(report_path) 108 | -------------------------------------------------------------------------------- /db/bibtex.py: -------------------------------------------------------------------------------- 1 | import bibtexparser 2 | import re 3 | import random 4 | 5 | import requests 6 | 7 | from db.ref_utils import parseBibAuthors, normalizeTitle 8 | 9 | 10 | def fixBibData(bib, index): 11 | """ 12 | Add mandatory missing fields to bibtex data 13 | 14 | :param bib: 15 | :param index: 16 | :return: 17 | """ 18 | if "ENTRYTYPE" not in bib: 19 | bib["ENTRYTYPE"] = "ARTICLE" 20 | if "ID" not in bib: 21 | authors = parseBibAuthors(bib["author"]) 22 | if not authors: 23 | bib['ID'] = 'id' + str(random.randint(1000, 9000)) 24 | else: 25 | bib["ID"] = authors[0]["family"] 26 | 27 | bib['ID'] += str(bib.get("year", "YEAR")) + bib["title"].split()[0].lower() 28 | 29 | return bib 30 | 31 | 32 | def readBibtexString(bibstr): 33 | return bibtexparser.loads(bibstr).entries 34 | 35 | 36 | def readBibtexFile(filename): 37 | return bibtexparser.load(open(filename, 'r')).entries 38 | 39 | 40 | def writeBibtex(results: list, filename: str): 41 | """ 42 | Exports the list of results to a BibTeX file. 43 | 44 | :param results: a list of either SearchResult or Paper objects, with a .bib dict property 45 | :param filename: file to export the bibtex to 46 | """ 47 | db = bibtexparser.bibdatabase.BibDatabase() 48 | 49 | for index, result in enumerate(results): 50 | db.entries.append(fixBibData(result.bib, index)) 51 | 52 | with open(filename, 'w') as bibtex_file: 53 | bibtexparser.dump(db, bibtex_file) 54 | 55 | 56 | def getBibtextFromDOI(doi: str): 57 | assert doi 58 | headers = {'Accept': 'text/bibliography; style=bibtex'} 59 | url = 'http://doi.org/' + doi 60 | r = requests.get(url, headers=headers) 61 | text = r.content.decode('utf-8') 62 | bib = readBibtexString(text) 63 | return bib 64 | 65 | 66 | def generateUniqueID(paper): 67 | """ 68 | Returns a simple string id that is the mashup of the title and authors 69 | 70 | :param paper: 71 | :return: 72 | """ 73 | author_bit = '' 74 | if paper.extra_data.get('xref_author'): 75 | authors = paper.extra_data['xref_author'] 76 | else: 77 | try: 78 | authors = parseBibAuthors(paper.authors) 79 | except: 80 | print("Failed to parse authors string", paper.authors) 81 | authors = [{'given': '', 'family': ''}] 82 | 83 | for author in authors: 84 | if isinstance(author, str): 85 | author_bit += author 86 | else: 87 | if author.get('family'): 88 | author_bit += author.get('family', '_')[0] + author.get('given', '_')[0] 89 | 90 | title_bit = normalizeTitle(paper.title) 91 | title_bit = re.sub("\s+", "", title_bit) 92 | full_id = title_bit + "_" + author_bit 93 | full_id = full_id.lower() 94 | 95 | return full_id 96 | 97 | 98 | def test(): 99 | bibtex = """@ARTICLE{Cesar2013, 100 | author = {Jean César}, 101 | title = {An amazing title}, 102 | year = {2013}, 103 | volume = {12}, 104 | pages = {12--23}, 105 | journal = {Nice Journal}, 106 | abstract = {This is an abstract. This line should be long enough to test 107 | multilines...}, 108 | comments = {A comment}, 109 | keywords = {keyword1, keyword2} 110 | } 111 | """ 112 | 113 | with open('bibtex.bib', 'w') as bibfile: 114 | bibfile.write(bibtex) 115 | 116 | with open("bibtex.bib") as bibtex_file: 117 | bib_database = bibtexparser.load(bibtex_file) 118 | 119 | print(bib_database.entries) 120 | 121 | 122 | if __name__ == '__main__': 123 | test() 124 | -------------------------------------------------------------------------------- /reasons_for_exclusion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | 5 | from itertools import chain 6 | from collections import Counter 7 | 8 | from db.data import PaperStore 9 | from db.rayyan import loadRayyan, computeReviewerOverlap 10 | from db.rayyan import selectPapersToReview 11 | 12 | 13 | EXCLUSION_PRECEDENCE = [ 14 | 'foreign language', 15 | 'is review', 16 | 'uses images', 17 | 'not radiology', 18 | 'not nlp', 19 | 'wrong publication type', 20 | 'not peer reviewed', 21 | 'cannot find fulltext', 22 | 'conference', 23 | 'too short' 24 | ] 25 | 26 | 27 | def fix_reasons(r): 28 | if r == 'not radiology report': 29 | return 'not radiology' 30 | if r == 'not radiology reports': 31 | return 'not radiology' 32 | if r == 'review': 33 | return 'is review' 34 | if r == 'with_images': 35 | return 'uses images' 36 | if '_' in r: 37 | return r.replace('_', ' ') 38 | return r.strip() 39 | 40 | 41 | def get_main_reason(reasons): 42 | reasons = set(map(fix_reasons, reasons)) 43 | for r in EXCLUSION_PRECEDENCE: 44 | if r in reasons: 45 | return r 46 | print(reasons) 47 | return None 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | parser = argparse.ArgumentParser(description='Gather metadata such as' 53 | 'reason for exclusion + bib information') 54 | 55 | args = parser.parse_args() 56 | 57 | paper_store = PaperStore() 58 | 59 | columns = ['title', 'abstract', 'authors', 'url'] 60 | 61 | # 220 articles - original query 62 | querydf = loadRayyan(os.path.join('reasons_for_exclusion', 'rayyan-old-query.zip')) 63 | # Include all 64 | querydf = selectPapersToReview(querydf, 0) 65 | querydf['rayyan_source'] = 'old_query' 66 | 67 | # 397 articles, follow up snowballing and new query 68 | snowdf = loadRayyan(os.path.join('reasons_for_exclusion', 'rayyan-snowball.zip')) 69 | # Include all 70 | snowdf = selectPapersToReview(snowdf, 0) 71 | snowdf['rayyan_source'] = 'snowball' 72 | 73 | # sysreview articles 74 | sysreviewdf = pd.read_excel(os.path.join('reasons_for_exclusion', 'sysreview-15-09-2020.xlsx')) 75 | sysreviewdf['rayyan_source'] = 'combined' 76 | # Only keep columns we care about 77 | sysreviewdf = sysreviewdf[columns] 78 | # The last paper was added by Hang 79 | sysreviewdf = sysreviewdf.head(274) 80 | 81 | # Join on title - unsure if there is a better join to do 82 | 83 | joined = pd.concat([querydf, snowdf], ignore_index=True, sort=True) 84 | joined['lower_title'] = joined['title'].str.strip().str.lower() 85 | # Keep the snowballing entry if duplicate exists 86 | joined = joined.drop_duplicates(subset='lower_title', keep='last') 87 | 88 | joined = pd.concat([sysreviewdf, joined], ignore_index=True, sort=True) 89 | joined['lower_title'] = joined['title'].str.strip().str.lower() 90 | 91 | # Drop all duplicates (hence only keep entries that didn't make 92 | # it past Rayyan filtering) 93 | joined = joined.drop_duplicates(subset='lower_title', keep=False) 94 | 95 | joined = joined.reset_index(drop=True) 96 | del joined['lower_title'] 97 | 98 | print(joined) 99 | 100 | possible_exclusion_reasons = set(map(fix_reasons, chain(*joined['exclusion_reasons'].tolist()))) 101 | print('Possible exclusion reasons') 102 | print(possible_exclusion_reasons) 103 | 104 | exclusion_reasons = joined['exclusion_reasons'] 105 | 106 | main_reasons = [get_main_reason(r) for r in exclusion_reasons] 107 | counts = Counter(main_reasons) 108 | print() 109 | for k, v in counts.most_common(): 110 | print('%s: %d' % (k.ljust(25), v)) 111 | print() 112 | print('Excluded %d articles' % sum(counts.values())) 113 | -------------------------------------------------------------------------------- /base/pdf_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tika 3 | import enchant 4 | 5 | d1 = enchant.Dict("en_US") 6 | d2 = enchant.Dict("en_UK") 7 | 8 | import re 9 | 10 | if not os.environ.get('TIKA_PATH'): 11 | os.environ['TIKA_PATH'] = '~/' 12 | 13 | tika.initVM() 14 | from tika import parser 15 | 16 | 17 | def dehyphenate(text): 18 | """ 19 | Removes hyphens from text intelligently, checking plausible spelling 20 | 21 | :param text: hyphenated text 22 | :return: text: de-hyphenated text 23 | """ 24 | 25 | def rep_func(match): 26 | full_word = match.group(1) + match.group(2) 27 | if d1.check(full_word) or d2.check(full_word): 28 | return full_word 29 | else: 30 | return match.group(1) + '-' + match.group(2) 31 | 32 | text = re.sub('(\w+)-\n(\w+)', rep_func, text) 33 | return text 34 | 35 | 36 | def cleanUpTikaText(text): 37 | text = re.sub('\n+', '\n', text) 38 | return text 39 | 40 | 41 | def findHeaders(strings, text, default): 42 | str_start = -1 43 | 44 | for str_string in strings: 45 | str_start = text.find(str_string) 46 | if str_start != -1: 47 | break 48 | 49 | if str_start == -1: 50 | str_start = default 51 | 52 | return str_start 53 | 54 | 55 | # def getAbstract(text): 56 | # abs_start = findHeaders(['Abstract', 'ABSTRACT'], text, 0) 57 | # abs_end = findHeaders(["Keywords:", "Keywords :", "KEYWORDS:", 'Related Work', 'Previous Work'], text[abs_start:], 58 | # len(text)) 59 | # 60 | # abstract = text[abs_start:abs_end] 61 | # return abstract 62 | 63 | regex_abstract = re.compile('(^Abstract[\:\—\-\s\n]*.+?)^(\d*\.?\s*Introduction|Keywords\s*\:?|Previous work)', 64 | re.MULTILINE | re.IGNORECASE | re.DOTALL) 65 | 66 | regex_summary = re.compile( 67 | '(^(Abstract|Summary)\s*\:?\n.+?)^(\d*\.?\s*Introduction|Keywords\s*\:?|Previous work|Table of contents)', 68 | re.MULTILINE | re.IGNORECASE | re.DOTALL) 69 | 70 | regex_thesis = re.compile('I.+?declare that.+?(dissertation|thesis)', re.MULTILINE | re.DOTALL) 71 | 72 | 73 | def getAbstractFromPDF(filename): 74 | parsed = readPDF(filename) 75 | 76 | if parsed.get('error'): 77 | print(parsed['error']) 78 | return None 79 | 80 | if parsed.get('status', 200) == 422: 81 | print('Tika:: Unprocessable entity', filename) 82 | return None 83 | 84 | text = parsed['content'] 85 | if not text: 86 | print('Tika:: No text in file', filename) 87 | return None 88 | 89 | text = cleanUpTikaText(text) 90 | 91 | if regex_thesis.search(text): 92 | match = regex_summary.search(text) 93 | else: 94 | match = regex_abstract.search(text) 95 | 96 | if match: 97 | abstract = match.group(1) 98 | else: 99 | print('[[[[[[Could not find the abstract]]]]]]') 100 | print(text[:1000]) 101 | print('\n\n') 102 | return None 103 | 104 | abstract = dehyphenate(abstract) 105 | abstract = cleanUpTikaText(abstract) 106 | 107 | return abstract 108 | 109 | 110 | def readPDF(filename, to_xml=False): 111 | try: 112 | parsed = parser.from_file(filename, xmlContent=to_xml) 113 | except UnicodeEncodeError as e: 114 | print(e.__class__.__name__, e) 115 | return {'error': e.__class__.__name__ + ': ' + e.__str__()} 116 | return parsed 117 | 118 | 119 | def getStructuredArticle(xml): 120 | pass 121 | 122 | 123 | def test(): 124 | parsed = readPDF( 125 | '/Users/masterman/Downloads/Towards dataset creation and establishing baselines for sentence-level neural clinical paraphrase generation and simplification.pdf', 126 | to_xml=True) 127 | print(parsed['content']) 128 | 129 | 130 | def test2(): 131 | parsed = readPDF( 132 | '/Users/masterman/Downloads/Towards dataset creation and establishing baselines for sentence-level neural clinical paraphrase generation and simplification.pdf', 133 | to_xml=False) 134 | full_text = cleanUpTikaText(parsed['content']) 135 | abstract = getAbstractFromPDF(full_text) 136 | clean_abstract = dehyphenate(abstract) 137 | print(clean_abstract) 138 | print() 139 | 140 | 141 | if __name__ == '__main__': 142 | test() 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ReviewBuilder 2 | A collection of tools for automating parts of a [systematic review](https://consumers.cochrane.org/what-systematic-review) of scientific literature. 3 | 4 | Currently supports one use case: creating a bibtex file with the results of a Google Scholar search and augmenting the metadata for each result by retrieving its abstract and finding [Open Access](https://en.wikipedia.org/wiki/Open_access) versions of the paper on the web, including preprints. 5 | 6 | - All results are cached locally in a SQLite database, aiming to make iterating over queries for obtaining papers for a review less painful. 7 | - All data ingestion is _nice_ :), locally enforcing rate limiting, both from the known requirements of each service, and by parsing the `X-Rate-Limit-Limit` and `X-Rate-Limit-Interval` where provided in the response. 8 | - Implemented: [Google Scholar](https://scholar.google.com), [Crossref](https://www.crossref.org/services/metadata-delivery/rest-api/), [SemanticScholar (metadata)](https://api.semanticscholar.org/), [PubMed](https://www.ncbi.nlm.nih.gov/home/develop/api/), [arXiv](https://arxiv.org/help/api), [Unpaywall](https://unpaywall.org/products/api). 9 | - Not yet implemented: [Microsoft Academic](https://academic.microsoft.com), Semantic Scholar (search), [Web of Science](https://developer.clarivate.com/apis/wos) 10 | - Coming very soon: 11 | - locally filtering results (i.e. "selecting articles for inclusion") based on keywords and the detected language the paper is written in 12 | - automatic downloading of PDFs 13 | 14 | ## Installation 15 | 16 | Tested on Python 3.7 only. May work with earlier versions of Python 3, but not 2. 17 | 18 | > pip install -r requirements.txt 19 | 20 | ## Example usage 21 | 22 | > python search_to_file.py -q "OR \"natural language\" OR \"radiology reports\" OR lstm OR rnn OR bert OR elmo OR word2vec" -m 100 -f test.bib -ys 2015 23 | 24 | This will send the supplied query to Google Scholar, and set the minimum year (--year-start) to 2015, retrieve a maximum of 100 results and save them in the file `test.bib`. 25 | 26 | Alternatively, we can save the query in a text file and pass that as a parameter: 27 | 28 | > python search_to_file.py -qf query1.txt -m 100 -f test.bib -ys 2015 29 | 30 | Bibtex does not store everything we are interested in, so by default, extra data from Scholar such as the link to the "related articles", number of citations and other tidbits will be directly saved to the local SQLite cache (see below). 31 | 32 | Google Scholar offers perhaps the best coverage (recall) over all fields of science and does a great job at surfacing relevant articles. What it does not do, however, is make it easy to scrape, or connect these results to anything else useful. It does not provide any useful identifier for the results ([DOI](http://www.doi.org/), [PMID](https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/), etc) or the abstract of the paper, and a lot of information is mangled in the results, including authors' names. To get high quality data, we need to use other services. 33 | 34 | Once we have the list of results, we can collect extra data, such as the abstract of the paper and locations on the web where we may find it in open access, whether in HTML or PDF. 35 | 36 | > python gather_metadata.py -i test.bib -o test_plus.bib --max 20 37 | 38 | This will process a maximum of 200 entries from the `test.bib` file, and output an "enriched" version to `test_plus.bib`. For each entry it will try to: 39 | 1. match it with an entry in the local cache. If it can't be found go to step 2. 40 | 1. attempt to match the paper with its DOI via the [Crossref](http://www.crossref.org/) API. 41 | 1. once we have a DOI, check [SemanticScholar](http://www.semanticscholar.org/) for metadata and abstract for the paper 42 | 1. if we don't have a DOI or abstract, search [PubMed](http://www.ncbi.nlm.nih.gov/pubmed/) for its PubMed ID (PMID) and retrieve the abstract from there, if available 43 | 1. search [arXiv](http://arxiv.org) for a preprint of the paper 44 | 1. search [Unpawall](http://unpaywall.org) for available open access versions of the paper if we are missing a PDF link from the above 45 | 46 | Many of these steps require approximate matching, both for the local cache and the results from the remote APIs. Often a preprint version of a paper will have a slightly different title or will be missing an author or two. This repo implements several heuristics for dealing with this. 47 | 48 | A SQLite database cache is automatically created in `papers.sqlite` in the /db directory. 49 | 50 | 51 | -------------------------------------------------------------------------------- /snowball_citations.py: -------------------------------------------------------------------------------- 1 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib 2 | from argparse import ArgumentParser 3 | from filter_results import filterPapers, printReport, filterOnePaper 4 | from search.metadata_harvest import semanticscholarmetadata, enrichAndUpdateMetadata 5 | import pandas as pd 6 | 7 | 8 | def getCitingPapers(paper): 9 | try: 10 | paper, citing_papers = semanticscholarmetadata.getMetadata(paper, get_citing_papers=True) 11 | except Exception as e: 12 | print(e.__class__.__name__, e) 13 | return [] 14 | 15 | return citing_papers 16 | 17 | 18 | def deDupePaperList(): 19 | pass 20 | 21 | 22 | def snowballCitations(paperstore, all_papers): 23 | newfound_paper_list = [] 24 | report = [] 25 | 26 | all_titles_ever_seen = {} 27 | search_nodes = all_papers 28 | 29 | while len(search_nodes) > 0: 30 | paper = search_nodes.pop(0) 31 | new_papers = getCitingPapers(paper) 32 | for new_paper in new_papers: 33 | if new_paper.title in all_titles_ever_seen: 34 | print('[Skipping] already seen paper', new_paper.title) 35 | all_titles_ever_seen[new_paper.title] += 1 36 | continue 37 | 38 | semanticscholarmetadata.getMetadata(new_paper) 39 | new_paper.extra_data['done_semanticscholar'] = True 40 | paperstore.updatePapers([new_paper]) 41 | 42 | all_titles_ever_seen[new_paper.title] = 1 43 | # year = new_paper.bib.get('year', 0) 44 | # if year and int(year) >= 2015: 45 | # newfound_paper_list.append(Paper(paper.bib, paper.extra_data)) 46 | # else: 47 | # print(new_paper) 48 | if not new_paper.has_abstract: 49 | record = { 50 | 'title': paper.title, 51 | 'year': paper.year, 52 | 'authors': paper.authors, 53 | 'venue': paper.venue, 54 | 'abstract': paper.abstract, 55 | 'excluded': False, 56 | 'exclude_reason': None 57 | } 58 | paper_add = new_paper 59 | else: 60 | paper_add, record = filterOnePaper(new_paper, exclude_rules={'no_pdf': False, 61 | 'year': False, 62 | 'is_review': False}) 63 | report.append(record) 64 | 65 | if paper_add: 66 | newfound_paper_list.append(paper_add) 67 | print('Adding new seed paper', paper_add.bib['title']) 68 | search_nodes.append(new_paper) 69 | else: 70 | print('[Excluded]:', record['exclude_reason'], new_paper.bib['title']) 71 | 72 | df = pd.DataFrame(report, columns=['id', 'year', 'title', 'excluded', 'exclude_reason', 'language', 'abstract']) 73 | 74 | return newfound_paper_list, df 75 | 76 | 77 | def main(conf): 78 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache) 79 | 80 | # successful, unsuccessful = enrichAndUpdateMetadata(papers_to_add, paperstore, conf.email) 81 | 82 | snowballed_papers, df = snowballCitations(paperstore, all_papers) 83 | print('Number of snowballed papers:', len(snowballed_papers)) 84 | printReport(df) 85 | 86 | successful, unsuccessful = enrichAndUpdateMetadata(snowballed_papers, paperstore, conf.email) 87 | 88 | # included, df = filterPapers(snowballed_papers) 89 | # printReport(df) 90 | 91 | writeOutputBib(snowballed_papers, conf.output) 92 | 93 | 94 | if __name__ == '__main__': 95 | parser = ArgumentParser(description='Filter results ') 96 | 97 | parser.add_argument('-i', '--input', type=str, 98 | help='Input bib file name with seed papers') 99 | parser.add_argument('-o', '--output', type=str, 100 | help='Output bib file name with snowballed papers') 101 | parser.add_argument('-r', '--report-path', type=str, default='filter_report.csv', 102 | help='Path to output report CSV') 103 | parser.add_argument('-c', '--cache', type=bool, default=True, 104 | help='Use local cache for results') 105 | parser.add_argument('-em', '--email', type=str, 106 | help='Email to serve as identity to API endpoints') 107 | 108 | conf = parser.parse_args() 109 | 110 | main(conf) 111 | -------------------------------------------------------------------------------- /db/rayyan.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | from zipfile import ZipFile 4 | from itertools import combinations 5 | import numpy as np 6 | from io import BytesIO 7 | 8 | 9 | DROP_FIELDS = ['key', 10 | 'issn', 11 | 'volume', 12 | 'pages', 13 | 'issue', 14 | 'language', 15 | 'location', 16 | 'notes', 17 | 'journal', 18 | 'day', 19 | 'month', 20 | 'maybe_count'] 21 | 22 | 23 | def parseInclusion(text): 24 | reviewers = {} 25 | exclusion_reasons = [] 26 | labels = [] 27 | 28 | for match in re.findall('\"([\w\s\.]+?)\"=>\"([\w\s]+?)\"', text): 29 | reviewers[match[0]] = match[1] 30 | 31 | if match[1].lower() == 'excluded': 32 | exclusion_reasons = [] 33 | match = re.search('RAYYAN-EXCLUSION-REASONS: ([\w\s,]+)', text) 34 | if match: 35 | exclusion_reasons.extend(match.group(1).split(',')) 36 | 37 | match = re.search('RAYYAN-LABELS: ([\w\s,]+)', text) 38 | if match: 39 | labels.extend(match.group(1).split(',')) 40 | 41 | return reviewers, exclusion_reasons, labels 42 | 43 | 44 | def loadRayyan(filename): 45 | with ZipFile(filename, 'r') as zip: 46 | data = zip.read('articles.csv') 47 | data = BytesIO(data) 48 | df = pd.read_csv(data) 49 | 50 | column_labels = [] 51 | column_exclusion_reasons = [] 52 | column_reviewers = [] 53 | all_unique_reviewers = set() 54 | 55 | for index, row in df.iterrows(): 56 | reviewers, exclusion_reasons, labels = parseInclusion(row['notes']) 57 | column_labels.append(labels) 58 | column_exclusion_reasons.append(exclusion_reasons) 59 | column_reviewers.append(reviewers) 60 | 61 | all_unique_reviewers = all_unique_reviewers | set(reviewers.keys()) 62 | 63 | reviewer_titles = [] 64 | 65 | for reviewer in all_unique_reviewers: 66 | reviewer_column_title = 'reviewer_' + reviewer 67 | reviewer_titles.append('reviewer_' + reviewer) 68 | reviewer_column_data = [r.get(reviewer) for r in column_reviewers] 69 | df.insert(len(df.columns), reviewer_column_title, reviewer_column_data) 70 | 71 | df.insert(len(df.columns), 'exclusion_reasons', column_exclusion_reasons) 72 | df.insert(len(df.columns), 'labels', column_labels) 73 | 74 | for index, row in df.iterrows(): 75 | match = re.search(r'PY - (\d+)\/+?', row['authors']) 76 | if match: 77 | df.at[index, 'year'] = match.group(1) 78 | df.at[index, 'authors'] = df.iloc[index]['authors'][:match.start()] 79 | 80 | included_counts = [] 81 | excluded_counts = [] 82 | maybe_counts = [] 83 | 84 | for index, row in df.iterrows(): 85 | included_count = 0 86 | excluded_count = 0 87 | maybe_count = 0 88 | for reviewer in reviewer_titles: 89 | if row.get(reviewer) == 'Included': 90 | included_count += 1 91 | elif row.get(reviewer) == 'Excluded': 92 | excluded_count += 1 93 | elif row.get(reviewer) == 'Maybe': 94 | maybe_count += 1 95 | included_counts.append(included_count) 96 | excluded_counts.append(excluded_count) 97 | maybe_counts.append(maybe_count) 98 | 99 | df.insert(len(df.columns), 'included_count', included_counts) 100 | df.insert(len(df.columns), 'excluded_count', excluded_counts) 101 | df.insert(len(df.columns), 'maybe_count', maybe_counts) 102 | 103 | return df 104 | 105 | 106 | def computeOverlap(df): 107 | reviewer_columns = [c for c in df.columns if c.startswith('reviewer_')] 108 | df = df[reviewer_columns] 109 | 110 | a = df.values 111 | d = {(i, j): np.mean(a[:, i] == a[:, j]) for i, j in combinations(range(a.shape[1]), 2)} 112 | 113 | res, c, vals = np.zeros((a.shape[1], a.shape[1])), \ 114 | list(map(list, zip(*d.keys()))), list(d.values()) 115 | 116 | res[c[0], c[1]] = vals 117 | 118 | return pd.DataFrame(res, columns=df.columns, index=df.columns) 119 | 120 | 121 | # def compute_agreement(vals, vala, valb): 122 | # # Use to compute TP/TN/FP/FN 123 | # d = {(i, j): np.sum((vals[:, i] == vala) & (vals[:, j] == valb)) 124 | # for i, j in combinations(range(vals.shape[1]), 2)} 125 | # df, c, vals = np.zeros((vals.shape[1], vals.shape[1])), \ 126 | # list(map(list, zip(*d.keys()))), list(d.values()) 127 | # df[c[0], c[1]] = vals 128 | # return df 129 | 130 | 131 | # def computeStats(df): 132 | # reviewer_columns = [c for c in df.columns if c.startswith('reviewer_')] 133 | # df = df[reviewer_columns] 134 | # 135 | # a = df.values 136 | # TP = compute_agreement(a, 'Included', 'Included') 137 | # TN = compute_agreement(a, 'Excluded', 'Excluded') 138 | # FP = compute_agreement(a, 'Included', 'Excluded') 139 | # FN = compute_agreement(a, 'Excluded', 'Included') 140 | # 141 | # print('TP', TP) 142 | # print('TN', TN) 143 | # print('FP', FP) 144 | # print('FN', FN) 145 | # 146 | # print('Total', TP+TN+FP+FN) 147 | 148 | 149 | def computeFleiss(df): 150 | reviewer_columns = [c for c in df.columns if c.startswith('reviewer_')] 151 | df = df[reviewer_columns] 152 | 153 | a = df.values 154 | classes = set(a.ravel()) 155 | 156 | # rows are instances/examples 157 | # columns are classes 158 | # values are number of annotators assigned instance to class 159 | # so sum of each rows = num annotators 160 | P = np.hstack([np.sum(a == c, axis=1, keepdims=True) 161 | for c in classes]) 162 | # Below is wikipedia example - expected kappa: 0.210 163 | # P = np.array([[0, 0, 0, 0, 14], 164 | # [0, 2, 6, 4, 2], 165 | # [0, 0, 3, 5, 6], 166 | # [0, 3, 9, 2, 0], 167 | # [2, 2, 8, 1, 1], 168 | # [7, 7, 0, 0, 0], 169 | # [3, 2, 6, 3, 0], 170 | # [2, 5, 3, 2, 2], 171 | # [6, 5, 2, 1, 0], 172 | # [0, 2, 2, 3, 7]]) 173 | 174 | # N: number examples, k = number classes 175 | N, k = P.shape 176 | # n: number of annotators 177 | n = P.sum(axis=1)[0] 178 | assert(np.all(P.sum(axis=1) == n)) 179 | # P_j.. 180 | pee_jays = np.sum(P, axis=0) / (N * n) 181 | assert np.isclose(pee_jays.sum(), 1.), 'P_j calculation is wrong' 182 | 183 | # P_is 184 | pee_eye = np.sum(P * (P - 1), axis=1) / (n * (n - 1)) 185 | 186 | pee_tilde = pee_eye.mean() 187 | pee_ee = np.sum(pee_jays ** 2) 188 | 189 | # Fleiss' kappa 190 | fleiss = (pee_tilde - pee_ee) / (1 - pee_ee) 191 | return fleiss 192 | 193 | 194 | def computeOverlap3(df): 195 | Yourdf = pd.DataFrame(columns=df.columns, index=df.columns) 196 | Yourdf = Yourdf.stack(dropna=False).to_frame().apply(lambda x: (df[x.name[0]] == df[x.name[1]]).mean(), 197 | axis=1).unstack() 198 | Yourdf = Yourdf.where(np.triu(np.ones(Yourdf.shape), 1).astype(np.bool)) 199 | return Yourdf 200 | 201 | 202 | # def computeOverlap(df): 203 | # pd.crosstab(df.columns, df.columns, ) 204 | 205 | def filterDFForInclusion(df, screen='Included'): 206 | if screen == 'Included': 207 | return df[df['included_count'] > 0] 208 | elif screen == 'Excluded': 209 | return df[df['excluded_count'] > 0] 210 | elif screen == 'Maybe': 211 | return df[df['maybe_count'] > 0] 212 | 213 | 214 | def computeReviewerOverlap(df): 215 | # df.at[df['reviewer_agrivas'] == 'Maybe', 'reviewer_agrivas'] = 'Included' 216 | # df.at[df['reviewer_Daniel'] == 'Maybe', 'reviewer_Daniel'] = 'Included' 217 | 218 | print('Total overlap') 219 | print(computeOverlap(df)) 220 | print("Fleiss' kappa: %.2f" % computeFleiss(df)) 221 | 222 | print('\nIncluded overlap') 223 | print(computeOverlap(filterDFForInclusion(df, 'Included'))) 224 | 225 | print('\nExcluded overlap') 226 | print(computeOverlap(filterDFForInclusion(df, 'Excluded'))) 227 | 228 | 229 | def selectPapersToReview(df, min_agreement=1): 230 | res = df[df['included_count'] >= min_agreement] 231 | res.drop(DROP_FIELDS, axis=1, inplace=True) 232 | return res 233 | 234 | 235 | def selectPapersToFilter(df, include_count, exclude_count): 236 | res = df[(df['included_count'] == include_count) & (df['excluded_count'] == exclude_count)] 237 | res.drop(DROP_FIELDS, axis=1, inplace=True) 238 | return res 239 | -------------------------------------------------------------------------------- /filter_results.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from base.general_utils import loadEntriesAndSetUp, writeOutputBib 3 | import pandas as pd 4 | from langdetect import detect 5 | from langdetect import DetectorFactory 6 | 7 | DetectorFactory.seed = 0 8 | import re 9 | 10 | 11 | def getPaperText(paper): 12 | res = paper.title 13 | abs = paper.bib.get('abstract') 14 | if abs: 15 | abstract = paper.bib.get('abstract', '') 16 | abstract = re.sub(r'[\n\s]+', ' ', abstract) 17 | abstract = re.sub(r'\s+', ' ', abstract) 18 | 19 | res += " " + abstract 20 | return res 21 | 22 | 23 | def isPatent(paper): 24 | url = paper.bib.get('url', paper.bib.get('eprint')) 25 | return 'patent' in paper.bib.get('journal', '') or (url and 'patent' in url.lower()) 26 | 27 | 28 | def oneKeywordInText(keywords, text): 29 | text_lower = text.lower() 30 | for kw in keywords: 31 | kw = kw.lower() 32 | if kw in text_lower: 33 | return True 34 | 35 | return False 36 | 37 | 38 | def allKeywordsInText(keywords, text): 39 | text_lower = text.lower() 40 | 41 | in_text = 0 42 | 43 | for kw in keywords: 44 | kw = kw.lower() 45 | if kw in text_lower: 46 | in_text += 1 47 | 48 | return in_text == len(keywords) 49 | 50 | 51 | def oneKeywordNotInText(keywords, text): 52 | text_lower = text.lower() 53 | for kw in keywords: 54 | kw = kw.lower() 55 | if kw not in text_lower: 56 | return True 57 | 58 | return False 59 | 60 | 61 | def allKeywordsNotInText(keywords, text): 62 | text_lower = text.lower() 63 | not_in_text = 0 64 | 65 | for kw in keywords: 66 | kw = kw.lower() 67 | if kw not in text_lower: 68 | not_in_text += 1 69 | 70 | return not_in_text == len(keywords) 71 | 72 | 73 | def printReport(df): 74 | print(df) 75 | print('Included papers', len(df[df['excluded'] == False])) 76 | print('Excluded papers', len(df[df['excluded'] == True])) 77 | print('Excluded because of') 78 | print(' language', len(df[df['exclude_reason'] == 'language'])) 79 | print(' is a patent', len(df[df['exclude_reason'] == 'is_patent'])) 80 | print(' year out of range', len(df[df['exclude_reason'] == 'year'])) 81 | print(' is a review', len(df[df['exclude_reason'] == 'is_review'])) 82 | print(' using images', len(df[df['exclude_reason'] == 'uses_images'])) 83 | # print(' full text not available', len(df[df['exclude_reason'] == 'no_pdf'])) 84 | print(' not radiology', len(df[df['exclude_reason'] == 'not_radiology'])) 85 | print(' not NLP', len(df[df['exclude_reason'] == 'not_nlp'])) 86 | 87 | 88 | def collectStats(papers): 89 | results = [] 90 | for paper in papers: 91 | res = { 92 | # 'id': paper.id, 93 | 'has_year': bool(paper.year), 94 | 'has_title': bool(paper.title), 95 | # 'authors': paper.authors, 96 | 'has_doi': bool(paper.doi), 97 | 'has_arxivid': bool(paper.arxivid), 98 | 'has_pmid': bool(paper.pmid), 99 | 'has_ssid': bool(paper.extra_data.get('ss_id')), 100 | 'has_valid_id': paper.has_valid_id, 101 | 'has_abstract': paper.has_abstract, 102 | 'has_full_abstract': paper.has_full_abstract, 103 | 'has_pdf': paper.has_pdf_link, 104 | 'not_abstract_but_pdf': not paper.has_abstract and paper.has_pdf 105 | } 106 | results.append(res) 107 | 108 | df = pd.DataFrame(results) 109 | for field in df.columns: 110 | print(field, len(df[df[field] == True])) 111 | print() 112 | 113 | 114 | def filterOnePaper(paper, exclude_rules={}): 115 | record = { 116 | 'title': paper.title, 117 | # 'year': int(paper.year) if paper.year else None, 118 | 'year': paper.year, 119 | 'authors': paper.authors, 120 | 'venue': paper.venue, 121 | 'abstract': paper.abstract, 122 | 'excluded': False, 123 | 'exclude_reason': None 124 | } 125 | accept = True 126 | 127 | text = getPaperText(paper) 128 | language = paper.extra_data.get('language') 129 | 130 | if not language: 131 | if len(text) < 62 or text.isupper(): 132 | language = 'en' 133 | else: 134 | language = detect(text) 135 | 136 | # if language != 'en': 137 | # print(text) 138 | # print("Lang:", language) 139 | # print() 140 | 141 | language = language.lower() 142 | record['language'] = language 143 | 144 | lower_text = text.lower() 145 | 146 | if paper.title == "Identifying peripheral arterial disease cases using natural language processing of clinical notes": 147 | print() 148 | 149 | try: 150 | if paper.bib.get('year') is None or paper.bib.get('year') == '': 151 | paper.bib['year'] = 0 152 | else: 153 | paper.bib['year'] = int(paper.bib['year']) 154 | except: 155 | paper.bib['year'] = 0 156 | 157 | if exclude_rules.get('language', True) and not language.startswith('en'): 158 | record['excluded'] = True 159 | record['exclude_reason'] = 'language' 160 | accept = False 161 | elif exclude_rules.get('is_patent', True) and isPatent(paper): 162 | record['excluded'] = True 163 | record['exclude_reason'] = 'is_patent' 164 | accept = False 165 | elif exclude_rules.get('year', True) and int(paper.bib.get('year', 0)) < 2015: 166 | record['excluded'] = True 167 | record['exclude_reason'] = 'year' 168 | accept = False 169 | elif exclude_rules.get('is_review', True) and oneKeywordInText(['review', 'overview'], 170 | paper.title.lower()) or oneKeywordInText( 171 | ['this review', 'this chapter'], lower_text): 172 | record['excluded'] = True 173 | record['exclude_reason'] = 'is_review' 174 | accept = False 175 | elif exclude_rules.get('uses_images', True) and oneKeywordInText( 176 | ['images', 'visual', 'chest x-ray', 'segmentation'], lower_text): 177 | record['excluded'] = True 178 | record['exclude_reason'] = 'uses_images' 179 | accept = False 180 | # elif exclude_rules.get('no_pdf', True) and not paper.has_pdf: 181 | # record['excluded'] = True 182 | # record['exclude_reason'] = 'no_pdf' 183 | # accept = False 184 | elif exclude_rules.get('not_radiology', True) and allKeywordsNotInText( 185 | ['radiolo', 'imaging report', ' CT', ',CT', ':CT', 'MRI'], lower_text): 186 | record['excluded'] = True 187 | record['exclude_reason'] = 'not_radiology' 188 | accept = False 189 | elif exclude_rules.get('not_nlp', True) and allKeywordsNotInText( 190 | ['text', 'langu', 'lingu', 'nlp', 'synta', 'embedding', 'information extraction', 191 | 'text mining', 'words', 192 | 'deep learning', 'deep neural', 193 | 'machine learning', 'artificial intelligence', 'document classification', ], 194 | lower_text): 195 | record['excluded'] = True 196 | record['exclude_reason'] = 'not_nlp' 197 | accept = False 198 | 199 | if accept: 200 | return paper, record 201 | else: 202 | return None, record 203 | 204 | 205 | def filterPapers(papers): 206 | included = [] 207 | report = [] 208 | 209 | for paper in papers: 210 | paper, record = filterOnePaper(paper) 211 | if paper: 212 | included.append(paper) 213 | report.append(record) 214 | 215 | df = pd.DataFrame(report, columns=['id', 'year', 'title', 'excluded', 'exclude_reason', 'language', 'abstract']) 216 | return included, df 217 | 218 | 219 | def main(conf): 220 | paperstore, papers_to_add, papers_existing, all_papers = loadEntriesAndSetUp(conf.input, conf.cache) 221 | 222 | collectStats(all_papers) 223 | included, df = filterPapers(all_papers) 224 | 225 | printReport(df) 226 | 227 | df.to_csv(conf.report_path) 228 | 229 | writeOutputBib(included, conf.output) 230 | 231 | return df 232 | 233 | 234 | if __name__ == '__main__': 235 | parser = ArgumentParser(description='Filter results ') 236 | 237 | parser.add_argument('-i', '--input', type=str, 238 | help='Input bib file name') 239 | parser.add_argument('-o', '--output', type=str, 240 | help='Output bib file name') 241 | parser.add_argument('-r', '--report-path', type=str, default='filter_report.csv', 242 | help='Path to output report CSV') 243 | parser.add_argument('-c', '--cache', type=bool, default=True, 244 | help='Use local cache for results') 245 | 246 | conf = parser.parse_args() 247 | 248 | df = main(conf) 249 | -------------------------------------------------------------------------------- /db/ref_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | 4 | 5 | def isPDFURL(url): 6 | return ('pdf' in url or 'openreview' in url) 7 | 8 | 9 | def getDOIfromURL(url): 10 | if not url: 11 | return None 12 | 13 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)\.pdf', url) 14 | if match: 15 | return match.group(1) 16 | 17 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)/', url) 18 | if match: 19 | return match.group(1) 20 | 21 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)\?', url) 22 | if match: 23 | return match.group(1) 24 | 25 | match = re.search('(10\.\d+\/[a-zA-Z\.\d\-\_]+)', url) 26 | if match: 27 | return match.group(1) 28 | 29 | return None 30 | 31 | 32 | def unicodeToASCII(input_str): 33 | nfkd_form = unicodedata.normalize('NFKD', input_str) 34 | only_ascii = nfkd_form.encode('ASCII', 'ignore').decode("utf-8") 35 | return only_ascii 36 | 37 | 38 | pLu = "[A-Z\u00C0-\u00D6\u00D8-\u00DE\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176\u0178\u0179\u017B\u017D\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E\u0370\u0372\u0376\u037F\u0386\u0388-\u038A\u038C\u038E\u038F\u0391-\u03A1\u03A3-\u03AB\u03CF\u03D2-\u03D4\u03D8\u03DA\u03DC\u03DE\u03E0\u03E2\u03E4\u03E6\u03E8\u03EA\u03EC\u03EE\u03F4\u03F7\u03F9\u03FA\u03FD-\u042F\u0460\u0462\u0464\u0466\u0468\u046A\u046C\u046E\u0470\u0472\u0474\u0476\u0478\u047A\u047C\u047E\u0480\u048A\u048C\u048E\u0490\u0492\u0494\u0496\u0498\u049A\u049C\u049E\u04A0\u04A2\u04A4\u04A6\u04A8\u04AA\u04AC\u04AE\u04B0\u04B2\u04B4\u04B6\u04B8\u04BA\u04BC\u04BE\u04C0\u04C1\u04C3\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04D2\u04D4\u04D6\u04D8\u04DA\u04DC\u04DE\u04E0\u04E2\u04E4\u04E6\u04E8\u04EA\u04EC\u04EE\u04F0\u04F2\u04F4\u04F6\u04F8\u04FA\u04FC\u04FE\u0500\u0502\u0504\u0506\u0508\u050A\u050C\u050E\u0510\u0512\u0514\u0516\u0518\u051A\u051C\u051E\u0520\u0522\u0524\u0526\u0528\u052A\u052C\u052E\u0531-\u0556\u10A0-\u10C5\u10C7\u10CD\u13A0-\u13F5\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE\u1F08-\u1F0F\u1F18-\u1F1D\u1F28-\u1F2F\u1F38-\u1F3F\u1F48-\u1F4D\u1F59\u1F5B\u1F5D\u1F5F\u1F68-\u1F6F\u1FB8-\u1FBB\u1FC8-\u1FCB\u1FD8-\u1FDB\u1FE8-\u1FEC\u1FF8-\u1FFB\u2102\u2107\u210B-\u210D\u2110-\u2112\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u2130-\u2133\u213E\u213F\u2145\u2160-\u216F\u2183\u24B6-\u24CF\u2C00-\u2C2E\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E-\u2C80\u2C82\u2C84\u2C86\u2C88\u2C8A\u2C8C\u2C8E\u2C90\u2C92\u2C94\u2C96\u2C98\u2C9A\u2C9C\u2C9E\u2CA0\u2CA2\u2CA4\u2CA6\u2CA8\u2CAA\u2CAC\u2CAE\u2CB0\u2CB2\u2CB4\u2CB6\u2CB8\u2CBA\u2CBC\u2CBE\u2CC0\u2CC2\u2CC4\u2CC6\u2CC8\u2CCA\u2CCC\u2CCE\u2CD0\u2CD2\u2CD4\u2CD6\u2CD8\u2CDA\u2CDC\u2CDE\u2CE0\u2CE2\u2CEB\u2CED\u2CF2\uA640\uA642\uA644\uA646\uA648\uA64A\uA64C\uA64E\uA650\uA652\uA654\uA656\uA658\uA65A\uA65C\uA65E\uA660\uA662\uA664\uA666\uA668\uA66A\uA66C\uA680\uA682\uA684\uA686\uA688\uA68A\uA68C\uA68E\uA690\uA692\uA694\uA696\uA698\uA69A\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uFF21-\uFF3A\U00010400-\U00010427\U000104B0-\U000104D3\U00010C80-\U00010CB2\U000118A0-\U000118BF\U0001D400-\U0001D419\U0001D434-\U0001D44D\U0001D468-\U0001D481\U0001D49C\U0001D49E\U0001D49F\U0001D4A2\U0001D4A5\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B5\U0001D4D0-\U0001D4E9\U0001D504\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D538\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546\U0001D54A-\U0001D550\U0001D56C-\U0001D585\U0001D5A0-\U0001D5B9\U0001D5D4-\U0001D5ED\U0001D608-\U0001D621\U0001D63C-\U0001D655\U0001D670-\U0001D689\U0001D6A8-\U0001D6C0\U0001D6E2-\U0001D6FA\U0001D71C-\U0001D734\U0001D756-\U0001D76E\U0001D790-\U0001D7A8\U0001D7CA\U0001E900-\U0001E921\U0001F130-\U0001F149\U0001F150-\U0001F169\U0001F170-\U0001F189]" 39 | author_regex1 = re.compile("^(" + pLu + "+)\s+([\w\-]+)") 40 | author_regex2 = re.compile("^(" + pLu + "[\w\-]+)\s*,\s*(" + pLu + "\w*)") 41 | author_regex3 = re.compile("^(" + pLu + "[\w\-]+)\s+([\w\.\s\-]+),\s([\w\-]+)") 42 | author_regex4 = re.compile("^(" + pLu + "[\w\-]+)\s+([\w\.\s\-]+)(van|von|dos|del|de la),\s([\w\-]+)") 43 | 44 | 45 | def parseBibAuthors(authors): 46 | if not authors: 47 | return [{"given": '', "family": ''}] 48 | 49 | bits = authors.split('and') 50 | authors = [] 51 | for bit in bits: 52 | bit = bit.strip() 53 | # matches 'SKC Chiang', 'S Chiang' 54 | match = author_regex1.search(bit) 55 | if match: 56 | author = {"given": match.group(1)[0], "family": match.group(2)} 57 | if len(match.group(1)) > 1: 58 | author['middle'] = match.group(1)[1:] 59 | authors.append(author) 60 | else: 61 | # matches 'Smith, Bob' 62 | match = author_regex2.search(bit) 63 | if match: 64 | author = {"given": match.group(2), "family": match.group(1)} 65 | authors.append(author) 66 | else: 67 | # matchesn 'Otmani Abdeldjallal, Nassim' 68 | match = author_regex3.search(bit) 69 | if match: 70 | author = {"given": match.group(3), "family": match.group(1)} 71 | if match.group(2): 72 | author['middle'] = match.group(2) 73 | authors.append(author) 74 | else: 75 | # special for Dutch/German/Portuguese/Spanish surnames - van/von/dos/del/de la Blah 76 | match = author_regex4.search(bit) 77 | if match: 78 | author = {"given": match.group(3), 79 | "family": match.group(3) + " " + match.group(1)} 80 | if match.group(2): 81 | author['middle'] = match.group(2) 82 | authors.append(author) 83 | 84 | author = {"given": '', "family": bit} 85 | # raise ValueError("Couldn't find names") 86 | return authors 87 | 88 | 89 | def authorListFromDict(authors): 90 | authorstrings = [] 91 | for author in authors: 92 | authorstring = author.get('family', '') 93 | if author.get('middle', ''): 94 | authorstring += ' ' + author.get('middle') 95 | authorstring += ', ' + author.get('given', '') 96 | authorstrings.append(authorstring) 97 | 98 | authors_string = " and ".join(authorstrings) 99 | return authors_string 100 | 101 | 102 | def authorListFromListOfAuthors(authors): 103 | authors_string = " and ".join(authors) 104 | return authors_string 105 | 106 | 107 | def normalizeURL(url: str): 108 | return url.replace('https:', 'http:') 109 | 110 | 111 | def addUrlIfNew(paper, url: str, type: str, source: str): 112 | paper.extra_data['urls'] = paper.extra_data.get('urls', []) 113 | 114 | existing_urls = [normalizeURL(u['url']).lower() for u in paper.extra_data['urls']] 115 | norm_url = normalizeURL(url) 116 | 117 | if norm_url.lower() not in existing_urls: 118 | paper.extra_data['urls'].append({'url': norm_url, 119 | 'type': type, 120 | 'source': source}) 121 | return True 122 | return False 123 | 124 | 125 | def addUrlIfNewWithType(paper, url: str, source: str): 126 | if isPDFURL(url): 127 | type = 'pdf' 128 | else: 129 | type = 'main' 130 | 131 | return addUrlIfNew(paper, url, type, source) 132 | 133 | 134 | def simpleResultDeDupe(results): 135 | from collections import OrderedDict 136 | 137 | duplicates = [] 138 | 139 | unique_entries = OrderedDict() 140 | for result in results: 141 | 142 | if result.bib['ID'] in unique_entries: 143 | if normalizeTitle(result.bib['title']) == normalizeTitle(unique_entries[result.bib['ID']].bib['title']): 144 | # print(unique_entries[result.bib['ID']], '\n\n', result.bib, '\n---------------') 145 | duplicates.append(result) 146 | continue 147 | else: 148 | result.bib['ID'] += "_2" 149 | 150 | unique_entries[result.bib['ID']] = result 151 | 152 | print('Duplicates found:', len(duplicates)) 153 | return [v for k, v in unique_entries.items()] 154 | 155 | 156 | def normalizeTitle(title): 157 | """ 158 | Returns a "normalized" title for easy matching 159 | """ 160 | title = title.lower() 161 | title = re.sub(r"–", " ", title) 162 | title = unicodeToASCII(title) 163 | title = title.replace("- ", "").replace("- ", "") 164 | title = re.sub(r"[\"\#\$\%\&\\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\¿\!\¡\@\[\]\^\_\`\{\|\}\~]", " ", title) 165 | title = re.sub(r"\s+", " ", title) 166 | title = title.strip() 167 | title = title[:200] 168 | return title 169 | -------------------------------------------------------------------------------- /db/data.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import os, re, json 3 | import pandas as pd 4 | import bibtexparser 5 | 6 | from strsimpy import NormalizedLevenshtein 7 | 8 | stopwords = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 9 | "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", 10 | "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", 11 | "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", 12 | "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", 13 | "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", 14 | "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", 15 | "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", 16 | "when", "where", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]) 17 | 18 | from db.bibtex import generateUniqueID 19 | from db.ref_utils import parseBibAuthors, normalizeTitle 20 | 21 | current_dir = os.path.dirname(os.path.realpath(__file__)) 22 | 23 | CACHE_FILE = os.path.join(current_dir, "papers.sqlite") 24 | 25 | 26 | class Paper: 27 | """ 28 | A Paper consists of 2 dicts: .bib and .extra_data 29 | - .bib is simply a bibtex dict 30 | - .extra_data stores everything else we can't properly store in a BibTeX file 31 | """ 32 | 33 | def __init__(self, bib: dict = None, extra_data: dict = None): 34 | self.bib = bib 35 | self.extra_data = extra_data 36 | 37 | for field in bib: 38 | if bib[field] is None: 39 | bib[field] = '' 40 | 41 | @classmethod 42 | def fromRecord(cls, paper_record): 43 | res = Paper(json.loads(paper_record["bib"]), 44 | json.loads(paper_record["extra_data"])) 45 | 46 | res.pmid = paper_record["pmid"] 47 | res.scholarid = paper_record["scholarid"] 48 | res.arxivid = paper_record["arxivid"] 49 | return res 50 | 51 | @property 52 | def id(self): 53 | return generateUniqueID(self) 54 | 55 | @property 56 | def doi(self): 57 | return self.bib.get("doi") 58 | 59 | @doi.setter 60 | def doi(self, doi): 61 | self.bib["doi"] = doi 62 | 63 | @property 64 | def arxivid(self): 65 | return self.extra_data.get("arxivid") 66 | 67 | @arxivid.setter 68 | def arxivid(self, arxivid): 69 | self.extra_data["arxivid"] = arxivid 70 | 71 | @property 72 | def pmid(self): 73 | return self.extra_data.get("pmid") 74 | 75 | @pmid.setter 76 | def pmid(self, pmid): 77 | self.extra_data["pmid"] = pmid 78 | 79 | @property 80 | def scholarid(self): 81 | return self.extra_data.get("scholarid") 82 | 83 | @scholarid.setter 84 | def scholarid(self, scholarid): 85 | self.extra_data["scholarid"] = scholarid 86 | 87 | @property 88 | def title(self): 89 | return self.bib.get("title") 90 | 91 | @title.setter 92 | def title(self, title): 93 | self.bib["title"] = title 94 | 95 | @property 96 | def norm_title(self): 97 | return normalizeTitle(self.title) 98 | 99 | @property 100 | def abstract(self): 101 | return self.bib.get("abstract") 102 | 103 | @property 104 | def year(self): 105 | return self.bib.get("year") 106 | 107 | @property 108 | def authors(self): 109 | return self.bib.get("author") 110 | 111 | @authors.setter 112 | def authors(self, authors): 113 | self.bib["author"] = authors 114 | 115 | @property 116 | def entrytype(self): 117 | return self.bib.get("ENTRYTYPE").lower() 118 | 119 | @property 120 | def venue(self): 121 | entrytype = self.entrytype 122 | if entrytype == "article": 123 | return self.bib.get("journal", "") 124 | elif entrytype in ["book", "booklet", "manual", "proceedings"]: 125 | # return self.bib.get("title", "") 126 | return "" 127 | elif entrytype in ["conference", "inproceedings", "incollection"]: 128 | return self.bib.get("booktitle", "") 129 | elif entrytype in ["mastersthesis", "phdthesis"]: 130 | return self.bib.get("school", "") 131 | elif entrytype in ["techreport"]: 132 | return self.bib.get("institution", "") 133 | elif entrytype in ["misc", "unpublished"]: 134 | return "" 135 | 136 | @property 137 | def has_pdf(self): 138 | for url in self.extra_data.get('urls', []): 139 | if url['type'] == 'pdf': 140 | return True 141 | return False 142 | 143 | @property 144 | def has_full_abstract(self): 145 | if not self.abstract: 146 | return False 147 | 148 | if self.abstract.endswith('…'): 149 | return False 150 | 151 | return True 152 | 153 | @property 154 | def has_abstract(self): 155 | return self.abstract is not None and self.abstract != '' 156 | 157 | @property 158 | def has_valid_id(self): 159 | return any([self.doi, self.pmid, self.arxivid, self.extra_data.get('ss_id')]) 160 | 161 | @property 162 | def has_pdf_link(self): 163 | for url in self.extra_data.get('urls', []): 164 | if url.get('type') == 'pdf' or 'pdf' in url.get('url', ''): 165 | return True 166 | 167 | return False 168 | 169 | def asDict(self): 170 | return { 171 | "id": self.id, 172 | "title": self.title, 173 | "norm_title": self.norm_title, 174 | "authors": self.authors, 175 | "year": self.year, 176 | "venue": self.venue, 177 | "bib": json.dumps(self.bib), 178 | "doi": self.doi, 179 | "arxivid": self.arxivid, 180 | "scholarid": self.scholarid, 181 | "pmid": self.pmid, 182 | "extra_data": json.dumps(self.extra_data) 183 | } 184 | 185 | def __repr__(self): 186 | return f"<%s - %s - %s> \n %s" % ( 187 | self.bib.get("title", ""), 188 | self.bib.get("author", ""), 189 | self.bib.get("year", ""), str(self.bib)) 190 | 191 | 192 | class PaperStore: 193 | def __init__(self): 194 | self.conn = sqlite3.connect(CACHE_FILE) 195 | self.conn.row_factory = sqlite3.Row 196 | self.initaliseDB() 197 | 198 | def initaliseDB(self): 199 | self.conn.execute("""CREATE TABLE IF NOT EXISTS "papers" ( 200 | "id" text primary key, 201 | "doi" text unique, 202 | "pmid" text unique, 203 | "scholarid" text unique, 204 | "arxivid" text unique, 205 | "authors" text, 206 | "year" integer, 207 | "title" text, 208 | "norm_title" text, 209 | "venue" text, 210 | "bib" text, 211 | "extra_data" text 212 | ) 213 | """) 214 | 215 | self.conn.execute( 216 | """CREATE UNIQUE INDEX IF NOT EXISTS idx_papers_ids ON papers(id, doi)""") 217 | 218 | self.conn.execute( 219 | """CREATE INDEX IF NOT EXISTS idx_papers_otherids ON papers(pmid, scholarid, arxivid)""") 220 | 221 | self.conn.execute( 222 | """CREATE INDEX IF NOT EXISTS idx_papers_title ON papers(title, norm_title)""") 223 | 224 | self.conn.commit() 225 | 226 | # def runSelectStatement(self, sql, parameters): 227 | # """ 228 | # 229 | # :param sql: SQL string to run 230 | # :return: Cursor to the results 231 | # """ 232 | # c = self.conn.cursor() 233 | # c.execute(sql, parameters) 234 | # return c 235 | 236 | def getPaper(self, id_string, id_type="doi"): 237 | """ 238 | Looks for a paper given an id. 239 | 240 | :param id_string: the actual id 241 | :param id_type: the type of id (doi, arxivid, pmid, scholarid) 242 | :return: paper if found, or None 243 | """ 244 | c = self.conn.cursor() 245 | 246 | c.execute("SELECT * FROM papers WHERE %s=?" % id_type, (id_string,)) 247 | paper_record = c.fetchone() 248 | if not paper_record: 249 | return None 250 | 251 | res = Paper.fromRecord(paper_record) 252 | return res 253 | 254 | def findPapersByTitle(self, title): 255 | """ 256 | Looks for a paper given a title. 257 | 258 | :param title: 259 | :return: 260 | """ 261 | c = self.conn.cursor() 262 | norm_title = normalizeTitle(title) 263 | 264 | c.execute("SELECT * FROM papers WHERE norm_title=?", (norm_title,)) 265 | paper_records = c.fetchall() 266 | if not paper_records: 267 | return None 268 | 269 | res = [] 270 | for paper_record in paper_records: 271 | res.append(Paper.fromRecord(paper_record)) 272 | return res 273 | 274 | def findPaperByApproximateTitle(self, paper, ok_title_distance=0.35, ok_author_distance=0.1): 275 | """ 276 | Very simple ngram-based similarity matching 277 | 278 | :param title: 279 | :return: 280 | """ 281 | c = self.conn.cursor() 282 | 283 | self.createVirtualTable() 284 | 285 | norm_title = normalizeTitle(paper.title) 286 | 287 | bits = norm_title.split() 288 | bits = [b for b in bits if b not in stopwords] 289 | 290 | query_string = " OR ".join(bits) 291 | 292 | c.execute('SELECT id, norm_title FROM papers_search WHERE norm_title MATCH ?', (query_string,)) 293 | paper_ids = c.fetchall() 294 | if not paper_ids: 295 | return None 296 | 297 | paper_id_list = [res['id'] for res in paper_ids] 298 | id_query_string = ",".join(['"%s"' % res['id'] for res in paper_ids]) 299 | 300 | c.execute('SELECT * FROM papers WHERE id IN (%s)' % id_query_string) 301 | paper_records = c.fetchall() 302 | if not paper_records: 303 | return None 304 | 305 | results = [Paper.fromRecord(r) for r in paper_records] 306 | 307 | sorted_results = rerankByTitleSimilarity(results, paper.title) 308 | 309 | top_res = sorted_results[0][1] 310 | 311 | title_distance = dist.distance(top_res.title.lower(), paper.title.lower()) 312 | author_distance = computeAuthorDistance(paper, top_res) 313 | 314 | 315 | if title_distance <= ok_title_distance and author_distance <= ok_author_distance: 316 | print('\n[matched] ', paper.title) 317 | print('Best match:', top_res.title) 318 | else: 319 | print('\n[skipped] ', paper.title) 320 | print('Options:\n' + '\n'.join([r[1].title for r in sorted_results[:5]]), '\n') 321 | return None 322 | 323 | print('title distance:', title_distance, 'author distance:', author_distance) 324 | 325 | new_paper = top_res 326 | # new_paper.title = paper.title 327 | 328 | return new_paper 329 | 330 | def addPaper(self, paper: Paper): 331 | self.addPapers([paper]) 332 | 333 | def addPapers(self, papers: list): 334 | to_add = [paper.asDict() for paper in papers] 335 | 336 | df = pd.DataFrame(to_add) 337 | df.to_sql("papers", self.conn, if_exists="append", index=False) 338 | 339 | def updatePapers(self, papers: list): 340 | for paper in papers: 341 | values = paper.asDict() 342 | try: 343 | self.conn.execute( 344 | """REPLACE INTO papers (id, doi, pmid, scholarid, arxivid, authors, year, title, norm_title, venue, bib, extra_data) values (?,?,?,?,?,?,?,?,?,?,?,?)""", 345 | (values['id'], values['doi'], values['pmid'], values['scholarid'], 346 | values['arxivid'], values['authors'], values['year'], 347 | values['title'], values['norm_title'], values['venue'], 348 | values['bib'], values['extra_data'])) 349 | except Exception as e: 350 | print(e.__class__.__name__, e) 351 | self.conn.commit() 352 | 353 | def createVirtualTable(self): 354 | self.conn.execute( 355 | """CREATE VIRTUAL TABLE IF NOT EXISTS papers_search USING fts5(id, norm_title, title);""") 356 | self.conn.execute( 357 | """REPLACE INTO papers_search (id, norm_title, title) SELECT id, norm_title, title from papers""") 358 | 359 | self.conn.commit() 360 | 361 | def deleteVirtualTable(self): 362 | self.conn.execute("DROP TABLE papers_search") 363 | self.conn.commit() 364 | 365 | def matchResultsWithPapers(self, results): 366 | """ 367 | Tries to match each result with a paper already in the db. 368 | 369 | :param results: 370 | :return: 371 | """ 372 | found = [] 373 | missing = [] 374 | self.createVirtualTable() 375 | for result in results: 376 | paper = Paper(result.bib, result.extra_data) 377 | 378 | paper_found = False 379 | for id_type in ["doi", "pmid", "arxivid", "scholarid"]: 380 | id_string = getattr(paper, id_type) 381 | if id_string: 382 | paper_record = self.getPaper(id_string, id_type=id_type) 383 | if paper_record: 384 | result.paper = paper_record 385 | found.append(result) 386 | paper_found = True 387 | break 388 | 389 | if not paper_found and paper.title: 390 | paper_records = self.findPapersByTitle(paper.title) 391 | if paper_records: 392 | result.paper = paper_records[0] 393 | found.append(result) 394 | paper_found = True 395 | 396 | if not paper_found and paper.title: 397 | paper_record = self.findPaperByApproximateTitle(paper) 398 | if paper_record: 399 | result.paper = paper_record 400 | found.append(result) 401 | paper_found = True 402 | 403 | if not paper_found: 404 | missing.append(result) 405 | 406 | self.deleteVirtualTable() 407 | return found, missing 408 | 409 | 410 | def computeAuthorDistance(paper1, paper2): 411 | """ 412 | Returns a measure of how much the authors of papers overlap 413 | 414 | :param paper1: 415 | :param paper2: 416 | :return: 417 | """ 418 | if not paper1.bib.get('author') or not paper2.bib.get('author'): 419 | return 1 420 | 421 | authors1 = paper1.extra_data.get('x_authors', parseBibAuthors(paper1.bib.get('author'))) 422 | authors2 = paper2.extra_data.get('x_authors', parseBibAuthors(paper2.bib.get('author'))) 423 | 424 | score = 0 425 | if len(authors1) >= len(authors2): 426 | a_short = authors2 427 | a_long = authors1 428 | else: 429 | a_short = authors1 430 | a_long = authors2 431 | 432 | max_score = 0 433 | 434 | for index, author in enumerate(a_short): 435 | factor = (len(a_long) - index) ** 2 436 | if author['family'].lower() == a_long[index]['family'].lower(): 437 | score += factor 438 | 439 | max_score += factor 440 | 441 | if max_score == 0: 442 | return 1 443 | 444 | distance = 1 - (score / max_score) 445 | return distance 446 | 447 | 448 | def basicTitleCleaning(title): 449 | return re.sub(r'\s+', ' ', title, flags=re.MULTILINE) 450 | 451 | 452 | def rerankByTitleSimilarity(results: list, title): 453 | scores = [] 454 | for res in results: 455 | res.bib['title'] = basicTitleCleaning(res.bib['title']) 456 | scores.append((dist.distance(res.bib['title'].lower(), title.lower()), res)) 457 | 458 | return sorted(scores, key=lambda x: x[0], reverse=False) 459 | 460 | def removeListWrapper(value): 461 | while isinstance(value, list): 462 | value = value[0] 463 | return value 464 | 465 | def test1(): 466 | bibstr = """@ARTICLE{Cesar2013, 467 | author = {Jean César}, 468 | title = {An amazing title}, 469 | year = {2013}, 470 | volume = {12}, 471 | pages = {12--23}, 472 | journal = {Nice Journal}, 473 | abstract = {This is an abstract. This line should be long enough to test 474 | multilines...}, 475 | comments = {A comment}, 476 | keywords = {keyword1, keyword2} 477 | } 478 | """ 479 | bib = bibtexparser.loads(bibstr) 480 | paper = Paper(bib.entries[0]) 481 | paperstore = PaperStore() 482 | paperstore.addPapers([paper]) 483 | 484 | 485 | def test2(): 486 | paperstore = PaperStore() 487 | paper = paperstore.getPaper('10.1148/radiol.2018171093') 488 | paper.arxivid = None 489 | paperstore.updatePapers([paper]) 490 | 491 | 492 | if __name__ == '__main__': 493 | test2() 494 | dist = NormalizedLevenshtein() 495 | 496 | 497 | -------------------------------------------------------------------------------- /search/metadata_harvest.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | warnings.filterwarnings("ignore") 4 | 5 | import requests 6 | import re, json 7 | import urllib.parse 8 | from db.bibtex import readBibtexString, fixBibData, getBibtextFromDOI 9 | from db.ref_utils import isPDFURL, getDOIfromURL, authorListFromDict, addUrlIfNew 10 | from db.data import Paper, computeAuthorDistance, rerankByTitleSimilarity, basicTitleCleaning, dist, removeListWrapper 11 | from .base_search import SearchResult 12 | from tqdm import tqdm 13 | import datetime 14 | from time import sleep 15 | from datetime import timedelta 16 | from io import StringIO, BytesIO 17 | from lxml import etree 18 | import datetime 19 | 20 | BIB_FIELDS_TRANSFER = ['abstract', 'address', 'annote', 'author', 'booktitle', 'chapter', 21 | 'crossref', 'doi', 'edition', 'editor', 22 | 'howpublished', 'institution', 'issue', 'journal', 'key', 23 | 'month', 'note', 'number', 'organization', 24 | 'pages', 'publisher', 'school', 'series', 'type', 'volume', 'year'] 25 | 26 | interval_regex = re.compile(r'((?P\d+?)hr)?((?P\d+?)m)?((?P\d+?)s)?') 27 | 28 | 29 | def parse_time(time_str): 30 | parts = interval_regex.match(time_str) 31 | if not parts: 32 | return 33 | parts = parts.groupdict() 34 | time_params = {} 35 | for (name, param) in parts.items(): 36 | if param: 37 | time_params[name] = int(param) 38 | return timedelta(**time_params) 39 | 40 | 41 | def refreshDOIfromURLs(paper): 42 | """ 43 | If paper has no DOI, it tries to find one in any URLs stored in the bib or extra_data dicts 44 | 45 | :param paper: Paper or SearchResult 46 | """ 47 | if paper.doi: 48 | return 49 | 50 | doi = getDOIfromURL(paper.bib.get('url', '')) 51 | if doi: 52 | paper.bib['doi'] = doi 53 | else: 54 | for url_dict in paper.extra_data.get('urls', []): 55 | doi = getDOIfromURL(url_dict['url']) 56 | if doi: 57 | paper.bib['doi'] = doi 58 | break 59 | 60 | 61 | def mergeResultData(result1, result2): 62 | """ 63 | Merges bibtex and extra_data dictionaries for a SearchResult and/or a Paper 64 | 65 | :param result1: 66 | :param result2: 67 | :return: 68 | """ 69 | # if there's no year we should update the ID after getting the year 70 | to_update_id = not result1.bib.get('year') or not 'ID' in result1.bib 71 | 72 | for field in BIB_FIELDS_TRANSFER: 73 | if len(str(result2.bib.get(field, ''))) > len(str(result1.bib.get(field, ''))): 74 | result1.bib[field] = str(result2.bib[field]) 75 | 76 | for field in ['ID', 'ENTRYTYPE']: 77 | if field in result2.bib: 78 | result1.bib[field] = str(result2.bib[field]) 79 | 80 | if 'ID' not in result2.bib and to_update_id: 81 | if 'ID' in result1.bib: 82 | del result1.bib['ID'] 83 | fixBibData(result1.bib, 1) 84 | 85 | for field in result2.extra_data: 86 | if field not in result1.extra_data: 87 | result1.extra_data[field] = result2.extra_data[field] 88 | 89 | if 'urls' in result2.extra_data: 90 | for url in result2.extra_data['urls']: 91 | addUrlIfNew(result1, url['url'], url['type'], url['source']) 92 | 93 | refreshDOIfromURLs(result1) 94 | return result1 95 | 96 | 97 | class NiceScraper: 98 | def __init__(self, basic_delay=0., rate_limit=None, rate_interval=None): 99 | self.response_times = [] 100 | self.request_times = [] 101 | self.avg_response_time = 0 102 | self.basic_delay = basic_delay 103 | self.delay = 0.0 104 | self.rate_limit = rate_limit 105 | if isinstance(rate_interval, str): 106 | self.rate_interval = parse_time(rate_interval) 107 | else: 108 | self.rate_interval = rate_interval 109 | 110 | def playNice(self): 111 | 112 | if self.rate_limit and len(self.request_times) >= self.rate_limit: 113 | now = datetime.datetime.now() 114 | 115 | diff = now - self.request_times[-self.rate_limit] 116 | if diff < self.rate_interval: 117 | print('Waiting for the rate limit') 118 | sleep(self.rate_interval - diff.total_seconds()) 119 | 120 | if len(self.response_times) > 0: 121 | self.avg_response_time = sum(self.response_times[-10:]) / len(self.response_times[-10:]) 122 | if self.response_times[-1] > self.avg_response_time: 123 | self.delay += 0.1 124 | else: 125 | self.delay -= 0.1 126 | self.delay = max(self.delay, 0) 127 | else: 128 | self.avg_response_time = 0 129 | 130 | if self.delay: 131 | sleep(self.delay) 132 | 133 | def request(self, url, headers=None, data=None, post=False): 134 | """ 135 | Makes a nice request, enforcing rate limits and adjusting the wait time 136 | between requests based on latency 137 | 138 | :param url: url to fetch 139 | :param headers: headers to pass 140 | :return: request object 141 | """ 142 | class_name = self.__class__.__name__.split('.')[-1] 143 | status_code = 0 144 | retries = 0 145 | 146 | while status_code != 200 and retries < 2: 147 | self.playNice() 148 | 149 | self.request_times.append(datetime.datetime.now()) 150 | before = datetime.datetime.now() 151 | 152 | if post: 153 | r = requests.post(url, json=data, headers=headers) 154 | else: 155 | r = requests.get(url, headers=headers) 156 | 157 | if r.status_code == 429: 158 | print(class_name, ': Status code 429: waiting and retrying') 159 | sleep(30) 160 | 161 | status_code = r.status_code 162 | retries += 1 163 | 164 | duration = datetime.datetime.now() - before 165 | 166 | self.setRateLimitsFromHeaders(r) 167 | 168 | self.response_times.append(duration.total_seconds()) 169 | print(class_name, "request took", self.response_times[-1]) 170 | 171 | return r 172 | 173 | def setRateLimitsFromHeaders(self, request): 174 | if request.headers.get('X-Rate-Limit-Limit'): 175 | self.rate_limit = int(request.headers.get('X-Rate-Limit-Limit')) 176 | if 'X-Rate-Limit-Interval' in request.headers: 177 | try: 178 | self.rate_interval = parse_time(request.headers['X-Rate-Limit-Interval']) 179 | except: 180 | print("Failed to parse X-Rate-Limit-Interval string", 181 | request.headers['X-Rate-Limit-Interval']) 182 | self.rate_interval = None 183 | 184 | def search(self, title, identity, max_results=5): 185 | raise NotImplementedError 186 | 187 | def matchPaperFromResults(self, paper, identity, ok_title_distance=0.1, ok_author_distance=0.1): 188 | """ 189 | Tries to match a paper with a DOI and retrieves its metadata if successful 190 | 191 | :param paper: 192 | :param identity: 193 | :return: 194 | """ 195 | class_name = self.__class__.__name__.split('.')[-1] 196 | 197 | try: 198 | results = self.search(paper.title, identity, max_results=5) 199 | except Exception as e: 200 | print('Error during %s.matchPaperFromResults()' % class_name, e) 201 | results = None 202 | 203 | if not results: 204 | return False 205 | 206 | sorted_results = rerankByTitleSimilarity(results, paper.title) 207 | 208 | top_res = sorted_results[0][1] 209 | 210 | title_distance = dist.distance(top_res['title'].lower(), paper.title.lower()) 211 | author_distance = computeAuthorDistance(paper, top_res) 212 | 213 | if title_distance > 0.1: 214 | if title_distance <= ok_title_distance and author_distance <= ok_author_distance: 215 | print('\n[matched] Title distance is above 0.1, but within settings') 216 | print('Title:', paper.title) 217 | print('Best match:', top_res['title']) 218 | print('title distance:', title_distance, 'author distance:', author_distance) 219 | else: 220 | print('\n[skipped] Distance is too great \n') 221 | print('Title:', paper.title) 222 | print('title distance:', title_distance, 'author distance:', author_distance) 223 | print('Options:\n' + '\n'.join([r[1]['title'] for r in sorted_results]), '\n') 224 | return False 225 | 226 | try: 227 | mergeResultData(paper, top_res) 228 | return True 229 | except Exception as e: 230 | print('Error during %s.matchPaperFromResults() mergeResultData()' % class_name, e) 231 | return False 232 | 233 | 234 | class CrossrefScraper(NiceScraper): 235 | 236 | def bulkSearchCrossref(self, papers): 237 | pass 238 | # r = requests.get("https://doi.crossref.org/simpleTextQuery") 239 | 240 | def search(self, title, identity, year=None, max_results=1): 241 | """ 242 | Searchs and returns a number of results from Crossref 243 | 244 | :param title: article title 245 | :param identity: email address to provide to Crossref 246 | :param year: publication year 247 | :param max_results: 248 | :return: list of Crossref JSON data results 249 | """ 250 | urllib.parse.quote(title, safe='') 251 | headers = {'User-Agent': 'ReviewBuilder(mailto:%s)' % identity} 252 | # changed because of https://status.crossref.org/incidents/4y45gj63jsp4 253 | url = 'https://api.crossref.org/works?rows={}&query.bibliographic={}'.format(max_results, title) 254 | if year: 255 | url += '&query.published=' + str(year) 256 | 257 | r = self.request(url, headers) 258 | 259 | d = r.json() 260 | if d['status'] != 'ok': 261 | raise ValueError('Error in request:' + d.get('status', 'NO STATUS') + str(d.get('message', 'NO MESSAGE'))) 262 | 263 | results = [] 264 | for index, item in enumerate(d['message']['items']): 265 | # print(item.get('type')) 266 | new_bib = {'doi': item['DOI'], 267 | 'title': basicTitleCleaning(removeListWrapper(item['title']))} 268 | 269 | if 'container-title' in item: 270 | # reference-entry, book 271 | 272 | if item.get('type') in ['journal-article', 'reference-entry']: 273 | new_bib['journal'] = removeListWrapper(item['container-title']) 274 | new_bib['ENTRYTYPE'] = 'article' 275 | elif item.get('type') in ['book-chapter']: 276 | new_bib['ENTRYTYPE'] = 'inbook' 277 | new_bib['booktitle'] = removeListWrapper(item['container-title']) 278 | elif item.get('type') in ['proceedings-article']: 279 | new_bib['ENTRYTYPE'] = 'inproceedings' 280 | new_bib['booktitle'] = removeListWrapper(item['container-title']) 281 | 282 | if item.get('type') in ['book']: 283 | new_bib['ENTRYTYPE'] = 'book' 284 | 285 | if item.get('type') not in ['journal-article', 'reference-entry', 'book', 'book-chapter', 286 | 'proceedings-article']: 287 | print(json.dumps(item, indent=3)) 288 | 289 | for field in [('publisher-location', 'address'), 290 | ('publisher', 'publisher'), 291 | ('issue', 'issue'), 292 | ('volume', 'volume'), 293 | ('page', 'pages'), 294 | ]: 295 | if field[0] in item: 296 | new_bib[field[1]] = str(item[field[0]]) 297 | 298 | if 'URL' in item: 299 | new_bib['url'] = item['URL'] 300 | 301 | if "issued" in item: 302 | date_parts = item['issued']['date-parts'][0] 303 | new_bib['year'] = str(date_parts[0]) 304 | if len(date_parts) > 1: 305 | new_bib['month'] = str(date_parts[1]) 306 | if len(date_parts) > 2: 307 | new_bib['day'] = str(date_parts[2]) 308 | 309 | authors = [] 310 | for author in item.get('author', []): 311 | authors.append({'given': author.get('given', ''), 'family': author.get('family', '')}) 312 | 313 | if item.get('author'): 314 | new_bib['author'] = authorListFromDict(authors) 315 | 316 | new_extra = {'x_authors': authors, 317 | 'language': item.get('language') 318 | } 319 | 320 | new_res = SearchResult(index, new_bib, 'crossref', new_extra) 321 | 322 | addUrlIfNew(new_res, item['URL'], 'main', 'crossref') 323 | 324 | if 'link' in item: 325 | for link in item['link']: 326 | if isPDFURL(link['URL']): 327 | addUrlIfNew(new_res, link['URL'], 'pdf', 'crossref') 328 | 329 | results.append(new_res) 330 | 331 | return results 332 | 333 | 334 | class UnpaywallScraper(NiceScraper): 335 | 336 | def getMetadata(self, paper, identity): 337 | if not paper.doi: 338 | raise ValueError("Paper has no DOI") 339 | 340 | url = 'https://api.unpaywall.org/v2/%s?email=%s' % (paper.doi, identity) 341 | 342 | r = self.request(url) 343 | 344 | data = r.json() 345 | if data.get('error') == 'true': 346 | return 347 | 348 | top_url = data.get('best_oa_location') 349 | if not top_url: 350 | return 351 | 352 | if top_url.get('url_for_pdf') in top_url: 353 | addUrlIfNew(paper, top_url['url_for_pdf'], 'pdf', 'unpaywall') 354 | if top_url.get('url_for_landing_page'): 355 | addUrlIfNew(paper, top_url['url_for_landing_page'], 'main', 'unpaywall') 356 | if top_url.get('url'): 357 | url = top_url['url'] 358 | if isPDFURL(url): 359 | type = 'pdf' 360 | else: 361 | type = 'main' 362 | 363 | addUrlIfNew(paper, url, type, 'unpaywall') 364 | 365 | paper.extra_data['done_unpaywall'] = True 366 | 367 | 368 | class PubMedScraper(NiceScraper): 369 | def search(self, title, identity, max_results=5): 370 | url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax={max_results}&sort=relevance&term=' 371 | url += urllib.parse.quote(title) 372 | 373 | r = self.request(url) 374 | d = r.json() 375 | id_list = d['esearchresult']['idlist'] 376 | 377 | try: 378 | result = self.getMetadata(id_list) 379 | except Exception as e: 380 | print('Error during %s.getMetadata()' % self.__class__.__name__.split('.')[-1], e) 381 | result = None 382 | 383 | return result 384 | 385 | def getMetadata(self, pmids: list): 386 | """ 387 | Returns a dict with metadata extracted from PubMed from a PMID 388 | 389 | rettype = {NULL = xml, abstract, medline, uilist, docsum} 390 | retmode = {xml, text} 391 | 392 | :param pmids: list of PMID to get 393 | :return: dict with metadata from XML returned 394 | """ 395 | assert isinstance(pmids, list) 396 | 397 | if not pmids: 398 | return [] 399 | 400 | if len(pmids) > 1: 401 | pmids = ','.join(pmids) 402 | else: 403 | pmids = pmids[0] 404 | 405 | url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=' + pmids 406 | r = self.request(url) 407 | 408 | text = StringIO(r.content.decode('utf-8')) 409 | tree = etree.parse(text) 410 | 411 | results = [] 412 | 413 | for index, article_node in enumerate(tree.xpath('/PubmedArticleSet/PubmedArticle')): 414 | new_bib = {} 415 | 416 | article = article_node.xpath('MedlineCitation/Article')[0] 417 | 418 | doi = article.xpath('ELocationID[@EIdType="doi"]') 419 | if doi: 420 | new_bib['doi'] = doi[0].text 421 | 422 | new_bib['title'] = article.xpath('ArticleTitle')[0].text 423 | 424 | abstract = "" 425 | for abs_piece in article.xpath('Abstract/AbstractText'): 426 | if 'Label' in abs_piece.keys(): 427 | abstract += abs_piece.get('Label') + "\n" 428 | 429 | abstract += abs_piece.text + '\n' 430 | new_bib['abstract'] = abstract 431 | 432 | authors = [] 433 | for author in article.xpath('AuthorList/Author'): 434 | new_author = {'given': author.xpath('ForeName')[0].text, 435 | 'family': author.xpath('LastName')[0].text, } 436 | authors.append(new_author) 437 | 438 | new_bib['author'] = authorListFromDict(authors) 439 | if article.xpath('ArticleDate'): 440 | date_node = article.xpath('ArticleDate')[0] 441 | elif article_node.xpath('PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]'): 442 | date_node = article_node.xpath('PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]')[0] 443 | 444 | new_bib['year'] = date_node.xpath('Year')[0].text 445 | new_bib['month'] = date_node.xpath('Month')[0].text 446 | new_bib['day'] = date_node.xpath('Day')[0].text 447 | 448 | new_extra = {'pmid': article_node.xpath('MedlineCitation/PMID')[0].text, 449 | 'x_authors': authors, 450 | 'language': article.xpath('Language')[0].text} 451 | 452 | new_res = SearchResult(index, new_bib, 'pubmed', new_extra) 453 | results.append(new_res) 454 | 455 | return results 456 | 457 | def getAlternateIDs(self, pmids: list): 458 | """ 459 | Gets DOI and PMCID for a list of PMIDs 460 | 461 | :param pmids: list of PMID to resolve 462 | :return: 463 | """ 464 | if isinstance(pmids, list): 465 | if len(pmids) > 1: 466 | pmids = ','.join([str(p) for p in pmids]) 467 | else: 468 | pmids = pmids[0] 469 | 470 | res = {} 471 | 472 | r = requests.get( 473 | 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids=' + str( 474 | pmids)) 475 | 476 | text = StringIO(r.content.decode('utf-8')) 477 | tree = etree.parse(text) 478 | for record in tree.xpath('/pmcids/record'): 479 | new_res = {} 480 | if 'pmcid' in record.keys(): 481 | new_res['pmcid'] = record.get('pmcid') 482 | if 'doi' in record.keys(): 483 | new_res['doi'] = record.get('doi') 484 | res[record.get('pmid')] = new_res 485 | return res 486 | 487 | def enrichWithMetadata(self, paper): 488 | if not paper.pmid: 489 | return 490 | 491 | if not paper.doi: 492 | ids = self.getAlternateIDs(paper.pmid) 493 | if 'doi' in ids[paper.pmid]: 494 | paper.doi = ids[paper.pmid]['doi'] 495 | if 'pmcid' in ids[paper.pmid]: 496 | paper.pmcid = ids[paper.pmid]['pmcid'] 497 | 498 | res = self.getMetadata([paper.pmid])[0] 499 | 500 | mergeResultData(paper, res) 501 | 502 | paper.extra_data['done_pubmed'] = True 503 | 504 | 505 | class arXivSearcher(NiceScraper): 506 | def search(self, title, identity, max_results=5): 507 | url = 'http://export.arxiv.org/api/query?search_query=title:{}&start=0&max_results={}'.format( 508 | urllib.parse.quote(title), max_results) 509 | r = self.request(url) 510 | 511 | text = BytesIO(r.content) 512 | tree = etree.parse(text) 513 | 514 | ns_map = {'ns': 'http://www.w3.org/2005/Atom', 515 | 'arxiv': 'http://arxiv.org/schemas/atom'} 516 | 517 | results = [] 518 | for index, entry in enumerate(tree.xpath('/ns:feed/ns:entry', namespaces=ns_map)): 519 | new_bib = {'arxivid': entry.xpath('ns:id', namespaces=ns_map)[0].text.split('/')[-1], 520 | 'title': entry.xpath('ns:title', namespaces=ns_map)[0].text, 521 | 'abstract': entry.xpath('ns:summary', namespaces=ns_map)[0].text, 522 | } 523 | 524 | published = entry.xpath('ns:published', namespaces=ns_map)[0].text 525 | match = re.search(r"(\d{4})-(\d{2})-(\d{2})", published) 526 | 527 | new_bib['year'] = match.group(1) 528 | new_bib['month'] = str(int(match.group(2))) 529 | new_bib['date'] = str(int(match.group(3))) 530 | 531 | authors = [] 532 | for author in entry.xpath('ns:author', namespaces=ns_map): 533 | bits = author.xpath('ns:name', namespaces=ns_map)[0].text.split() 534 | authors.append({'given': bits[0], 'family': bits[-1]}) 535 | 536 | new_bib['author'] = authorListFromDict(authors) 537 | new_extra = { 538 | 'x_authors': authors, 539 | 'ax_main_category': entry.xpath('arxiv:primary_category', namespaces=ns_map)[0].get('term'), 540 | 541 | } 542 | 543 | categories = [] 544 | for cat in entry.xpath('ns:category', namespaces=ns_map): 545 | categories.append(cat.get('term')) 546 | 547 | new_extra['ax_categories'] = categories 548 | 549 | new_res = SearchResult(index, new_bib, 'arxiv', new_extra) 550 | 551 | for link in entry.xpath('ns:link', namespaces=ns_map): 552 | if link.get('title') == 'pdf': 553 | addUrlIfNew(new_res, link.get('href'), 'pdf', 'arxiv') 554 | elif 'arxiv.org/abs/' in link.get('href'): 555 | addUrlIfNew(new_res, link.get('href'), 'main', 'arxiv') 556 | 557 | results.append(new_res) 558 | 559 | return results 560 | 561 | 562 | class GScholarScraper(NiceScraper): 563 | def getBibtex(self, paper): 564 | if paper.extra_data.get("url_scholarbib"): 565 | bib = paper.bib 566 | url = paper.extra_data.get("url_scholarbib") 567 | try: 568 | r = self.request(url) 569 | 570 | # print(r) 571 | text = r.content.decode('utf-8') 572 | bib = readBibtexString(text)[0] 573 | 574 | except Exception as e: 575 | print(e.__class__.__name__, e) 576 | 577 | bib['abstract'] = paper.abstract 578 | for key in ['abstract', 'eprint', 'url']: 579 | if key in paper.bib: 580 | bib[key] = paper.bib[key] 581 | paper.bib = bib 582 | 583 | 584 | class SemanticScholarScraper(NiceScraper): 585 | 586 | @classmethod 587 | def loadSSAuthors(self, authors_dict): 588 | authors = [] 589 | for author in authors_dict: 590 | bits = author['name'].split() 591 | new_author = {'given': bits[0], 'family': bits[-1]} 592 | if len(bits) > 2: 593 | new_author['middle'] = " ".join(bits[1:len(bits) - 1]) 594 | authors.append(new_author) 595 | return authors 596 | 597 | def search(self, title, identity, max_results=5, min_year=None, max_year=None): 598 | url = 'https://www.semanticscholar.org/api/1/search' 599 | 600 | yearFilter = None 601 | 602 | if min_year or max_year: 603 | yearFilter = {} 604 | if not max_year: 605 | now = datetime.datetime.now() 606 | max_year = now.year 607 | 608 | if min_year: 609 | yearFilter['min'] = int(min_year) 610 | if max_year: 611 | yearFilter['max'] = int(max_year) 612 | 613 | results_left = max_results 614 | page_num = 1 615 | 616 | return_results = [] 617 | 618 | while results_left > 0: 619 | data = {"queryString": title, 620 | "page": page_num, 621 | "pageSize": 10, 622 | "sort": "relevance", 623 | "authors": [], 624 | "coAuthors": [], 625 | "venues": [], 626 | "yearFilter": yearFilter, 627 | "requireViewablePdf": False, 628 | "publicationTypes": [], 629 | "externalContentTypes": [] 630 | } 631 | 632 | r = self.request(url, data=data, post=True) 633 | 634 | results_dict = r.json() 635 | 636 | if results_dict.get('totalResults') and max_results != results_dict['totalResults']: 637 | max_results = min(max_results, results_dict['totalResults']) 638 | results_left = max_results 639 | 640 | if 'results' in results_dict: 641 | results = results_dict['results'] 642 | else: 643 | results = [] 644 | 645 | results_left -= len(results) 646 | 647 | for index, res in enumerate(results[:results_left]): 648 | 649 | res_title = res['title']['text'] 650 | 651 | authors_processed = [] 652 | for author_list in res['authors']: 653 | for author_dict in author_list: 654 | if 'name' in author_dict: 655 | authors_processed.append(author_dict) 656 | 657 | authors = self.loadSSAuthors(authors_processed) 658 | 659 | bib = {'title': res_title, 660 | 'abstract': res['paperAbstract']['text'], 661 | 'year': res['year']['text'], 662 | 'url': 'https://www.semanticscholar.org/paper/{}/{}'.format(res['slug'], 663 | res['id']), 664 | 'author': authorListFromDict(authors), 665 | } 666 | 667 | if res.get('doiInfo'): 668 | bib['doi'] = res['doiInfo'].get('doi') 669 | 670 | extra_data = { 671 | 'ss_id': res['id'], 672 | 'x_authors': authors 673 | } 674 | 675 | new_res = SearchResult(index, bib, 'semantischolar', extra_data) 676 | 677 | for link in res.get('links', []): 678 | if isPDFURL(link['url']): 679 | bib['eprint'] = link['url'] 680 | addUrlIfNew(new_res, link['url'], 'pdf', 'semanticscholar') 681 | 682 | venue = res['venue'].get('text') 683 | extra_data['venue'] = venue 684 | return_results.append(new_res) 685 | 686 | return return_results 687 | 688 | def getMetadata(self, paper, get_citing_papers=False): 689 | if not paper.doi and not paper.extra_data.get('ss_id'): 690 | raise ValueError('paper has no DOI or SSID') 691 | 692 | if paper.extra_data.get('ss_id'): 693 | unique_id = paper.extra_data.get('ss_id') 694 | else: 695 | unique_id = paper.doi 696 | 697 | url = 'https://api.semanticscholar.org/v1/paper/' + unique_id 698 | 699 | r = self.request(url) 700 | d = r.json() 701 | 702 | if 'error' in d: 703 | print("SemanticScholar error:", d['error']) 704 | return 705 | 706 | for field in ['abstract', 'year', 'venue']: 707 | if d.get(field): 708 | paper.bib[field] = str(d[field]) 709 | 710 | if d.get('arxivId'): 711 | paper.arxivid = d['arxivId'] 712 | 713 | for topic in d['topics']: 714 | # we really don't need to store the url, it's just 715 | # https://www.semanticscholar.org/topic/{topicId} 716 | del topic['url'] 717 | 718 | authors = self.loadSSAuthors(d['authors']) 719 | paper.bib['author'] = authorListFromDict(authors) 720 | 721 | paper.extra_data['ss_topics'] = d['topics'] 722 | paper.extra_data['ss_authors'] = d['authors'] 723 | paper.extra_data['ss_id'] = d['paperId'] 724 | 725 | if get_citing_papers: 726 | citing_papers = [] 727 | for index, citation in enumerate(d['citations']): 728 | ss_authors = semanticscholarmetadata.loadSSAuthors(citation['authors']) 729 | authors = authorListFromDict(ss_authors) 730 | 731 | bib = { 732 | 'title': citation['title'], 733 | 'author': authors, 734 | 'year': citation['year'], 735 | 'doi': citation['year'], 736 | } 737 | bib = fixBibData(bib, index) 738 | 739 | extra_data = { 740 | 'ss_id': citation['paperId'], 741 | 'ss_influential': citation['isInfluential'], 742 | 'ss_authors': ss_authors 743 | } 744 | if citation.get('arxivId'): 745 | extra_data['arxivid'] = citation.get('arxivId') 746 | 747 | new_paper = Paper(bib, extra_data) 748 | citing_papers.append(new_paper) 749 | return paper, citing_papers 750 | return paper 751 | 752 | 753 | crossref_scraper = CrossrefScraper() 754 | scholar_scraper = GScholarScraper(basic_delay=0.1) 755 | unpaywall_scraper = UnpaywallScraper(rate_limit=100000, rate_interval='24h') 756 | pubmed_scraper = PubMedScraper() 757 | arxiv_scraper = arXivSearcher() 758 | semanticscholarmetadata = SemanticScholarScraper() 759 | 760 | 761 | def enrichAndUpdateMetadata(papers, paperstore, identity): 762 | successful = [] 763 | unsuccessful = [] 764 | 765 | for paper in tqdm(papers, desc='Enriching metadata'): 766 | try: 767 | enrichMetadata(paper, identity) 768 | successful.append(paper) 769 | except Exception as e: 770 | print(e.__class__.__name__, e) 771 | unsuccessful.append(paper) 772 | 773 | paperstore.updatePapers([paper]) 774 | 775 | return successful, unsuccessful 776 | 777 | 778 | def enrichMetadata(paper: Paper, identity): 779 | """ 780 | Tries to retrieve metadata from Crossref and abstract from SemanticScholar for a given paper, 781 | Google Scholar bib if all else fails 782 | 783 | :param paper: Paper instance 784 | """ 785 | paper.title = basicTitleCleaning(paper.title) 786 | original_title = paper.title 787 | 788 | if paper.pmid and not paper.extra_data.get("done_pubmed"): 789 | pubmed_scraper.enrichWithMetadata(paper) 790 | paper.extra_data['done_pubmed'] = True 791 | 792 | # if we don't have a DOI, we need to find it on Crossref 793 | if not paper.doi and not paper.extra_data.get('done_crossref', False): 794 | crossref_scraper.matchPaperFromResults(paper, identity) 795 | 796 | if paper.doi: 797 | new_bib = getBibtextFromDOI(paper.doi) 798 | paper = mergeResultData(paper, 799 | SearchResult(1, new_bib[0], 'crossref', paper.extra_data)) 800 | paper.extra_data['done_crossref'] = True 801 | 802 | # if we have a DOI and we haven't got the abstract yet 803 | if paper.doi and not paper.extra_data.get('done_semanticscholar'): 804 | semanticscholarmetadata.getMetadata(paper) 805 | paper.extra_data['done_semanticscholar'] = True 806 | 807 | # try PubMed if we still don't have a PMID 808 | if not paper.pmid and not paper.extra_data.get('done_pubmed'): 809 | # if (not paper.doi or not paper.has_full_abstract) and not paper.pmid and not paper.extra_data.get('done_pubmed'): 810 | if pubmed_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.4): 811 | pubmed_scraper.enrichWithMetadata(paper) 812 | paper.extra_data['done_pubmed'] = True 813 | 814 | # still no DOI? maybe we can get something from SemanticScholar 815 | if not paper.extra_data.get('ss_id') and not paper.extra_data.get('done_semanticscholar'): 816 | semanticscholarmetadata.matchPaperFromResults(paper, identity) 817 | paper.extra_data['done_semanticscholar'] = True 818 | 819 | # # time to try Scopus, see if it's behind a paywall 820 | # if not paper.doi and not paper.extra_data.get('done_scopus'): 821 | # semanticscholarmetadata.getMetadata(paper) 822 | # paper.extra_data['done_semanticscholar'] = True 823 | 824 | # if we don't have an abstract maybe it's on arXiv 825 | if not paper.has_full_abstract and not paper.extra_data.get('done_arxiv'): 826 | # if not paper.extra_data.get('done_arxiv'): 827 | arxiv_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.35) 828 | paper.extra_data['done_arxiv'] = True 829 | 830 | # try to get open access links if DOI present and missing PDF link 831 | if not paper.has_pdf_link and paper.doi and not paper.extra_data.get('done_unpaywall'): 832 | unpaywall_scraper.getMetadata(paper, identity) 833 | paper.extra_data['done_unpaywall'] = True 834 | 835 | # if all else has failed but we have a link to Google Scholar bib data, get that 836 | if not paper.year and paper.extra_data.get('url_scholarbib'): 837 | scholar_scraper.getBibtex(paper) 838 | 839 | if paper.title != original_title: 840 | print('Original: %s\nNew: %s' % (original_title, paper.title)) 841 | paper.bib = fixBibData(paper.bib, 1) 842 | 843 | 844 | def test(): 845 | title = 'NegBio: a high-performance tool for negation and uncertainty detection in radiology reports' 846 | 847 | # res = searchSemanticScholar(title) 848 | 849 | # res = searchCrossref(title) 850 | # for r in res: 851 | # print(json.dumps(r, indent=3)) 852 | pubmed_scraper.search(title, 'dduma@ed.ac.uk') 853 | 854 | 855 | if __name__ == '__main__': 856 | test() 857 | --------------------------------------------------------------------------------