├── README.md └── Scrape_PubMed.py /README.md: -------------------------------------------------------------------------------- 1 | # Web-Scraping-PubMed 2 | This Python Program takes a keyword and outputs n abstracts + bibliographies from PubMed 3 | 4 | Please use this script responsibly and do not put too much of a strain on the PubMed servsers. 5 | 6 | If you have any questiosn or coments, don't hesitate to contact me. 7 | 8 | -------------------------------------------------------------------------------- /Scrape_PubMed.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import urllib.request, urllib.parse, urllib.error 4 | import re 5 | import ssl 6 | import json 7 | import calendar 8 | import numpy as np 9 | import pandas as pd 10 | 11 | url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=NUM&sort=relevance&term=KEYWORD" 12 | 13 | # We ask the user to provide the keyword and number of results and subsequently replace these elements in the url string 14 | keyword = str(input('Please enter the keyword ')) 15 | num = int(input('Please enter the number of results ')) 16 | url = url.replace('NUM', str(num)) 17 | url = url.replace('KEYWORD', keyword) 18 | 19 | try: 20 | _create_unverified_https_context = ssl._create_unverified_context 21 | except AttributeError: 22 | # Legacy Python that doesn’t verify HTTPS certificates by default 23 | pass 24 | else: 25 | # Handle target environment that doesn’t support HTTPS verification 26 | ssl._create_default_https_context = _create_unverified_https_context 27 | 28 | webpage = urllib.request.urlopen(url).read() 29 | dict_page =json.loads(webpage) 30 | idlist = dict_page["esearchresult"]["idlist"] 31 | 32 | 33 | 34 | # We create a function to delete brackets from titles 35 | def strip_brackets(s): 36 | # initialization of string to "" 37 | no_bracktes = "" 38 | dont_want = ['[',']'] 39 | # traverse in the string 40 | for char in s: 41 | if char not in dont_want: 42 | no_bracktes += char 43 | # return string 44 | return no_bracktes 45 | 46 | 47 | # We create a function which takes the soup and extracts all needed elements for the bibliography and abstract 48 | # Example output: A. Bester, R. Zelazny, and H. Ellison, “On the Role of Viruses in Future Epidemics,” Journal of Irreproducible Results 3(4) pp. 29–35 (Mar. 2103). PUBMED: 23456789; DOI 12.1119/2847595. 49 | 50 | def get_bibliography(soup): 51 | 52 | # This function creates a empty variable for each needed element and subsequently fills in the true value if it exists 53 | 54 | article = soup.find('article') 55 | journal = soup.find('journal') 56 | 57 | authorlist = article.find('authorlist') 58 | 59 | authors = "" 60 | if authorlist: 61 | for i in range(len(authorlist.find_all('lastname'))): 62 | initial = authorlist.find_all('initials')[i].text 63 | authors+= initial 64 | authors+= '. ' 65 | last_name = authorlist.find_all('lastname')[i].text 66 | authors+= last_name 67 | if i == len(authorlist.find_all('lastname'))-2: 68 | authors += ' and ' 69 | elif i != len(authorlist.find_all('lastname'))-1: 70 | authors += ', ' 71 | authors += ", " 72 | 73 | ArticleTitle = '' 74 | if article.find('articletitle'): 75 | ArticleTitle = '"' 76 | title_str = article.find('articletitle').text 77 | title_str = strip_brackets(title_str) 78 | ArticleTitle += title_str 79 | # If that is in the title, please leave it and put the comma after the quotation marks. - Professor Bishop 80 | if ArticleTitle[-1] == '.': 81 | ArticleTitle += '", ' 82 | else: 83 | ArticleTitle += '," ' 84 | 85 | volume = '' 86 | if journal.find('volume'): 87 | volume = journal.find('volume').text 88 | if soup.find('issue'): 89 | volume += '(' 90 | volume += soup.find('issue').text 91 | volume += ')' 92 | volume += ' ' 93 | 94 | page = '' 95 | if article.find('pagination'): 96 | if '-' in article.find('pagination').text: 97 | page = 'pp. ' 98 | page_str = article.find('pagination').text 99 | page_str = page_str.strip('\n') 100 | page += page_str 101 | page += ' ' 102 | else: 103 | page = 'p. ' 104 | page_str = article.find('pagination').text 105 | page_str = page_str.strip('\n') 106 | page += page_str 107 | page += ' ' 108 | 109 | journal_title = '' 110 | if journal.find('title'): 111 | journal_title = journal.find('title').text 112 | journal_title += ' ' 113 | 114 | JournalIssue = journal.find('journalissue') 115 | 116 | month = JournalIssue.find('month') 117 | date = '' 118 | if month: 119 | month = JournalIssue.find('month').text 120 | if len(month)<3: 121 | month_int = int(str(month)) 122 | month = calendar.month_abbr[month_int] 123 | 124 | year = JournalIssue.find('year').text 125 | date = '(' 126 | date += month 127 | date += '. ' 128 | date += year 129 | date += '). ' 130 | elif JournalIssue.find('year'): 131 | date = '(' 132 | date+= JournalIssue.find('year').text 133 | date += '). ' 134 | else: '' 135 | 136 | pubmed = '' 137 | if soup.find('articleid'): 138 | pubmed = 'PUBMED: ' 139 | pubmed += soup.find('articleid').text 140 | pubmed += '; ' 141 | doi_pii = article.find_all('elocationid') 142 | doi_pii_str = "" 143 | if len(doi_pii)>1: 144 | if 'doi' in str(doi_pii[0]): 145 | doi_pii = doi_pii[0].text 146 | doi_pii_str += "DOI " 147 | doi_pii_str += doi_pii 148 | doi_pii_str += "." 149 | elif 'doi' in str(doi_pii[1]): 150 | doi_pii = doi_pii[1].text 151 | doi_pii_str += "DOI " 152 | doi_pii_str += doi_pii 153 | doi_pii_str += "." 154 | elif len(doi_pii) == 1: 155 | if 'doi' in str(doi_pii[0]): 156 | doi_pii = doi_pii[0].text 157 | doi_pii_str += "DOI " 158 | doi_pii_str += doi_pii 159 | doi_pii_str += "." 160 | elif 'pii' in str(doi_pii[0]): 161 | doi_pii = doi_pii[0].text 162 | doi_pii_str += "PII " 163 | doi_pii_str += doi_pii 164 | doi_pii_str += "." 165 | 166 | abstract = '' 167 | if article.find('abstracttext'): 168 | abstract = article.find('abstracttext').text 169 | 170 | result = [] 171 | result.append(authors) 172 | result.append(ArticleTitle) 173 | result.append(journal_title) 174 | result.append(volume) 175 | result.append(date) 176 | result.append(pubmed) 177 | result.append(doi_pii_str) 178 | result.append(abstract) 179 | 180 | return result 181 | 182 | 183 | articles_list = [] 184 | 185 | # We loop over each element in the idlist to get the soup and feed it into our function 186 | for link in idlist: 187 | url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist" 188 | url = url.replace('idlist', link) 189 | 190 | try: 191 | _create_unverified_https_context = ssl._create_unverified_context 192 | except AttributeError: 193 | # Legacy Python that doesn’t verify HTTPS certificates by default 194 | pass 195 | else: 196 | # Handle target environment that doesn’t support HTTPS verification 197 | ssl._create_default_https_context = _create_unverified_https_context 198 | 199 | r = requests.get(url) 200 | soup = BeautifulSoup(r.content, "html.parser") 201 | article = get_bibliography(soup) 202 | articles_list.append(article) 203 | 204 | df = pd.DataFrame(articles_list) 205 | df.columns = ['authors', 'ArticleTitle', 'journal_title', 'volume', 'date', 'pubmed', 'doi_pii_str', 'abstract'] 206 | file_name = keyword + '_' + str(num) + '.csv' 207 | df.to_csv(file_name) --------------------------------------------------------------------------------