├── README.md
└── Scrape_PubMed.py


/README.md:
--------------------------------------------------------------------------------
1 | # Web-Scraping-PubMed
2 | This Python Program takes a keyword and outputs n abstracts + bibliographies from PubMed
3 | 
4 | Please use this script responsibly and do not put too much of a strain on the PubMed servsers.
5 | 
6 | If you have any questiosn or coments, don't hesitate to contact me.
7 | 
8 | 


--------------------------------------------------------------------------------
/Scrape_PubMed.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import requests
  3 | import urllib.request, urllib.parse, urllib.error
  4 | import re
  5 | import ssl
  6 | import json
  7 | import calendar
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=NUM&sort=relevance&term=KEYWORD"
 12 | 
 13 | # We ask the user to provide the keyword and number of results and subsequently replace these elements in the url string
 14 | keyword = str(input('Please enter the keyword '))
 15 | num = int(input('Please enter the number of results '))
 16 | url = url.replace('NUM', str(num))
 17 | url = url.replace('KEYWORD', keyword)
 18 | 
 19 | try:
 20 |     _create_unverified_https_context = ssl._create_unverified_context
 21 | except AttributeError:
 22 |     # Legacy Python that doesn’t verify HTTPS certificates by default
 23 |     pass
 24 | else:
 25 |     # Handle target environment that doesn’t support HTTPS verification
 26 |     ssl._create_default_https_context = _create_unverified_https_context
 27 |     
 28 | webpage = urllib.request.urlopen(url).read()
 29 | dict_page =json.loads(webpage)
 30 | idlist = dict_page["esearchresult"]["idlist"]
 31 | 
 32 | 
 33 | 
 34 | # We create a function to delete brackets from titles
 35 | def strip_brackets(s): 
 36 |     # initialization of string to "" 
 37 |     no_bracktes = "" 
 38 |     dont_want = ['[',']']
 39 |     # traverse in the string  
 40 |     for char in s: 
 41 |         if char not in dont_want:
 42 |             no_bracktes += char
 43 |     # return string  
 44 |     return no_bracktes 
 45 | 
 46 | 
 47 | # We create a function which takes the soup and extracts all needed elements for the bibliography and abstract
 48 | # Example output: A. Bester, R. Zelazny, and H. Ellison, “On the Role of Viruses in Future Epidemics,” Journal of Irreproducible Results 3(4) pp. 29–35 (Mar. 2103). PUBMED: 23456789; DOI 12.1119/2847595.
 49 | 
 50 | def get_bibliography(soup):
 51 | 
 52 |     # This function creates a empty variable for each needed element and subsequently fills in the true value if it exists
 53 | 
 54 |     article = soup.find('article')
 55 |     journal = soup.find('journal')
 56 | 
 57 |     authorlist = article.find('authorlist')
 58 |     
 59 |     authors = ""
 60 |     if authorlist:
 61 |         for i in range(len(authorlist.find_all('lastname'))):
 62 |             initial = authorlist.find_all('initials')[i].text
 63 |             authors+= initial
 64 |             authors+= '. '
 65 |             last_name = authorlist.find_all('lastname')[i].text
 66 |             authors+= last_name
 67 |             if i == len(authorlist.find_all('lastname'))-2:
 68 |                 authors += ' and '
 69 |             elif i != len(authorlist.find_all('lastname'))-1:
 70 |                 authors += ', '
 71 |         authors += ", "
 72 | 
 73 |     ArticleTitle = ''
 74 |     if article.find('articletitle'):
 75 |             ArticleTitle = '"'
 76 |             title_str = article.find('articletitle').text
 77 |             title_str = strip_brackets(title_str)
 78 |             ArticleTitle += title_str
 79 |             # If that is in the title, please leave it and put the comma after the quotation marks. - Professor Bishop
 80 |             if ArticleTitle[-1] == '.':
 81 |                 ArticleTitle += '", '
 82 |             else:
 83 |                 ArticleTitle += '," '
 84 | 
 85 |     volume = ''
 86 |     if journal.find('volume'):
 87 |         volume = journal.find('volume').text
 88 |         if soup.find('issue'):
 89 |             volume += '('
 90 |             volume += soup.find('issue').text
 91 |             volume += ')'
 92 |         volume += ' '
 93 |     
 94 |     page = ''
 95 |     if article.find('pagination'):
 96 |         if '-' in article.find('pagination').text:
 97 |             page = 'pp. '
 98 |             page_str = article.find('pagination').text
 99 |             page_str = page_str.strip('\n')
100 |             page += page_str
101 |             page += ' '
102 |         else:
103 |             page = 'p. '
104 |             page_str = article.find('pagination').text
105 |             page_str = page_str.strip('\n')
106 |             page += page_str            
107 |             page += ' '
108 |     
109 |     journal_title = ''
110 |     if journal.find('title'):
111 |         journal_title = journal.find('title').text
112 |         journal_title += ' '
113 |      
114 |     JournalIssue = journal.find('journalissue')
115 |     
116 |     month = JournalIssue.find('month')
117 |     date = ''
118 |     if month:
119 |         month = JournalIssue.find('month').text
120 |         if len(month)<3:
121 |             month_int = int(str(month))
122 |             month = calendar.month_abbr[month_int]
123 | 
124 |         year = JournalIssue.find('year').text
125 |         date = '('
126 |         date += month
127 |         date += '. '
128 |         date += year
129 |         date += '). '
130 |     elif JournalIssue.find('year'):
131 |         date = '('
132 |         date+= JournalIssue.find('year').text
133 |         date += '). '      
134 |     else: ''
135 | 
136 |     pubmed = ''
137 |     if soup.find('articleid'):
138 |         pubmed = 'PUBMED: '
139 |         pubmed += soup.find('articleid').text
140 |         pubmed += '; '
141 |         doi_pii = article.find_all('elocationid')
142 |         doi_pii_str = ""
143 |         if len(doi_pii)>1:
144 |             if 'doi' in str(doi_pii[0]):
145 |                 doi_pii = doi_pii[0].text
146 |                 doi_pii_str += "DOI "
147 |                 doi_pii_str += doi_pii
148 |                 doi_pii_str += "."
149 |             elif 'doi' in str(doi_pii[1]):
150 |                 doi_pii = doi_pii[1].text
151 |                 doi_pii_str += "DOI "
152 |                 doi_pii_str += doi_pii
153 |                 doi_pii_str += "."
154 |         elif len(doi_pii) == 1:
155 |             if 'doi' in str(doi_pii[0]):
156 |                 doi_pii = doi_pii[0].text
157 |                 doi_pii_str += "DOI "
158 |                 doi_pii_str += doi_pii
159 |                 doi_pii_str += "."
160 |             elif 'pii' in str(doi_pii[0]):
161 |                 doi_pii = doi_pii[0].text
162 |                 doi_pii_str += "PII "
163 |                 doi_pii_str += doi_pii
164 |                 doi_pii_str += "."
165 |     
166 |     abstract = ''
167 |     if article.find('abstracttext'):
168 |         abstract = article.find('abstracttext').text
169 |     
170 |     result = []
171 |     result.append(authors)
172 |     result.append(ArticleTitle)
173 |     result.append(journal_title)
174 |     result.append(volume)
175 |     result.append(date)
176 |     result.append(pubmed)
177 |     result.append(doi_pii_str)
178 |     result.append(abstract)
179 | 
180 |     return result
181 | 
182 | 
183 | articles_list = []
184 | 
185 | # We loop over each element in the idlist to get the soup and feed it into our function
186 | for link in idlist:
187 |     url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
188 |     url = url.replace('idlist', link)
189 | 
190 |     try:
191 |         _create_unverified_https_context = ssl._create_unverified_context
192 |     except AttributeError:
193 |         # Legacy Python that doesn’t verify HTTPS certificates by default
194 |         pass
195 |     else:
196 |         # Handle target environment that doesn’t support HTTPS verification
197 |         ssl._create_default_https_context = _create_unverified_https_context
198 |     
199 |     r = requests.get(url)
200 |     soup = BeautifulSoup(r.content, "html.parser")
201 |     article = get_bibliography(soup)
202 |     articles_list.append(article)
203 | 
204 | df = pd.DataFrame(articles_list)
205 | df.columns = ['authors', 'ArticleTitle', 'journal_title', 'volume', 'date', 'pubmed', 'doi_pii_str', 'abstract']
206 | file_name = keyword + '_' + str(num) + '.csv'
207 | df.to_csv(file_name)


--------------------------------------------------------------------------------