├── PyperGrabber ├── work_n_shutdown.sh ├── retriever │ ├── flatten_list.py │ ├── web2pdf.py │ ├── string_funcs.py │ ├── log_this.py │ ├── easy_parallelize.py │ ├── get_title.py │ └── retriever.py ├── config.ini ├── mail2pmid │ ├── mail2pmid.py │ └── imbox │ │ ├── utils.py │ │ ├── imap.py │ │ ├── query.py │ │ ├── __init__.py │ │ └── parser.py ├── config.py └── PyperGrabber.py ├── README.md └── LICENSE /PyperGrabber/work_n_shutdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # due to long time to completion - let script run when you are away and then power off the machine 3 | 4 | ./PyperGrabber.py && sudo poweroff -------------------------------------------------------------------------------- /PyperGrabber/retriever/flatten_list.py: -------------------------------------------------------------------------------- 1 | def flatten_list(lis): 2 | """Given a list, possibly nested to any level, return it flattened.""" 3 | new_lis = [] 4 | for item in lis: 5 | if type(item) == type([]): 6 | new_lis.extend(flatten_list(item)) 7 | else: 8 | new_lis.append(item) 9 | return new_lis 10 | -------------------------------------------------------------------------------- /PyperGrabber/retriever/web2pdf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import subprocess 4 | 5 | 6 | def pdf_print(pmid, save_dir): 7 | link = 'https://www.ncbi.nlm.nih.gov/pubmed/' + pmid 8 | 9 | filename = save_dir + pmid + '.pdf' 10 | 11 | # number of trials: 12 | trials = 7 13 | for trial in range(1, trials): 14 | try: 15 | cmd = 'wkhtmltopdf --quiet --page-size A4 ' 16 | subprocess.call([cmd + link + ' ' + filename], shell=True) # executing wkhtmltopdf command 17 | except IOError: 18 | print 'Problem with wkhtmltopdf. Trying again' 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /PyperGrabber/config.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | version = 0.1 3 | 4 | [paths] 5 | save_to = /Desktop/pulled_pdfs/ 6 | 7 | [email_auth] 8 | em_server = 9 | em_usr = 10 | em_pw = 11 | sender = efback@ncbi.nlm.nih.gov 12 | 13 | [crawler] 14 | depth = 2 15 | # choose: lxml or html5lib: 16 | cr_parser = html5lib 17 | usr_agent = Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36 18 | email = 19 | pubmed_base_url = https://www.ncbi.nlm.nih.gov/pubmed/ 20 | scihub_base_url = http://www.ncbi.nlm.nih.gov.sci-hub.cc/pubmed/ 21 | schola_base_url = https://scholar.google.de/scholar?as_vis=1&q={}&hl=en&as_sdt=1,5 22 | -------------------------------------------------------------------------------- /PyperGrabber/retriever/string_funcs.py: -------------------------------------------------------------------------------- 1 | from string import punctuation, whitespace 2 | 3 | 4 | def rem_whitespace(string): 5 | """ careful to keep this order of patterns or duplicate whitespace created in first round 6 | will not be removed 7 | """ 8 | unwanted_chars = punctuation + whitespace 9 | 10 | pat_l = [r'[' + unwanted_chars + ']', 11 | r'\s+', 12 | r' ', 13 | r' \\', 14 | r' \ ' 15 | ] 16 | 17 | for p in pat_l: 18 | rx = re.compile(p) 19 | string = re.sub(rx, ' ', string) 20 | 21 | return string.strip() 22 | 23 | 24 | def encode(ustr): 25 | return ustr.encode('utf8') 26 | -------------------------------------------------------------------------------- /PyperGrabber/retriever/log_this.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | # logging functions: 5 | def log_search(msg): 6 | # print msg 7 | log_filename = 'search.log' 8 | logging.basicConfig(filename=log_filename, 9 | level=logging.INFO, 10 | format='%(asctime)s %(message)s' 11 | ) 12 | logging.info(' - ' + msg) 13 | 14 | 15 | def log_download(msg): 16 | # print msg 17 | LOG_FILENAME = 'download.log' 18 | logging.basicConfig(filename=LOG_FILENAME, 19 | level=logging.INFO, 20 | format='%(asctime)s %(message)s' 21 | ) 22 | logging.info(' - ' + msg) 23 | -------------------------------------------------------------------------------- /PyperGrabber/mail2pmid/mail2pmid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from imbox import Imbox 4 | from config import * 5 | import re 6 | import sys 7 | 8 | 9 | pmids = [] 10 | msgs_l = [] 11 | 12 | 13 | def mail2pmid(): 14 | imbox = Imbox(em_server, username=em_usr, password=em_pw) 15 | 16 | print '\n\nFetching emails ...', 17 | email_gen = (imbox.messages(sent_from=sender)) 18 | while True: 19 | try: 20 | msgs_l.append(email_gen.next()) 21 | except StopIteration: 22 | break 23 | except Exception as e: 24 | print(e) 25 | sys.exit(1) 26 | 27 | print("Done.\nSuccessfully retrieved {} messages from NCBI.".format(len(msgs_l))) 28 | 29 | msgs_str = str(msgs_l) 30 | 31 | pat = re.compile(r'pubmed\/(?P\d+)') 32 | 33 | pmids = re.findall(pat, msgs_str) 34 | 35 | pmids = list(set(pmids)) 36 | 37 | print('Found {} unique PMIDs.\n'.format(len(pmids))) 38 | 39 | return pmids -------------------------------------------------------------------------------- /PyperGrabber/mail2pmid/imbox/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from six import PY3 3 | 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | if PY3: 8 | def str_encode(value='', encoding=None, errors='strict'): 9 | logger.debug("Encode str {} with and errors {}".format(value, encoding, errors)) 10 | return str(value, encoding, errors) 11 | 12 | def str_decode(value='', encoding=None, errors='strict'): 13 | if isinstance(value, str): 14 | return bytes(value, encoding, errors).decode('utf-8') 15 | elif isinstance(value, bytes): 16 | return value.decode(encoding or 'utf-8', errors=errors) 17 | else: 18 | raise TypeError( "Cannot decode '{}' object".format(value.__class__) ) 19 | else: 20 | def str_encode(string='', encoding=None, errors='strict'): 21 | return unicode(string, encoding, errors) 22 | 23 | def str_decode(value='', encoding=None, errors='strict'): 24 | return value.decode(encoding, errors) 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyperGrabber 2 | Fetches PubMed article IDs (PMIDs) from email inbox, then crawls **PubMed**, **Google Scholar** and **Sci-Hub** for respective PDF files. 3 | 4 | 5 | PubMed can send you regular update on new articles matching your specified search criteria. PyperGrabber will automatically download thoe papers, saving you much time tracking on downloading those manually. When no PDF article is found PyperGrabber will save the PubMed abstract of the respective article to PDF. All files are named after PMID for convenience. 6 | 7 | 8 | ## NOTES: 9 | - _Messy code ahead!_ 10 | - Program may halt without error message. The source of this bug is yet to be determined. 11 | - The web crawler function may be used to work with other sources of PMIDs then email (e.g. command line parameter or file holding list of PMIDs) 12 | 13 | 14 | ## Required dependencies: 15 | sudo apt-get install wkhtmltopdf 16 | sudo pip install pypdf 17 | 18 | ## USAGE: 19 | - **Step 1** - Put in your email access data into `config.ini` or prepare to be prompted (works with IMAP) 20 | - **Step 2** - Start with: `python ./PyperGrabber.py` 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 bfelder 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PyperGrabber/retriever/easy_parallelize.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | # from multiprocessing.dummy import Pool # use this module for threading: default is processing 3 | 4 | 5 | """ 6 | NOTE: WORKS only when only a single argument/parameter needs to be passed to worker func! 7 | 8 | http://chriskiehl.com/article/parallelism-in-one-line/ 9 | https://www.binpress.com/tutorial/simple-python-parallelism/121 10 | 11 | speed of multithreading vs. multiprocessing: 12 | http://eli.thegreenplace.net/2012/01/16/python-parallelizing-cpu-bound-tasks-with-multiprocessing/ 13 | 14 | multiple arguments: 15 | https://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments 16 | """ 17 | 18 | 19 | def easy_parallelize(func, data, pool_size=None): 20 | if pool_size is None or pool_size < 1: # make number of workers fit size of input data, if not specified otherwise 21 | pool = Pool(processes=len(data)) 22 | else: 23 | pool = Pool(processes=pool_size) 24 | 25 | results = pool.map(func, data) 26 | 27 | cleaned = filter(None, results) # cleaning out None results 28 | 29 | pool.close() 30 | pool.join() 31 | 32 | return cleaned 33 | 34 | 35 | -------------------------------------------------------------------------------- /PyperGrabber/mail2pmid/imbox/imap.py: -------------------------------------------------------------------------------- 1 | from imaplib import IMAP4, IMAP4_SSL 2 | 3 | import logging 4 | import ssl as pythonssllib 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ImapTransport(object): 10 | 11 | def __init__(self, hostname, port=None, ssl=True): 12 | self.hostname = hostname 13 | self.port = port 14 | kwargs = {} 15 | 16 | if ssl: 17 | self.transport = IMAP4_SSL 18 | if not self.port: 19 | self.port = 993 20 | 21 | else: 22 | self.transport = IMAP4 23 | if not self.port: 24 | self.port = 143 25 | 26 | self.server = self.transport(self.hostname, self.port) 27 | logger.debug("Created IMAP4 transport for {host}:{port}" 28 | .format(host=self.hostname, port=self.port)) 29 | 30 | def list_folders(self): 31 | logger.debug("List all folders in mailbox") 32 | return self.server.list() 33 | 34 | def connect(self, username, password): 35 | self.server.login(username, password) 36 | self.server.select() 37 | logger.debug("Logged into server {} and selected mailbox 'INBOX'" 38 | .format(self.hostname)) 39 | return self.server 40 | -------------------------------------------------------------------------------- /PyperGrabber/config.py: -------------------------------------------------------------------------------- 1 | from ConfigParser import SafeConfigParser 2 | from os.path import expanduser 3 | import getpass 4 | 5 | parser = SafeConfigParser() 6 | parser.read('config.ini') 7 | 8 | VERSION = parser.get('general', 'version') 9 | 10 | # email access: 11 | em_server = parser.get('email_auth', 'em_server') 12 | em_usr = parser.get('email_auth', 'em_usr') 13 | em_pw = parser.get('email_auth', 'em_pw') 14 | sender = parser.get('email_auth', 'sender') 15 | 16 | # prompting for email user name and password in case not provided by config.ini 17 | if len(em_usr) == 0: 18 | em_usr = getpass.getpass(prompt='Input username for email account: ') 19 | if len(em_pw) == 0: 20 | em_pw = getpass.getpass() 21 | 22 | 23 | 24 | # most common user agent according to: https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ 25 | USER_AGENT = parser.get('crawler', 'usr_agent') 26 | email = parser.get('crawler', 'email') 27 | cr_parser = parser.get('crawler', 'cr_parser') 28 | pubmed_base_url = parser.get('crawler', 'pubmed_base_url') 29 | scihub_base_url = parser.get('crawler', 'scihub_base_url') 30 | schola_base_url = parser.get('crawler', 'schola_base_url') 31 | 32 | # dir to save files to: 33 | home = expanduser("~") 34 | save_dir = home + parser.get('paths', 'save_to') 35 | tmp_dir = save_dir + '.tmp/' # hidden 36 | -------------------------------------------------------------------------------- /PyperGrabber/mail2pmid/imbox/query.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | # TODO - Validate query arguments 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | IMAP_MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 8 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 9 | 10 | 11 | def format_date(date): 12 | 13 | return "%s-%s-%s" % (date.day, IMAP_MONTHS[date.month - 1], date.year) 14 | 15 | 16 | def build_search_query(**kwargs): 17 | 18 | # Parse keyword arguments 19 | unread = kwargs.get('unread', False) 20 | sent_from = kwargs.get('sent_from', False) 21 | sent_to = kwargs.get('sent_to', False) 22 | date__gt = kwargs.get('date__gt', False) 23 | if type(date__gt) is datetime.date: 24 | date__gt = format_date(date__gt) 25 | date__lt = kwargs.get('date__lt', False) 26 | if type(date__lt) is datetime.date: 27 | date__lt = format_date(date__lt) 28 | subject = kwargs.get('subject') 29 | 30 | query = [] 31 | 32 | if unread: 33 | query.append("(UNSEEN)") 34 | 35 | if sent_from: 36 | query.append('(FROM "%s")' % sent_from) 37 | 38 | if sent_to: 39 | query.append('(TO "%s")' % sent_to) 40 | 41 | if date__gt: 42 | query.append('(SINCE "%s")' % date__gt) 43 | 44 | if date__lt: 45 | query.append('(BEFORE "%s")' % date__lt) 46 | 47 | if subject is not None: 48 | query.append('(SUBJECT "%s")' % subject) 49 | 50 | if query: 51 | logger.debug("IMAP query: {}".format(" ".join(query))) 52 | return " ".join(query) 53 | 54 | logger.debug("IMAP query: {}".format("(ALL)")) 55 | return "(ALL)" 56 | -------------------------------------------------------------------------------- /PyperGrabber/retriever/get_title.py: -------------------------------------------------------------------------------- 1 | from Bio import Entrez 2 | from string_funcs import rem_whitespace 3 | 4 | 5 | # get title by polling NVBI with PMID as input 6 | def get_title(query, email): 7 | Entrez.email = email # Always tell NCBI who you are 8 | 9 | try: 10 | esear_handle = Entrez.esearch(db="pubmed", 11 | sort='relevance', 12 | retmax='1', 13 | retmode='xml', 14 | term=query 15 | ) 16 | r1 = Entrez.read(esear_handle) 17 | # print 'results1: ', r1 18 | 19 | if int(r1['Count']) >= 1: 20 | list = r1["IdList"] 21 | 22 | for index in range(0, len(list)): 23 | listid = list[index] 24 | 25 | esum_handle = Entrez.esummary(db="pubmed", 26 | sort='relevance', 27 | retmax='1', 28 | retmode='xml', 29 | id=listid 30 | ) 31 | r2 = Entrez.read(esum_handle) 32 | ''' generates dic entry in the form: {'PMID':'title'}, 33 | (duplicate, leading, trailing) whitespaces are removed 34 | ''' 35 | 36 | try: 37 | title = "'{}'".format(rem_whitespace(r2[0]['Title'])) 38 | except: 39 | title = None 40 | 41 | return title 42 | 43 | else: 44 | return None 45 | 46 | except: 47 | # in case of server error try to redo it by recursion: 48 | get_title(query, email) -------------------------------------------------------------------------------- /PyperGrabber/mail2pmid/imbox/__init__.py: -------------------------------------------------------------------------------- 1 | from imap import ImapTransport 2 | from parser import parse_email 3 | from query import build_search_query 4 | 5 | import logging 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Imbox(object): 10 | 11 | def __init__(self, hostname, username=None, password=None, ssl=True, 12 | port=None, ssl_context=None): 13 | 14 | self.server = ImapTransport(hostname, ssl=ssl, port=port) 15 | 16 | self.hostname = hostname 17 | self.username = username 18 | self.password = password 19 | self.connection = self.server.connect(username, password) 20 | logger.info("Connected to IMAP Server with user {username} on {hostname}{ssl}".format( 21 | hostname=hostname, username=username, ssl=(" over SSL" if ssl else ""))) 22 | 23 | def logout(self): 24 | self.connection.close() 25 | self.connection.logout() 26 | logger.info("Disconnected from IMAP Server {username}@{hostname}".format( 27 | hostname=self.hostname, username=self.username)) 28 | 29 | def query_uids(self, **kwargs): 30 | query = build_search_query(**kwargs) 31 | message, data = self.connection.uid('search', None, query) 32 | if data[0] is None: 33 | return [] 34 | return data[0].split() 35 | 36 | def fetch_by_uid(self, uid): 37 | message, data = self.connection.uid('fetch', uid, '(BODY.PEEK[])') 38 | logger.debug("Fetched message for UID {}".format(int(uid))) 39 | raw_email = data[0][1] 40 | 41 | email_object = parse_email(raw_email) 42 | 43 | return email_object 44 | 45 | def fetch_list(self, **kwargs): 46 | uid_list = self.query_uids(**kwargs) 47 | logger.debug("Fetch all messages for UID in {}".format(uid_list)) 48 | 49 | for uid in uid_list: 50 | yield (uid, self.fetch_by_uid(uid)) 51 | 52 | def mark_seen(self, uid): 53 | logger.info("Mark UID {} with \\Seen FLAG".format(int(uid))) 54 | self.connection.uid('STORE', uid, '+FLAGS', '(\\Seen)') 55 | 56 | def delete(self, uid): 57 | logger.info("Mark UID {} with \\Deleted FLAG and expunge.".format(int(uid))) 58 | mov, data = self.connection.uid('STORE', uid, '+FLAGS', '(\\Deleted)') 59 | self.connection.expunge() 60 | 61 | def copy(self, uid, destination_folder): 62 | logger.info("Copy UID {} to {} folder".format(int(uid), str(destination_folder))) 63 | return self.connection.uid('COPY', uid, destination_folder) 64 | 65 | def move(self, uid, destination_folder): 66 | logger.info("Move UID {} to {} folder".format(int(uid), str(destination_folder))) 67 | if self.copy(uid, destination_folder): 68 | self.delete(uid) 69 | 70 | def messages(self, *args, **kwargs): 71 | folder = kwargs.get('folder', False) 72 | msg = "" 73 | 74 | if folder: 75 | self.connection.select(folder) 76 | msg = " from folder '{}'".format(folder) 77 | 78 | logger.info("Fetch list of massages{}".format(msg)) 79 | return self.fetch_list(**kwargs) 80 | 81 | def folders(self): 82 | return self.connection.list() 83 | -------------------------------------------------------------------------------- /PyperGrabber/PyperGrabber.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ https://github.com/martinrusev/imbox 4 | https://www.rz.ruhr-uni-bochum.de/mitteilungen/faqs/mail-konfiguration.html 5 | https://stackoverflow.com/questions/364802/generator-comprehension 6 | """ 7 | 8 | import sys 9 | from mail2pmid.mail2pmid import mail2pmid 10 | from retriever.retriever import retriever 11 | from config import VERSION, save_dir, tmp_dir 12 | import os 13 | 14 | 15 | import subprocess 16 | import shutil 17 | import time 18 | 19 | from retriever.web2pdf import pdf_print 20 | import datetime 21 | 22 | import glob 23 | from pyPdf import PdfFileReader 24 | 25 | 26 | # CHANGING TERMINAL TITLE 27 | sys.stdout.write("\x1b]2;" + 'PyperGrabber v{}'.format(VERSION) + "\x07") 28 | 29 | # check if folder path is already in use as it would cause trouble: 30 | if os.path.exists(tmp_dir): 31 | # try saving remnant files by moving to lost and found folder 32 | new_dir = save_dir + 'lost_and_found/' 33 | files = os.listdir(tmp_dir) 34 | for f in files: 35 | try: 36 | shutil.move(f, new_dir) 37 | except Exception as e: 38 | print e 39 | # removing tmp_dir: 40 | shutil.rmtree(tmp_dir) 41 | 42 | 43 | # retrieve pubmed's emails and extract PMID's from them 44 | pmids = mail2pmid() 45 | 46 | # counter variables for statistics 47 | abstracts = 0 48 | full_article = 0 49 | 50 | # retrieve paper: 51 | to = time.time() 52 | num_digits = len(str(len(pmids))) # dynamically calculate leading zeros 53 | for i, pmid in enumerate(pmids): 54 | print "Fetching paper {number:0{wd}d} of {tot} with PMID: {id} ...".\ 55 | format(wd=num_digits, number=i, tot=len(pmids), id=pmid), 56 | if not os.path.exists(tmp_dir): 57 | os.makedirs(tmp_dir) 58 | retriever(pmid) 59 | 60 | content = os.listdir(tmp_dir) 61 | if not content: # if dir empty make pdf of pubmed abstract: 62 | pdf_print(pmid, save_dir) 63 | abstracts += 1 64 | else: # if files exist rename them to PMID_i.pdf and move them one path level up 65 | pdfs = glob.glob(tmp_dir + '*.pdf') 66 | for pi, pdf in enumerate(pdfs): 67 | if pi == 0: 68 | index = '' 69 | else: 70 | index = '_{}'.format(pi+1) 71 | new_path = save_dir + pmid + index + '.pdf' 72 | os.rename(pdf, new_path) 73 | full_article += 1 74 | 75 | print "DONE." 76 | 77 | # eliminate any duplicate files and left over empty dirs: 78 | fd_cmd = "fdupes -rdN {} && find . -type d -empty -delete ".format(save_dir) # test manually to check if working 79 | subprocess.call([fd_cmd], shell=True) 80 | # removing pdf files failing integrity check, e.g. HTML files in disguise#: 81 | all_pdfs = glob.glob(save_dir + '*.pdf') 82 | for candidate in all_pdfs: 83 | try: 84 | mypdf = PdfFileReader(file( 'filename', 'rb')) 85 | except: 86 | print candidate,' is invalid pdf' 87 | shutil.rmtree(candidate) 88 | 89 | 90 | # finally removing tmp_dir: 91 | # shutil.rmtree(tmp_dir) 92 | 93 | # printing out concluding statistics: 94 | total_time = (time.time() - to)/60 95 | format_time = datetime.timedelta(seconds=total_time) # format to hh:mm:ss format 96 | print("Job took {0:0.0f} minutes to complete".format(format_time)) 97 | total_down = abstracts + full_article 98 | avg_time = datetime.timedelta(seconds=total_time/total_down) 99 | print("Average time to retrieve paper: {}".format(avg_time)) 100 | print('Abstracts are {} of total fetches: {}'.format(abstracts, total_down)) 101 | 102 | 103 | with open('time.txt', 'w') as out_f: 104 | out_f.write(format_time) 105 | -------------------------------------------------------------------------------- /PyperGrabber/retriever/retriever.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from urllib import quote_plus 5 | from log_this import log_search, log_download 6 | from string_funcs import encode 7 | from config import * 8 | from get_title import get_title 9 | 10 | from easy_parallelize import easy_parallelize 11 | from flatten_list import flatten_list 12 | 13 | import urllib2 14 | from time import sleep 15 | from random import uniform 16 | from bs4 import BeautifulSoup 17 | 18 | from urlparse import urljoin 19 | 20 | import re 21 | import shelve 22 | 23 | 24 | # EMPTYING PERSISTENT STORAGE OF DOWNLOADED PDFs: 25 | db = shelve.open('download_db.db', writeback=True) 26 | try: 27 | for key in db: 28 | del db[key] 29 | finally: 30 | db.close() 31 | 32 | 33 | # mini crawler functions: 34 | def check_db(entry): 35 | db = shelve.open('download_db.db', writeback=True) 36 | try: 37 | if entry in db: 38 | # print '{} found'.format(entry) 39 | exist = True 40 | else: 41 | db[entry] = '' # putting entry into db 42 | exist = False 43 | finally: 44 | db.close() 45 | return exist 46 | 47 | 48 | def get_pdf(pdf_link): 49 | 50 | # check whether value already existing in permanent storage: 51 | pdf_name = pdf_link.rsplit('/', 1)[-1] # set filename according to last element of link 52 | if not check_db(pdf_name) and not check_db(pdf_link): 53 | # print 'Downloading: {}'.format(pdf_link) 54 | try: 55 | opener = urllib2.build_opener() 56 | opener.addheaders = [('User-agent', USER_AGENT)] 57 | 58 | r = opener.open(pdf_link) 59 | 60 | path = tmp_dir + pdf_name 61 | 62 | with open(path, "wb") as code: # 'w' 63 | code.write(r.read()) 64 | 65 | # log successful download: 66 | log_download('DOWNLOADED: {}'.format(pdf_link)) 67 | 68 | except Exception as e: 69 | log_download('FAILURE: {} | {}'.format(pdf_link, e)) 70 | else: 71 | log_download('File already downloaded: {}'.format(pdf_name)) 72 | 73 | 74 | def rem_blacklisted(url_l): 75 | ret_list = [] 76 | 77 | whitelist = [r'ncbi.nlm.nih.gov/pmc/articles', 78 | r'scholar?cluster =' 79 | ] 80 | 81 | blacklist = [r'mailto', # important to prevent crashes 82 | 83 | r'nlm.nih.gov', 84 | r'pubmed.gov', 85 | r'nih.gov', 86 | r'dhhs.gov', 87 | r'usa.gov', 88 | r'youtube.com', 89 | r'facebook.com', 90 | r'twitter.com', 91 | r'sci-hub.cc/donate', 92 | r'vk.com', 93 | r'google', 94 | r'scholar.google', 95 | r'.css', 96 | r'index.html' 97 | ] 98 | 99 | wl_rx = re.compile('.*' + '.*|.*'.join(wl for wl in whitelist) + '.*') 100 | bl_rx = re.compile('.*' + '.*|.*'.join(bl for bl in blacklist) + '.*') 101 | 102 | for u in url_l: 103 | # if whitelisted or not blacklisted join to return list 104 | if re.match(wl_rx, u) or not re.match(bl_rx, u): 105 | ret_list.append(u) 106 | else: 107 | continue 108 | 109 | return ret_list 110 | 111 | 112 | def get_links(url): 113 | opener = urllib2.build_opener() 114 | opener.addheaders = [('User-agent', USER_AGENT)] 115 | 116 | # print ' get_links working on url: {}'.format(url) 117 | 118 | # if 'sci-hub.cc' in url: # workaround for not having captcha solving right now 119 | # driver = webdriver.Chrome() 120 | # driver.get("http://www.google.com") 121 | 122 | fetch_timeout = 30 123 | try: 124 | response = opener.open(url, timeout=fetch_timeout) 125 | 126 | if response: 127 | soup = BeautifulSoup(response, cr_parser, from_encoding=response.info().getparam('charset')) 128 | href_l = [urljoin(url, h['href']) for h in soup.find_all(href=True)] 129 | # print 'href_l 1: ', href_l 130 | href_l = list(set(href_l)) # removing potential duplicates 131 | href_l = rem_blacklisted(href_l) # removing blacklisted 132 | href_l = map(encode, href_l) # transforming all potential unicode items to stringm 133 | 134 | pdf_l = [p for p in href_l if p.lower().endswith('.pdf')] # picking potential pdf links 135 | 136 | # print 'pdf_l: ', pdf_l 137 | 138 | href_l = list(set(href_l) - set(pdf_l)) # removing pdf links from link list 139 | # print 'href_l 3: ', href_l 140 | 141 | # downloading pdf files: 142 | map(get_pdf, pdf_l) 143 | 144 | return href_l 145 | 146 | except Exception as e: 147 | log_download("ERROR in get_links: {}".format(e)) 148 | return [] 149 | 150 | 151 | def mini_crawler(seed_url): 152 | link_l = [seed_url] # starting link_l, populating with seed_url 153 | visited = [] 154 | # print 'link_l in mini_crawler: ', link_l 155 | max_depths = 2 156 | 157 | for i in range(max_depths): 158 | if not link_l: 159 | break 160 | else: 161 | go_to = set(link_l) - set(visited) # preventing visiting site twice 162 | go_to = list(go_to) 163 | visited.extend(link_l) 164 | 165 | res_x = easy_parallelize(get_links, go_to) 166 | link_l = list(set((flatten_list(res_x)))) 167 | 168 | # print 'link_l: ', link_l 169 | sleep(uniform(1.3, 4.7)) # small randomized pause to be easy on servers 170 | log_download('CRAWLER FINISHED working on seed: {}'.format(seed_url)) 171 | 172 | 173 | # -------------------------------------------------------------------------------------- 174 | ''' MAIN FUNCTION ''' 175 | 176 | 177 | def retriever(pmid): 178 | # print 'threader received data: ', data 179 | log_search('SEARCHING: {}'.format(pmid)) 180 | 181 | # retrievingpaper title via pubmed API: 182 | title = get_title(pmid, email) 183 | 184 | seed_links = [] # seed links, populated with data 185 | 186 | ncbi_url = pubmed_base_url + pmid 187 | scihub_url = scihub_base_url + pmid 188 | seed_links.extend([ncbi_url, scihub_url]) 189 | 190 | if title: 191 | scho_tit = quote_plus(title) 192 | gosch_url = schola_base_url.format(scho_tit) 193 | seed_links.append(gosch_url) 194 | 195 | # print 'seed_links: ', seed_links 196 | # via iteration: 197 | for seed in seed_links: 198 | mini_crawler(seed) 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /PyperGrabber/mail2pmid/imbox/parser.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from six import BytesIO, binary_type 3 | 4 | import re 5 | import email 6 | import base64 7 | import quopri 8 | import time 9 | from datetime import datetime 10 | from email.header import decode_header 11 | from utils import str_encode, str_decode 12 | 13 | import logging 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Struct(object): 18 | def __init__(self, **entries): 19 | self.__dict__.update(entries) 20 | 21 | def keys(self): 22 | return self.__dict__.keys() 23 | 24 | def __repr__(self): 25 | return str(self.__dict__) 26 | 27 | 28 | def decode_mail_header(value, default_charset='us-ascii'): 29 | """ 30 | Decode a header value into a unicode string. 31 | """ 32 | try: 33 | headers = decode_header(value) 34 | except email.errors.HeaderParseError: 35 | return str_decode(str_encode(value, default_charset, 'replace'), default_charset) 36 | else: 37 | for index, (text, charset) in enumerate(headers): 38 | logger.debug("Mail header no. {}: {} encoding {}".format(index, str_decode(text, charset or 'utf-8'), charset)) 39 | try: 40 | headers[index] = str_decode(text, charset or default_charset, 41 | 'replace') 42 | except LookupError: 43 | # if the charset is unknown, force default 44 | headers[index] = str_decode(text, default_charset, 'replace') 45 | 46 | return ''.join(headers) 47 | 48 | 49 | def get_mail_addresses(message, header_name): 50 | """ 51 | Retrieve all email addresses from one message header. 52 | """ 53 | headers = [h for h in message.get_all(header_name, [])] 54 | addresses = email.utils.getaddresses(headers) 55 | 56 | for index, (address_name, address_email) in enumerate(addresses): 57 | addresses[index] = {'name': decode_mail_header(address_name), 58 | 'email': address_email} 59 | logger.debug("{} Mail addressees in message: <{}> {}".format(header_name.upper(), address_name, address_email)) 60 | return addresses 61 | 62 | 63 | def decode_param(param): 64 | name, v = param.split('=', 1) 65 | values = v.split('\n') 66 | value_results = [] 67 | for value in values: 68 | match = re.search(r'=\?((?:\w|-)+)\?(Q|B)\?(.+)\?=', value) 69 | if match: 70 | encoding, type_, code = match.groups() 71 | if type_ == 'Q': 72 | value = quopri.decodestring(code) 73 | elif type_ == 'B': 74 | value = base64.decodestring(code) 75 | value = str_encode(value, encoding) 76 | value_results.append(value) 77 | if value_results: 78 | v = ''.join(value_results) 79 | logger.debug("Decoded parameter {} - {}".format(name, v)) 80 | return name, v 81 | 82 | 83 | def parse_attachment(message_part): 84 | # Check again if this is a valid attachment 85 | content_disposition = message_part.get("Content-Disposition", None) 86 | if content_disposition is not None and not message_part.is_multipart(): 87 | dispositions = content_disposition.strip().split(";") 88 | 89 | if dispositions[0].lower() in ["attachment", "inline"]: 90 | file_data = message_part.get_payload(decode=True) 91 | 92 | attachment = { 93 | 'content-type': message_part.get_content_type(), 94 | 'size': len(file_data), 95 | 'content': BytesIO(file_data) 96 | } 97 | filename = message_part.get_param('name') 98 | if filename: 99 | attachment['filename'] = filename 100 | 101 | for param in dispositions[1:]: 102 | name, value = decode_param(param) 103 | 104 | if 'file' in name: 105 | attachment['filename'] = value 106 | 107 | if 'create-date' in name: 108 | attachment['create-date'] = value 109 | 110 | return attachment 111 | 112 | return None 113 | 114 | 115 | def decode_content(message): 116 | content = message.get_payload(decode=True) 117 | charset = message.get_content_charset('utf-8') 118 | try: 119 | return content.decode(charset) 120 | except AttributeError: 121 | return content 122 | 123 | 124 | def parse_email(raw_email): 125 | if isinstance(raw_email, binary_type): 126 | raw_email = str_encode(raw_email, 'utf-8') 127 | try: 128 | email_message = email.message_from_string(raw_email) 129 | except UnicodeEncodeError: 130 | email_message = email.message_from_string(raw_email.encode('utf-8')) 131 | maintype = email_message.get_content_maintype() 132 | parsed_email = {} 133 | 134 | parsed_email['raw_email'] = raw_email 135 | 136 | body = { 137 | "plain": [], 138 | "html": [] 139 | } 140 | attachments = [] 141 | 142 | if maintype in ('multipart', 'image'): 143 | logger.debug("Multipart message. Will process parts.") 144 | for part in email_message.walk(): 145 | content_type = part.get_content_type() 146 | part_maintype = part.get_content_maintype() 147 | content_disposition = part.get('Content-Disposition', None) 148 | if content_disposition or not part_maintype == "text": 149 | content = part.get_payload(decode=True) 150 | else: 151 | content = decode_content(part) 152 | 153 | is_inline = content_disposition is None \ 154 | or content_disposition == "inline" 155 | if content_type == "text/plain" and is_inline: 156 | body['plain'].append(content) 157 | elif content_type == "text/html" and is_inline: 158 | body['html'].append(content) 159 | elif content_disposition: 160 | attachment = parse_attachment(part) 161 | if attachment: 162 | attachments.append(attachment) 163 | 164 | elif maintype == 'text': 165 | payload = decode_content(email_message) 166 | body['plain'].append(payload) 167 | 168 | parsed_email['attachments'] = attachments 169 | 170 | parsed_email['body'] = body 171 | email_dict = dict(email_message.items()) 172 | 173 | parsed_email['sent_from'] = get_mail_addresses(email_message, 'from') 174 | parsed_email['sent_to'] = get_mail_addresses(email_message, 'to') 175 | parsed_email['cc'] = get_mail_addresses(email_message, 'cc') 176 | parsed_email['bcc'] = get_mail_addresses(email_message, 'bcc') 177 | 178 | value_headers_keys = ['subject', 'date', 'message-id'] 179 | key_value_header_keys = ['received-spf', 180 | 'mime-version', 181 | 'x-spam-status', 182 | 'x-spam-score', 183 | 'content-type'] 184 | 185 | parsed_email['headers'] = [] 186 | for key, value in email_dict.items(): 187 | 188 | if key.lower() in value_headers_keys: 189 | valid_key_name = key.lower().replace('-', '_') 190 | parsed_email[valid_key_name] = decode_mail_header(value) 191 | 192 | if key.lower() in key_value_header_keys: 193 | parsed_email['headers'].append({'Name': key, 194 | 'Value': value}) 195 | 196 | if parsed_email.get('date'): 197 | timetuple = email.utils.parsedate(parsed_email['date']) 198 | parsed_date = datetime.fromtimestamp(time.mktime(timetuple)) \ 199 | if timetuple else None 200 | parsed_email['parsed_date'] = parsed_date 201 | 202 | logger.info("Downloaded and parsed mail '{}' with {} attachments".format( 203 | parsed_email.get('subject'), len(parsed_email.get('attachments')))) 204 | return Struct(**parsed_email) 205 | --------------------------------------------------------------------------------