├── PyperGrabber
    ├── work_n_shutdown.sh
    ├── retriever
    │   ├── flatten_list.py
    │   ├── web2pdf.py
    │   ├── string_funcs.py
    │   ├── log_this.py
    │   ├── easy_parallelize.py
    │   ├── get_title.py
    │   └── retriever.py
    ├── config.ini
    ├── mail2pmid
    │   ├── mail2pmid.py
    │   └── imbox
    │   │   ├── utils.py
    │   │   ├── imap.py
    │   │   ├── query.py
    │   │   ├── __init__.py
    │   │   └── parser.py
    ├── config.py
    └── PyperGrabber.py
├── README.md
└── LICENSE


/PyperGrabber/work_n_shutdown.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # due to long time to completion - let script run when you are away and then power off the machine
3 | 
4 | ./PyperGrabber.py && sudo poweroff


--------------------------------------------------------------------------------
/PyperGrabber/retriever/flatten_list.py:
--------------------------------------------------------------------------------
 1 | def flatten_list(lis):
 2 |     """Given a list, possibly nested to any level, return it flattened."""
 3 |     new_lis = []
 4 |     for item in lis:
 5 |         if type(item) == type([]):
 6 |             new_lis.extend(flatten_list(item))
 7 |         else:
 8 |             new_lis.append(item)
 9 |     return new_lis
10 | 


--------------------------------------------------------------------------------
/PyperGrabber/retriever/web2pdf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import subprocess
 4 | 
 5 | 
 6 | def pdf_print(pmid, save_dir):
 7 |     link = 'https://www.ncbi.nlm.nih.gov/pubmed/' + pmid
 8 | 
 9 |     filename = save_dir + pmid + '.pdf'
10 | 
11 |     # number of trials:
12 |     trials = 7
13 |     for trial in range(1, trials):
14 |         try:
15 |             cmd = 'wkhtmltopdf --quiet --page-size A4 '
16 |             subprocess.call([cmd + link + ' ' + filename], shell=True)  # executing wkhtmltopdf command
17 |         except IOError:
18 |             print 'Problem with wkhtmltopdf. Trying again'
19 |             
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/PyperGrabber/config.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | version = 0.1
 3 | 
 4 | [paths]
 5 | save_to = /Desktop/pulled_pdfs/
 6 | 
 7 | [email_auth]
 8 | em_server =
 9 | em_usr =
10 | em_pw =
11 | sender = efback@ncbi.nlm.nih.gov
12 | 
13 | [crawler]
14 | depth = 2
15 | # choose: lxml or html5lib:
16 | cr_parser = html5lib
17 | usr_agent = Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36
18 | email = 
19 | pubmed_base_url = https://www.ncbi.nlm.nih.gov/pubmed/
20 | scihub_base_url = http://www.ncbi.nlm.nih.gov.sci-hub.cc/pubmed/
21 | schola_base_url = https://scholar.google.de/scholar?as_vis=1&q={}&hl=en&as_sdt=1,5
22 | 


--------------------------------------------------------------------------------
/PyperGrabber/retriever/string_funcs.py:
--------------------------------------------------------------------------------
 1 | from string import punctuation, whitespace
 2 | 
 3 | 
 4 | def rem_whitespace(string):
 5 |     """ careful to keep this order of patterns or duplicate whitespace created in first round
 6 |     will not be removed
 7 |     """
 8 |     unwanted_chars = punctuation + whitespace
 9 | 
10 |     pat_l = [r'[' + unwanted_chars + ']',
11 |              r'\s+',
12 |              r'  ',
13 |              r' \\',
14 |              r' \ '
15 |              ]
16 | 
17 |     for p in pat_l:
18 |         rx = re.compile(p)
19 |         string = re.sub(rx, ' ', string)
20 | 
21 |     return string.strip()
22 | 
23 | 
24 | def encode(ustr):
25 |     return ustr.encode('utf8')
26 | 


--------------------------------------------------------------------------------
/PyperGrabber/retriever/log_this.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | # logging functions:
 5 | def log_search(msg):
 6 |     # print msg
 7 |     log_filename = 'search.log'
 8 |     logging.basicConfig(filename=log_filename,
 9 |                          level=logging.INFO,
10 |                          format='%(asctime)s %(message)s'
11 |                          )
12 |     logging.info(' - ' + msg)
13 | 
14 | 
15 | def log_download(msg):
16 |     # print msg
17 |     LOG_FILENAME = 'download.log'
18 |     logging.basicConfig(filename=LOG_FILENAME,
19 |                          level=logging.INFO,
20 |                          format='%(asctime)s %(message)s'
21 |                          )
22 |     logging.info(' - ' + msg)
23 | 


--------------------------------------------------------------------------------
/PyperGrabber/mail2pmid/mail2pmid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from imbox import Imbox
 4 | from config import *
 5 | import re
 6 | import sys
 7 | 
 8 | 
 9 | pmids = []
10 | msgs_l = []
11 | 
12 | 
13 | def mail2pmid():
14 |     imbox = Imbox(em_server, username=em_usr, password=em_pw)
15 | 
16 |     print '\n\nFetching emails ...',
17 |     email_gen = (imbox.messages(sent_from=sender))
18 |     while True:    
19 |         try:
20 |             msgs_l.append(email_gen.next())
21 |         except StopIteration:
22 |             break
23 |         except Exception as e:
24 |             print(e)
25 |             sys.exit(1)
26 | 
27 |     print("Done.\nSuccessfully retrieved {} messages from NCBI.".format(len(msgs_l)))
28 |     
29 |     msgs_str = str(msgs_l)
30 | 
31 |     pat = re.compile(r'pubmed\/(?P<pmid>\d+)')
32 | 
33 |     pmids = re.findall(pat, msgs_str)
34 | 
35 |     pmids = list(set(pmids))
36 | 
37 |     print('Found {} unique PMIDs.\n'.format(len(pmids)))
38 | 
39 |     return pmids


--------------------------------------------------------------------------------
/PyperGrabber/mail2pmid/imbox/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from six import PY3
 3 | 
 4 | import logging
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | if PY3:
 8 |     def str_encode(value='', encoding=None, errors='strict'):
 9 |         logger.debug("Encode str {} with and errors {}".format(value, encoding, errors))
10 |         return str(value, encoding, errors)
11 | 
12 |     def str_decode(value='', encoding=None, errors='strict'):
13 |         if isinstance(value, str):
14 |             return bytes(value, encoding, errors).decode('utf-8')
15 |         elif isinstance(value, bytes):
16 |             return value.decode(encoding or 'utf-8', errors=errors)
17 |         else:
18 |             raise TypeError( "Cannot decode '{}' object".format(value.__class__) )
19 | else:
20 |     def str_encode(string='', encoding=None, errors='strict'):
21 |         return unicode(string, encoding, errors)
22 | 
23 |     def str_decode(value='', encoding=None, errors='strict'):
24 |         return value.decode(encoding, errors)
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyperGrabber
 2 | Fetches PubMed article IDs (PMIDs) from email inbox, then crawls **PubMed**, **Google Scholar** and **Sci-Hub** for respective PDF files.
 3 | 
 4 | 
 5 | PubMed can send you regular update on new articles matching your specified search criteria. PyperGrabber will automatically download thoe papers, saving you much time tracking on downloading those manually. When no PDF article is found PyperGrabber will save the PubMed abstract of the respective article to PDF. All files are named after PMID for convenience.
 6 | 
 7 | 
 8 | ## NOTES:
 9 | - _Messy code ahead!_ 
10 | - Program may halt without error message. The source of this bug is yet to be determined.
11 | - The web crawler function may be used to work with other sources of PMIDs then email (e.g. command line parameter  or file holding list of PMIDs)
12 | 
13 | 
14 | ## Required dependencies:
15 |     sudo apt-get install wkhtmltopdf
16 |     sudo pip install pypdf
17 | 
18 | ## USAGE:
19 | - **Step 1** - Put in your email access data into `config.ini` or prepare to be prompted (works with IMAP)
20 | - **Step 2** - Start with: `python ./PyperGrabber.py`
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 bfelder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PyperGrabber/retriever/easy_parallelize.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | # from multiprocessing.dummy import Pool # use this module for threading: default is processing
 3 | 
 4 | 
 5 | """
 6 | NOTE: WORKS only when only a single argument/parameter needs to be passed to worker func!
 7 | 
 8 | http://chriskiehl.com/article/parallelism-in-one-line/
 9 | https://www.binpress.com/tutorial/simple-python-parallelism/121
10 | 
11 | speed of multithreading vs. multiprocessing:
12 | http://eli.thegreenplace.net/2012/01/16/python-parallelizing-cpu-bound-tasks-with-multiprocessing/
13 | 
14 | multiple arguments:
15 | https://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments
16 | """
17 | 
18 | 
19 | def easy_parallelize(func, data, pool_size=None):
20 |     if pool_size is None or pool_size < 1:  # make number of workers fit size of input data, if not specified otherwise
21 |         pool = Pool(processes=len(data))
22 |     else:
23 |         pool = Pool(processes=pool_size)
24 | 
25 |     results = pool.map(func, data)
26 | 
27 |     cleaned = filter(None, results)  # cleaning out None results
28 | 
29 |     pool.close()
30 |     pool.join()
31 | 
32 |     return cleaned
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/PyperGrabber/mail2pmid/imbox/imap.py:
--------------------------------------------------------------------------------
 1 | from imaplib import IMAP4, IMAP4_SSL
 2 | 
 3 | import logging
 4 | import ssl as pythonssllib
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class ImapTransport(object):
10 | 
11 |     def __init__(self, hostname, port=None, ssl=True):
12 |         self.hostname = hostname
13 |         self.port = port
14 |         kwargs = {}
15 | 
16 |         if ssl:
17 |             self.transport = IMAP4_SSL
18 |             if not self.port:
19 |                 self.port = 993
20 |     
21 |         else:
22 |             self.transport = IMAP4
23 |             if not self.port:
24 |                 self.port = 143
25 | 
26 |         self.server = self.transport(self.hostname, self.port)
27 |         logger.debug("Created IMAP4 transport for {host}:{port}"
28 |                      .format(host=self.hostname, port=self.port))
29 | 
30 |     def list_folders(self):
31 |         logger.debug("List all folders in mailbox")
32 |         return self.server.list()
33 | 
34 |     def connect(self, username, password):
35 |         self.server.login(username, password)
36 |         self.server.select()
37 |         logger.debug("Logged into server {} and selected mailbox 'INBOX'"
38 |                      .format(self.hostname))
39 |         return self.server
40 | 


--------------------------------------------------------------------------------
/PyperGrabber/config.py:
--------------------------------------------------------------------------------
 1 | from ConfigParser import SafeConfigParser
 2 | from os.path import expanduser
 3 | import getpass
 4 | 
 5 | parser = SafeConfigParser()
 6 | parser.read('config.ini')
 7 | 
 8 | VERSION = parser.get('general', 'version')
 9 | 
10 | # email access:
11 | em_server = parser.get('email_auth', 'em_server')
12 | em_usr = parser.get('email_auth', 'em_usr')
13 | em_pw = parser.get('email_auth', 'em_pw')
14 | sender = parser.get('email_auth', 'sender')
15 | 
16 | # prompting for email user name and password in case not provided by config.ini
17 | if len(em_usr) == 0:
18 |     em_usr = getpass.getpass(prompt='Input username for email account: ')
19 | if len(em_pw) == 0:
20 |     em_pw = getpass.getpass()
21 | 
22 | 
23 | 
24 | # most common user agent according to: https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
25 | USER_AGENT = parser.get('crawler', 'usr_agent')
26 | email = parser.get('crawler', 'email')
27 | cr_parser = parser.get('crawler', 'cr_parser')
28 | pubmed_base_url = parser.get('crawler', 'pubmed_base_url')
29 | scihub_base_url = parser.get('crawler', 'scihub_base_url')
30 | schola_base_url = parser.get('crawler', 'schola_base_url')
31 | 
32 | # dir to save files to:
33 | home = expanduser("~")
34 | save_dir = home + parser.get('paths', 'save_to')
35 | tmp_dir = save_dir + '.tmp/'  # hidden
36 | 


--------------------------------------------------------------------------------
/PyperGrabber/mail2pmid/imbox/query.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | # TODO - Validate query arguments
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | IMAP_MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
 8 |                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
 9 | 
10 | 
11 | def format_date(date):
12 | 
13 |     return "%s-%s-%s" % (date.day, IMAP_MONTHS[date.month - 1], date.year)
14 | 
15 | 
16 | def build_search_query(**kwargs):
17 | 
18 |     # Parse keyword arguments
19 |     unread = kwargs.get('unread', False)
20 |     sent_from = kwargs.get('sent_from', False)
21 |     sent_to = kwargs.get('sent_to', False)
22 |     date__gt = kwargs.get('date__gt', False)
23 |     if type(date__gt) is datetime.date:
24 |         date__gt = format_date(date__gt)
25 |     date__lt = kwargs.get('date__lt', False)
26 |     if type(date__lt) is datetime.date:
27 |         date__lt = format_date(date__lt)
28 |     subject = kwargs.get('subject')
29 | 
30 |     query = []
31 | 
32 |     if unread:
33 |         query.append("(UNSEEN)")
34 | 
35 |     if sent_from:
36 |         query.append('(FROM "%s")' % sent_from)
37 | 
38 |     if sent_to:
39 |         query.append('(TO "%s")' % sent_to)
40 | 
41 |     if date__gt:
42 |         query.append('(SINCE "%s")' % date__gt)
43 | 
44 |     if date__lt:
45 |         query.append('(BEFORE "%s")' % date__lt)
46 | 
47 |     if subject is not None:
48 |         query.append('(SUBJECT "%s")' % subject)
49 | 
50 |     if query:
51 |         logger.debug("IMAP query: {}".format(" ".join(query)))
52 |         return " ".join(query)
53 | 
54 |     logger.debug("IMAP query: {}".format("(ALL)"))
55 |     return "(ALL)"
56 | 


--------------------------------------------------------------------------------
/PyperGrabber/retriever/get_title.py:
--------------------------------------------------------------------------------
 1 | from Bio import Entrez
 2 | from string_funcs import rem_whitespace
 3 | 
 4 | 
 5 | # get title by polling NVBI with PMID as input
 6 | def get_title(query, email):
 7 |     Entrez.email = email  # Always tell NCBI who you are
 8 | 
 9 |     try:
10 |         esear_handle = Entrez.esearch(db="pubmed",
11 |                                       sort='relevance',
12 |                                       retmax='1',
13 |                                       retmode='xml',
14 |                                       term=query
15 |                                       )
16 |         r1 = Entrez.read(esear_handle)
17 |         # print 'results1: ', r1
18 | 
19 |         if int(r1['Count']) >= 1:
20 |             list = r1["IdList"]
21 | 
22 |             for index in range(0, len(list)):
23 |                 listid = list[index]
24 | 
25 |                 esum_handle = Entrez.esummary(db="pubmed",
26 |                                               sort='relevance',
27 |                                               retmax='1',
28 |                                               retmode='xml',
29 |                                               id=listid
30 |                                               )
31 |                 r2 = Entrez.read(esum_handle)
32 |                 ''' generates dic entry in the form: {'PMID':'title'},
33 |                 (duplicate, leading, trailing) whitespaces are removed
34 |                 '''
35 | 
36 |                 try:
37 |                     title = "'{}'".format(rem_whitespace(r2[0]['Title']))
38 |                 except:
39 |                     title = None
40 | 
41 |                 return title
42 | 
43 |         else:
44 |             return None
45 | 
46 |     except:
47 |         # in case of server error try to redo it by recursion:
48 |         get_title(query, email)


--------------------------------------------------------------------------------
/PyperGrabber/mail2pmid/imbox/__init__.py:
--------------------------------------------------------------------------------
 1 | from imap import ImapTransport
 2 | from parser import parse_email
 3 | from query import build_search_query
 4 | 
 5 | import logging
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class Imbox(object):
10 | 
11 |     def __init__(self, hostname, username=None, password=None, ssl=True,
12 |                  port=None, ssl_context=None):
13 | 
14 |         self.server = ImapTransport(hostname, ssl=ssl, port=port)
15 |                                    
16 |         self.hostname = hostname
17 |         self.username = username
18 |         self.password = password
19 |         self.connection = self.server.connect(username, password)
20 |         logger.info("Connected to IMAP Server with user {username} on {hostname}{ssl}".format(
21 |             hostname=hostname, username=username, ssl=(" over SSL" if ssl else "")))
22 | 
23 |     def logout(self):
24 |         self.connection.close()
25 |         self.connection.logout()
26 |         logger.info("Disconnected from IMAP Server {username}@{hostname}".format(
27 |             hostname=self.hostname, username=self.username))
28 | 
29 |     def query_uids(self, **kwargs):
30 |         query = build_search_query(**kwargs)
31 |         message, data = self.connection.uid('search', None, query)
32 |         if data[0] is None:
33 |             return []
34 |         return data[0].split()
35 | 
36 |     def fetch_by_uid(self, uid):
37 |         message, data = self.connection.uid('fetch', uid, '(BODY.PEEK[])')
38 |         logger.debug("Fetched message for UID {}".format(int(uid)))
39 |         raw_email = data[0][1]
40 | 
41 |         email_object = parse_email(raw_email)
42 | 
43 |         return email_object
44 | 
45 |     def fetch_list(self, **kwargs):
46 |         uid_list = self.query_uids(**kwargs)
47 |         logger.debug("Fetch all messages for UID in {}".format(uid_list))
48 | 
49 |         for uid in uid_list:
50 |             yield (uid, self.fetch_by_uid(uid))
51 | 
52 |     def mark_seen(self, uid):
53 |         logger.info("Mark UID {} with \\Seen FLAG".format(int(uid)))
54 |         self.connection.uid('STORE', uid, '+FLAGS', '(\\Seen)')
55 | 
56 |     def delete(self, uid):
57 |         logger.info("Mark UID {} with \\Deleted FLAG and expunge.".format(int(uid)))
58 |         mov, data = self.connection.uid('STORE', uid, '+FLAGS', '(\\Deleted)')
59 |         self.connection.expunge()
60 | 
61 |     def copy(self, uid, destination_folder):
62 |         logger.info("Copy UID {} to {} folder".format(int(uid), str(destination_folder)))
63 |         return self.connection.uid('COPY', uid, destination_folder)
64 | 
65 |     def move(self, uid, destination_folder):
66 |         logger.info("Move UID {} to {} folder".format(int(uid), str(destination_folder)))
67 |         if self.copy(uid, destination_folder):
68 |             self.delete(uid)
69 | 
70 |     def messages(self, *args, **kwargs):
71 |         folder = kwargs.get('folder', False)
72 |         msg = ""
73 | 
74 |         if folder:
75 |             self.connection.select(folder)
76 |             msg = " from folder '{}'".format(folder)
77 | 
78 |         logger.info("Fetch list of massages{}".format(msg))
79 |         return self.fetch_list(**kwargs)
80 | 
81 |     def folders(self):
82 |         return self.connection.list()
83 | 


--------------------------------------------------------------------------------
/PyperGrabber/PyperGrabber.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """ https://github.com/martinrusev/imbox
  4 | https://www.rz.ruhr-uni-bochum.de/mitteilungen/faqs/mail-konfiguration.html
  5 | https://stackoverflow.com/questions/364802/generator-comprehension
  6 | """
  7 | 
  8 | import sys
  9 | from mail2pmid.mail2pmid import mail2pmid
 10 | from retriever.retriever import retriever
 11 | from config import VERSION, save_dir, tmp_dir
 12 | import os
 13 | 
 14 | 
 15 | import subprocess
 16 | import shutil
 17 | import time
 18 | 
 19 | from retriever.web2pdf import pdf_print
 20 | import datetime
 21 | 
 22 | import glob
 23 | from pyPdf import PdfFileReader
 24 | 
 25 | 
 26 | # CHANGING TERMINAL TITLE
 27 | sys.stdout.write("\x1b]2;" + 'PyperGrabber v{}'.format(VERSION) + "\x07")
 28 | 
 29 | # check if folder path is already in use as it would cause trouble:
 30 | if os.path.exists(tmp_dir):
 31 |     # try saving remnant files by moving to lost and found folder
 32 |     new_dir = save_dir + 'lost_and_found/'
 33 |     files = os.listdir(tmp_dir)
 34 |     for f in files:
 35 |         try:
 36 |             shutil.move(f, new_dir)
 37 |         except Exception as e:
 38 |             print e
 39 |     # removing tmp_dir:
 40 |     shutil.rmtree(tmp_dir)
 41 | 
 42 | 
 43 | # retrieve pubmed's emails and extract PMID's from them
 44 | pmids = mail2pmid()
 45 | 
 46 | # counter variables for statistics
 47 | abstracts = 0
 48 | full_article = 0
 49 | 
 50 | # retrieve paper:
 51 | to = time.time()
 52 | num_digits = len(str(len(pmids)))  # dynamically calculate leading zeros
 53 | for i, pmid in enumerate(pmids):
 54 |     print "Fetching paper {number:0{wd}d} of {tot} with PMID: {id} ...".\
 55 |         format(wd=num_digits, number=i, tot=len(pmids), id=pmid),
 56 |     if not os.path.exists(tmp_dir):
 57 |         os.makedirs(tmp_dir)
 58 |     retriever(pmid)
 59 | 
 60 |     content = os.listdir(tmp_dir)
 61 |     if not content:  # if dir empty make pdf of pubmed abstract:
 62 |         pdf_print(pmid, save_dir)
 63 |         abstracts += 1
 64 |     else:  # if files exist rename them to PMID_i.pdf and move them one path level up
 65 |         pdfs = glob.glob(tmp_dir + '*.pdf')
 66 |         for pi, pdf in enumerate(pdfs):
 67 |             if pi == 0:
 68 |                 index = ''
 69 |             else:
 70 |                 index = '_{}'.format(pi+1)
 71 |             new_path = save_dir + pmid + index + '.pdf'
 72 |             os.rename(pdf, new_path)
 73 |         full_article += 1
 74 | 
 75 |     print "DONE."
 76 | 
 77 | # eliminate any duplicate files and left over empty dirs:
 78 | fd_cmd = "fdupes -rdN {} && find . -type d -empty -delete ".format(save_dir)  # test manually to check if working
 79 | subprocess.call([fd_cmd], shell=True)
 80 | # removing pdf files failing integrity check, e.g. HTML files in disguise#:
 81 | all_pdfs = glob.glob(save_dir + '*.pdf')
 82 | for candidate in all_pdfs:
 83 |     try:
 84 |          mypdf = PdfFileReader(file( 'filename', 'rb'))
 85 |     except:
 86 |          print candidate,' is invalid pdf'
 87 |          shutil.rmtree(candidate)
 88 | 
 89 | 
 90 | # finally removing tmp_dir:
 91 | # shutil.rmtree(tmp_dir)
 92 | 
 93 | # printing out concluding statistics:
 94 | total_time = (time.time() - to)/60
 95 | format_time = datetime.timedelta(seconds=total_time)  # format to hh:mm:ss format
 96 | print("Job took {0:0.0f} minutes to complete".format(format_time))
 97 | total_down = abstracts + full_article
 98 | avg_time = datetime.timedelta(seconds=total_time/total_down)
 99 | print("Average time to retrieve paper: {}".format(avg_time))
100 | print('Abstracts are {} of total fetches: {}'.format(abstracts, total_down))
101 | 
102 | 
103 | with open('time.txt', 'w') as out_f:
104 |     out_f.write(format_time)
105 | 


--------------------------------------------------------------------------------
/PyperGrabber/retriever/retriever.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from urllib import quote_plus
  5 | from log_this import log_search, log_download
  6 | from string_funcs import encode
  7 | from config import *
  8 | from get_title import get_title
  9 | 
 10 | from easy_parallelize import easy_parallelize
 11 | from flatten_list import flatten_list
 12 | 
 13 | import urllib2
 14 | from time import sleep
 15 | from random import uniform
 16 | from bs4 import BeautifulSoup
 17 | 
 18 | from urlparse import urljoin
 19 | 
 20 | import re
 21 | import shelve
 22 | 
 23 | 
 24 | # EMPTYING PERSISTENT STORAGE OF DOWNLOADED PDFs:
 25 | db = shelve.open('download_db.db', writeback=True)
 26 | try:
 27 |     for key in db:
 28 |         del db[key]
 29 | finally:
 30 |     db.close()
 31 | 
 32 | 
 33 | # mini crawler functions:
 34 | def check_db(entry):
 35 |     db = shelve.open('download_db.db', writeback=True)
 36 |     try:
 37 |         if entry in db:
 38 |             # print '{} found'.format(entry)
 39 |             exist = True
 40 |         else:
 41 |             db[entry] = ''  # putting entry into db
 42 |             exist = False
 43 |     finally:
 44 |         db.close()
 45 |     return exist
 46 | 
 47 | 
 48 | def get_pdf(pdf_link):
 49 | 
 50 |     # check whether value already existing in permanent storage:
 51 |     pdf_name = pdf_link.rsplit('/', 1)[-1]  # set filename according to last element of link
 52 |     if not check_db(pdf_name) and not check_db(pdf_link):
 53 |         # print 'Downloading: {}'.format(pdf_link)
 54 |         try:
 55 |             opener = urllib2.build_opener()
 56 |             opener.addheaders = [('User-agent', USER_AGENT)]
 57 | 
 58 |             r = opener.open(pdf_link)
 59 | 
 60 |             path = tmp_dir + pdf_name
 61 | 
 62 |             with open(path, "wb") as code:  # 'w'
 63 |                 code.write(r.read())
 64 | 
 65 |             # log successful download:
 66 |             log_download('DOWNLOADED: {}'.format(pdf_link))
 67 | 
 68 |         except Exception as e:
 69 |             log_download('FAILURE: {} | {}'.format(pdf_link, e))
 70 |     else:
 71 |         log_download('File already downloaded: {}'.format(pdf_name))
 72 | 
 73 | 
 74 | def rem_blacklisted(url_l):
 75 |     ret_list = []
 76 | 
 77 |     whitelist = [r'ncbi.nlm.nih.gov/pmc/articles',
 78 |                  r'scholar?cluster ='
 79 |                  ]
 80 | 
 81 |     blacklist = [r'mailto',  # important to prevent crashes
 82 | 
 83 |                  r'nlm.nih.gov',
 84 |                  r'pubmed.gov',
 85 |                  r'nih.gov',
 86 |                  r'dhhs.gov',
 87 |                  r'usa.gov',
 88 |                  r'youtube.com',
 89 |                  r'facebook.com',
 90 |                  r'twitter.com',
 91 |                  r'sci-hub.cc/donate',
 92 |                  r'vk.com',
 93 |                  r'google',
 94 |                  r'scholar.google',
 95 |                  r'.css',
 96 |                  r'index.html'
 97 |                  ]
 98 | 
 99 |     wl_rx = re.compile('.*' + '.*|.*'.join(wl for wl in whitelist) + '.*')
100 |     bl_rx = re.compile('.*' + '.*|.*'.join(bl for bl in blacklist) + '.*')
101 | 
102 |     for u in url_l:
103 |         # if whitelisted or not blacklisted join to return list
104 |         if re.match(wl_rx, u) or not re.match(bl_rx, u):
105 |             ret_list.append(u)
106 |         else:
107 |             continue
108 | 
109 |     return ret_list
110 | 
111 | 
112 | def get_links(url):
113 |     opener = urllib2.build_opener()
114 |     opener.addheaders = [('User-agent', USER_AGENT)]
115 | 
116 |     # print ' get_links working on url: {}'.format(url)
117 | 
118 |     #  if 'sci-hub.cc' in url:  # workaround for not having captcha solving right now
119 |     #      driver = webdriver.Chrome()
120 |     #      driver.get("http://www.google.com")
121 | 
122 |     fetch_timeout = 30
123 |     try:
124 |         response = opener.open(url, timeout=fetch_timeout)
125 | 
126 |         if response:
127 |             soup = BeautifulSoup(response, cr_parser, from_encoding=response.info().getparam('charset'))
128 |             href_l = [urljoin(url, h['href']) for h in soup.find_all(href=True)]
129 |             # print 'href_l 1: ', href_l
130 |             href_l = list(set(href_l))  # removing potential duplicates
131 |             href_l = rem_blacklisted(href_l)  # removing blacklisted
132 |             href_l = map(encode, href_l)  # transforming all potential unicode items to stringm
133 | 
134 |             pdf_l = [p for p in href_l if p.lower().endswith('.pdf')]  # picking potential pdf links
135 | 
136 |             # print 'pdf_l: ', pdf_l
137 | 
138 |             href_l = list(set(href_l) - set(pdf_l))  # removing pdf links from link list
139 |             # print 'href_l 3: ', href_l
140 | 
141 |             # downloading pdf files:
142 |             map(get_pdf, pdf_l)
143 | 
144 |             return href_l
145 | 
146 |     except Exception as e:
147 |         log_download("ERROR in get_links: {}".format(e))
148 |         return []
149 | 
150 | 
151 | def mini_crawler(seed_url):
152 |     link_l = [seed_url]  # starting link_l, populating with seed_url
153 |     visited = []
154 |     # print 'link_l in mini_crawler: ', link_l
155 |     max_depths = 2
156 | 
157 |     for i in range(max_depths):
158 |         if not link_l:
159 |             break
160 |         else:
161 |             go_to = set(link_l) - set(visited)  # preventing visiting site twice
162 |             go_to = list(go_to)
163 |             visited.extend(link_l)
164 | 
165 |             res_x = easy_parallelize(get_links, go_to)
166 |             link_l = list(set((flatten_list(res_x))))
167 | 
168 |             # print 'link_l: ', link_l
169 |         sleep(uniform(1.3, 4.7))  # small randomized pause to be easy on servers
170 |     log_download('CRAWLER FINISHED working on seed: {}'.format(seed_url))
171 | 
172 | 
173 | # --------------------------------------------------------------------------------------
174 | '''    MAIN FUNCTION    '''
175 | 
176 | 
177 | def retriever(pmid):
178 |     # print 'threader received data: ', data
179 |     log_search('SEARCHING: {}'.format(pmid))
180 | 
181 |     # retrievingpaper title via pubmed API:
182 |     title = get_title(pmid, email)
183 | 
184 |     seed_links = []  # seed links, populated with data
185 | 
186 |     ncbi_url = pubmed_base_url + pmid
187 |     scihub_url = scihub_base_url + pmid
188 |     seed_links.extend([ncbi_url, scihub_url])
189 | 
190 |     if title:
191 |         scho_tit = quote_plus(title)
192 |         gosch_url = schola_base_url.format(scho_tit)
193 |         seed_links.append(gosch_url)
194 | 
195 |     # print 'seed_links: ', seed_links
196 |     # via iteration:
197 |     for seed in seed_links:
198 |         mini_crawler(seed)
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/PyperGrabber/mail2pmid/imbox/parser.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | from six import BytesIO, binary_type
  3 | 
  4 | import re
  5 | import email
  6 | import base64
  7 | import quopri
  8 | import time
  9 | from datetime import datetime
 10 | from email.header import decode_header
 11 | from utils import str_encode, str_decode
 12 | 
 13 | import logging
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class Struct(object):
 18 |     def __init__(self, **entries):
 19 |         self.__dict__.update(entries)
 20 | 
 21 |     def keys(self):
 22 |         return self.__dict__.keys()
 23 | 
 24 |     def __repr__(self):
 25 |         return str(self.__dict__)
 26 | 
 27 | 
 28 | def decode_mail_header(value, default_charset='us-ascii'):
 29 |     """
 30 |     Decode a header value into a unicode string.
 31 |     """
 32 |     try:
 33 |         headers = decode_header(value)
 34 |     except email.errors.HeaderParseError:
 35 |         return str_decode(str_encode(value, default_charset, 'replace'), default_charset)
 36 |     else:
 37 |         for index, (text, charset) in enumerate(headers):
 38 |             logger.debug("Mail header no. {}: {} encoding {}".format(index, str_decode(text, charset or 'utf-8'), charset))
 39 |             try:
 40 |                 headers[index] = str_decode(text, charset or default_charset,
 41 |                                             'replace')
 42 |             except LookupError:
 43 |                 # if the charset is unknown, force default
 44 |                 headers[index] = str_decode(text, default_charset, 'replace')
 45 | 
 46 |         return ''.join(headers)
 47 | 
 48 | 
 49 | def get_mail_addresses(message, header_name):
 50 |     """
 51 |     Retrieve all email addresses from one message header.
 52 |     """
 53 |     headers = [h for h in message.get_all(header_name, [])]
 54 |     addresses = email.utils.getaddresses(headers)
 55 | 
 56 |     for index, (address_name, address_email) in enumerate(addresses):
 57 |         addresses[index] = {'name': decode_mail_header(address_name),
 58 |                             'email': address_email}
 59 |         logger.debug("{} Mail addressees in message: <{}> {}".format(header_name.upper(), address_name, address_email))
 60 |     return addresses
 61 | 
 62 | 
 63 | def decode_param(param):
 64 |     name, v = param.split('=', 1)
 65 |     values = v.split('\n')
 66 |     value_results = []
 67 |     for value in values:
 68 |         match = re.search(r'=\?((?:\w|-)+)\?(Q|B)\?(.+)\?=', value)
 69 |         if match:
 70 |             encoding, type_, code = match.groups()
 71 |             if type_ == 'Q':
 72 |                 value = quopri.decodestring(code)
 73 |             elif type_ == 'B':
 74 |                 value = base64.decodestring(code)
 75 |             value = str_encode(value, encoding)
 76 |             value_results.append(value)
 77 |             if value_results:
 78 |                 v = ''.join(value_results)
 79 |     logger.debug("Decoded parameter {} - {}".format(name, v))
 80 |     return name, v
 81 | 
 82 | 
 83 | def parse_attachment(message_part):
 84 |     # Check again if this is a valid attachment
 85 |     content_disposition = message_part.get("Content-Disposition", None)
 86 |     if content_disposition is not None and not message_part.is_multipart():
 87 |         dispositions = content_disposition.strip().split(";")
 88 | 
 89 |         if dispositions[0].lower() in ["attachment", "inline"]:
 90 |             file_data = message_part.get_payload(decode=True)
 91 | 
 92 |             attachment = {
 93 |                 'content-type': message_part.get_content_type(),
 94 |                 'size': len(file_data),
 95 |                 'content': BytesIO(file_data)
 96 |             }
 97 |             filename = message_part.get_param('name')
 98 |             if filename:
 99 |                 attachment['filename'] = filename
100 | 
101 |             for param in dispositions[1:]:
102 |                 name, value = decode_param(param)
103 | 
104 |                 if 'file' in name:
105 |                     attachment['filename'] = value
106 | 
107 |                 if 'create-date' in name:
108 |                     attachment['create-date'] = value
109 | 
110 |             return attachment
111 | 
112 |     return None
113 | 
114 | 
115 | def decode_content(message):
116 |     content = message.get_payload(decode=True)
117 |     charset = message.get_content_charset('utf-8')
118 |     try:
119 |         return content.decode(charset)
120 |     except AttributeError:
121 |         return content
122 | 
123 | 
124 | def parse_email(raw_email):
125 |     if isinstance(raw_email, binary_type):
126 |         raw_email = str_encode(raw_email, 'utf-8')
127 |     try:
128 |         email_message = email.message_from_string(raw_email)
129 |     except UnicodeEncodeError:
130 |         email_message = email.message_from_string(raw_email.encode('utf-8'))
131 |     maintype = email_message.get_content_maintype()
132 |     parsed_email = {}
133 | 
134 |     parsed_email['raw_email'] = raw_email
135 | 
136 |     body = {
137 |         "plain": [],
138 |         "html": []
139 |     }
140 |     attachments = []
141 | 
142 |     if maintype in ('multipart', 'image'):
143 |         logger.debug("Multipart message. Will process parts.")
144 |         for part in email_message.walk():
145 |             content_type = part.get_content_type()
146 |             part_maintype = part.get_content_maintype()
147 |             content_disposition = part.get('Content-Disposition', None)
148 |             if content_disposition or not part_maintype == "text":
149 |                 content = part.get_payload(decode=True)
150 |             else:
151 |                 content = decode_content(part)
152 | 
153 |             is_inline = content_disposition is None \
154 |                 or content_disposition == "inline"
155 |             if content_type == "text/plain" and is_inline:
156 |                 body['plain'].append(content)
157 |             elif content_type == "text/html" and is_inline:
158 |                 body['html'].append(content)
159 |             elif content_disposition:
160 |                 attachment = parse_attachment(part)
161 |                 if attachment:
162 |                     attachments.append(attachment)
163 | 
164 |     elif maintype == 'text':
165 |         payload = decode_content(email_message)
166 |         body['plain'].append(payload)
167 | 
168 |     parsed_email['attachments'] = attachments
169 | 
170 |     parsed_email['body'] = body
171 |     email_dict = dict(email_message.items())
172 | 
173 |     parsed_email['sent_from'] = get_mail_addresses(email_message, 'from')
174 |     parsed_email['sent_to'] = get_mail_addresses(email_message, 'to')
175 |     parsed_email['cc'] = get_mail_addresses(email_message, 'cc')
176 |     parsed_email['bcc'] = get_mail_addresses(email_message, 'bcc')
177 | 
178 |     value_headers_keys = ['subject', 'date', 'message-id']
179 |     key_value_header_keys = ['received-spf',
180 |                              'mime-version',
181 |                              'x-spam-status',
182 |                              'x-spam-score',
183 |                              'content-type']
184 | 
185 |     parsed_email['headers'] = []
186 |     for key, value in email_dict.items():
187 | 
188 |         if key.lower() in value_headers_keys:
189 |             valid_key_name = key.lower().replace('-', '_')
190 |             parsed_email[valid_key_name] = decode_mail_header(value)
191 | 
192 |         if key.lower() in key_value_header_keys:
193 |             parsed_email['headers'].append({'Name': key,
194 |                                             'Value': value})
195 | 
196 |     if parsed_email.get('date'):
197 |         timetuple = email.utils.parsedate(parsed_email['date'])
198 |         parsed_date = datetime.fromtimestamp(time.mktime(timetuple)) \
199 |             if timetuple else None
200 |         parsed_email['parsed_date'] = parsed_date
201 | 
202 |     logger.info("Downloaded and parsed mail '{}' with {} attachments".format(
203 |         parsed_email.get('subject'), len(parsed_email.get('attachments'))))
204 |     return Struct(**parsed_email)
205 | 


--------------------------------------------------------------------------------