├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── fetch_abstracts.py
    └── pubmedflow.ipynb
├── pubmedflow
    ├── __init__.py
    ├── pubmedflow.py
    └── utils.py
└── setup.py


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: true
18 |       matrix:
19 |         python-version: ["3.8", "3.9"]
20 |         os: [ubuntu-latest, macos-latest]
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v3
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v3
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install flake8 pytest
32 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pip install --upgrade pip
42 |         python setup.py install
43 |         
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .DS_Store
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 NFFLOW
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h2 align="center">PUBMED-FLOW </h2>
  2 | <h3 align="center"> Open source data collection tool to fetch data from pubmed</h3>
  3 | <p align="center"> Contribute and Support </p>
  4 | 
  5 | 
  6 | [![License:MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  7 | [![GitHub commit](https://img.shields.io/github/last-commit/nfflow/pubmedflow)](https://github.com/nfflow/pubmedflow/commits/main)
  8 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com)
  9 | [![Open All Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1mjlnHAb7aqwfDEylo05z3RdIyyaNRoQ5?usp=sharing)
 10 | 
 11 | 
 12 | ## 🎮 Features
 13 | 
 14 | - fetch pubmed ids (pmids) based on keyword query (supports multiple keywords query)
 15 | - Fetch Abstract of research papers from pubmed based on pmids
 16 | - Download the full pdf of respective pmid -> if available on pubmedcentral (pmc)
 17 | - if pdf not available on pmc -> download from scihub internally
 18 | 
 19 | 
 20 | ## How to obtain ncbi key?
 21 | 
 22 | - Follow this [tutorial](https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/#:~:text=To%20create%20the%20key%2C%20go,and%20copy%20the%20resulting%20key)
 23 | 
 24 | ## Installation
 25 | ### From pypi
 26 | 
 27 | ```
 28 | pip install pubmedflow
 29 | ```
 30 | 
 31 | ### From source
 32 | ```python
 33 | python setup.py install
 34 | ```
 35 | OR
 36 | ```
 37 | pip install git+https://github.com/nfflow/pubmedflow
 38 | ```
 39 | 
 40 | ## How to use api?
 41 | 
 42 | Arguments:   
 43 | Name | Input | Description 
 44 |  ----------- | ----------- |  -----------
 45 | folder_name | Optional, str | path to store output data 
 46 | 
 47 | 
 48 | ## Quick Start:
 49 | 
 50 | ### Download pubmed articles as PDF and DataFrame -
 51 | 
 52 | ```python
 53 | 
 54 | import eutils
 55 | from pubmedflow import LazyPubmed
 56 | 
 57 | 
 58 | pb        = LazyPubmed(title_query,
 59 |                  folder_name='pubmed_data',
 60 |                  api_key='',
 61 |                  max_documents=None,
 62 |                  download_pdf=True,
 63 |                  scihub=False)
 64 |                     
 65 | ```
 66 | 
 67 | ### Perform unsupervised learning to make a pre-trained model from the collected data:
 68 | 
 69 | ```python
 70 | pb.pubmed_train(model_name='sentence-transformers/all-mpnet-base-v2',
 71 |                                      model_output_path='pubmedflow_model',
 72 |                                      model_architecture='ct')
 73 | ```
 74 | 
 75 | ### Do question answering on the downloaded text to get answer spans from each article:
 76 | 
 77 | ```python
 78 | 
 79 | qa_results = pb.pubmed_qa(qa_query = 'What are the chronic diseases',)
 80 |  print(qa_results)
 81 |  ```
 82 |  
 83 |  ### Summarise each of them
 84 |  
 85 |  ```python
 86 |  
 87 | summ_results = pb.pubmed_summarise()
 88 |  print(summ_results)
 89 |  ```
 90 |  
 91 |   ### Perform entity extraction on each of them
 92 |  
 93 |  ```python
 94 |  
 95 | ents = pb.pubmed_entity_extraction()
 96 |  print(ents)
 97 |  ```
 98 |  
 99 |  
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/examples/fetch_abstracts.py:
--------------------------------------------------------------------------------
1 | from pubmedflow import LazyPubmed
2 | pb        = LazyPubmed()
3 | 
4 | result    = pb.fetch(query = "lncRNA",
5 |                     key = "your_api_key", 
6 |                     max_documents = 5)


--------------------------------------------------------------------------------
/examples/pubmedflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "id": "d7aa088f",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pubmedflow import LazyPubmed\n",
 11 |     "\n",
 12 |     "pb        = LazyPubmed()\n",
 13 |     "df_result = pb.pubmed_search(query         = 'Chronic',\n",
 14 |     "                             key           = \"your_api_key\",\n",
 15 |     "                             max_documents = 10,\n",
 16 |     "                             download_pdf  = True, \n",
 17 |     "                             scihub        = False)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "id": "e14c8487",
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/html": [
 29 |        "<div>\n",
 30 |        "<style scoped>\n",
 31 |        "    .dataframe tbody tr th:only-of-type {\n",
 32 |        "        vertical-align: middle;\n",
 33 |        "    }\n",
 34 |        "\n",
 35 |        "    .dataframe tbody tr th {\n",
 36 |        "        vertical-align: top;\n",
 37 |        "    }\n",
 38 |        "\n",
 39 |        "    .dataframe thead th {\n",
 40 |        "        text-align: right;\n",
 41 |        "    }\n",
 42 |        "</style>\n",
 43 |        "<table border=\"1\" class=\"dataframe\">\n",
 44 |        "  <thead>\n",
 45 |        "    <tr style=\"text-align: right;\">\n",
 46 |        "      <th></th>\n",
 47 |        "      <th>title</th>\n",
 48 |        "      <th>issue</th>\n",
 49 |        "      <th>pages</th>\n",
 50 |        "      <th>abstract</th>\n",
 51 |        "      <th>journal</th>\n",
 52 |        "      <th>authors</th>\n",
 53 |        "      <th>pubdate</th>\n",
 54 |        "      <th>pmid</th>\n",
 55 |        "      <th>mesh_terms</th>\n",
 56 |        "      <th>publication_types</th>\n",
 57 |        "      <th>...</th>\n",
 58 |        "      <th>references</th>\n",
 59 |        "      <th>delete</th>\n",
 60 |        "      <th>affiliations</th>\n",
 61 |        "      <th>pmc</th>\n",
 62 |        "      <th>other_id</th>\n",
 63 |        "      <th>medline_ta</th>\n",
 64 |        "      <th>nlm_unique_id</th>\n",
 65 |        "      <th>issn_linking</th>\n",
 66 |        "      <th>country</th>\n",
 67 |        "      <th>pdf_content</th>\n",
 68 |        "    </tr>\n",
 69 |        "  </thead>\n",
 70 |        "  <tbody>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>0</th>\n",
 73 |        "      <td>Assessing the experience of person-centred coo...</td>\n",
 74 |        "      <td>25(3)</td>\n",
 75 |        "      <td>1069-1080</td>\n",
 76 |        "      <td>BACKGROUND\\nCountries are adapting their healt...</td>\n",
 77 |        "      <td>Health expectations : an international journal...</td>\n",
 78 |        "      <td>Rijken|Mieke|M|https://orcid.org/0000-0001-607...</td>\n",
 79 |        "      <td>2022</td>\n",
 80 |        "      <td>35318778</td>\n",
 81 |        "      <td>D000328:Adult; D000368:Aged; D000369:Aged, 80 ...</td>\n",
 82 |        "      <td>D016428:Journal Article</td>\n",
 83 |        "      <td>...</td>\n",
 84 |        "      <td>29444767;29166917;8870135;15804318;22778146;18...</td>\n",
 85 |        "      <td>False</td>\n",
 86 |        "      <td>Nivel (Netherlands Institute for Health Servic...</td>\n",
 87 |        "      <td>NaN</td>\n",
 88 |        "      <td>NaN</td>\n",
 89 |        "      <td>Health Expect</td>\n",
 90 |        "      <td>9815926</td>\n",
 91 |        "      <td>1369-6513</td>\n",
 92 |        "      <td>England</td>\n",
 93 |        "      <td>Received 11 August 2021 Revised 28 November 20...</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>1</th>\n",
 97 |        "      <td>Association Between Systolic Blood Pressure Va...</td>\n",
 98 |        "      <td>11(11)</td>\n",
 99 |        "      <td>e025513</td>\n",
100 |        "      <td>Background Whether visit-to-visit systolic blo...</td>\n",
101 |        "      <td>Journal of the American Heart Association</td>\n",
102 |        "      <td>Park|Cheol Ho|CH|0000-0003-4636-5745;Kim|Hyung...</td>\n",
103 |        "      <td>2022</td>\n",
104 |        "      <td>35656977</td>\n",
105 |        "      <td>D001794:Blood Pressure; D002318:Cardiovascular...</td>\n",
106 |        "      <td>D016428:Journal Article</td>\n",
107 |        "      <td>...</td>\n",
108 |        "      <td>NaN</td>\n",
109 |        "      <td>False</td>\n",
110 |        "      <td>Department of Internal Medicine College of Med...</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>NaN</td>\n",
113 |        "      <td>J Am Heart Assoc</td>\n",
114 |        "      <td>101580524</td>\n",
115 |        "      <td>2047-9980</td>\n",
116 |        "      <td>England</td>\n",
117 |        "      <td>NaN</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>2</th>\n",
121 |        "      <td>Comorbidity progression patterns of major chro...</td>\n",
122 |        "      <td>NaN</td>\n",
123 |        "      <td>17423953221087647</td>\n",
124 |        "      <td>OBJECTIVE\\nThe presence of one chronic disease...</td>\n",
125 |        "      <td>Chronic illness</td>\n",
126 |        "      <td>Uddin|Shahadat|S|https://orcid.org/0000-0003-0...</td>\n",
127 |        "      <td>2022</td>\n",
128 |        "      <td>35306857</td>\n",
129 |        "      <td>NaN</td>\n",
130 |        "      <td>D016428:Journal Article</td>\n",
131 |        "      <td>...</td>\n",
132 |        "      <td>NaN</td>\n",
133 |        "      <td>False</td>\n",
134 |        "      <td>Faculty of Engineering, 4334The University of ...</td>\n",
135 |        "      <td>NaN</td>\n",
136 |        "      <td>NaN</td>\n",
137 |        "      <td>Chronic Illn</td>\n",
138 |        "      <td>101253019</td>\n",
139 |        "      <td>1742-3953</td>\n",
140 |        "      <td>United States</td>\n",
141 |        "      <td>NaN</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>3</th>\n",
145 |        "      <td>A Review of Laser Therapy and Low-Intensity Ul...</td>\n",
146 |        "      <td>26(1)</td>\n",
147 |        "      <td>57-63</td>\n",
148 |        "      <td>PURPOSE OF REVIEW\\nChronic pain management the...</td>\n",
149 |        "      <td>Current pain and headache reports</td>\n",
150 |        "      <td>Chen|Frank R|FR|;Manzi|Joseph E|JE|;Mehta|Neel...</td>\n",
151 |        "      <td>2022</td>\n",
152 |        "      <td>35133560</td>\n",
153 |        "      <td>D059350:Chronic Pain; D006801:Humans; D053685:...</td>\n",
154 |        "      <td>D016428:Journal Article; D016454:Review</td>\n",
155 |        "      <td>...</td>\n",
156 |        "      <td>32880358;25824429;31726927;30443883;12605432;2...</td>\n",
157 |        "      <td>False</td>\n",
158 |        "      <td>Department of Anesthesiology, Hospital of the ...</td>\n",
159 |        "      <td>NaN</td>\n",
160 |        "      <td>NaN</td>\n",
161 |        "      <td>Curr Pain Headache Rep</td>\n",
162 |        "      <td>100970666</td>\n",
163 |        "      <td>1534-3081</td>\n",
164 |        "      <td>United States</td>\n",
165 |        "      <td>NaN</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>4</th>\n",
169 |        "      <td>Patients' and healthcare providers' perception...</td>\n",
170 |        "      <td>22(1)</td>\n",
171 |        "      <td>9</td>\n",
172 |        "      <td>BACKGROUND\\nTelehealth and online health infor...</td>\n",
173 |        "      <td>BMC geriatrics</td>\n",
174 |        "      <td>Jiang|Yuyu|Y|;Sun|Pingping|P|;Chen|Zhongyi|Z|;...</td>\n",
175 |        "      <td>2022</td>\n",
176 |        "      <td>34979967</td>\n",
177 |        "      <td>D000368:Aged; D019468:Disease Management; D006...</td>\n",
178 |        "      <td>D016428:Journal Article; D013485:Research Supp...</td>\n",
179 |        "      <td>...</td>\n",
180 |        "      <td>32512462;16867972;32314971;12020305;33687342;2...</td>\n",
181 |        "      <td>False</td>\n",
182 |        "      <td>Research office of chronic disease management ...</td>\n",
183 |        "      <td>NaN</td>\n",
184 |        "      <td>NaN</td>\n",
185 |        "      <td>BMC Geriatr</td>\n",
186 |        "      <td>100968548</td>\n",
187 |        "      <td>1471-2318</td>\n",
188 |        "      <td>England</td>\n",
189 |        "      <td>Jiang et al BMC Geriatrics 2022 22 9 https doi...</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>5</th>\n",
193 |        "      <td>A Preliminary Study of Provider Burden in the ...</td>\n",
194 |        "      <td>22(11)</td>\n",
195 |        "      <td>1408-1417</td>\n",
196 |        "      <td>This study compared perceptions of the burden ...</td>\n",
197 |        "      <td>The journal of pain</td>\n",
198 |        "      <td>Tait|Raymond C|RC|;Chibnall|John T|JT|;Kalauok...</td>\n",
199 |        "      <td>2021</td>\n",
200 |        "      <td>33989786</td>\n",
201 |        "      <td>D000328:Adult; D001291:Attitude of Health Pers...</td>\n",
202 |        "      <td>D016428:Journal Article; D013485:Research Supp...</td>\n",
203 |        "      <td>...</td>\n",
204 |        "      <td>NaN</td>\n",
205 |        "      <td>False</td>\n",
206 |        "      <td>Department of Psychiatry and Behavioral Neuros...</td>\n",
207 |        "      <td>NaN</td>\n",
208 |        "      <td>NaN</td>\n",
209 |        "      <td>J Pain</td>\n",
210 |        "      <td>100898657</td>\n",
211 |        "      <td>1526-5900</td>\n",
212 |        "      <td>United States</td>\n",
213 |        "      <td>NaN</td>\n",
214 |        "    </tr>\n",
215 |        "    <tr>\n",
216 |        "      <th>6</th>\n",
217 |        "      <td>\"A little bit of a guidance and a little bit o...</td>\n",
218 |        "      <td>43(23)</td>\n",
219 |        "      <td>3347-3356</td>\n",
220 |        "      <td>PURPOSE\\nTo understand preferences, barriers, ...</td>\n",
221 |        "      <td>Disability and rehabilitation</td>\n",
222 |        "      <td>Dnes|Natalie|N|;Coley|Bridget|B|;Frisby|Kaitly...</td>\n",
223 |        "      <td>2021</td>\n",
224 |        "      <td>32223460</td>\n",
225 |        "      <td>D000293:Adolescent; D000328:Adult; D059350:Chr...</td>\n",
226 |        "      <td>D016428:Journal Article; D013485:Research Supp...</td>\n",
227 |        "      <td>...</td>\n",
228 |        "      <td>NaN</td>\n",
229 |        "      <td>False</td>\n",
230 |        "      <td>Department of Physical Therapy, University of ...</td>\n",
231 |        "      <td>NaN</td>\n",
232 |        "      <td>NaN</td>\n",
233 |        "      <td>Disabil Rehabil</td>\n",
234 |        "      <td>9207179</td>\n",
235 |        "      <td>0963-8288</td>\n",
236 |        "      <td>England</td>\n",
237 |        "      <td>NaN</td>\n",
238 |        "    </tr>\n",
239 |        "    <tr>\n",
240 |        "      <th>7</th>\n",
241 |        "      <td>Chronic disease health literacy in First Natio...</td>\n",
242 |        "      <td>30(17-18)</td>\n",
243 |        "      <td>2683-2695</td>\n",
244 |        "      <td>AIM\\nTo explore chronic disease education, sel...</td>\n",
245 |        "      <td>Journal of clinical nursing</td>\n",
246 |        "      <td>Rheault|Haunnah|H|https://orcid.org/0000-0001-...</td>\n",
247 |        "      <td>2021</td>\n",
248 |        "      <td>34180097</td>\n",
249 |        "      <td>D000328:Adult; D001315:Australia; D002908:Chro...</td>\n",
250 |        "      <td>D016428:Journal Article</td>\n",
251 |        "      <td>...</td>\n",
252 |        "      <td>NaN</td>\n",
253 |        "      <td>False</td>\n",
254 |        "      <td>School of Nursing, Queensland University of Te...</td>\n",
255 |        "      <td>NaN</td>\n",
256 |        "      <td>NaN</td>\n",
257 |        "      <td>J Clin Nurs</td>\n",
258 |        "      <td>9207302</td>\n",
259 |        "      <td>0962-1067</td>\n",
260 |        "      <td>England</td>\n",
261 |        "      <td>NaN</td>\n",
262 |        "    </tr>\n",
263 |        "    <tr>\n",
264 |        "      <th>8</th>\n",
265 |        "      <td>Patient Perceptions of Physician Burden in the...</td>\n",
266 |        "      <td>22(9)</td>\n",
267 |        "      <td>1060-1071</td>\n",
268 |        "      <td>While patient perceptions of burden to caregiv...</td>\n",
269 |        "      <td>The journal of pain</td>\n",
270 |        "      <td>Tait|Raymond C|RC|;Chibnall|John T|JT|;Kalauok...</td>\n",
271 |        "      <td>2021</td>\n",
272 |        "      <td>33727158</td>\n",
273 |        "      <td>D000328:Adult; D059350:Chronic Pain; D002983:C...</td>\n",
274 |        "      <td>D016428:Journal Article</td>\n",
275 |        "      <td>...</td>\n",
276 |        "      <td>NaN</td>\n",
277 |        "      <td>False</td>\n",
278 |        "      <td>Department of Psychiatry and Behavioral Neuros...</td>\n",
279 |        "      <td>NaN</td>\n",
280 |        "      <td>NaN</td>\n",
281 |        "      <td>J Pain</td>\n",
282 |        "      <td>100898657</td>\n",
283 |        "      <td>1526-5900</td>\n",
284 |        "      <td>United States</td>\n",
285 |        "      <td>NaN</td>\n",
286 |        "    </tr>\n",
287 |        "    <tr>\n",
288 |        "      <th>9</th>\n",
289 |        "      <td>The relationship between the perception of chr...</td>\n",
290 |        "      <td>NaN</td>\n",
291 |        "      <td>17423953211039792</td>\n",
292 |        "      <td>OBJECTIVES\\nIn this study, it was aimed to det...</td>\n",
293 |        "      <td>Chronic illness</td>\n",
294 |        "      <td>Akca|Nesrin|N|;Saygili|Meltem|M|;Ture|Aysun Ka...</td>\n",
295 |        "      <td>2021</td>\n",
296 |        "      <td>34569319</td>\n",
297 |        "      <td>NaN</td>\n",
298 |        "      <td>D016428:Journal Article</td>\n",
299 |        "      <td>...</td>\n",
300 |        "      <td>NaN</td>\n",
301 |        "      <td>False</td>\n",
302 |        "      <td>52977Kirikkale University, Faculty of Health S...</td>\n",
303 |        "      <td>NaN</td>\n",
304 |        "      <td>NaN</td>\n",
305 |        "      <td>Chronic Illn</td>\n",
306 |        "      <td>101253019</td>\n",
307 |        "      <td>1742-3953</td>\n",
308 |        "      <td>United States</td>\n",
309 |        "      <td>NaN</td>\n",
310 |        "    </tr>\n",
311 |        "  </tbody>\n",
312 |        "</table>\n",
313 |        "<p>10 rows × 23 columns</p>\n",
314 |        "</div>"
315 |       ],
316 |       "text/plain": [
317 |        "                                               title      issue  \\\n",
318 |        "0  Assessing the experience of person-centred coo...      25(3)   \n",
319 |        "1  Association Between Systolic Blood Pressure Va...     11(11)   \n",
320 |        "2  Comorbidity progression patterns of major chro...        NaN   \n",
321 |        "3  A Review of Laser Therapy and Low-Intensity Ul...      26(1)   \n",
322 |        "4  Patients' and healthcare providers' perception...      22(1)   \n",
323 |        "5  A Preliminary Study of Provider Burden in the ...     22(11)   \n",
324 |        "6  \"A little bit of a guidance and a little bit o...     43(23)   \n",
325 |        "7  Chronic disease health literacy in First Natio...  30(17-18)   \n",
326 |        "8  Patient Perceptions of Physician Burden in the...      22(9)   \n",
327 |        "9  The relationship between the perception of chr...        NaN   \n",
328 |        "\n",
329 |        "               pages                                           abstract  \\\n",
330 |        "0          1069-1080  BACKGROUND\\nCountries are adapting their healt...   \n",
331 |        "1            e025513  Background Whether visit-to-visit systolic blo...   \n",
332 |        "2  17423953221087647  OBJECTIVE\\nThe presence of one chronic disease...   \n",
333 |        "3              57-63  PURPOSE OF REVIEW\\nChronic pain management the...   \n",
334 |        "4                  9  BACKGROUND\\nTelehealth and online health infor...   \n",
335 |        "5          1408-1417  This study compared perceptions of the burden ...   \n",
336 |        "6          3347-3356  PURPOSE\\nTo understand preferences, barriers, ...   \n",
337 |        "7          2683-2695  AIM\\nTo explore chronic disease education, sel...   \n",
338 |        "8          1060-1071  While patient perceptions of burden to caregiv...   \n",
339 |        "9  17423953211039792  OBJECTIVES\\nIn this study, it was aimed to det...   \n",
340 |        "\n",
341 |        "                                             journal  \\\n",
342 |        "0  Health expectations : an international journal...   \n",
343 |        "1          Journal of the American Heart Association   \n",
344 |        "2                                    Chronic illness   \n",
345 |        "3                  Current pain and headache reports   \n",
346 |        "4                                     BMC geriatrics   \n",
347 |        "5                                The journal of pain   \n",
348 |        "6                      Disability and rehabilitation   \n",
349 |        "7                        Journal of clinical nursing   \n",
350 |        "8                                The journal of pain   \n",
351 |        "9                                    Chronic illness   \n",
352 |        "\n",
353 |        "                                             authors  pubdate      pmid  \\\n",
354 |        "0  Rijken|Mieke|M|https://orcid.org/0000-0001-607...     2022  35318778   \n",
355 |        "1  Park|Cheol Ho|CH|0000-0003-4636-5745;Kim|Hyung...     2022  35656977   \n",
356 |        "2  Uddin|Shahadat|S|https://orcid.org/0000-0003-0...     2022  35306857   \n",
357 |        "3  Chen|Frank R|FR|;Manzi|Joseph E|JE|;Mehta|Neel...     2022  35133560   \n",
358 |        "4  Jiang|Yuyu|Y|;Sun|Pingping|P|;Chen|Zhongyi|Z|;...     2022  34979967   \n",
359 |        "5  Tait|Raymond C|RC|;Chibnall|John T|JT|;Kalauok...     2021  33989786   \n",
360 |        "6  Dnes|Natalie|N|;Coley|Bridget|B|;Frisby|Kaitly...     2021  32223460   \n",
361 |        "7  Rheault|Haunnah|H|https://orcid.org/0000-0001-...     2021  34180097   \n",
362 |        "8  Tait|Raymond C|RC|;Chibnall|John T|JT|;Kalauok...     2021  33727158   \n",
363 |        "9  Akca|Nesrin|N|;Saygili|Meltem|M|;Ture|Aysun Ka...     2021  34569319   \n",
364 |        "\n",
365 |        "                                          mesh_terms  \\\n",
366 |        "0  D000328:Adult; D000368:Aged; D000369:Aged, 80 ...   \n",
367 |        "1  D001794:Blood Pressure; D002318:Cardiovascular...   \n",
368 |        "2                                                NaN   \n",
369 |        "3  D059350:Chronic Pain; D006801:Humans; D053685:...   \n",
370 |        "4  D000368:Aged; D019468:Disease Management; D006...   \n",
371 |        "5  D000328:Adult; D001291:Attitude of Health Pers...   \n",
372 |        "6  D000293:Adolescent; D000328:Adult; D059350:Chr...   \n",
373 |        "7  D000328:Adult; D001315:Australia; D002908:Chro...   \n",
374 |        "8  D000328:Adult; D059350:Chronic Pain; D002983:C...   \n",
375 |        "9                                                NaN   \n",
376 |        "\n",
377 |        "                                   publication_types  ...  \\\n",
378 |        "0                            D016428:Journal Article  ...   \n",
379 |        "1                            D016428:Journal Article  ...   \n",
380 |        "2                            D016428:Journal Article  ...   \n",
381 |        "3            D016428:Journal Article; D016454:Review  ...   \n",
382 |        "4  D016428:Journal Article; D013485:Research Supp...  ...   \n",
383 |        "5  D016428:Journal Article; D013485:Research Supp...  ...   \n",
384 |        "6  D016428:Journal Article; D013485:Research Supp...  ...   \n",
385 |        "7                            D016428:Journal Article  ...   \n",
386 |        "8                            D016428:Journal Article  ...   \n",
387 |        "9                            D016428:Journal Article  ...   \n",
388 |        "\n",
389 |        "                                          references delete  \\\n",
390 |        "0  29444767;29166917;8870135;15804318;22778146;18...  False   \n",
391 |        "1                                                NaN  False   \n",
392 |        "2                                                NaN  False   \n",
393 |        "3  32880358;25824429;31726927;30443883;12605432;2...  False   \n",
394 |        "4  32512462;16867972;32314971;12020305;33687342;2...  False   \n",
395 |        "5                                                NaN  False   \n",
396 |        "6                                                NaN  False   \n",
397 |        "7                                                NaN  False   \n",
398 |        "8                                                NaN  False   \n",
399 |        "9                                                NaN  False   \n",
400 |        "\n",
401 |        "                                        affiliations pmc  other_id  \\\n",
402 |        "0  Nivel (Netherlands Institute for Health Servic... NaN       NaN   \n",
403 |        "1  Department of Internal Medicine College of Med... NaN       NaN   \n",
404 |        "2  Faculty of Engineering, 4334The University of ... NaN       NaN   \n",
405 |        "3  Department of Anesthesiology, Hospital of the ... NaN       NaN   \n",
406 |        "4  Research office of chronic disease management ... NaN       NaN   \n",
407 |        "5  Department of Psychiatry and Behavioral Neuros... NaN       NaN   \n",
408 |        "6  Department of Physical Therapy, University of ... NaN       NaN   \n",
409 |        "7  School of Nursing, Queensland University of Te... NaN       NaN   \n",
410 |        "8  Department of Psychiatry and Behavioral Neuros... NaN       NaN   \n",
411 |        "9  52977Kirikkale University, Faculty of Health S... NaN       NaN   \n",
412 |        "\n",
413 |        "               medline_ta  nlm_unique_id  issn_linking        country  \\\n",
414 |        "0           Health Expect        9815926     1369-6513        England   \n",
415 |        "1        J Am Heart Assoc      101580524     2047-9980        England   \n",
416 |        "2            Chronic Illn      101253019     1742-3953  United States   \n",
417 |        "3  Curr Pain Headache Rep      100970666     1534-3081  United States   \n",
418 |        "4             BMC Geriatr      100968548     1471-2318        England   \n",
419 |        "5                  J Pain      100898657     1526-5900  United States   \n",
420 |        "6         Disabil Rehabil        9207179     0963-8288        England   \n",
421 |        "7             J Clin Nurs        9207302     0962-1067        England   \n",
422 |        "8                  J Pain      100898657     1526-5900  United States   \n",
423 |        "9            Chronic Illn      101253019     1742-3953  United States   \n",
424 |        "\n",
425 |        "                                         pdf_content  \n",
426 |        "0  Received 11 August 2021 Revised 28 November 20...  \n",
427 |        "1                                                NaN  \n",
428 |        "2                                                NaN  \n",
429 |        "3                                                NaN  \n",
430 |        "4  Jiang et al BMC Geriatrics 2022 22 9 https doi...  \n",
431 |        "5                                                NaN  \n",
432 |        "6                                                NaN  \n",
433 |        "7                                                NaN  \n",
434 |        "8                                                NaN  \n",
435 |        "9                                                NaN  \n",
436 |        "\n",
437 |        "[10 rows x 23 columns]"
438 |       ]
439 |      },
440 |      "execution_count": 2,
441 |      "metadata": {},
442 |      "output_type": "execute_result"
443 |     }
444 |    ],
445 |    "source": [
446 |     "df_result"
447 |    ]
448 |   }
449 |  ],
450 |  "metadata": {
451 |   "kernelspec": {
452 |    "display_name": "Python 3 (ipykernel)",
453 |    "language": "python",
454 |    "name": "python3"
455 |   },
456 |   "language_info": {
457 |    "codemirror_mode": {
458 |     "name": "ipython",
459 |     "version": 3
460 |    },
461 |    "file_extension": ".py",
462 |    "mimetype": "text/x-python",
463 |    "name": "python",
464 |    "nbconvert_exporter": "python",
465 |    "pygments_lexer": "ipython3",
466 |    "version": "3.8.9"
467 |   }
468 |  },
469 |  "nbformat": 4,
470 |  "nbformat_minor": 5
471 | }
472 | 


--------------------------------------------------------------------------------
/pubmedflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .pubmedflow import *


--------------------------------------------------------------------------------
/pubmedflow/pubmedflow.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This class is to implement the core pubmed functions,
  3 |     download the articles and query based on keywords
  4 |     @author: Aaditya(Ankit) <aadityaura@gmail.com>
  5 |     @date created: 27/06/2022
  6 |     @date last modified: 02/08/2022
  7 | """
  8 | 
  9 | from pathlib import Path
 10 | import pandas as pd
 11 | import uuid
 12 | from .utils import fetch, xml2df, get_pdf, get_final_data
 13 | 
 14 | 
 15 | class LazyPubmed(object):
 16 | 
 17 |     def __init__(self, title_query,
 18 |                  folder_name='pubmed_data',
 19 |                  api_key='',
 20 |                  max_documents=None,
 21 |                  download_pdf=True,
 22 |                  scihub=False):
 23 | 
 24 |         # creating folders for storing data
 25 |         # ---------------------------------------------------------
 26 | 
 27 |         self.folder_uuid = str(uuid.uuid4())
 28 |         self.folder_name = folder_name
 29 |         self.raw_pdf_path = f'{self.folder_name}/{self.folder_uuid}/raw_pdfs/'
 30 |         self.final_df = f'{self.folder_name}/{self.folder_uuid}/final_df/'
 31 |         self.raw_abs_path = f'{self.folder_name}/{self.folder_uuid}/raw_abstracts/'
 32 |         self.meta_data_path = f'{self.folder_name}/{self.folder_uuid}/meta_data/'
 33 |         self.xml2pdf_path = f'{self.folder_name}/{self.folder_uuid}/xml2df/'
 34 |         self.key = api_key
 35 | 
 36 |         Path(self.raw_pdf_path).mkdir(parents=True,
 37 |                                       exist_ok=True)
 38 |         Path(self.raw_abs_path).mkdir(parents=True,
 39 |                                       exist_ok=True)
 40 |         Path(self.meta_data_path).mkdir(parents=True,
 41 |                                         exist_ok=True)
 42 |         Path(self.xml2pdf_path).mkdir(parents=True,
 43 |                                       exist_ok=True)
 44 |         Path(self.final_df).mkdir(parents=True,
 45 |                                   exist_ok=True)
 46 |         # ---------------------------------------------------------
 47 |         self.user_agent_list = [
 48 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
 49 |             'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
 50 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
 51 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
 52 |             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
 53 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
 54 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15',
 55 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:101.0) Gecko/20100101 Firefox/101.0',
 56 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36 OPR/88.0.4412.27',
 57 |         ]
 58 | 
 59 |         final_df = self.download_articles(title_query,
 60 |                                         max_documents=max_documents,
 61 |                                         download_pdf=download_pdf,
 62 |                                         scihub=scihub)
 63 |         self.final_df = final_df
 64 | 
 65 | 
 66 |     def download_articles(self, query,
 67 |                         max_documents=None,
 68 |                         download_pdf=True,
 69 |                         scihub=False,
 70 |                         ):
 71 |         """function to fetch ids, fetch abstracts and fetch respective pdf files"""
 72 | 
 73 |         fetch_ids = fetch(self,query, max_documents=max_documents)
 74 |         final_df = xml2df(self.raw_abs_path, self.xml2pdf_path)
 75 |         final_df = final_df[final_df['pmid'].notna()]
 76 |         final_df['pmid'] = final_df['pmid'].apply(lambda x: int(x))
 77 | 
 78 |         pids = list(set(list(final_df['pmid'])))
 79 | 
 80 |         if download_pdf:
 81 |             get_pdf(self, pids, save=True, scihub=scihub)
 82 |             dart = get_final_data(self.raw_pdf_path)
 83 |             dart = dart[dart['pmid'].notna()]
 84 |             dart['pmid'] = dart['pmid'].apply(lambda x: int(x))
 85 |             final_df = pd.merge(final_df, dart, on='pmid', how='left')
 86 | 
 87 |         final_df.to_csv(f'{self.final_df}final_df.csv')
 88 |         return final_df
 89 | 
 90 |     def pubmed_train(self,
 91 |                      model_name='sentence-transformers/all-mpnet-base-v2',
 92 |                      model_output_path='pubmedflow_model',
 93 |                      model_architecture='ct'):
 94 | 
 95 |         final_df = self.final_df
 96 |         pdf_content = [i for i in final_df['pdf_content'] if isinstance(i,
 97 |                                                                         str)]
 98 |         import nltk
 99 |         nltk.download('punkt')
100 |         from nltk.tokenize import sent_tokenize
101 |         train_sentences = []
102 |         for text in pdf_content:
103 |             train_sentences += sent_tokenize(text)
104 | 
105 |         if len(train_sentences) > 0:
106 |             print(train_sentences)
107 |             train_df = pd.DataFrame({'text': train_sentences})
108 | 
109 |             from nfmodelapis.text.SentenceEmbedder import ModelSelect
110 |             trainer = ModelSelect(model_name,
111 |                                   model_output_path,
112 |                                   model_architecture=model_architecture
113 |                                   ).return_trainer()
114 |             trainer.train(data=train_df)
115 |         else:
116 |             raise Exception('''No data collected to train.
117 |                             Check the search parameters to
118 |                             collect more data''')
119 | 
120 |     def pubmed_entity_extraction(self):
121 |         final_df = self.final_df
122 |         from nfmodelapis.text.ner import NERPipeline
123 |         ner = NERPipeline(final_df)
124 |         ents = ner.batch_ner('pdf_content')
125 |         return ents
126 | 
127 |     def pubmed_qa(self,
128 |                   qa_query):
129 | 
130 |         final_df = self.final_df
131 | 
132 |         from nfmodelapis.text.question_answering import QAPipeline
133 |         pipe = QAPipeline(final_df)
134 |         res = pipe.batch_qa(qa_query, 'pdf_content')
135 |         return res
136 | 
137 |     def pubmed_summarize(self):
138 |         final_df = self.final_df
139 | 
140 |         from nfmodelapis.text.summarization import SummarizationPipeline
141 |         pipe = SummarizationPipeline(final_df)
142 |         res = pipe.batch_summarize('pdf_content')
143 |         return res
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/pubmedflow/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This class is to implement the util functions for pubmed main class
  3 |     @author: Aaditya(Ankit) <aadityaura@gmail.com>
  4 |     @date created: 27/06/2022
  5 |     @date last modified: 02/08/2022
  6 | """
  7 | import re
  8 | import pubmed_parser as pp
  9 | import pandas as pd
 10 | from tqdm import tqdm
 11 | import io
 12 | import uuid
 13 | import glob
 14 | 
 15 | from pdfminer3.layout import LAParams
 16 | from pdfminer3.pdfpage import PDFPage
 17 | from pdfminer3.pdfinterp import PDFResourceManager
 18 | from pdfminer3.pdfinterp import PDFPageInterpreter
 19 | from pdfminer3.converter import TextConverter
 20 | 
 21 | import random
 22 | import requests
 23 | 
 24 | import json
 25 | 
 26 | from datetime import date
 27 | from metapub import FindIt
 28 | from bs4 import BeautifulSoup
 29 | from scidownl import scihub_download
 30 | 
 31 | 
 32 | def preprocess_text(sentence):
 33 |     """Remove punctuations and extra spaces"""
 34 | 
 35 |     sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
 36 |     sentence = re.sub(r'\s+', ' ', sentence)
 37 | 
 38 |     return sentence
 39 | 
 40 | 
 41 | def pdf_in(x):
 42 |     """Read the text content of a pdf"""
 43 | 
 44 |     resource_manager = PDFResourceManager()
 45 |     fake_file_handle = io.StringIO()
 46 |     converter = TextConverter(
 47 |         resource_manager, fake_file_handle, laparams=LAParams())
 48 |     page_interpreter = PDFPageInterpreter(resource_manager, converter)
 49 | 
 50 |     with open(x, 'rb') as fh:
 51 | 
 52 |         for page in PDFPage.get_pages(fh,
 53 |                                       caching=True,
 54 |                                       check_extractable=True):
 55 |             page_interpreter.process_page(page)
 56 | 
 57 |         text = fake_file_handle.getvalue()
 58 | 
 59 |     # close open handles
 60 |     converter.close()
 61 |     fake_file_handle.close()
 62 | 
 63 |     return text
 64 | 
 65 | 
 66 | def get_pdftext_content(pdf_name):
 67 |     """read multiple pdfs from a folder"""
 68 | 
 69 |     result_data = pdf_in(pdf_name)
 70 |     t = [k for k in result_data.split('\n') if k != '']
 71 |     t_join = " ".join(t)
 72 |     if 'References' in t_join:
 73 |         t_join = "".join(t_join.split('References')[:-1])
 74 | 
 75 |     return t_join
 76 | 
 77 | 
 78 | def get_final_data(folder_name):
 79 |     """convert pdf text content into a pd.dataframe"""
 80 | 
 81 |     df_data = {'pmid': [], 'pdf_content': []}
 82 | 
 83 |     pdfs = [pdf_file
 84 |             for pdf_file in glob.glob(f'{folder_name}*')]
 85 | 
 86 |     for single_pdf in tqdm(range(len(pdfs))):
 87 | 
 88 |         fname = pdfs[single_pdf]
 89 |         pdf_name = fname.split('/')[3].split('.pdf')[0]
 90 |         df_data['pmid'].append(pdf_name)
 91 | 
 92 |         try:
 93 |             pdf_content = get_pdftext_content(fname)
 94 | 
 95 |             if pdf_content != ' ':
 96 |                 df_data['pdf_content'].append(pdf_content)
 97 |             else:
 98 |                 df_data['pdf_content'].append('')
 99 | 
100 |         except Exception as e:
101 |             pass
102 | 
103 |     df_data = pd.DataFrame(df_data)
104 |     return df_data
105 | 
106 | 
107 | def parse_xml(file):
108 |     """parse the xml file"""
109 | 
110 |     dicts_out = pp.parse_medline_xml(file)
111 |     return dicts_out
112 | 
113 | 
114 | def xml2df(folder_name, save_path):
115 |     """xml data to pd.dataframe"""
116 | 
117 |     all_files = glob.glob(f"./{folder_name}*xml")
118 |     u_id = str(uuid.uuid4())
119 | 
120 |     df_list = []
121 | 
122 |     try:
123 |         for i in tqdm(all_files):
124 |             raw_df = parse_xml(i)
125 |             df = pd.DataFrame(raw_df)
126 |             df_list.append(df)
127 | 
128 |         final_df = pd.concat(df_list)
129 | 
130 |         final_df.to_csv(f'./{save_path}{u_id}.csv', index=False)
131 |         final_df = pd.read_csv(f'./{save_path}{u_id}.csv')
132 | 
133 |         # it will replace blank will NaN now load again and select without NaN
134 |         final_df = final_df[final_df['abstract'].notna()
135 |                             ].reset_index(drop=True)
136 |         final_df.to_csv(f'{save_path}{u_id}.csv', index=False)
137 |         return final_df
138 | 
139 |     except Exception as e:
140 |         print(e)
141 |         pass
142 | 
143 | 
144 | def request_head(self, url):
145 |     """Request function for urls"""
146 | 
147 |     headers = requests.utils.default_headers()
148 |     headers['User-Agent'] = random.choice(self.user_agent_list)
149 |     r = requests.get(url, headers=headers,
150 |                      allow_redirects=True,
151 |                      verify=False)
152 |     return r
153 | 
154 | 
155 | def pdf_links(self, pmid):
156 |     """Get pdf links from Pubmed website"""
157 | 
158 |     data = {}
159 |     url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
160 |     url_request = request_head(self, url)
161 |     soup = BeautifulSoup(url_request.text, 'html.parser')
162 | 
163 |     try:
164 |         full_text_class = soup.find_all(
165 |             "div", {"class": "full-text-links"})
166 |         full_content = full_text_class[0].find_all(
167 |             "div", {"class": "full-text-links-list"})
168 |         all_links = full_content[0].findAll("a", href=True)
169 | 
170 |         for single_link in all_links:
171 |             data[single_link.text.strip()] = single_link['href']
172 |         return data
173 |     except Exception as e:
174 |         print(e)
175 |         return {'data': None}
176 | 
177 | 
178 | def get_date(self):
179 |     currentDate = date.today()
180 |     today = currentDate.strftime('%Y/%m/%d')
181 |     return today
182 | 
183 | 
184 | def pmc(self, url):
185 |     """Download pdf from pmc website"""
186 | 
187 |     url_request = request_head(self, url)
188 |     soup = BeautifulSoup(url_request.text, 'html.parser')
189 |     full_text_class = soup.find_all(
190 |         "ul", {"class": "pmc-sidebar__formats"})
191 |     link_ = full_text_class[0].find_all(
192 |         "li", {"class": "pdf-link other_item"})[0].findAll("a", href=True)[0]
193 |     link_url = f"https://www.ncbi.nlm.nih.gov{link_['href']}"
194 | 
195 |     return link_url
196 | 
197 | 
198 | def save_pdf(self, pmid, pdf_url):
199 |     """save pdf in local folder"""
200 | 
201 |     pdf_request = request_head(self, pdf_url)
202 |     with open(f'{self.raw_pdf_path}{pmid}.pdf', 'wb') as f:
203 |         f.write(pdf_request.content)
204 |     return 0
205 | 
206 | 
207 | def scihub_mode(self, pmid):
208 |     """scihub search"""
209 | 
210 |     paper_type = "pmid"
211 |     out = f'{self.raw_pdf_path}{pmid}.pdf'
212 |     scihub_download(pmid, paper_type=paper_type, out=out)
213 | 
214 | 
215 | def get_pdf(self, pmids, save=False, scihub=True):
216 |     """Main function to download and search pdfs -> save in local folder"""
217 | 
218 |     downloadble_url = {}
219 |     not_downloaded = {}
220 | 
221 |     pdf_count = len(pmids)
222 |     print(f"Total pdf downloading : {pdf_count}.. \n")
223 | 
224 |     for pmid in tqdm(pmids):
225 | 
226 |         try:
227 |             pdf_source = pdf_links(self, pmid)
228 |             if 'Free PMC article' in pdf_source.keys():
229 |                 pdf_url = pmc(self, pdf_source['Free PMC article'])
230 |                 downloadble_url[pmid] = pdf_url
231 |                 if save:
232 |                     save_pdf(self, pmid, pdf_url)
233 | 
234 |             else:
235 |                 if FindIt(pmid).url:
236 |                     print("saving from findit")
237 |                     findit_url = FindIt(pmid).url
238 |                     save_pdf(self, pmid, findit_url)
239 |                     downloadble_url[pmid] = findit_url
240 | 
241 |                 elif scihub:
242 |                     print("saving from scihub")
243 |                     scihub_mode(self, pmid)
244 |                     downloadble_url[pmid] = 'sci_hub'
245 |                 else:
246 |                     not_downloaded[pmid] = pdf_source
247 | 
248 |         except Exception as e:
249 |             pass
250 | 
251 |     return json.dumps({
252 |         'downloaded': downloadble_url,
253 |         'not_downloaded': not_downloaded
254 |     }, indent=3)
255 | 
256 | 
257 | def write_json(self, path_name, data, name):
258 |     """Write json data"""
259 | 
260 |     with open(f'{path_name}{name}.json', 'w', encoding='utf-8') as f:
261 |         json.dump(data, f, ensure_ascii=False, indent=4)
262 |     return 0
263 | 
264 | 
265 | def get_records(self, query=None):
266 |     """get fetch result and ids from ncbi website using api"""
267 | 
268 |     if query:
269 |         search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?sort=relevance&db=pubmed&term={query}&mindate=1800/01/01&maxdate={get_date(self)}&usehistory=y&retmode=json"
270 |     else:
271 |         search_url = f"https://eutils.ncbi.nlm.nih.gov/entrdefez/eutils/esearch.fcgi?db=pubmed&mindate=1800/01/01&maxdate={self.get_date()}&usehistory=y&retmode=json"
272 | 
273 |     search_r = requests.post(search_url, verify=False)
274 |     search_data = search_r.json()
275 | 
276 |     webenv = search_data["esearchresult"]['webenv']
277 |     total_records = int(search_data["esearchresult"]['count'])
278 | 
279 |     return {'total_records': total_records,
280 |             'webenv': webenv,
281 |             'search_data': search_data}
282 | 
283 | 
284 | def fetch(self, query,
285 |           max_documents=None):
286 |     """function to do multi task -> fetch ids, based on ids fetch abstracts"""
287 | 
288 |     all_records = get_records(self, query)
289 |     webenv = all_records['webenv']
290 |     all_rec = all_records['total_records']
291 | 
292 |     if max_documents:
293 |         all_rec = max_documents
294 |         fetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&api_key={self.key}&retmax={max_documents}&retmode=xml&query_key=1&webenv="+webenv
295 |     else:
296 |         fetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&api_key={self.key}&retmax=9999&retmode=xml&query_key=1&webenv="+webenv
297 | 
298 |     print("-------------------------------------------\n")
299 |     print(f" Fetching total documents -> {all_rec}..\n")
300 |     print("-------------------------------------------\n")
301 | 
302 |     for i in tqdm(range(0, all_rec, 10000)):
303 |         try:
304 |             meta_data = {}
305 |             u_id = str(uuid.uuid4())
306 |             payload = fetch_url+"&retstart="+str(i)
307 | 
308 |             print(f"Getting this URL: {payload} \n")
309 | 
310 |             fetch_r = requests.post(payload, verify=False)
311 |             pre_name = f'{self.raw_abs_path}/pubmed_batch_{u_id}_{str(i)}_to_{str(i+all_rec)}.xml'
312 | 
313 |             f = open(pre_name, 'wb')
314 |             f.write(fetch_r.content)
315 |             f.close()
316 | 
317 |             meta_data['uid'] = u_id
318 |             meta_data['query'] = query
319 |             meta_data['url'] = payload
320 |             meta_data['total'] = all_rec
321 |             meta_data['iter'] = i
322 | 
323 |             write_json(self, self.meta_data_path, meta_data, u_id)
324 | 
325 |         except Exception as e:
326 |             with open('exceptions', 'a') as f:
327 |                 f.write(f" featch_1_fetch_exception {e} number {i} \n")
328 |             pass
329 | 
330 |     return 0
331 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | install_requires = ["shutup",
 7 |                     "numpy",
 8 |                     "pandas",
 9 |                     "bs4",
10 |                     "metapub",
11 |                     "scidownl",
12 |                     "pdfminer3",
13 |                     "pubmed_parser"]
14 | setuptools.setup(
15 |     name="pubmedflow",
16 |     version="0.0.2",
17 |     author="Aditya Ura",
18 |     author_email="aadityaura@gmail.com",
19 |     description="Data Collection from pubmed made easy",
20 |     long_description=long_description,
21 |     long_description_content_type="text/markdown",
22 |     license='MIT License',
23 |     url="https://github.com/nfflow/pubmedflow",
24 |     install_requires=install_requires,
25 |     packages=setuptools.find_packages(),
26 |     python_requires='>=3.6',
27 |     include_package_data=True,
28 |     extras_require={"qa": ["nfmodelapis"]},
29 | )
30 | 


--------------------------------------------------------------------------------