bsp-tree

├── src ├── __init__.py ├── fetch │ ├── __init__.py │ └── fetch.py ├── parse │ ├── __init__.py │ └── parse.py ├── version.py ├── providers │ ├── __init__.py │ ├── arxiv.py │ ├── scidb.py │ └── scihub.py └── papers_dl.py ├── test.sh ├── tests ├── documents │ ├── ids.txt │ ├── reyes-rendering.html │ ├── real-time-rendering.html │ ├── b-tree-techniques.html │ ├── bsp-tree.html │ ├── arxiv.html │ ├── superscalar-cisc.html │ ├── scihub.html │ └── scidb.html ├── __init__.py ├── test_fetch.py ├── test_cli.py └── test_parse.py ├── LICENSE ├── requirements.txt ├── pyproject.toml ├── README.md └── .gitignore /src/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/fetch/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/parse/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.25" 2 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | python -m unittest discover 2 | -------------------------------------------------------------------------------- /tests/documents/ids.txt: -------------------------------------------------------------------------------- 1 | https://www.cell.com/current-biology/fulltext/S0960-9822(19)31469-1 2 | 10.1016/j.cub.2019.11.030 3 | 10.1107/s0907444905036693 4 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.insert( 5 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) 6 | ) 7 | -------------------------------------------------------------------------------- /src/providers/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.insert( 5 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "fetch")) 6 | ) 7 | -------------------------------------------------------------------------------- /tests/documents/reyes-rendering.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

6 | https://doi.org/10.1145/37402.37414 7 |

8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/providers/arxiv.py: -------------------------------------------------------------------------------- 1 | # from urllib.parse import urljoin 2 | 3 | # from loguru import logger 4 | from parse.parse import parse_ids_from_text 5 | 6 | 7 | async def get_url(identifier): 8 | is_arxiv = parse_ids_from_text(identifier, ["arxiv"]) 9 | if is_arxiv: 10 | pdf_url = f"https://arxiv.org/pdf/{identifier}.pdf" 11 | return pdf_url 12 | 13 | return None 14 | -------------------------------------------------------------------------------- /tests/documents/real-time-rendering.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

https://doi.org/10.1201/9781315365459

6 | 9781315365459 7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/documents/b-tree-techniques.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

6 | Goetz Graefe (2011), "Modern B-Tree Techniques", Foundations and Trends® in Databases: Vol. 3: No. 4, pp 203-402. http://dx.doi.org/10.1561/1900000028 7 |

8 |

10.1561/1900000028

9 |

978-1-60198-482-1

10 |

978-1-60198-483-8

11 | 12 | 13 | -------------------------------------------------------------------------------- /tests/test_fetch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import aiohttp 4 | import asyncio 5 | 6 | from src.providers.scihub import get_available_scihub_urls 7 | 8 | 9 | class TestSciHub(unittest.IsolatedAsyncioTestCase): 10 | async def test_scihub_up(self): 11 | """ 12 | Test to verify that `scihub.now.sh` is available 13 | """ 14 | urls = await get_available_scihub_urls() 15 | self.assertIsNotNone(urls, "Failed to find Sci-Hub domains") 16 | -------------------------------------------------------------------------------- /tests/documents/bsp-tree.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 8 | bsp-tree 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/documents/arxiv.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 10 |

11 |

arXiv:2407.13619

12 |

arXiv:1608.00878

13 |

arXiv:q-bio/0512009

14 |

arXiv:math/0601009

15 |

arXiv:hep-th/0512302

16 |

arXiv:cond-mat/0512295

17 |

arXiv:quant-ph/0511150

18 |

arXiv:1605.04938

19 |

20 | 21 | 22 | -------------------------------------------------------------------------------- /tests/documents/superscalar-cisc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /tests/documents/scihub.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Parrots Voluntarily Help Each Other to Obtain Food Rewards | 10.1016/j.cub.2019.11.030_Science Hub 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

14 |

15 | 16 |

17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /tests/documents/scidb.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

5 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/providers/scidb.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin 2 | 3 | from loguru import logger 4 | from parse.parse import find_pdf_url, parse_ids_from_text 5 | 6 | 7 | async def get_url(session, identifier): 8 | base_url = "https://annas-archive.org/scidb/" 9 | # TODO: add support for .se and .li base_urls 10 | 11 | is_doi = parse_ids_from_text(identifier, ["doi"]) 12 | if is_doi: 13 | url = urljoin(base_url, identifier) 14 | logger.info("searching SciDB: {}", url) 15 | try: 16 | res = await session.get(url) 17 | except Exception as e: 18 | logger.error("Couldn't connect to SciDB: {}", e) 19 | return None 20 | pdf_url = find_pdf_url(await res.read()) 21 | if pdf_url is None: 22 | logger.info("No direct link to PDF found from SciDB") 23 | return pdf_url 24 | 25 | return None 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Ben Muthalaly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.5 2 | aiosignal==1.3.1 3 | attrs==23.2.0 4 | beautifulsoup4==4.12.3 5 | bs4==0.0.2 6 | build==1.2.1 7 | certifi==2024.2.2 8 | cffi==1.16.0 9 | charset-normalizer==3.3.2 10 | cloudpickle==3.0.0 11 | cryptography==42.0.5 12 | docutils==0.21.1 13 | easygui==0.98.3 14 | feedparser==6.0.11 15 | frozenlist==1.4.1 16 | google==3.0.0 17 | idna==3.6 18 | importlib_metadata==7.1.0 19 | jaraco.classes==3.4.0 20 | jaraco.context==5.3.0 21 | jaraco.functools==4.0.0 22 | Jinja2==3.1.4 23 | keyring==25.1.0 24 | loguru==0.7.2 25 | markdown-it-py==3.0.0 26 | MarkupSafe==2.1.5 27 | mdurl==0.1.2 28 | more-itertools==10.2.0 29 | multidict==6.0.5 30 | nh3==0.2.17 31 | packaging==24.0 32 | pdf2doi==1.5.1 33 | pdfminer.six==20221105 34 | pdftitle==0.11 35 | pkginfo==1.10.0 36 | psutil==5.9.8 37 | pycparser==2.21 38 | Pygments==2.17.2 39 | PyMuPDF==1.23.26 40 | PyMuPDFb==1.23.22 41 | pynvml==11.5.0 42 | PyPDF2==2.0.0 43 | pyperclip==1.8.2 44 | pyproject_hooks==1.0.0 45 | readme_renderer==43.0 46 | requests==2.31.0 47 | requests-toolbelt==1.0.0 48 | retrying==1.3.4 49 | rfc3986==2.0.0 50 | rich==13.7.1 51 | sgmllib3k==1.0.0 52 | six==1.16.0 53 | soupsieve==2.5 54 | twine==5.0.0 55 | urllib3==2.2.1 56 | w3lib==2.1.2 57 | wheel==0.43.0 58 | yarl==1.9.4 59 | zipp==3.18.1 60 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "papers-dl" 7 | authors = [ 8 | { name="Ben Muthalaly", email="benmuthalaly@gmail.com" }, 9 | ] 10 | description = "A command line application for downloading scientific papers" 11 | readme = "README.md" 12 | requires-python = ">=3.8" 13 | dynamic=["version"] 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "aiohttp==3.9.5", 21 | "beautifulsoup4==4.12.3", 22 | "bs4==0.0.2", 23 | "certifi==2024.2.2", 24 | "cffi>=1.16.0", 25 | "charset-normalizer==3.3.2", 26 | "cryptography==42.0.5", 27 | "easygui==0.98.3", 28 | "feedparser==6.0.11", 29 | "google==3.0.0", 30 | "idna==3.6", 31 | "loguru==0.7.2", 32 | "pdf2doi==1.5.1", 33 | "pdfminer.six==20221105", 34 | "pdftitle==0.11", 35 | "pycparser==2.21", 36 | "PyMuPDF>=1.23.26", 37 | "PyMuPDFb>=1.23.22", 38 | "PyPDF2==2.0.0", 39 | "pyperclip==1.8.2", 40 | "requests==2.31.0", 41 | "retrying==1.3.4", 42 | "sgmllib3k==1.0.0", 43 | "six==1.16.0", 44 | "soupsieve==2.5", 45 | "urllib3==2.2.1", 46 | "w3lib==2.1.2", 47 | ] 48 | 49 | [project.scripts] 50 | papers-dl = "papers_dl:main" 51 | 52 | [project.urls] 53 | Homepage = "https://github.com/benmuth/papers-dl" 54 | Issues = "https://github.com/benmuth/papers-dl/issues" 55 | 56 | [tool.setuptools.dynamic] 57 | version = {attr = "version.__version__"} 58 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import subprocess 3 | import sys 4 | 5 | test_paper_id = "10.1016/j.cub.2019.11.030" 6 | test_paper_title = "Parrots Voluntarily Help Each Other to Obtain Food Rewards" 7 | 8 | 9 | class TestCLI(unittest.TestCase): 10 | def test_parse_command_doi_csv(self): 11 | result = subprocess.run( 12 | [ 13 | sys.executable, 14 | "src/papers_dl.py", 15 | "parse", 16 | "-m", 17 | "doi", 18 | "-p", 19 | "tests/documents/bsp-tree.html", 20 | "-f", 21 | "csv", 22 | ], 23 | capture_output=True, 24 | text=True, 25 | ) 26 | self.assertIn("10.1109/83.544569,doi", result.stdout) 27 | 28 | def test_parse_command_doi_jsonl(self): 29 | result = subprocess.run( 30 | [ 31 | sys.executable, 32 | "src/papers_dl.py", 33 | "parse", 34 | "-m", 35 | "doi", 36 | "-f", 37 | "jsonl", 38 | "-p", 39 | "tests/documents/bsp-tree.html", 40 | ], 41 | capture_output=True, 42 | text=True, 43 | ) 44 | self.assertIn('{"id": "10.1109/83.544569", "type": "doi"}', result.stdout) 45 | 46 | def test_parse_command_isbn_raw(self): 47 | result = subprocess.run( 48 | [ 49 | sys.executable, 50 | "src/papers_dl.py", 51 | "parse", 52 | "-m", 53 | "isbn", 54 | "-f", 55 | "raw", 56 | "-p", 57 | "tests/documents/b-tree-techniques.html", 58 | ], 59 | capture_output=True, 60 | text=True, 61 | ) 62 | self.assertIn("978-1-60198-482-1", result.stdout) 63 | self.assertIn("978-1-60198-483-8", result.stdout) 64 | 65 | def test_parse_command_cli(self): 66 | args = [ 67 | sys.executable, 68 | "src/papers_dl.py", 69 | "parse", 70 | "-f", 71 | "jsonl", 72 | "-m", 73 | "isbn", 74 | ] 75 | 76 | input_data = "978-1-60198-482-1 978-1-60198-483-8" 77 | 78 | result = subprocess.run(args, input=input_data, capture_output=True, text=True) 79 | self.assertIn('{"id": "978-1-60198-482-1", "type": "isbn"}', result.stdout) 80 | self.assertIn('{"id": "978-1-60198-483-8", "type": "isbn"}', result.stdout) 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Overview 2 | `papers-dl` is a command line application for downloading scientific papers. 3 | 4 | ### Installation 5 | ```shell 6 | # install with uv 7 | uv tool install papers-dl 8 | 9 | # install with pip 10 | pip install papers-dl 11 | ``` 12 | 13 | ### Usage 14 | ```shell 15 | # parse DOI identifiers from a file: 16 | papers-dl parse -m doi --path pages/my-paper.html 17 | 18 | # parse ISBN identifiers from a file, output matches as CSV: 19 | papers-dl parse -m isbn --path pages/my-paper.html -f csv 20 | 21 | # fetch paper with given identifier from any known provider: 22 | papers-dl fetch "10.1016/j.cub.2019.11.030" 23 | 24 | # fetch paper from any known Sci-Hub URL with verbose logging on, and store in "papers" directory: 25 | papers-dl -v fetch -p "scihub" -o "papers" "10.1107/s0907444905036693" 26 | 27 | # fetch paper from specific Sci-Hub URL: 28 | papers-dl fetch -p "sci-hub.ee" "10.1107/s0907444905036693" 29 | 30 | # fetch paper from SciDB (Anna's Archive): 31 | papers-dl fetch -p "scidb" "10.1107/s0907444905036693" 32 | ``` 33 | 34 | ### About 35 | 36 | `papers-dl` attempts to be a comprehensive tool for gathering research papers from popular open libraries. There are other solutions for this (see "Other tools" below), but `papers-dl` is trying to fill its own niche: 37 | 38 | - comprehensive: other tools usually work with a single library, while `papers-dl` is trying to support a collection of popular libraries. 39 | - performant: `papers-dl` tries to improve search and retrieval times by making use of concurrency where possible. 40 | 41 | That said, `papers-dl` may not be the best choice for your specific use case right now. For example, if you require features supported by a specific library, one of the more mature and specialized tools listed below may be a better option. 42 | 43 | `papers-dl` was initially created to serve as an extractor for [ArchiveBox](https://archivebox.io), a powerful solution for self-hosted web archiving. 44 | 45 | This project started as a fork of [scihub.py](https://github.com/zaytoun/scihub.py). 46 | 47 | ### Other tools 48 | 49 | - [Scidownl](https://pypi.org/project/scidownl/) 50 | - [arxiv-dl](https://pypi.org/project/arxiv-dl/) 51 | - [Anna's Archive API](https://github.com/dheison0/annas-archive-api) 52 | 53 | ### Roadmap 54 | 55 | `papers-dl`'s CLI is not yet stable. 56 | 57 | Short-term roadmap: 58 | 59 | **parsing** 60 | - add support for parsing more identifier types like PMID and ISSN 61 | 62 | **fetching** 63 | - add support for downloading formats other than PDFs, like HTML or epub 64 | 65 | **searching** 66 | - add a CLI command for searching libraries for papers and metadata 67 | 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | .DS_Store 162 | output 163 | *.pdf 164 | -------------------------------------------------------------------------------- /src/providers/scihub.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import enum 3 | import re 4 | from urllib.parse import urljoin 5 | 6 | import aiohttp 7 | from bs4 import BeautifulSoup 8 | from loguru import logger 9 | from parse.parse import find_pdf_url 10 | 11 | # URL-DIRECT - openly accessible paper 12 | # URL-NON-DIRECT - pay-walled paper 13 | # PMID - PubMed ID 14 | # DOI - digital object identifier 15 | IDClass = enum.Enum("identifier", ["URL-DIRECT", "URL-NON-DIRECT", "PMD", "DOI"]) 16 | 17 | DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15" 18 | 19 | 20 | class IdentifierNotFoundError(Exception): 21 | pass 22 | 23 | 24 | async def get_available_scihub_urls() -> list[str]: 25 | """ 26 | Finds available Sci-Hub urls via https://sci-hub.now.sh/ 27 | """ 28 | 29 | # NOTE: This misses some valid URLs. Alternatively, we could parse 30 | # the HTML more finely by navigating the parsed DOM, instead of relying 31 | # on filtering. That might be more brittle in case the HTML changes. 32 | # Generally, we don't need to get all URLs. 33 | scihub_domain = re.compile(r"^http[s]*://sci.hub", flags=re.IGNORECASE) 34 | urls = [] 35 | 36 | try: 37 | async with aiohttp.request("GET", "https://sci-hub.now.sh/") as res: 38 | s = BeautifulSoup(await res.text(), "html.parser") 39 | except Exception as e: 40 | logger.info("Couldn't find Sci-Hub URLs: {}", e) 41 | return [] 42 | 43 | text_matches = s.find_all( 44 | "a", 45 | href=True, 46 | string=re.compile(scihub_domain), 47 | ) 48 | 49 | href_matches = s.find_all( 50 | "a", 51 | re.compile(scihub_domain), 52 | href=True, 53 | ) 54 | 55 | full_match_set = set(text_matches) | set(href_matches) 56 | for a in full_match_set: 57 | if "sci" in a or "sci" in a["href"]: 58 | urls.append(a["href"]) 59 | 60 | return urls 61 | 62 | 63 | async def get_direct_urls( 64 | session, 65 | identifier: str, 66 | base_urls: list[str] | None = None, 67 | ) -> list[str]: 68 | """ 69 | Finds the direct source url for a given identifier. 70 | """ 71 | 72 | if base_urls is None: 73 | base_urls = await get_available_scihub_urls() 74 | 75 | logger.info("searching Sci-Hub urls: {}", base_urls) 76 | 77 | # catch exceptions so that they don't cancel the task group 78 | async def get_wrapper(url): 79 | try: 80 | return await session.get(url) 81 | except Exception as e: 82 | logger.info("Couldn't connect to {}: {}", url, e) 83 | return None 84 | 85 | if classify(identifier) == IDClass["URL-DIRECT"]: 86 | return [identifier] 87 | 88 | async with asyncio.TaskGroup() as tg: 89 | tasks = [ 90 | tg.create_task(get_wrapper(urljoin(base_url, identifier))) 91 | for base_url in base_urls 92 | ] 93 | 94 | direct_urls = [] 95 | try: 96 | for task in tasks: 97 | res = await task 98 | if res is None: 99 | continue 100 | path = find_pdf_url(await res.text()) 101 | if isinstance(path, list): 102 | path = path[0] 103 | if isinstance(path, str) and path.startswith("//"): 104 | direct_urls.append("https:" + path) 105 | elif isinstance(path, str) and path.startswith("/"): 106 | direct_urls.append(urljoin(res.url.human_repr(), path)) 107 | 108 | except Exception as err: 109 | logger.error("Error while looking for PDF urls: {}", err) 110 | 111 | if not direct_urls: 112 | logger.info("No direct link to PDF found from Sci-Hub") 113 | 114 | return list(set(direct_urls)) 115 | 116 | 117 | def classify(identifier) -> IDClass: 118 | """ 119 | Classify the type of identifier: 120 | url-direct - openly accessible paper 121 | url-non-direct - pay-walled paper 122 | pmid - PubMed ID 123 | doi - digital object identifier 124 | """ 125 | if identifier.startswith("http") or identifier.startswith("https"): 126 | if identifier.endswith("pdf"): 127 | return IDClass["URL-DIRECT"] 128 | else: 129 | return IDClass["URL-NON-DIRECT"] 130 | elif identifier.isdigit(): 131 | return IDClass["PMID"] 132 | else: 133 | return IDClass["DOI"] 134 | -------------------------------------------------------------------------------- /src/papers_dl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import os 4 | import sys 5 | 6 | import aiohttp 7 | from loguru import logger 8 | from fetch import fetch 9 | from parse.parse import format_output, id_patterns, parse_file, parse_ids_from_text 10 | 11 | 12 | async def fetch_paper(args) -> str: 13 | providers = args.providers 14 | id = args.query 15 | out = args.output 16 | 17 | headers = None 18 | if args.user_agent is not None: 19 | headers = { 20 | "User-Agent": args.user_agent, 21 | } 22 | 23 | async with aiohttp.ClientSession(headers=headers) as sess: 24 | result = await fetch.fetch(sess, id, providers) 25 | 26 | if result is None: 27 | return None 28 | 29 | pdf_content, url = result 30 | 31 | path = os.path.join(out, fetch.generate_name(pdf_content)) 32 | fetch.save(pdf_content, path) 33 | new_path = fetch.rename(out, path) 34 | return f"Successfully downloaded paper from {url}.\n Saved to {new_path}" 35 | 36 | 37 | def parse_ids(args) -> str: 38 | output = None 39 | if hasattr(args, "path") and args.path: 40 | output = parse_file(args.path, args.match) 41 | else: 42 | # if a path isn't passed or is empty, read from stdin 43 | output = parse_ids_from_text(sys.stdin.read(), args.match) 44 | return format_output(output, args.format) 45 | 46 | 47 | async def run(): 48 | name = "papers-dl" 49 | parser = argparse.ArgumentParser( 50 | prog=name, 51 | description="Download scientific papers from the command line", 52 | ) 53 | 54 | from version import __version__ 55 | 56 | parser.add_argument( 57 | "--version", "-V", action="version", version=f"{name} {__version__}" 58 | ) 59 | 60 | parser.add_argument( 61 | "--verbose", "-v", action="store_true", help="increase verbosity" 62 | ) 63 | 64 | subparsers = parser.add_subparsers() 65 | 66 | # FETCH 67 | parser_fetch = subparsers.add_parser( 68 | "fetch", help="try to download a paper with the given identifier" 69 | ) 70 | 71 | parser_fetch.add_argument( 72 | "query", 73 | metavar="(DOI|PMID|URL)", 74 | type=str, 75 | help="the identifier to try to download", 76 | ) 77 | 78 | parser_fetch.add_argument( 79 | "-o", 80 | "--output", 81 | metavar="path", 82 | help="optional output directory for downloaded papers", 83 | default=".", 84 | type=str, 85 | ) 86 | 87 | parser_fetch.add_argument( 88 | "-p", 89 | "--providers", 90 | help="comma separated list of providers to try fetching from", 91 | default="all", 92 | type=str, 93 | ) 94 | 95 | parser_fetch.add_argument( 96 | "-A", 97 | "--user-agent", 98 | help="", 99 | default=None, 100 | type=str, 101 | ) 102 | 103 | # PARSE 104 | parser_parse = subparsers.add_parser( 105 | "parse", help="parse identifiers from a file or stdin" 106 | ) 107 | parser_parse.add_argument( 108 | "-m", 109 | "--match", 110 | metavar="type", 111 | help="the type of identifier to search for", 112 | type=str, 113 | choices=id_patterns.keys(), 114 | action="append", 115 | ) 116 | parser_parse.add_argument( 117 | "-p", 118 | "--path", 119 | help="the path of the file to parse", 120 | type=str, 121 | ) 122 | parser_parse.add_argument( 123 | "-f", 124 | "--format", 125 | help="the output format for printing", 126 | metavar="fmt", 127 | default="raw", 128 | choices=["raw", "jsonl", "csv"], 129 | nargs="?", 130 | ) 131 | 132 | parser_fetch.set_defaults(func=fetch_paper) 133 | parser_parse.set_defaults(func=parse_ids) 134 | 135 | args = parser.parse_args() 136 | 137 | logger.remove(0) 138 | if args.verbose: 139 | logger.add(sys.stderr, level="INFO", enqueue=True, format="{message}") 140 | else: 141 | logger.add(sys.stderr, level="ERROR", enqueue=True, format="{message}") 142 | 143 | if hasattr(args, "func"): 144 | if asyncio.iscoroutinefunction(args.func): 145 | result = await args.func(args) 146 | else: 147 | result = args.func(args) 148 | 149 | if result: 150 | print(result) 151 | else: 152 | # TODO: change this to be more general 153 | print("No papers found") 154 | else: 155 | parser.print_help() 156 | 157 | 158 | def main(): 159 | asyncio.run(run()) 160 | 161 | 162 | if __name__ == "__main__": 163 | asyncio.run(run()) 164 | -------------------------------------------------------------------------------- /tests/test_parse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from parse import parse 5 | 6 | target_ids = ("doi", "pmid", "isbn", "issn", "url", "arxiv") 7 | 8 | 9 | class TestParser(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.test_material_dir = "tests/documents" 13 | cls.valid_id_types = parse.id_patterns.keys() 14 | for id_type in target_ids: 15 | if id_type not in cls.valid_id_types: 16 | print(f"Skipping testing for {id_type} parsing") 17 | 18 | def test_parse_text_ids(self): 19 | "Test to parse identifiers from a set of files." 20 | 21 | # NOTE: this test does not fail on false positive matches 22 | # for file in test_document_ids: 23 | for file in test_document_ids: 24 | print(f"testing {file}") 25 | with open(os.path.join(TestParser.test_material_dir, file)) as f: 26 | file_content = f.read() 27 | 28 | parsed_results = parse.parse_ids_from_text(file_content) 29 | 30 | # just include the matching id, not the type 31 | parsed_results = [result["id"] for result in parsed_results] 32 | 33 | expected_ids = [] 34 | for type in test_document_ids[file]: 35 | if type in parse.id_patterns: 36 | for id in test_document_ids[file][type]: 37 | expected_ids.append(id) 38 | 39 | if not expected_ids: 40 | print("No expected IDs for this file") 41 | continue 42 | 43 | for expected_id in expected_ids: 44 | self.assertIn( 45 | expected_id, 46 | parsed_results, 47 | f"ID {expected_id} not found in {file}", 48 | ) 49 | 50 | def test_parse_text_pdfs(self): 51 | for file, expected_url in test_document_links: 52 | with open(os.path.join(TestParser.test_material_dir, file), "rt") as f: 53 | html_content = f.read() 54 | pdf_url = parse.find_pdf_url(html_content) 55 | self.assertEqual(pdf_url, expected_url) 56 | 57 | 58 | test_document_ids = { 59 | "ids.txt": { 60 | "url": [ 61 | "https://www.cell.com/current-biology/fulltext/S0960-9822(19)31469-1", 62 | ], 63 | "doi": [ 64 | "10.1016/j.cub.2019.11.030", 65 | "10.1107/s0907444905036693", 66 | ], 67 | }, 68 | "bsp-tree.html": { 69 | "doi": [ 70 | "10.1109/83.544569", 71 | ], 72 | "issn": [ 73 | "1057-7149", 74 | "1941-0042", 75 | ], 76 | }, 77 | "reyes-rendering.html": { 78 | "doi": [ 79 | "10.1145/37402.37414", 80 | ], 81 | }, 82 | "superscalar-cisc.html": { 83 | "doi": [ 84 | "10.1109/HPCA.2006.1598111", 85 | ], 86 | "issn": [ 87 | "1530-0897", 88 | "2378-203X", 89 | ], 90 | }, 91 | "b-tree-techniques.html": { 92 | "doi": [ 93 | "10.1561/1900000028", 94 | ], 95 | "url": [ 96 | "http://dx.doi.org/10.1561/1900000028", 97 | ], 98 | "isbn": [ 99 | "978-1-60198-482-1", 100 | "978-1-60198-483-8", 101 | ], 102 | }, 103 | "real-time-rendering.html": { 104 | "url": [ 105 | "https://doi.org/10.1201/9781315365459", 106 | ], 107 | "isbn": [ 108 | "9781315365459", 109 | ], 110 | }, 111 | "arxiv.html": { 112 | "url": [ 113 | "https://arxiv.org/abs/1605.04938", 114 | ], 115 | "arxiv": [ 116 | # identifiers after March 2007 117 | "arXiv:2407.13619", 118 | "arXiv:1608.00878", 119 | "arXiv:1605.04938", 120 | # identifiers before March 2007 121 | "arXiv:q-bio/0512009", 122 | "arXiv:math/0601009", 123 | "arXiv:hep-th/0512302", 124 | "arXiv:cond-mat/0512295", 125 | "arXiv:quant-ph/0511150", 126 | ], 127 | }, 128 | } 129 | 130 | test_document_links = [ 131 | ( 132 | "scidb.html", 133 | "https://wbsg8v.xyz/d3/x/1719017408/134/i/scimag/80500000/80542000/10.1016/j.cub.2019.11.030.pdf~/Avtp6y0GwksOGlfLFy9d9Q/Parrots%20Voluntarily%20Help%20Each%20Other%20to%20Obtain%20Food%20Rewards%20--%20Brucks%2C%20D%C3%A9sir%C3%A9e%3B%20von%20Bayern%2C%20Auguste%20M_P_%20--%20Current%20Biology%2C%20%232%2C%2030%2C%20pages%20292-297_e5%2C%20--%2010_1016%2Fj_cub_2019_11_030%20--%20c28dc1242df6f931c29b9cd445a55597%20--%20Anna%E2%80%99s%20Archive.pdf", 134 | ), 135 | ("scihub.html", "https://sci.bban.top/pdf/10.1016/j.cub.2019.11.030.pdf"), 136 | ] 137 | -------------------------------------------------------------------------------- /src/fetch/fetch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import hashlib 3 | import json 4 | import os 5 | from typing import Iterable 6 | 7 | import aiohttp 8 | import pdf2doi 9 | import providers.scidb as scidb 10 | import providers.scihub as scihub 11 | import providers.arxiv as arxiv 12 | from loguru import logger 13 | 14 | all_providers = ["scihub", "scidb", "arxiv"] 15 | 16 | 17 | def match_available_providers( 18 | providers, available_providers: Iterable[str] | None = None 19 | ) -> list[str]: 20 | "Find the providers that are included in available_providers" 21 | if not available_providers: 22 | available_providers = all_providers 23 | matching_providers = [] 24 | for provider in providers: 25 | for available_provider in available_providers: 26 | # a user-supplied provider might be a substring of a supported 27 | # provider (e.g. sci-hub.ee instead of https://sci-hub.ee) 28 | if provider in available_provider: 29 | matching_providers.append(available_provider) 30 | return matching_providers 31 | 32 | 33 | async def get_urls(session, identifier, providers): 34 | urls = [] 35 | if providers == "all": 36 | urls.append(await scidb.get_url(session, identifier)) 37 | urls.extend(await scihub.get_direct_urls(session, identifier)) 38 | urls.append(await arxiv.get_url(identifier)) 39 | return urls 40 | 41 | providers = [provider.strip() for provider in providers.split(",")] 42 | logger.info(f"given providers: {providers}") 43 | 44 | matching_providers = match_available_providers(providers) 45 | logger.info(f"matching providers: {matching_providers}") 46 | for mp in matching_providers: 47 | if mp == "scihub": 48 | urls.extend(await scihub.get_direct_urls(session, identifier)) 49 | if mp == "scidb": 50 | urls.append(await scidb.get_url(session, identifier)) 51 | if mp == "arxiv": 52 | urls.append(await arxiv.get_url(identifier)) 53 | 54 | # if the catch-all "scihub" provider isn't given, we look for 55 | # specific Sci-Hub urls. if we find specific Sci-Hub URLs in the 56 | # user input, we only use those 57 | if "scihub" not in providers: 58 | matching_scihub_urls = match_available_providers( 59 | providers, await scihub.get_available_scihub_urls() 60 | ) 61 | logger.info(f"matching scihub urls: {matching_scihub_urls}") 62 | if len(matching_scihub_urls) > 0: 63 | urls.extend( 64 | await scihub.get_direct_urls( 65 | session, identifier, base_urls=matching_scihub_urls 66 | ) 67 | ) 68 | 69 | return urls 70 | 71 | 72 | async def fetch(session, identifier, providers) -> tuple | None: 73 | # catch exceptions so that they don't cancel the task group 74 | async def get_wrapper(url): 75 | try: 76 | return await session.get(url) 77 | except Exception as e: 78 | logger.error("error: {}", e) 79 | return None 80 | 81 | urls = await get_urls(session, identifier, providers) 82 | 83 | urls = [url for url in urls if url is not None] 84 | 85 | if len(urls) > 0: 86 | logger.info("PDF urls: {}", "\n".join(urls)) 87 | tasks = [get_wrapper(url) for url in urls if url] 88 | for item in zip(asyncio.as_completed(tasks), urls): 89 | res = await item[0] 90 | if res is None or res.content_type != "application/pdf": 91 | logger.info("couldn't find url at {}", item[1]) 92 | continue 93 | return (await res.read(), item[1]) 94 | return None 95 | 96 | 97 | def save(data, path): 98 | """ 99 | Save a file give data and a path. 100 | """ 101 | try: 102 | logger.info(f"Saving file to {path}") 103 | 104 | with open(path, "wb") as f: 105 | f.write(data) 106 | except Exception as e: 107 | logger.error(f"Failed to write to {path} {e}") 108 | raise e 109 | 110 | 111 | def generate_name(content): 112 | "Generate unique filename for paper" 113 | 114 | pdf_hash = hashlib.md5(content).hexdigest() 115 | return f"{pdf_hash}" + ".pdf" 116 | 117 | 118 | def rename(out_dir, path, name=None) -> str: 119 | """ 120 | Renames a PDF to either the given name or its appropriate title, if 121 | possible. Adds the PDF extension. Returns the new path if renaming was 122 | successful, or the original path if not. 123 | """ 124 | 125 | logger.info("Finding paper title") 126 | pdf2doi.config.set("verbose", False) 127 | 128 | try: 129 | if name is None: 130 | result_info = pdf2doi.pdf2doi(path) 131 | if not result_info: 132 | return path 133 | raw_validation_info = result_info["validation_info"] 134 | if isinstance(raw_validation_info, (str, bytes, bytearray)): 135 | validation_info = json.loads(raw_validation_info) 136 | else: 137 | validation_info = raw_validation_info 138 | name = validation_info.get("title") 139 | 140 | if name: 141 | name += ".pdf" 142 | new_path = os.path.join(out_dir, name) 143 | os.rename(path, new_path) 144 | logger.info(f"File renamed to {new_path}") 145 | return new_path 146 | else: 147 | return path 148 | except Exception as e: 149 | logger.error(f"Couldn't get paper title from PDF at {path}: {e}") 150 | return path 151 | -------------------------------------------------------------------------------- /src/parse/parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from bs4 import BeautifulSoup 5 | from loguru import logger 6 | 7 | 8 | # from https://isbn-checker.netlify.app 9 | def valid_isbn(subject): 10 | "Check if the subject is a valid ISBN" 11 | 12 | isbn_regex = re.compile( 13 | r"^(?:ISBN(?:-1[03])?:? )?(?=[0-9X]{10}$|(?=(?:[0-9]+[- ]){3})[- 0-9X]{13}$|97[89][0-9]{10}$|(?=(?:[0-9]+[- ]){4})[- 0-9]{17}$)(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]$" 14 | ) 15 | 16 | # Check if the subject matches the ISBN pattern 17 | if isbn_regex.match(subject): 18 | chars = re.sub(r"[- ]|^ISBN(?:-1[03])?:?", "", subject) 19 | chars = list(chars) 20 | last = chars.pop() 21 | sum = 0 22 | check = 0 23 | 24 | if len(chars) == 9: 25 | chars.reverse() 26 | for i in range(len(chars)): 27 | sum += (i + 2) * int(chars[i]) 28 | check = 11 - (sum % 11) 29 | if check == 10: 30 | check = "X" 31 | elif check == 11: 32 | check = "0" 33 | else: 34 | for i in range(len(chars)): 35 | sum += (i % 2 * 2 + 1) * int(chars[i]) 36 | check = 10 - (sum % 10) 37 | if check == 10: 38 | check = "0" 39 | 40 | if str(check) == last: 41 | return True 42 | else: 43 | return False 44 | else: 45 | return False 46 | 47 | 48 | # these are the currently supported identifier types that we can parse, along 49 | # with their regex patterns 50 | id_patterns = { 51 | # These come from https://gist.github.com/oscarmorrison/3744fa216dcfdb3d0bcb 52 | "isbn": [ 53 | r"(?:ISBN(?:-10)?:?\ )?(?=[0-9X]{10}|(?=(?:[0-9]+[-\ ]){3})[-\ 0-9X]{13})[0-9]{1,5}[-\ ]?[0-9]+[-\ ]?[0-9]+[-\ ]?[0-9X]", 54 | r"(?:ISBN(?:-13)?:?\ )?(?=[0-9]{13}|(?=(?:[0-9]+[-\ ]){4})[-\ 0-9]{17})97[89][-\ ]?[0-9]{1,5}[-\ ]?[0-9]+[-\ ]?[0-9]+[-\ ]?[0-9]", 55 | ], 56 | # doi regexes taken from https://www.crossref.org/blog/dois-and-matching-regular-expressions/ 57 | # listed in decreasing order of goodness. Not fully tested yet. 58 | "doi": [ 59 | r"10.\d{4,9}\/[-._;()\/:A-Z0-9]+", 60 | r"10.1002\/[^\s]+", 61 | r"10.\d{4}\/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d", 62 | r"10.1021\/\w\w\d++", 63 | r"10.1207/[\w\d]+\&\d+_\d+", 64 | ], 65 | # arXiv ids: https://info.arxiv.org/help/arxiv_identifier.html 66 | "arxiv": [ 67 | # identifiers since March 2007 68 | r"arXiv:\d{4}\.\d{4,5}(v\d+)?", 69 | # identifiers before March 2007 70 | r"arXiv:[A-Za-z-]{3,10}(\.[A-Z]{2})?\/\d{4,8}", 71 | ], 72 | } 73 | 74 | # these can eliminate false positives 75 | # TODO: remove duplication of validation logic and parsing logic 76 | id_validators = { 77 | "isbn": valid_isbn, 78 | } 79 | 80 | 81 | def find_pdf_url(html_content) -> str | None: 82 | "Given HTML content, find an embedded link to a PDF." 83 | 84 | s = BeautifulSoup(html_content, "html.parser") 85 | 86 | # look for a dynamically loaded PDF 87 | script_element = s.find("script", string=re.compile("PDFObject.embed")) 88 | 89 | if script_element: 90 | match = re.search(r'PDFObject\.embed\("([^"]+)"', script_element.string) 91 | if match: 92 | return match.group(1) 93 | 94 | # look for the "

" element (scihub) 95 | embed_element = s.find("embed", {"id": "pdf", "type": "application/pdf"}) 96 | 97 | if embed_element: 98 | direct_url = embed_element["src"] 99 | if isinstance(direct_url, list): 100 | direct_url = direct_url[0] 101 | if direct_url: 102 | return direct_url 103 | 104 | # look for an iframe 105 | iframe = s.find("iframe", {"type": "application/pdf"}) 106 | 107 | if iframe: 108 | logger.info(f"found iframe: {iframe}") 109 | direct_url = iframe.get("src") 110 | if isinstance(direct_url, list): 111 | direct_url = direct_url[0] 112 | if direct_url: 113 | return direct_url 114 | 115 | return None 116 | 117 | 118 | def parse_ids_from_text( 119 | s: str, id_types: list[str] | None = None 120 | ) -> list[dict[str, str]]: 121 | """ 122 | Find all matches for the given id types in a string. If id_types isn't 123 | given, it will parse all the types in id_patterns by default. 124 | """ 125 | 126 | # we look for all ID patterns by default 127 | if id_types is None: 128 | id_types = list(id_patterns) 129 | 130 | seen = set() 131 | matches = [] 132 | for id_type in id_types: 133 | validator = id_validators.get(id_type) 134 | for regex in id_patterns[id_type]: 135 | for match in re.finditer(regex, s, re.IGNORECASE): 136 | mg = match.group() 137 | valid_id = validator(mg) if validator else True 138 | if mg not in seen and valid_id: 139 | matches.append({"id": mg, "type": id_type}) 140 | seen.add(mg) 141 | return matches 142 | 143 | 144 | def parse_file(path, id_types: list[str] | None = None): 145 | """ 146 | Find all matches for the given id types in a file. If id_types isn't given, 147 | defaults to the types in id_patterns. 148 | """ 149 | 150 | matches = [] 151 | try: 152 | with open(path) as f: 153 | content = f.read() 154 | matches = parse_ids_from_text(content, id_types) 155 | except Exception as e: 156 | print(f"Error: {e}") 157 | 158 | return matches 159 | 160 | 161 | def format_output(output: list[dict[str, str]], format: str = "raw") -> str: 162 | """ 163 | Formats a list of dicts of ids and id types into a string according to the 164 | given format type. 'raw' formats ids by line, ignoring type. 'jsonl' and 165 | 'csv' formats ids and types. 166 | """ 167 | 168 | lines: list[str] = [] 169 | if format == "raw": 170 | lines = [line["id"] for line in output] 171 | elif format == "jsonl": 172 | lines = [json.dumps(line) for line in output] 173 | elif format == "csv": 174 | lines = [f"{line['id']},{line['type']}" for line in output] 175 | else: 176 | raise Exception(f"invalid format {format}") 177 | return "\n".join(lines) 178 | --------------------------------------------------------------------------------