├── docs ├── _static │ └── .gitignore ├── guide.rst ├── license.rst ├── requirements.txt ├── notebook │ ├── clean.sh │ ├── to-rst.sh │ ├── server.sh │ └── correct_nbconvert.py ├── index.rst ├── Makefile ├── api.rst ├── make.bat ├── conf.py └── cli.rst ├── dev ├── test_data │ ├── map │ │ ├── empty.json │ │ ├── pubchem.json │ │ └── module.json │ ├── module-entry-ids.txt │ ├── glycan-pubchem-entry-ids.txt │ ├── pathway-module-entry-ids.txt │ ├── ddi-output.txt │ ├── brite-entries │ │ ├── pull-results.json │ │ ├── br_br08902.txt │ │ └── br_br08005.txt │ ├── all-brite-entry-ids.txt │ └── drug-entry-ids.txt ├── pytest.ini ├── test.sh ├── install.sh ├── README.md ├── conftest.py ├── test_utils.py ├── test_entry_ids_cli.py ├── test_pathway_organizer_cli.py ├── test_entry_ids.py ├── test_map_cli.py ├── test_pathway_organizer.py ├── utils.py ├── test_pull_cli.py ├── test_rest_cli.py ├── test_main.py ├── test_rest.py ├── test_kegg_url.py └── test_map.py ├── requirements.txt ├── .gitignore ├── src └── kegg_pull │ ├── __init__.py │ ├── __main__.py │ ├── entry_ids_cli.py │ ├── pathway_organizer_cli.py │ ├── map_cli.py │ ├── entry_ids.py │ ├── _utils.py │ ├── pull_cli.py │ ├── rest_cli.py │ ├── pathway_organizer.py │ └── rest.py ├── CITATION.cff ├── .github └── workflows │ ├── main.yml │ └── build_documentation.yml ├── setup.py ├── LICENSE └── README.rst /docs/_static/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dev/test_data/map/empty.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /dev/test_data/module-entry-ids.txt: -------------------------------------------------------------------------------- 1 | md:M00050 2 | md:M00959 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt 2 | requests 3 | tqdm 4 | jsonschema 5 | -------------------------------------------------------------------------------- /docs/guide.rst: -------------------------------------------------------------------------------- 1 | Guide 2 | ===== 3 | 4 | .. include:: ../README.rst 5 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | .. include:: ../LICENSE 5 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=5.3.0 2 | sphinx_rtd_theme 3 | tqdm 4 | jsonschema 5 | -------------------------------------------------------------------------------- /dev/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | disable_mock_organism_set: Disable mocking the AbstractKEGGurl._get_organism_set method 4 | -------------------------------------------------------------------------------- /dev/test_data/glycan-pubchem-entry-ids.txt: -------------------------------------------------------------------------------- 1 | gl:G13143 pubchem:405226698 2 | gl:G13141 pubchem:405226697 3 | gl:G13139 pubchem:405226696 4 | -------------------------------------------------------------------------------- /dev/test_data/map/pubchem.json: -------------------------------------------------------------------------------- 1 | { 2 | "cpd:C00001": [ 3 | "pubchem:3303" 4 | ], 5 | "cpd:C00002": [ 6 | "pubchem:3304" 7 | ] 8 | } -------------------------------------------------------------------------------- /dev/test_data/map/module.json: -------------------------------------------------------------------------------- 1 | { 2 | "md:M00965": [ 3 | "ko:K12696", 4 | "ko:K22365" 5 | ], 6 | "md:M00962": [ 7 | "ko:K22435" 8 | ] 9 | } -------------------------------------------------------------------------------- /dev/test_data/pathway-module-entry-ids.txt: -------------------------------------------------------------------------------- 1 | md:M00575 path:map05133 2 | md:M00574 path:map05133 3 | md:M00363 path:map05130 4 | md:M00363 path:map05131 5 | -------------------------------------------------------------------------------- /dev/test_data/ddi-output.txt: -------------------------------------------------------------------------------- 1 | D00100 D00564 CI,P unclassified 2 | D00109 D00564 P unclassified 3 | D00564 D00100 CI,P unclassified 4 | D00564 D00109 P unclassified 5 | -------------------------------------------------------------------------------- /dev/test.sh: -------------------------------------------------------------------------------- 1 | source .env/bin/activate || source .env/Scripts/activate # Windows has Scripts instead of bin 2 | python3 -m pytest dev --cov --cov-branch --cov-report=term-missing 3 | -------------------------------------------------------------------------------- /docs/notebook/clean.sh: -------------------------------------------------------------------------------- 1 | rm -rf brite-entries/ brite-entries/ brite-entry-ids.txt compound-entries compound-entries.zip hierarchy-nodes.json mapping.json pull-entries/ pull-results.json standard_input.txt 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .coverage 2 | .env/ 3 | .pypi-env/ 4 | .idea/ 5 | __pycache__/ 6 | src/kegg_pull.egg-info/ 7 | docs/_build 8 | docs/notebook/.ipynb_checkpoints/ 9 | docs/notebook/tutorial.ipynb 10 | dist/ 11 | -------------------------------------------------------------------------------- /docs/notebook/to-rst.sh: -------------------------------------------------------------------------------- 1 | # Note this will only work if pandoc is installed separately via "sudo dnf install pandoc" 2 | jupyter nbconvert --to rst tutorial.ipynb 3 | python3 correct_nbconvert.py 4 | mv tutorial.rst ../tutorial.rst 5 | -------------------------------------------------------------------------------- /src/kegg_pull/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package has the following modules: 3 | 4 | ``pull`` 5 | 6 | ``entry_ids`` 7 | 8 | ``map`` 9 | 10 | ``pathway_organizer`` 11 | 12 | ``rest`` 13 | 14 | ``kegg_url`` 15 | """ 16 | __version__ = '3.1.0' 17 | -------------------------------------------------------------------------------- /dev/test_data/brite-entries/pull-results.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent-success": 100.0, 3 | "pull-minutes": 0.03, 4 | "num-successful": 1, 5 | "num-failed": 0, 6 | "num-timed-out": 0, 7 | "num-total": 1, 8 | "successful-entry-ids": [ 9 | "br:br08902" 10 | ], 11 | "failed-entry-ids": [], 12 | "timed-out-entry-ids": [] 13 | } -------------------------------------------------------------------------------- /docs/notebook/server.sh: -------------------------------------------------------------------------------- 1 | # This will start the server and print the URL that you need to copy and paste into your web browser. 2 | # Note that if you're port forwarding, you need to replace the port number in the URL with the one that you're forwarding to localhost (your local computer) rather than the one you're running on a lab machine. 3 | jupyter notebook 4 | 5 | -------------------------------------------------------------------------------- /docs/notebook/correct_nbconvert.py: -------------------------------------------------------------------------------- 1 | with open('tutorial.rst', 'r') as file: 2 | contents: str = file.read() 3 | 4 | contents: str = contents.replace('ipython3', 'python3') 5 | contents: str = contents.replace('python3\n\n !', 'none\n\n !') 6 | contents: str = contents.replace('! ', '% ') 7 | 8 | with open('tutorial.rst', 'w') as file: 9 | file.write(contents) 10 | 11 | -------------------------------------------------------------------------------- /dev/install.sh: -------------------------------------------------------------------------------- 1 | echo "Removing previous .env/ directory if it exists..." 2 | rm -rf .env/ 3 | echo "Creating new .env/ directory..." 4 | python3 -m venv .env/ 5 | source .env/bin/activate || source .env/Scripts/activate # Windows has Scripts instead of bin 6 | python3 -m pip install --upgrade pip 7 | python3 -m pip install pytest pytest-mock pytest-cov sphinx sphinx-rtd-theme notebook 8 | python3 -m pip install -e . 9 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. kegg_pull documentation master file, created by 2 | sphinx-quickstart on Tue Aug 30 16:32:53 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to kegg_pull's documentation! 7 | ===================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | guide 14 | tutorial 15 | cli 16 | api 17 | license 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | -------------------------------------------------------------------------------- /dev/README.md: -------------------------------------------------------------------------------- 1 | # Local Development 2 | ## Installing testing dependencies and kegg_pull as a package 3 | With the root of the repository as the working directory, run the following: 4 | ``` 5 | bash tests/dev-install.sh # Installs testing dependencies and the kegg_pull package 6 | bash tests/test.sh # Runs tests on the kegg_pull package 7 | ``` 8 | ## Preventing the "module not found" error in PyCharm 9 | * After installing `kegg_pull`, a file at `src/kegg_pull.egg-info/PKG-INFO` is generated. 10 | * Go into that file and change `kegg-pull` (with a dash) to `kegg_pull` (with an underscore). 11 | * Restart PyCharm 12 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite this article in the reference section." 3 | authors: 4 | - family-names: "Huckvale" 5 | given-names: "Erik" 6 | - family-names: "Moseley" 7 | given-names: "Hunter" 8 | title: "kegg-pull" 9 | version: 3.0.0 10 | date-released: 2023-02-15 11 | url: "https://github.com/MoseleyBioinformaticsLab/kegg_pull" 12 | references: 13 | - authors: 14 | - family-names: "Huckvale" 15 | given-names: "Erik" 16 | - family-names: "Moseley" 17 | given-names: "Hunter" 18 | type: article 19 | doi: "https://doi.org/10.1101/2022.11.03.515120" 20 | journal: "BMC Bioinformatics" 21 | title: "kegg_pull: a Software Package for the RESTful Access and Pulling from The Kyoto Encyclopedia of Gene and Genomes" 22 | volume: 24 23 | year: 2023 24 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | 3 | name: build 4 | 5 | on: 6 | push: 7 | branches: 8 | - main 9 | - dev 10 | pull_request: 11 | branches: 12 | - main 13 | - dev 14 | workflow_dispatch: 15 | 16 | jobs: 17 | build: 18 | 19 | strategy: 20 | matrix: 21 | python-version: ["3.10", "3.11"] 22 | os: [ ubuntu-latest, windows-latest ] 23 | runs-on: ${{matrix.os}} 24 | 25 | steps: 26 | - uses: actions/checkout@v3 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v3 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Install testing environment and kegg_pull package 32 | run: bash dev/install.sh 33 | - name: Test with pytest 34 | run: bash dev/test.sh 35 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. |Functionality| replace:: Provides API functionality 2 | .. |Interface for| replace:: Provides wrapper methods for 3 | 4 | API 5 | === 6 | **Note:** Many KEGG entry IDs contain colons and ``kegg_pull`` saves KEGG entry files with their ID in the file name. When running on Windows, all file names with colons will have their colons replaced with underscores. 7 | 8 | .. automodule:: kegg_pull 9 | 10 | .. automodule:: kegg_pull.pull 11 | :members: 12 | :undoc-members: 13 | 14 | .. automodule:: kegg_pull.entry_ids 15 | :members: 16 | :undoc-members: 17 | 18 | .. automodule:: kegg_pull.map 19 | :members: 20 | :undoc-members: 21 | 22 | .. automodule:: kegg_pull.pathway_organizer 23 | :members: 24 | :undoc-members: 25 | 26 | .. automodule:: kegg_pull.rest 27 | :members: 28 | :undoc-members: 29 | 30 | .. automodule:: kegg_pull.kegg_url 31 | :members: 32 | :undoc-members: 33 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools as st 2 | import re 3 | 4 | 5 | requirements = [ 6 | 'docopt', 7 | 'requests', 8 | 'tqdm', 9 | 'jsonschema' 10 | ] 11 | 12 | 13 | def _readme() -> str: 14 | with open('README.rst') as readme_file: 15 | return readme_file.read() 16 | 17 | 18 | def _get_version() -> str: 19 | with open('src/kegg_pull/__init__.py', 'r') as fd: 20 | version: str = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) 21 | if not version: 22 | raise RuntimeError('Cannot find version information') 23 | return version 24 | 25 | 26 | st.setup( 27 | name='kegg_pull', 28 | version=_get_version(), 29 | package_dir={'': 'src'}, 30 | packages=st.find_packages('src', exclude=['dev', 'docs']), 31 | install_requires=requirements, 32 | entry_points={'console_scripts': ['kegg_pull = kegg_pull.__main__:main']}, 33 | author='Erik Huckvale', 34 | author_email='edhu227@g.uky.edu', 35 | url='https://github.com/MoseleyBioinformaticsLab/KEGGpull', 36 | description='Pulls any and all entries from any and all KEGG databases, pulls KEGG entry IDs, and wraps all the KEGG REST API operations in both Python API and the command line.', 37 | long_description_content_type='text/x-rst', 38 | long_description=_readme()) 39 | -------------------------------------------------------------------------------- /dev/conftest.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import os 4 | import shutil as sh 5 | import kegg_pull.kegg_url as ku 6 | 7 | 8 | @pt.fixture(autouse=True) 9 | def mock_organism_set(mocker, request): 10 | if 'disable_mock_organism_set' not in request.keywords: 11 | organism_set_mock = {'organism-code', 'organism-T-number'} 12 | mocker.patch.object(ku.AbstractKEGGurl, 'organism_set', organism_set_mock) 13 | 14 | 15 | @pt.fixture(name='output_file', params=['dir/subdir/file.txt', 'dir/file.txt', './file.txt', 'file.txt']) 16 | def get_output_file(request): 17 | output_file: str = request.param 18 | yield output_file 19 | os.remove(output_file) 20 | sh.rmtree('dir', ignore_errors=True) 21 | 22 | 23 | @pt.fixture(name='zip_archive_data', params=['file.txt', 'dir/file.txt', '/file.txt', '/dir/file.txt']) 24 | def get_zip_archive_data(request): 25 | zip_file_name: str = request.param 26 | zip_archive_path = 'archive.zip' 27 | yield zip_archive_path, zip_file_name 28 | os.remove(zip_archive_path) 29 | 30 | 31 | @pt.fixture(name='json_file_path', params=[ 32 | 'dir/subdir/file.json', 'dir/file.json', './file.json', 'file.json', 'archive.zip:file.json', 'archive.zip:dir/file.json']) 33 | def get_json_file_path(request): 34 | json_file_path: str = request.param 35 | yield json_file_path 36 | if '.zip:' in json_file_path: 37 | os.remove('archive.zip') 38 | else: 39 | os.remove(json_file_path) 40 | sh.rmtree('dir', ignore_errors=True) 41 | -------------------------------------------------------------------------------- /.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 3.11 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.11' 19 | - name: Upgrade pip, install package, install requirements, build docs 20 | run: | 21 | pip install --upgrade pip 22 | pip install -r ./docs/requirements.txt 23 | sphinx-build docs ./docs/_build/html/ 24 | # Create an artifact of the html output. 25 | - uses: actions/upload-artifact@v3 26 | with: 27 | name: DocumentationHTML 28 | path: docs/_build/html/ 29 | # Publish built docs to gh-pages branch. 30 | # =============================== 31 | - name: Commit documentation changes 32 | env: 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | run: | 35 | git config --global user.name "${GITHUB_ACTOR}" 36 | git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com" 37 | git clone "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" --branch gh-pages --single-branch gh-pages 38 | cd gh-pages/ 39 | git rm -r . 40 | cp -r ../docs/_build/html/* . 41 | touch .nojekyll 42 | git add . 43 | git commit -m "Update documentation." -a || true 44 | # The above command will fail if no changes were present, so we ignore 45 | # that. 46 | - name: Push changes 47 | uses: ad-m/github-push-action@master 48 | with: 49 | branch: gh-pages 50 | directory: gh-pages 51 | github_token: ${{ secrets.GITHUB_TOKEN }} 52 | # =============================== 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Clear BSD License with Extra Clause 2 | 3 | Copyright (c) 2022, Erik Huckvale, Hunter N.B. Moseley 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 11 | * All advertising materials mentioning features or use of this software must display the following acknowledgement: This product includes software developed by the copyright holder. 12 | * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 13 | * If the source code is used in a published work, then proper citation of the source code must be included with the published work. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /src/kegg_pull/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | kegg_pull -h | --help Show this help message. 4 | kegg_pull -v | --version Displays the package version. 5 | kegg_pull --full-help Show the help message of all sub commands. 6 | kegg_pull pull ... Pull, separate, and store an arbitrary number of KEGG entries to the local file system. 7 | kegg_pull entry-ids ... Obtain a list of KEGG entry IDs. 8 | kegg_pull map ... Obtain a mapping of entry IDs (KEGG or outside databases) to the IDs of related entries. 9 | kegg_pull pathway-organizer ... Creates a flattened version of a pathways Brite hierarchy. 10 | kegg_pull rest ... Executes one of the KEGG REST API operations. 11 | """ 12 | import sys 13 | from . import __version__ 14 | from . import pull_cli as p_cli 15 | from . import entry_ids_cli as ei_cli 16 | from . import map_cli as map_cli 17 | from . import pathway_organizer_cli as po_cli 18 | from . import rest_cli as r_cli 19 | 20 | 21 | def main() -> None: 22 | first_arg: str = sys.argv[1] if len(sys.argv) > 1 else None 23 | if first_arg == 'pull': 24 | p_cli.main() 25 | elif first_arg == 'entry-ids': 26 | ei_cli.main() 27 | elif first_arg == 'map': 28 | map_cli.main() 29 | elif first_arg == 'pathway-organizer': 30 | po_cli.main() 31 | elif first_arg == 'rest': 32 | r_cli.main() 33 | elif first_arg == '--full-help': 34 | separator = '-'*80 35 | print(__doc__) 36 | print(separator) 37 | print(p_cli.__doc__) 38 | print(separator) 39 | print(ei_cli.__doc__) 40 | print(separator) 41 | print(map_cli.__doc__) 42 | print(separator) 43 | print(po_cli.__doc__) 44 | print(separator) 45 | print(r_cli.__doc__) 46 | elif first_arg == '--version' or first_arg == '-v': 47 | print(__version__) 48 | else: 49 | print(__doc__) 50 | 51 | 52 | if __name__ == '__main__': # pragma: no cover 53 | main() # pragma: no cover 54 | -------------------------------------------------------------------------------- /dev/test_utils.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | # noinspection PyProtectedMember 4 | import kegg_pull._utils as utils 5 | import dev.utils as u 6 | import kegg_pull.pull as p 7 | import kegg_pull.rest as r 8 | import kegg_pull.pathway_organizer as po 9 | 10 | 11 | @pt.mark.parametrize('comma_separated_list', [',,', ',', '']) 12 | def test_parse_input_sequence_comma_exception(comma_separated_list: str): 13 | with pt.raises(ValueError) as error: 14 | utils.parse_input_sequence(input_source=comma_separated_list) 15 | expected_message = f'Empty list provided from comma separated list: "{comma_separated_list}"' 16 | u.assert_exception(expected_message=expected_message, exception=error) 17 | 18 | 19 | @pt.mark.parametrize('stdin_input', ['', '\n', '\t\t', '\n\n', '\t \n \t', ' \n \n\t\t \t\n']) 20 | def test_parse_input_sequence_stdin_exception(mocker, stdin_input: str): 21 | stdin_mock: mocker.MagicMock = mocker.patch('kegg_pull._utils.sys.stdin.read', return_value=stdin_input) 22 | with pt.raises(ValueError) as error: 23 | utils.parse_input_sequence(input_source='-') 24 | stdin_mock.assert_called_once_with() 25 | expected_message = 'Empty list provided from standard input' 26 | u.assert_exception(expected_message=expected_message, exception=error) 27 | 28 | 29 | def test_get_range_values_exception(): 30 | with pt.raises(ValueError) as error: 31 | utils._get_range_values(range_values=['1', '2', '3'], value_type=int) 32 | expected_message = f'Range can only be specified by two values but 3 values were provided: 1, 2, 3' 33 | u.assert_exception(expected_message=expected_message, exception=error) 34 | 35 | 36 | @pt.mark.parametrize( 37 | 'NonInstantiable,kwargs', [(p.PullResult, {}), (r.KEGGresponse, {'status': None, 'kegg_url': None}), (po.PathwayOrganizer, {})]) 38 | def test_non_instantiable(NonInstantiable: type, kwargs: dict): 39 | expected_error_message = f'The class "{NonInstantiable.__name__}" cannot be instantiated outside of its module.' 40 | with pt.raises(RuntimeError) as error: 41 | NonInstantiable(**kwargs) 42 | u.assert_exception(expected_message=expected_error_message, exception=error) 43 | -------------------------------------------------------------------------------- /dev/test_data/all-brite-entry-ids.txt: -------------------------------------------------------------------------------- 1 | br:br08901 2 | br:br08902 3 | br:br08904 4 | br:br08906 5 | br:ko00001 6 | br:ko00002 7 | br:ko00003 8 | br:br08907 9 | br:ko01000 10 | br:ko01001 11 | br:ko01009 12 | br:ko01002 13 | br:ko01003 14 | br:ko01005 15 | br:ko01011 16 | br:ko01004 17 | br:ko01008 18 | br:ko01006 19 | br:ko01007 20 | br:ko00199 21 | br:ko00194 22 | br:ko03000 23 | br:ko03021 24 | br:ko03019 25 | br:ko03041 26 | br:ko03011 27 | br:ko03009 28 | br:ko03016 29 | br:ko03012 30 | br:ko03110 31 | br:ko04131 32 | br:ko04121 33 | br:ko03051 34 | br:ko03032 35 | br:ko03036 36 | br:ko03400 37 | br:ko03029 38 | br:ko02000 39 | br:ko02044 40 | br:ko02042 41 | br:ko02022 42 | br:ko02035 43 | br:ko03037 44 | br:ko04812 45 | br:ko04147 46 | br:ko02048 47 | br:ko04030 48 | br:ko04050 49 | br:ko04054 50 | br:ko03310 51 | br:ko04040 52 | br:ko04031 53 | br:ko04052 54 | br:ko04515 55 | br:ko04090 56 | br:ko01504 57 | br:ko00535 58 | br:ko00536 59 | br:ko00537 60 | br:ko04091 61 | br:ko04990 62 | br:ko03200 63 | br:ko03210 64 | br:ko03100 65 | br:br08001 66 | br:br08002 67 | br:br08003 68 | br:br08005 69 | br:br08006 70 | br:br08007 71 | br:br08009 72 | br:br08021 73 | br:br08120 74 | br:br08201 75 | br:br08202 76 | br:br08204 77 | br:br08203 78 | br:br08303 79 | br:br08302 80 | br:br08301 81 | br:br08313 82 | br:br08312 83 | br:br08304 84 | br:br08305 85 | br:br08331 86 | br:br08330 87 | br:br08332 88 | br:br08310 89 | br:br08307 90 | br:br08327 91 | br:br08311 92 | br:br08403 93 | br:br08402 94 | br:br08401 95 | br:br08411 96 | br:br08410 97 | br:br08420 98 | br:br08601 99 | br:br08610 100 | br:br08611 101 | br:br08612 102 | br:br08613 103 | br:br08614 104 | br:br08615 105 | br:br08620 106 | br:br08621 107 | br:br08605 108 | br:br03220 109 | br:br03222 110 | br:br03223 111 | br:br01610 112 | br:br01611 113 | br:br01612 114 | br:br01613 115 | br:br01601 116 | br:br01602 117 | br:br01600 118 | br:br01620 119 | br:br01553 120 | br:br01554 121 | br:br01556 122 | br:br01555 123 | br:br01557 124 | br:br01800 125 | br:br01810 126 | br:br08011 127 | br:br08020 128 | br:br08012 129 | br:br08110 130 | br:br08319 131 | br:br08329 132 | br:br08318 133 | br:br08328 134 | br:br08309 135 | br:br08341 136 | br:br08324 137 | br:br08317 138 | br:br08315 139 | br:br08314 140 | br:br08442 141 | br:br08441 142 | br:br08431 -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # Add the package to the python path so autodoc can import modules so doc strings can be included in the documentation 7 | import os 8 | import sys 9 | sys.path.insert(0, os.path.abspath('../src')) 10 | 11 | # It's recommended that you import the project version from your package's __init__.py file 12 | from kegg_pull import __version__ 13 | 14 | def skip_organism_set(app, what, name, obj, skip, options) -> bool: 15 | if name in {'organism_set'}: 16 | return True 17 | 18 | def setup(app): 19 | app.connect('autodoc-skip-member', skip_organism_set) 20 | 21 | # -- Project information ----------------------------------------------------- 22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 23 | 24 | project = 'kegg_pull' 25 | copyright = '2022, Erik Huckvale' 26 | author = 'Erik Huckvale' 27 | 28 | version = __version__ 29 | release = __version__ 30 | 31 | # -- General configuration --------------------------------------------------- 32 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 33 | 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.doctest', 37 | 'sphinx.ext.intersphinx', 38 | 'sphinx.ext.todo', 39 | 'sphinx.ext.coverage', 40 | 'sphinx.ext.viewcode', 41 | 'sphinx.ext.githubpages', 42 | ] 43 | 44 | templates_path = ['_templates'] 45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 46 | latex_elements = {'preamble': r'\usepackage{pmboxdraw}'} 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 50 | 51 | autodoc_typehints = 'both' 52 | autoclass_content = 'both' 53 | autodoc_member_order = 'bysource' 54 | html_theme = 'sphinx_rtd_theme' 55 | html_static_path = ['_static'] 56 | 57 | # -- Options for intersphinx extension --------------------------------------- 58 | # https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#configuration 59 | 60 | intersphinx_mapping = { 61 | 'python': ('https://docs.python.org/3', None), 62 | } 63 | 64 | # -- Options for todo extension ---------------------------------------------- 65 | # https://www.sphinx-doc.org/en/master/usage/extensions/todo.html#configuration 66 | 67 | todo_include_todos = True 68 | -------------------------------------------------------------------------------- /src/kegg_pull/entry_ids_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | kegg_pull entry-ids -h | --help 4 | kegg_pull entry-ids database [--output=] 5 | kegg_pull entry-ids keywords [--output=] 6 | kegg_pull entry-ids molec-attr (--formula=|--em=...|--mw=...) [--output=] 7 | 8 | Options: 9 | -h --help Show this help message. 10 | database Pulls all the entry IDs within a given database. 11 | The KEGG database from which to pull a list of entry IDs. 12 | --output= Path to the file (either in a directory or ZIP archive) to store the output (1 entry ID per line). Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt). 13 | keywords Searches for entries within a database based on provided keywords. 14 | Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to "-", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...). 15 | molec-attr Searches a database of molecule-type KEGG entries by molecular attributes. 16 | --formula= Sequence of atoms in a chemical formula format to search for (e.g. "O5C7" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms). 17 | --em= Either a single number (e.g. "--em=155.5") or two numbers (e.g. "--em=155.5 --em=244.4"). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range). 18 | --mw= Same as "--em=" but searches based on the molecular weight. 19 | """ 20 | import docopt as d 21 | from . import entry_ids as ei 22 | from . import _utils as u 23 | 24 | 25 | def main() -> None: 26 | args = d.docopt(__doc__) 27 | database: str = args[''] 28 | if args['database']: 29 | entry_ids = ei.from_database(database=database) 30 | elif args['keywords']: 31 | keywords: list = u.parse_input_sequence(input_source=args['']) 32 | entry_ids = ei.from_keywords(database=database, keywords=keywords) 33 | else: 34 | formula, exact_mass, molecular_weight = u.get_molecular_attribute_args(args=args) 35 | entry_ids = ei.from_molecular_attribute( 36 | database=database, formula=formula, exact_mass=exact_mass, molecular_weight=molecular_weight) 37 | entry_ids_str = '\n'.join(entry_ids) 38 | u.print_or_save(output_target=args['--output'], output_content=entry_ids_str) 39 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ######### 2 | kegg_pull 3 | ######### 4 | Description 5 | ----------- 6 | The ``kegg_pull`` package provides a number of useful CLI and API features for interacting with the KEGG REST API. This includes wrapper methods/commands for all the REST API operations, pulling lists of KEGG entry IDs, and pulling an arbitrary number of KEGG entries, in a single call, that are automatically separated and saved in individual files. 7 | 8 | Documentation 9 | ------------- 10 | The complete documentation for our API and CLI including tutorials can be found `here `__. 11 | 12 | Installation 13 | ------------ 14 | Requires python 3.10 and above. 15 | 16 | Install on Linux, Mac OS X 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .. parsed-literal:: 19 | python3 -m pip install kegg-pull 20 | 21 | Install on Windows 22 | ~~~~~~~~~~~~~~~~~~ 23 | .. parsed-literal:: 24 | py -3 -m pip install kegg-pull 25 | 26 | **Note:** Many KEGG entry IDs contain colons and ``kegg_pull`` saves KEGG entry files with their ID in the file name. When running on Windows, all file names with colons will have their colons replaced with underscores. 27 | 28 | **Note:** If ``py`` is not installed on Windows (e.g. Python was installed via the Windows store rather than from the official Python website), the installation command is the same as Linux and Mac OS X. 29 | 30 | **Note:** If the ``kegg_pull`` console script is not found on Windows, the CLI can be used via ``python3 -m kegg_pull`` or ``py -3 -m kegg_pull`` or ``path\to\console\script\kegg_pull.exe``. Alternatively, the directory where the console script is located can be added to the Path environment variable. For example, the console script may be installed at: 31 | 32 | .. parsed-literal:: 33 | c:\\users\\\\appdata\\local\\programs\\python\\python310\\Scripts\\ 34 | 35 | PyPi 36 | ~~~~ 37 | See our PyPi page `here `__. 38 | 39 | Questions, Feature Requests, and Bug Reports 40 | -------------------------------------------- 41 | Please submit any questions or feature requests you may have and report any potential bugs/errors you observe on `our GitHub issues page `__. 42 | 43 | Dependencies 44 | ------------ 45 | Note, the ``pip`` command will install dependencies automatically. 46 | 47 | .. parsed-literal:: 48 | docopt 49 | requests 50 | tqdm 51 | jsonschema 52 | 53 | Get the source code 54 | ------------------- 55 | Code is available on GitHub: https://github.com/MoseleyBioinformaticsLab/kegg_pull. 56 | 57 | You can clone the repository via: 58 | 59 | .. parsed-literal:: 60 | git clone https://github.com/MoseleyBioinformaticsLab/kegg_pull.git 61 | 62 | Once you have a copy of the source, you can embed it in your own Python package, or install it into your system site-packages easily: 63 | 64 | Linux, Mac OS X 65 | ~~~~~~~~~~~~~~~ 66 | .. parsed-literal:: 67 | python3 setup.py install 68 | 69 | Windows 70 | ~~~~~~~ 71 | .. parsed-literal:: 72 | py -3 setup.py install 73 | -------------------------------------------------------------------------------- /dev/test_entry_ids_cli.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import kegg_pull.entry_ids_cli as ei_cli 4 | import dev.utils as u 5 | 6 | entry_ids_mock = ['a', 'b'] 7 | expected_output: str = '\n'.join(entry_ids_mock) 8 | 9 | 10 | def test_help(mocker): 11 | u.assert_help(mocker=mocker, module=ei_cli, subcommand='entry-ids') 12 | 13 | 14 | test_data = [ 15 | (['entry-ids', 'database', 'compound'], 'entry_ids_cli.ei.from_database', {'database': 'compound'}, None), 16 | (['entry-ids', 'keywords', 'pathway', 'k1,,k2'], 'entry_ids_cli.ei.from_keywords', {'database': 'pathway', 'keywords': ['k1', 'k2']}, 17 | None), 18 | (['entry-ids', 'molec-attr', 'drug', '--formula=CO2'], 'entry_ids_cli.ei.from_molecular_attribute', 19 | {'database': 'drug', 'formula': 'CO2', 'exact_mass': None, 'molecular_weight': None}, None), 20 | (['entry-ids', 'molec-attr', 'drug', '--em=20.2'], 'entry_ids_cli.ei.from_molecular_attribute', 21 | {'database': 'drug', 'formula': None, 'exact_mass': 20.2, 'molecular_weight': None}, None), 22 | (['entry-ids', 'molec-attr', 'drug', '--mw=202'], 'entry_ids_cli.ei.from_molecular_attribute', 23 | {'database': 'drug', 'formula': None, 'exact_mass': None, 'molecular_weight': 202}, None), 24 | (['entry-ids', 'molec-attr', 'drug', '--em=20.2', '--em=30.3'], 'entry_ids_cli.ei.from_molecular_attribute', 25 | {'database': 'drug', 'formula': None, 'exact_mass': (20.2, 30.3), 'molecular_weight': None}, None), 26 | (['entry-ids', 'molec-attr', 'drug', '--mw=202', '--mw=303'], 'entry_ids_cli.ei.from_molecular_attribute', 27 | {'database': 'drug', 'formula': None, 'exact_mass': None, 'molecular_weight': (202, 303)}, None), 28 | (['entry-ids', 'keywords', 'pathway', '-'], 'entry_ids_cli.ei.from_keywords', 29 | {'database': 'pathway', 'keywords': ['k1', 'k2']}, 'k1\nk2')] 30 | 31 | 32 | # noinspection DuplicatedCode 33 | @pt.mark.parametrize('args,method,kwargs,stdin_mock', test_data) 34 | def test_print(mocker, args: list, method: str, kwargs: dict, stdin_mock: str): 35 | u.test_print( 36 | mocker=mocker, argv_mock=args, stdin_mock=stdin_mock, method=method, method_return_value=entry_ids_mock, method_kwargs=kwargs, 37 | module=ei_cli, expected_output=expected_output) 38 | 39 | 40 | @pt.mark.parametrize('args,method,kwargs,stdin_mock', test_data) 41 | def test_file(mocker, args: list, method: str, kwargs: dict, output_file: str, stdin_mock: str): 42 | u.test_file( 43 | mocker=mocker, argv_mock=args, output_file=output_file, stdin_mock=stdin_mock, method=method, method_return_value=entry_ids_mock, 44 | method_kwargs=kwargs, module=ei_cli, expected_output=expected_output) 45 | 46 | 47 | @pt.mark.parametrize('args,method,kwargs,stdin_mock', test_data) 48 | def test_zip_archive(mocker, args: list, method: str, kwargs: dict, zip_archive_data: tuple, stdin_mock: str): 49 | u.test_zip_archive( 50 | mocker=mocker, argv_mock=args, zip_archive_data=zip_archive_data, stdin_mock=stdin_mock, method=method, 51 | method_return_value=entry_ids_mock, method_kwargs=kwargs, module=ei_cli, expected_output=expected_output) 52 | -------------------------------------------------------------------------------- /src/kegg_pull/pathway_organizer_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | kegg_pull pathway-organizer [--tln=] [--fn=] [--output=] 4 | 5 | Options: 6 | -h --help Show this help message. 7 | --tln= Node names in the highest level of the hierarchy to select from. If not set, all top level nodes are traversed to create the mapping of node key to node info. Either a comma separated list (e.g. node1,node2,node3 etc.) or if equal to "-", read from standard input one node per line; Press CTRL+D to finalize input or pipe (e.g. cat nodes.txt | kegg_pull pathway-organizer --tln=- ...). If both "--tln" and "--fn" are set as "-", one of the lines must be the delimiter "---" without quotes in order to distinguish the input, with the top level nodes first and filter nodes second. 8 | --fn= Names (not keys) of nodes to exclude from the mapping of node key to node info. Neither these nodes nor any of their children will be included. If not set, no nodes will be excluded. Either a comma separated list (e.g. node1,node2,node3 etc.) or if equal to "-", read from standard input one node per line; Press CTRL+D to finalize input or pipe (e.g. cat nodes.txt | kegg_pull pathway-organizer --fn=- ...). If both "--tln" and "--fn" are set as "-", one of the lines must be the delimiter "---" without quotes in order to distinguish the input, with the top level nodes first and filter nodes second. 9 | --output= The file to store the flattened Brite hierarchy as a JSON structure with node keys mapping to node info, either a JSON file or ZIP archive. Prints to the console if not set. If saving to a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:mapping.json). 10 | """ 11 | import docopt as d 12 | import sys 13 | from . import pathway_organizer as po 14 | from . import _utils as u 15 | 16 | 17 | def main(): 18 | args = d.docopt(__doc__) 19 | if args['--tln'] == '-' and args['--fn'] == '-': 20 | # If both the top level nodes and filter nodes are coming from standard input, convert them to comma separated lists 21 | inputs = sys.stdin.read() 22 | [top_level_nodes, filter_nodes] = inputs.split('---\n') 23 | top_level_nodes = ','.join(top_level_nodes.strip().split('\n')) 24 | filter_nodes = ','.join(filter_nodes.strip().split('\n')) 25 | top_level_nodes = set(u.parse_input_sequence(input_source=top_level_nodes)) 26 | filter_nodes = set(u.parse_input_sequence(input_source=filter_nodes)) 27 | else: 28 | top_level_nodes: str | set[str] = args['--tln'] 29 | filter_nodes: str | set[str] = args['--fn'] 30 | if top_level_nodes: 31 | top_level_nodes = set[str](u.parse_input_sequence(input_source=top_level_nodes)) 32 | if filter_nodes: 33 | filter_nodes = set[str](u.parse_input_sequence(input_source=filter_nodes)) 34 | pathway_organizer = po.PathwayOrganizer.load_from_kegg(top_level_nodes=top_level_nodes, filter_nodes=filter_nodes) 35 | hierarchy_nodes_json_string = str(pathway_organizer) 36 | u.print_or_save(output_target=args['--output'], output_content=hierarchy_nodes_json_string) 37 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | .. |Functionality| replace:: Provides commandline functionality 2 | .. |Interface for| replace:: Provides commandline functionality for accessing 3 | 4 | CLI 5 | === 6 | **Note:** Many KEGG entry IDs contain colons and ``kegg_pull`` saves KEGG entry files with their ID in the file name. When running on Windows, all file names with colons will have their colons replaced with underscores. 7 | 8 | kegg_pull Commandline Interface 9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 10 | Top-level commandline interface. 11 | 12 | .. literalinclude:: ../src/kegg_pull/__main__.py 13 | :start-at: Usage: 14 | :end-before: """ 15 | :language: none 16 | 17 | .. include:: ../src/kegg_pull/pull.py 18 | :start-after: """ 19 | :end-before: """ 20 | 21 | A JSON file, called ``pull-results.json``, is saved, describing the results of the pull. Below is the interpretation of each of the fields: 22 | 23 | **percent-success:** The percentage of the requested entries that were successfully pulled and saved in a file. 24 | 25 | **pull-minutes:** The number of minutes that the pull took to complete. 26 | 27 | **num-successful:** The number of entries that were successfully pulled and saved in a file. 28 | 29 | **num-failed:** The number of entries that failed to be pulled. 30 | 31 | **num-timed-out:** The number of entries that timed out when requested. 32 | 33 | **num-total:** The number of total entry IDs requested. 34 | 35 | **successful-entry-ids:** The list of successful entry IDs. 36 | 37 | **failed-entry-ids:** The list of failed entry IDs. 38 | 39 | **timed-out-entry-ids:** The list of timed out entry IDs. 40 | 41 | If the ``--unsuccessful-threshold`` option is set and surpassed, an ``aborted-pull-results.json`` file is instead output with the following fields: 42 | 43 | **num-remaining-entry-ids:** The number of requested entries remaining after the process aborted. The process aborted before ``kegg_pull`` could even try to pull these entries. 44 | 45 | **num-successful:** The number of entries that were successfully pulled before the process aborted. 46 | 47 | **num-failed:** The number of entries that failed by the time the process aborted. 48 | 49 | **num-timed-out:** The number of entries that timed out by the time the process aborted. 50 | 51 | **remaining-entry-ids:** The IDs of the remaining entries. 52 | 53 | **successful-entry-ids:** The IDs of the successful entries. 54 | 55 | **failed-entry-ids:** The IDs of the failed entries. 56 | 57 | **timed-out-entry-ids:** The IDs of the timed out entries. 58 | 59 | .. literalinclude:: ../src/kegg_pull/pull_cli.py 60 | :start-at: Usage: 61 | :end-before: """ 62 | :language: none 63 | 64 | .. include:: ../src/kegg_pull/entry_ids.py 65 | :start-after: """ 66 | :end-before: """ 67 | 68 | .. literalinclude:: ../src/kegg_pull/entry_ids_cli.py 69 | :start-at: Usage: 70 | :end-before: """ 71 | :language: none 72 | 73 | .. include:: ../src/kegg_pull/map.py 74 | :start-after: """ 75 | :end-before: """ 76 | 77 | .. literalinclude:: ../src/kegg_pull/map_cli.py 78 | :start-at: Usage: 79 | :end-before: """ 80 | :language: none 81 | 82 | .. include:: ../src/kegg_pull/pathway_organizer.py 83 | :start-after: """ 84 | :end-before: """ 85 | 86 | .. literalinclude:: ../src/kegg_pull/pathway_organizer_cli.py 87 | :start-at: Usage: 88 | :end-before: """ 89 | :language: none 90 | 91 | .. include:: ../src/kegg_pull/rest.py 92 | :start-after: """ 93 | :end-before: """ 94 | 95 | .. literalinclude:: ../src/kegg_pull/rest_cli.py 96 | :start-at: Usage: 97 | :end-before: """ 98 | :language: none 99 | -------------------------------------------------------------------------------- /dev/test_pathway_organizer_cli.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import json 4 | import kegg_pull.pathway_organizer as po 5 | import kegg_pull.pathway_organizer_cli as po_cli 6 | import dev.utils as u 7 | 8 | 9 | def test_help(mocker): 10 | u.assert_help(mocker=mocker, module=po_cli, subcommand='pathway-organizer') 11 | 12 | 13 | method = 'pathway_organizer_cli.po.PathwayOrganizer.load_from_kegg' 14 | test_data = [ 15 | (['pathway-organizer', '--tln=-', '--fn=-'], {'top_level_nodes': {'node1'}, 'filter_nodes': {'node2', 'node3'}}, 16 | ' node1\n---\nnode2\t\nnode3 '), 17 | (['pathway-organizer', '--tln=-', '--fn=node2,node3,node4'], 18 | {'top_level_nodes': {'node1', 'node5'}, 'filter_nodes': {'node2', 'node3', 'node4'}}, '\nnode1\n node5\n'), 19 | (['pathway-organizer', '--tln=node1', '--fn=-'], {'top_level_nodes': {'node1'}, 'filter_nodes': {'node2'}}, 'node2'), 20 | (['pathway-organizer', '--tln=node1,node2', '--fn=node3'], {'top_level_nodes': {'node1', 'node2'}, 'filter_nodes': {'node3'}}, None), 21 | (['pathway-organizer', '--tln=-'], {'top_level_nodes': {'node1', 'node2', 'node3'}, 'filter_nodes': None}, 'node1\nnode2\nnode3'), 22 | (['pathway-organizer', '--fn=-'], {'top_level_nodes': None, 'filter_nodes': {'node1', 'node2', 'node3'}}, 'node1\nnode2\nnode3'), 23 | (['pathway-organizer', '--tln=node1,node2,node3'], {'top_level_nodes': {'node1', 'node2', 'node3'}, 'filter_nodes': None}, None), 24 | (['pathway-organizer', '--fn=node1,node2,node3'], {'top_level_nodes': None, 'filter_nodes': {'node1', 'node2', 'node3'}}, None), 25 | (['pathway-organizer'], {'top_level_nodes': None, 'filter_nodes': None}, None)] 26 | 27 | 28 | @pt.mark.parametrize('args,kwargs,stdin_mock', test_data) 29 | def test_print(mocker, args: list, kwargs: dict, stdin_mock: str): 30 | pathway_org_mock, expected_output = _get_mock_pathway_org_and_expected_output(mocker=mocker) 31 | u.test_print( 32 | mocker=mocker, argv_mock=args, stdin_mock=stdin_mock, method=method, method_return_value=pathway_org_mock, method_kwargs=kwargs, 33 | module=po_cli, expected_output=expected_output) 34 | 35 | 36 | def _get_mock_pathway_org_and_expected_output(mocker): 37 | u.mock_non_instantiable(mocker=mocker) 38 | hierarchy_nodes_mock: po.HierarchyNodes = {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': ['a'], 'entry_id': 'd'}} 39 | pathway_org_mock = po.PathwayOrganizer() 40 | pathway_org_mock.hierarchy_nodes = hierarchy_nodes_mock 41 | expected_output: str = json.dumps(hierarchy_nodes_mock, indent=2) 42 | return pathway_org_mock, expected_output 43 | 44 | 45 | @pt.mark.parametrize('args,kwargs,stdin_mock', test_data) 46 | def test_file(mocker, args: list, kwargs: dict, stdin_mock: str, output_file: str): 47 | pathway_org_mock, expected_output = _get_mock_pathway_org_and_expected_output(mocker=mocker) 48 | u.test_file( 49 | mocker=mocker, argv_mock=args, output_file=output_file, stdin_mock=stdin_mock, method=method, 50 | method_return_value=pathway_org_mock, method_kwargs=kwargs, module=po_cli, expected_output=expected_output) 51 | 52 | 53 | @pt.mark.parametrize('args,kwargs,stdin_mock', test_data) 54 | def test_zip_archive(mocker, args: list, kwargs: dict, stdin_mock: str, zip_archive_data: tuple): 55 | pathway_org_mock, expected_output = _get_mock_pathway_org_and_expected_output(mocker=mocker) 56 | u.test_zip_archive( 57 | mocker=mocker, argv_mock=args, zip_archive_data=zip_archive_data, stdin_mock=stdin_mock, method=method, 58 | method_return_value=pathway_org_mock, method_kwargs=kwargs, module=po_cli, expected_output=expected_output) 59 | -------------------------------------------------------------------------------- /dev/test_entry_ids.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import typing as t 4 | import os 5 | import kegg_pull.rest as r 6 | import kegg_pull.entry_ids as ei 7 | import kegg_pull.kegg_url as ku 8 | import dev.utils as u 9 | 10 | 11 | test_from_kegg_rest_data = [ 12 | (ei.from_database, ku.ListKEGGurl, {'database': 'compound'}, 'list/compound'), 13 | (ei.from_keywords, ku.KeywordsFindKEGGurl, {'database': 'compound', 'keywords': ['kw1', 'kw2']}, 'find/compound/kw1+kw2'), 14 | ( 15 | ei.from_molecular_attribute, ku.MolecularFindKEGGurl, 16 | {'database': 'compound', 'formula': 'M4O3C2K1', 'exact_mass': None, 'molecular_weight': None}, 17 | 'find/compound/M4O3C2K1/formula')] 18 | 19 | 20 | @pt.mark.parametrize('get_entry_ids,KEGGurl,kwargs,url', test_from_kegg_rest_data) 21 | def test_from_kegg_rest(mocker, get_entry_ids: t.Callable, KEGGurl: type, kwargs: dict, url: str): 22 | text_body_mock = ''' 23 | cpd:C22501 alpha-D-Xylulofuranose 24 | cpd:C22502 alpha-D-Fructofuranose; alpha-D-Fructose 25 | cpd:C22500 2,8-Dihydroxyadenine 26 | cpd:C22504 cis-Alkene 27 | cpd:C22506 Archaeal dolichyl alpha-D-glucosyl phosphate; Dolichyl alpha-D-glucosyl phosphate 28 | cpd:C22507 6-Sulfo-D-rhamnose 29 | cpd:C22509 3',5'-Cyclic UMP; Uridine 3',5'-cyclic monophosphate; cUMP 30 | cpd:C22510 4-Deoxy-4-sulfo-D-erythrose 31 | cpd:C22511 4-Deoxy-4-sulfo-D-erythrulose 32 | cpd:C22512 Solabiose 33 | cpd:C22513 sn-3-O-(Farnesylgeranyl)glycerol 1-phosphate 34 | cpd:C22514 2,3-Bis-O-(geranylfarnesyl)-sn-glycerol 1-phosphate 35 | ''' 36 | get_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.get', return_value=mocker.MagicMock(text=text_body_mock, status_code=200)) 37 | request_and_check_error_spy: mocker.MagicMock = mocker.spy(r, 'request_and_check_error') 38 | actual_entry_ids: list = get_entry_ids(**kwargs) 39 | request_and_check_error_spy.assert_called_once_with(kegg_rest=None, KEGGurl=KEGGurl, **kwargs) 40 | url = f'{ku.BASE_URL}/{url}' 41 | get_mock.assert_called_once_with(url=url, timeout=60) 42 | expected_entry_ids = [ 43 | 'cpd:C22501', 'cpd:C22502', 'cpd:C22500', 'cpd:C22504', 'cpd:C22506', 'cpd:C22507', 'cpd:C22509', 'cpd:C22510', 44 | 'cpd:C22511', 'cpd:C22512', 'cpd:C22513', 'cpd:C22514'] 45 | assert actual_entry_ids == expected_entry_ids 46 | 47 | 48 | @pt.fixture(name='file_info', params=[True, False]) 49 | def file_mock(request): 50 | is_empty = request.param 51 | if is_empty: 52 | file_contents_mock = '' 53 | else: 54 | file_contents_mock = ''' 55 | cpd:C22501 56 | cpd:C22502 57 | cpd:C22500 58 | cpd:C22504 59 | 60 | cpd:C22506 61 | cpd:C22507 62 | cpd:C22509 63 | cpd:C22510 64 | cpd:C22511 65 | cpd:C22512 66 | cpd:C22513 67 | cpd:C22514 68 | ''' 69 | file_name = 'file-mock.txt' 70 | with open(file_name, 'w') as file: 71 | file.write(file_contents_mock) 72 | yield file_name, is_empty 73 | os.remove(file_name) 74 | 75 | 76 | def test_from_file(file_info: str): 77 | file_name, is_empty = file_info 78 | if is_empty: 79 | with pt.raises(ValueError) as error: 80 | ei.from_file(file_path=file_name) 81 | u.assert_exception(expected_message=f'Attempted to load entry IDs from {file_name}. But the file is empty', exception=error) 82 | else: 83 | actual_entry_ids: list = ei.from_file(file_path=file_name) 84 | expected_entry_ids = [ 85 | 'cpd:C22501', 'cpd:C22502', 'cpd:C22500', 'cpd:C22504', 'cpd:C22506', 'cpd:C22507', 'cpd:C22509', 'cpd:C22510', 86 | 'cpd:C22511', 'cpd:C22512', 'cpd:C22513', 'cpd:C22514'] 87 | assert actual_entry_ids == expected_entry_ids 88 | -------------------------------------------------------------------------------- /dev/test_map_cli.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import kegg_pull.map_cli as map_cli 4 | import dev.utils as u 5 | 6 | mapping_mock = {'k1': {'v1'}, 'k2': {'v1', 'v2'}, 'k3': {'v3', 'v4'}} 7 | 8 | 9 | def test_help(mocker): 10 | u.assert_help(mocker=mocker, module=map_cli, subcommand='map') 11 | 12 | 13 | test_data = [ 14 | (['conv', 'compound', 'chebi'], 'database_conv', {'kegg_database': 'compound', 'outside_database': 'chebi', 'reverse': False}, None), 15 | (['conv', 'entry-ids', '-', 'pubchem'], 'entries_conv', {'entry_ids': ['e1', 'e2'], 'target_database': 'pubchem', 'reverse': False}, 16 | 'e1\ne2'), 17 | (['conv', 'entry-ids', 'e1', 'chebi', '--reverse'], 'entries_conv', {'entry_ids': ['e1'], 'target_database': 'chebi', 'reverse': True}, 18 | None), 19 | (['link', 'enzyme', 'compound'], 'database_link', 20 | {'source_database': 'enzyme', 'target_database': 'compound', 'deduplicate': False, 'add_glycans': False, 'add_drugs': False}, None), 21 | (['link', 'compound', 'reaction', '--add-glycans', '--add-drugs'], 'database_link', 22 | {'source_database': 'compound', 'target_database': 'reaction', 'deduplicate': False, 'add_glycans': True, 'add_drugs': True}, None), 23 | (['link', 'pathway', 'reaction', '--deduplicate'], 'database_link', 24 | {'source_database': 'pathway', 'target_database': 'reaction', 'deduplicate': True, 'add_glycans': False, 'add_drugs': False}, None), 25 | (['link', 'entry-ids', 'e1,e2,e3', 'glycan'], 'entries_link', 26 | {'entry_ids': ['e1', 'e2', 'e3'], 'target_database': 'glycan', 'reverse': False}, None), 27 | (['link', 'entry-ids', '-', 'ko', '--reverse'], 'entries_link', 28 | {'entry_ids': ['e1', 'e2', 'e3'], 'target_database': 'ko', 'reverse': True}, ' e1\ne2\t\ne3\n\n'), 29 | (['link', 'ko', 'reaction', 'compound'], 'indirect_link', 30 | {'source_database': 'ko', 'intermediate_database': 'reaction', 'target_database': 'compound', 'deduplicate': False, 31 | 'add_glycans': False, 'add_drugs': False}, None), 32 | (['link', 'pathway', 'reaction', 'ko', '--deduplicate'], 'indirect_link', 33 | {'source_database': 'pathway', 'intermediate_database': 'reaction', 'target_database': 'ko', 'deduplicate': True, 34 | 'add_glycans': False, 'add_drugs': False}, None), 35 | (['link', 'compound', 'reaction', 'ko', '--add-glycans', '--add-drugs'], 'indirect_link', 36 | {'source_database': 'compound', 'intermediate_database': 'reaction', 'target_database': 'ko', 'deduplicate': False, 37 | 'add_glycans': True, 'add_drugs': True}, None)] 38 | 39 | 40 | def _prepare_input(args: list, method: str) -> tuple[list, str, str]: 41 | args = ['map'] + args 42 | method = f'map_cli.kmap.{method}' 43 | expected_output = '{\n "k1": [\n "v1"\n ],\n "k2": [\n "v1",\n "v2"\n ],\n "k3": [\n "v3",\n "v4"\n ]\n}' 44 | return args, method, expected_output 45 | 46 | 47 | @pt.mark.parametrize('args,method,kwargs,stdin_mock', test_data) 48 | def test_print(mocker, args: list, method: str, kwargs: dict, stdin_mock: str): 49 | args, method, expected_output = _prepare_input(args=args, method=method) 50 | u.test_print( 51 | mocker=mocker, argv_mock=args, stdin_mock=stdin_mock, method=method, method_return_value=mapping_mock, method_kwargs=kwargs, 52 | module=map_cli, expected_output=expected_output) 53 | 54 | 55 | @pt.mark.parametrize('args,method,kwargs,stdin_mock', test_data) 56 | def test_file(mocker, args: list, method: str, kwargs: dict, stdin_mock: str, output_file: str): 57 | args, method, expected_output = _prepare_input(args=args, method=method) 58 | u.test_file( 59 | mocker=mocker, argv_mock=args, output_file=output_file, stdin_mock=stdin_mock, method=method, 60 | method_return_value=mapping_mock, method_kwargs=kwargs, module=map_cli, expected_output=expected_output) 61 | 62 | 63 | @pt.mark.parametrize('args,method,kwargs,stdin_mock', test_data) 64 | def test_zip_archive(mocker, args: list, method: str, kwargs: dict, stdin_mock: str, zip_archive_data: tuple): 65 | args, method, expected_output = _prepare_input(args=args, method=method) 66 | u.test_zip_archive( 67 | mocker=mocker, argv_mock=args, zip_archive_data=zip_archive_data, stdin_mock=stdin_mock, method=method, 68 | method_return_value=mapping_mock, method_kwargs=kwargs, module=map_cli, expected_output=expected_output) 69 | -------------------------------------------------------------------------------- /src/kegg_pull/map_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | kegg_pull map -h | --help 4 | kegg_pull map conv [--reverse] [--output=] 5 | kegg_pull map link [--deduplicate] [--add-glycans] [--add-drugs] [--output=] 6 | kegg_pull map (link|conv) entry-ids [--reverse] [--output=] 7 | kegg_pull map link [--deduplicate] [--add-glycans] [--add-drugs] [--output=] 8 | 9 | Options: 10 | -h --help Show this help message. 11 | conv Converts the output of the KEGG "conv" operation into a JSON mapping. 12 | The name of the KEGG database with entry IDs mapped to the outside database. 13 | The name of the outside database with entry IDs mapped from the KEGG database. 14 | --reverse Reverses the mapping with the target becoming the source and the source becoming the target. 15 | --output= The location (either a directory or ZIP archive) of the JSON file to store the mapping. If not set, prints a JSON representation of the mapping to the console. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:mapping.json). 16 | link Converts the output of the KEGG "link" operation into a JSON mapping. 17 | The name of the database with entry IDs mapped to the target database. 18 | The name of the database with entry IDs mapped from the source database. 19 | --deduplicate Some mappings including pathway entry IDs result in half beginning with the normal "path:map" prefix but the other half with a different prefix. If set, removes the IDs corresponding to identical entries but with a different prefix. Raises an exception if neither the source nor the target database are "pathway". 20 | --add-glycans Whether to add the corresponding compound IDs of equivalent glycan entries. Logs a warning if neither the source nor the target database are "compound". 21 | --add-drugs Whether to add the corresponding compound IDs of equivalent drug entries. Logs a warning if neither the source nor the target database are "compound". 22 | entry-ids Create a mapping to a target database from a list of specific entry IDs. 23 | Comma separated list of entry IDs (e.g. Id1,Id2,Id3 etc.). Or if equal to "-", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull map entry-ids drug - ...). 24 | The name of an intermediate KEGG database with which to find cross-references to cross-references e.g. "kegg_pull map link ko reaction compound" creates a mapping from ko-to-compound via ko-to-reaction cross-references connected to reaction-to-compound cross-references. 25 | """ 26 | import docopt as doc 27 | from . import map as kmap 28 | from . import _utils as u 29 | 30 | 31 | def main() -> None: 32 | args = doc.docopt(__doc__) 33 | source_database: str = args[''] 34 | intermediate_database: str = args[''] 35 | target_database: str = args[''] 36 | deduplicate: bool = args['--deduplicate'] 37 | add_glycans: bool = args['--add-glycans'] 38 | add_drugs: bool = args['--add-drugs'] 39 | reverse: bool = args['--reverse'] 40 | if intermediate_database: 41 | mapping: kmap.KEGGmapping = kmap.indirect_link( 42 | source_database=source_database, intermediate_database=intermediate_database, 43 | target_database=target_database, deduplicate=deduplicate, add_glycans=add_glycans, add_drugs=add_drugs) 44 | elif args['entry-ids']: 45 | entry_ids = u.parse_input_sequence(input_source=args['']) 46 | if args['link']: 47 | mapping = kmap.entries_link( 48 | entry_ids=entry_ids, target_database=target_database, reverse=reverse) 49 | else: 50 | mapping = kmap.entries_conv(entry_ids=entry_ids, target_database=target_database, reverse=reverse) 51 | elif args['link']: 52 | mapping = kmap.database_link( 53 | source_database=source_database, target_database=target_database, deduplicate=deduplicate, 54 | add_glycans=add_glycans, add_drugs=add_drugs) 55 | else: 56 | kegg_database: str = args[''] 57 | outside_database: str = args[''] 58 | mapping = kmap.database_conv(kegg_database=kegg_database, outside_database=outside_database, reverse=reverse) 59 | mapping_str: str = kmap.to_json_string(mapping=mapping) 60 | u.print_or_save(output_target=args['--output'], output_content=mapping_str) 61 | -------------------------------------------------------------------------------- /src/kegg_pull/entry_ids.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pulling Lists of KEGG Entry IDs 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | |Functionality| for pulling lists of KEGG entry IDs from the KEGG REST API. 5 | """ 6 | from . import rest as r 7 | from . import kegg_url as ku 8 | 9 | 10 | def from_database(database: str, kegg_rest: r.KEGGrest | None = None) -> list[str]: 11 | """ Pulls the KEGG entry IDs of a given database. 12 | 13 | :param database: The KEGG database to pull the entry IDs from. If equal to "brite", the "br:" prefix is prepended to each entry ID such that they succeed if used in downstream use of the KEGG "get" operation (e.g. for the "pull" API module or CLI subcommand). 14 | :param kegg_rest: The KEGGrest object to request the entry IDs. If None, one is created with the default parameters. 15 | :return: The list of resulting entry IDs. 16 | :raises RuntimeError: Raised if the request to the KEGG REST API fails or times out. 17 | """ 18 | entry_ids = _process_response(KEGGurl=ku.ListKEGGurl, kegg_rest=kegg_rest, database=database) 19 | if database == 'brite': 20 | entry_ids = [f'br:{entry_id}' for entry_id in entry_ids if not entry_id.startswith('br:')] 21 | return entry_ids 22 | 23 | 24 | def _process_response(KEGGurl: type[ku.AbstractKEGGurl], kegg_rest: r.KEGGrest | None, **kwargs) -> list[str]: 25 | """ Extracts the entry IDs from a KEGG response if successful, else raises an exception. The KEGG response arrives from making 26 | an entry IDs related request with a KEGGrest object. 27 | 28 | :param KEGGurl: The URL class for the request. 29 | :param kegg_rest: The KEGGrest object to make the request with. If None, one is created with the default parameters. 30 | :param kwargs: The arguments to pass into the KEGGrest method. 31 | :return: The list of KEGG entry IDs. 32 | :raises RuntimeError: Raised if the KEGG response indicates a failure or time out. 33 | """ 34 | kegg_response: r.KEGGresponse = r.request_and_check_error(kegg_rest=kegg_rest, KEGGurl=KEGGurl, **kwargs) 35 | return _parse_entry_ids_string(entry_ids_string=kegg_response.text_body) 36 | 37 | 38 | def _parse_entry_ids_string(entry_ids_string: str) -> list[str]: 39 | """ Parses the entry IDs contained in a string. 40 | 41 | :param entry_ids_string: The string containing the entry IDs. 42 | :return: The list of parsed entry IDs. 43 | """ 44 | entry_ids = entry_ids_string.strip().split('\n') 45 | return [entry_id.split('\t')[0].strip() for entry_id in entry_ids if entry_id.strip() != ''] 46 | 47 | 48 | def from_file(file_path: str) -> list[str]: 49 | """ Loads KEGG entry IDs that are listed in a file with one entry ID on each line. 50 | 51 | :param file_path: The path to the file containing the entry IDs. 52 | :return: The list of entry IDs. 53 | :raises ValueError: Raised if the file is empty. 54 | """ 55 | with open(file_path, 'r') as file: 56 | entry_ids = file.read() 57 | if entry_ids == '': 58 | raise ValueError(f'Attempted to load entry IDs from {file_path}. But the file is empty') 59 | return _parse_entry_ids_string(entry_ids_string=entry_ids) 60 | 61 | 62 | def from_keywords(database: str, keywords: list[str], kegg_rest: r.KEGGrest | None = None) -> list[str]: 63 | """ Pulls entry IDs from a KEGG database based on keywords searched in the entries. 64 | 65 | :param database: The name of the database to pull entry IDs from. 66 | :param keywords: The keywords to search entries in the database with. 67 | :param kegg_rest: The KEGGrest object to request the entry IDs. If None, one is created with the default parameters. 68 | :return: The list of entry IDs. 69 | :raises RuntimeError: Raised if the request to the KEGG REST API fails or times out. 70 | """ 71 | return _process_response(KEGGurl=ku.KeywordsFindKEGGurl, kegg_rest=kegg_rest, database=database, keywords=keywords) 72 | 73 | 74 | def from_molecular_attribute( 75 | database: str, formula: str | None = None, exact_mass: float | tuple[float, float] | None = None, 76 | molecular_weight: int | tuple[int, int] | None = None, kegg_rest: r.KEGGrest | None = None) -> list[str]: 77 | """ Pulls entry IDs from a KEGG database containing chemical entries based on one (and only one) of three molecular attributes of the entries. 78 | 79 | :param database: The name of the database containing chemical entries. 80 | :param formula: The chemical formula to search for. 81 | :param exact_mass: The exact mass of the compound to search for (a single value or a range). 82 | :param molecular_weight: The molecular weight of the compound to search for (a single value or a range). 83 | :param kegg_rest: The KEGGrest object to request the entry IDs. If None, one is created with the default parameters. 84 | :return: The list of entry IDs. 85 | :raises RuntimeError: Raised if the request to the KEGG REST API fails or times out. 86 | """ 87 | return _process_response( 88 | KEGGurl=ku.MolecularFindKEGGurl, kegg_rest=kegg_rest, database=database, formula=formula, exact_mass=exact_mass, 89 | molecular_weight=molecular_weight) 90 | -------------------------------------------------------------------------------- /dev/test_data/brite-entries/br_br08902.txt: -------------------------------------------------------------------------------- 1 | +C Br number 2 | ! 3 | APathway and Brite 4 | B Pathway maps 5 | C br08901 KEGG pathway maps 6 | B Brite files 7 | C br08902 BRITE hierarchy files 8 | C br08904 BRITE table files 9 | C br08906 BRITE binary relation files 10 | AGenes and Proteins 11 | B Orthologs, modules and networks 12 | C ko00001 KEGG Orthology (KO) 13 | C ko00002 KEGG modules 14 | C ko00003 KEGG reaction modules 15 | C br08907 KEGG networks 16 | B Protein families: metabolism 17 | C ko01000 Enzymes 18 | C ko01001 Protein kinases 19 | C ko01009 Protein phosphatases and associated proteins 20 | C ko01002 Peptidases and inhibitors 21 | C ko01003 Glycosyltransferases 22 | C ko01005 Lipopolysaccharide biosynthesis proteins 23 | C ko01011 Peptidoglycan biosynthesis and degradation proteins 24 | C ko01004 Lipid biosynthesis proteins 25 | C ko01008 Polyketide biosynthesis proteins 26 | C ko01006 Prenyltransferases 27 | C ko01007 Amino acid related enzymes 28 | C ko00199 Cytochrome P450 29 | C ko00194 Photosynthesis proteins 30 | B Protein families: genetic information processing 31 | C ko03000 Transcription factors 32 | C ko03021 Transcription machinery 33 | C ko03019 Messenger RNA biogenesis 34 | C ko03041 Spliceosome 35 | C ko03011 Ribosome 36 | C ko03009 Ribosome biogenesis 37 | C ko03016 Transfer RNA biogenesis 38 | C ko03012 Translation factors 39 | C ko03110 Chaperones and folding catalysts 40 | C ko04131 Membrane trafficking 41 | C ko04121 Ubiquitin system 42 | C ko03051 Proteasome 43 | C ko03032 DNA replication proteins 44 | C ko03036 Chromosome and associated proteins 45 | C ko03400 DNA repair and recombination proteins 46 | C ko03029 Mitochondrial biogenesis 47 | B Protein families: signaling and cellular processes 48 | C ko02000 Transporters 49 | C ko02044 Secretion system 50 | C ko02042 Bacterial toxins 51 | C ko02022 Two-component system 52 | C ko02035 Bacterial motility proteins 53 | C ko03037 Cilium and associated proteins 54 | C ko04812 Cytoskeleton proteins 55 | C ko04147 Exosome 56 | C ko02048 Prokaryotic defense system 57 | C ko04030 G protein-coupled receptors 58 | C ko04050 Cytokine receptors 59 | C ko04054 Pattern recognition receptors 60 | C ko03310 Nuclear receptors 61 | C ko04040 Ion channels 62 | C ko04031 GTP-binding proteins 63 | C ko04052 Cytokines and growth factors 64 | C ko04515 Cell adhesion molecules 65 | C ko04090 CD molecules 66 | C ko01504 Antimicrobial resistance genes 67 | C ko00535 Proteoglycans 68 | C ko00536 Glycosaminoglycan binding proteins 69 | C ko00537 Glycosylphosphatidylinositol (GPI)-anchored proteins 70 | C ko04091 Lectins 71 | C ko04990 Domain-containing proteins not elsewhere classified 72 | B Viral protein families 73 | C ko03200 Viral proteins 74 | C ko03210 Viral fusion proteins 75 | B RNA family 76 | C ko03100 Non-coding RNAs 77 | ACompounds and Reactions 78 | B Compounds 79 | C br08001 Compounds with biological roles 80 | C br08002 Lipids 81 | C br08003 Phytochemical compounds 82 | C br08005 Bioactive peptides 83 | C br08006 Endocrine disrupting compounds 84 | C br08007 Pesticides 85 | C br08009 Natural toxins 86 | C br08021 Glycosides 87 | B Glycans 88 | C br08120 O-antigens 89 | B Reactions 90 | C br08201 Enzymatic reactions 91 | C br08202 IUBMB reaction hierarchy 92 | C br08204 Reaction class 93 | C br08203 Glycosyltransferase reactions 94 | ADrugs 95 | B Drug classifications 96 | C br08303 Anatomical Therapeutic Chemical (ATC) classification 97 | C br08302 USP drug classification 98 | C br08301 Therapeutic category of drugs in Japan 99 | C br08313 Classification of Japanese OTC drugs 100 | C br08312 Risk category of Japanese OTC drugs 101 | C br08304 Traditional Chinese Medicine in Japan 102 | C br08305 Crude drugs 103 | C br08331 Animal drugs in Japan 104 | B Drug information 105 | C br08330 Drug groups 106 | C br08332 Drug classes 107 | C br08310 Target-based classification of drugs 108 | C br08307 Antimicrobials 109 | C br08327 Antimicrobials abbreviations 110 | C br08311 Drugs listed in the Japanese Pharmacopoeia 111 | ADiseases 112 | B Human diseases 113 | C br08403 Human diseases in ICD-11 classification 114 | C br08402 Pathway-based classification of diseases 115 | C br08401 Genome-based classification of infectious diseases 116 | C br08411 ICD-11 International Classification of Diseases 117 | C br08410 ICD-10 International Classification of Diseases 118 | C br08420 ICD-O-3 International Classification of Diseases for Oncology 119 | AOrganisms and Viruses 120 | B Taxonomy 121 | C br08601 KEGG organisms 122 | C br08610 KEGG organisms in the NCBI taxonomy 123 | C br08611 KEGG organisms in taxonomic ranks 124 | C br08612 KEGG organisms: animals 125 | C br08613 KEGG organisms: plants 126 | C br08614 KEGG organisms: fungi 127 | C br08615 KEGG organisms: protists 128 | C br08620 KEGG viruses in the NCBI taxonomy 129 | C br08621 KEGG viruses in taxonomic ranks 130 | B Organism information 131 | C br08605 Plant pathogens 132 | ! 133 | # 134 | #Last updated: July 10, 2023 135 | #» Japanese version 136 | -------------------------------------------------------------------------------- /src/kegg_pull/_utils.py: -------------------------------------------------------------------------------- 1 | import logging as log 2 | import typing as t 3 | import zipfile as zf 4 | import os 5 | import sys 6 | import json 7 | import jsonschema as js 8 | import inspect as ins 9 | 10 | 11 | def get_molecular_attribute_args(args: dict) -> tuple[str | None, float | tuple[float, float] | None, int | tuple[int, int] | None]: 12 | formula: str | None = args['--formula'] 13 | exact_mass: list[str] | None = args['--em'] 14 | molecular_weight: list[str] | None = args['--mw'] 15 | # exact_mass and molecular_weight will be [] (empty list) if not specified in the commandline args 16 | if exact_mass: 17 | exact_mass: float | tuple[float, float] = _get_range_values(range_values=exact_mass, value_type=float) 18 | else: 19 | exact_mass = None 20 | if molecular_weight: 21 | molecular_weight: int | tuple[int, int] = _get_range_values(range_values=molecular_weight, value_type=int) 22 | else: 23 | molecular_weight = None 24 | return formula, exact_mass, molecular_weight 25 | 26 | 27 | def _get_range_values( 28 | range_values: list[str], value_type: type[int | float]) -> int | float | tuple[int, int] | tuple[float, float]: 29 | if len(range_values) == 1: 30 | [val] = range_values 31 | return value_type(val) 32 | elif len(range_values) == 2: 33 | [min_val, max_val] = range_values 34 | return value_type(min_val), value_type(max_val) 35 | else: 36 | raise ValueError( 37 | f'Range can only be specified by two values but {len(range_values)} values were provided: ' 38 | f'{", ".join(range_value for range_value in range_values)}') 39 | 40 | 41 | def load_json_file(file_path: str, json_schema: dict, validation_error_message: str) -> dict: 42 | if '.zip:' in file_path: 43 | [file_location, file_name] = file_path.split('.zip:') 44 | file_location = file_location + '.zip' 45 | with zf.ZipFile(file_location, 'r') as zip_file: 46 | json_object: bytes = zip_file.read(file_name) 47 | json_object: dict = json.loads(s=json_object) 48 | else: 49 | with open(file_path, 'r') as file: 50 | json_object: dict = json.load(file) 51 | validate_json_object(json_object=json_object, json_schema=json_schema, validation_error_message=validation_error_message) 52 | return json_object 53 | 54 | 55 | def validate_json_object(json_object: dict, json_schema: dict, validation_error_message: str) -> None: 56 | try: 57 | js.validate(json_object, json_schema) 58 | except js.exceptions.ValidationError as e: 59 | log.error(validation_error_message) 60 | raise e 61 | 62 | 63 | def parse_input_sequence(input_source: str) -> list[str]: 64 | if input_source == '-': 65 | # Read from standard input 66 | inputs: str = sys.stdin.read() 67 | inputs: list = inputs.strip().split('\n') 68 | else: 69 | # Split a comma separated list 70 | inputs: list = input_source.split(',') 71 | inputs: list = [input_string.strip() for input_string in inputs if input_string.strip() != ''] 72 | # If the inputs end up being an empty list 73 | if not inputs: 74 | input_source = 'standard input' if input_source == '-' else f'comma separated list: "{input_source}"' 75 | raise ValueError(f'Empty list provided from {input_source}') 76 | return inputs 77 | 78 | 79 | def print_or_save(output_target: str, output_content: str | bytes) -> None: 80 | if output_target is None: 81 | if type(output_content) is bytes: 82 | log.warning('Printing binary output...') 83 | print(output_content) 84 | else: 85 | save_output(output_target=output_target, output_content=output_content) 86 | 87 | 88 | def save_output(output_target: str, output_content: str | bytes) -> None: 89 | if '.zip:' in output_target: 90 | [file_location, file_name] = output_target.split('.zip:') 91 | file_location: str = file_location + '.zip' 92 | else: 93 | file_location, file_name = os.path.split(output_target) 94 | file_location = '.' if file_location == '' else file_location 95 | save_file(file_location=file_location, file_content=output_content, file_name=file_name) 96 | 97 | 98 | def save_file(file_location: str, file_content: str | bytes, file_name: str) -> None: 99 | if os.name == 'nt': # pragma: no cover 100 | # If the OS is Windows, replace colons with underscores (Windows does not support colons in file names). 101 | file_name = file_name.replace(':', '_') # pragma: no cover 102 | if file_location.endswith('.zip'): 103 | with zf.ZipFile(file_location, 'a') as zip_file: 104 | zip_file.writestr(file_name, file_content) 105 | else: 106 | if not os.path.isdir(file_location): 107 | os.makedirs(file_location) 108 | file_path = os.path.join(file_location, file_name) 109 | save_type = 'wb' if type(file_content) is bytes else 'w' 110 | encoding: str | None = None if type(file_content) is bytes else 'utf-8' 111 | with open(file_path, save_type, encoding=encoding) as file: 112 | file.write(file_content) 113 | 114 | 115 | class NonInstantiable: 116 | """Base classes of this class are only instantiable in the same module that they are defined in.""" 117 | @classmethod 118 | def __init__(cls) -> None: 119 | caller_module_path = ins.stack()[2].filename 120 | class_module_path = ins.getfile(cls) 121 | # Ensure the python module of the caller matches that of the class 122 | # This ensures the class is only instantiated in the same module that it's defined in 123 | if caller_module_path != class_module_path: 124 | raise RuntimeError(f'The class "{cls.__name__}" cannot be instantiated outside of its module.') 125 | 126 | 127 | class staticproperty(staticmethod): 128 | def __get__(self, *_) -> t.Any: 129 | return self.__func__() 130 | -------------------------------------------------------------------------------- /dev/test_data/drug-entry-ids.txt: -------------------------------------------------------------------------------- 1 | dr:D00227 2 | dr:D00240 3 | dr:D00246 4 | dr:D00262 5 | dr:D00277 6 | dr:D00328 7 | dr:D00356 8 | dr:D00383 9 | dr:D00400 10 | dr:D00523 11 | dr:D00585 12 | dr:D00603 13 | dr:D00631 14 | dr:D00637 15 | dr:D00650 16 | dr:D00651 17 | dr:D00657 18 | dr:D00663 19 | dr:D00702 20 | dr:D00716 21 | dr:D00718 22 | dr:D00725 23 | dr:D00752 24 | dr:D00824 25 | dr:D00874 26 | dr:D00892 27 | dr:D00935 28 | dr:D00946 29 | dr:D00948 30 | dr:D00949 31 | dr:D00955 32 | dr:D00961 33 | dr:D00983 34 | dr:D00986 35 | dr:D01003 36 | dr:D01018 37 | dr:D01042 38 | dr:D01044 39 | dr:D01056 40 | dr:D01069 41 | dr:D01103 42 | dr:D01117 43 | dr:D01124 44 | dr:D01131 45 | dr:D01229 46 | dr:D01289 47 | dr:D01297 48 | dr:D01315 49 | dr:D01332 50 | dr:D01402 51 | dr:D01408 52 | dr:D01451 53 | dr:D01452 54 | dr:D01472 55 | dr:D01477 56 | dr:D01483 57 | dr:D01490 58 | dr:D01551 59 | dr:D01616 60 | dr:D01619 61 | dr:D01640 62 | dr:D01692 63 | dr:D01745 64 | dr:D01773 65 | dr:D01836 66 | dr:D01858 67 | dr:D01872 68 | dr:D01922 69 | dr:D01926 70 | dr:D01976 71 | dr:D02071 72 | dr:D02110 73 | dr:D02114 74 | dr:D02190 75 | dr:D02196 76 | dr:D02203 77 | dr:D02212 78 | dr:D02222 79 | dr:D02238 80 | dr:D02239 81 | dr:D02241 82 | dr:D02293 83 | dr:D02307 84 | dr:D02332 85 | dr:D02345 86 | dr:D02429 87 | dr:D02463 88 | dr:D02556 89 | dr:D02575 90 | dr:D02605 91 | dr:D02635 92 | dr:D02651 93 | dr:D02656 94 | dr:D02666 95 | dr:D02675 96 | dr:D02683 97 | dr:D02715 98 | dr:D02733 99 | dr:D02775 100 | dr:D02880 101 | dr:D02889 102 | dr:D02899 103 | dr:D02903 104 | dr:D02942 105 | dr:D03005 106 | dr:D03051 107 | dr:D03064 108 | dr:D03088 109 | dr:D03134 110 | dr:D03147 111 | dr:D03214 112 | dr:D03321 113 | dr:D03499 114 | dr:D03517 115 | dr:D03547 116 | dr:D03558 117 | dr:D03607 118 | dr:D03630 119 | dr:D03652 120 | dr:D03654 121 | dr:D03681 122 | dr:D03704 123 | dr:D03752 124 | dr:D03772 125 | dr:D03775 126 | dr:D03788 127 | dr:D03792 128 | dr:D03797 129 | dr:D03807 130 | dr:D03809 131 | dr:D03810 132 | dr:D03848 133 | dr:D03850 134 | dr:D03892 135 | dr:D03895 136 | dr:D03910 137 | dr:D03971 138 | dr:D04009 139 | dr:D04018 140 | dr:D04023 141 | dr:D04033 142 | dr:D04040 143 | dr:D04066 144 | dr:D04112 145 | dr:D04179 146 | dr:D04190 147 | dr:D04222 148 | dr:D04225 149 | dr:D04226 150 | dr:D04452 151 | dr:D04512 152 | dr:D04517 153 | dr:D04586 154 | dr:D04611 155 | dr:D04627 156 | dr:D04646 157 | dr:D04696 158 | dr:D04735 159 | dr:D04758 160 | dr:D04774 161 | dr:D04789 162 | dr:D04871 163 | dr:D04949 164 | dr:D05077 165 | dr:D05116 166 | dr:D05120 167 | dr:D05131 168 | dr:D05149 169 | dr:D05194 170 | dr:D05290 171 | dr:D05308 172 | dr:D05339 173 | dr:D05375 174 | dr:D05426 175 | dr:D05451 176 | dr:D05477 177 | dr:D05498 178 | dr:D05507 179 | dr:D05613 180 | dr:D05649 181 | dr:D05700 182 | dr:D05709 183 | dr:D05718 184 | dr:D05719 185 | dr:D05802 186 | dr:D05806 187 | dr:D05834 188 | dr:D05895 189 | dr:D05897 190 | dr:D05911 191 | dr:D05915 192 | dr:D05939 193 | dr:D05981 194 | dr:D06029 195 | dr:D06102 196 | dr:D06134 197 | dr:D06138 198 | dr:D06154 199 | dr:D06160 200 | dr:D06187 201 | dr:D06331 202 | dr:D06334 203 | dr:D06342 204 | dr:D06395 205 | dr:D06579 206 | dr:D06618 207 | dr:D06877 208 | dr:D06883 209 | dr:D07072 210 | dr:D07084 211 | dr:D07086 212 | dr:D07093 213 | dr:D07096 214 | dr:D07099 215 | dr:D07143 216 | dr:D07236 217 | dr:D07288 218 | dr:D07319 219 | dr:D07355 220 | dr:D07438 221 | dr:D07453 222 | dr:D07472 223 | dr:D07499 224 | dr:D07503 225 | dr:D07545 226 | dr:D07547 227 | dr:D07609 228 | dr:D07636 229 | dr:D07650 230 | dr:D07675 231 | dr:D07702 232 | dr:D07730 233 | dr:D07733 234 | dr:D07736 235 | dr:D07741 236 | dr:D07761 237 | dr:D07796 238 | dr:D07863 239 | dr:D07865 240 | dr:D07879 241 | dr:D07915 242 | dr:D07927 243 | dr:D07932 244 | dr:D07977 245 | dr:D07980 246 | dr:D08001 247 | dr:D08019 248 | dr:D08039 249 | dr:D08117 250 | dr:D08146 251 | dr:D08160 252 | dr:D08168 253 | dr:D08212 254 | dr:D08227 255 | dr:D08236 256 | dr:D08277 257 | dr:D08296 258 | dr:D08313 259 | dr:D08316 260 | dr:D08371 261 | dr:D08397 262 | dr:D08410 263 | dr:D08457 264 | dr:D08498 265 | dr:D08508 266 | dr:D08551 267 | dr:D08574 268 | dr:D08595 269 | dr:D08616 270 | dr:D08618 271 | dr:D08656 272 | dr:D08660 273 | dr:D08662 274 | dr:D08688 275 | dr:D08757 276 | dr:D08845 277 | dr:D08851 278 | dr:D08872 279 | dr:D08890 280 | dr:D08894 281 | dr:D08940 282 | dr:D08949 283 | dr:D08970 284 | dr:D09003 285 | dr:D09026 286 | dr:D09028 287 | dr:D09341 288 | dr:D09344 289 | dr:D09360 290 | dr:D09369 291 | dr:D09389 292 | dr:D09393 293 | dr:D09402 294 | dr:D09567 295 | dr:D09572 296 | dr:D09645 297 | dr:D09671 298 | dr:D09702 299 | dr:D09730 300 | dr:D09732 301 | dr:D09772 302 | dr:D09787 303 | dr:D09816 304 | dr:D09861 305 | dr:D09919 306 | dr:D09922 307 | dr:D09925 308 | dr:D09931 309 | dr:D09962 310 | dr:D09976 311 | dr:D09992 312 | dr:D09997 313 | dr:D10008 314 | dr:D10014 315 | dr:D10019 316 | dr:D10020 317 | dr:D10073 318 | dr:D10084 319 | dr:D10157 320 | dr:D10180 321 | dr:D10198 322 | dr:D10309 323 | dr:D10313 324 | dr:D10322 325 | dr:D10330 326 | dr:D10345 327 | dr:D10370 328 | dr:D10381 329 | dr:D10389 330 | dr:D10397 331 | dr:D10426 332 | dr:D10549 333 | dr:D10594 334 | dr:D10624 335 | dr:D10631 336 | dr:D10648 337 | dr:D10658 338 | dr:D10661 339 | dr:D10669 340 | dr:D10674 341 | dr:D10678 342 | dr:D10692 343 | dr:D10703 344 | dr:D10725 345 | dr:D10730 346 | dr:D10747 347 | dr:D10750 348 | dr:D10833 349 | dr:D10871 350 | dr:D10883 351 | dr:D10924 352 | dr:D10959 353 | dr:D11038 354 | dr:D11049 355 | dr:D11051 356 | dr:D11137 357 | dr:D11156 358 | dr:D11245 359 | dr:D11259 360 | dr:D11300 361 | dr:D11316 362 | dr:D11326 363 | dr:D11371 364 | dr:D11409 365 | dr:D11437 366 | dr:D11446 367 | dr:D11465 368 | dr:D11499 369 | dr:D11509 370 | dr:D11585 371 | dr:D11602 372 | dr:D11622 373 | dr:D11658 374 | dr:D11667 375 | dr:D11691 376 | dr:D11754 377 | dr:D11781 378 | dr:D11788 379 | dr:D11791 380 | dr:D11817 381 | dr:D11824 382 | dr:D11842 383 | dr:D11864 384 | dr:D11889 385 | dr:D11915 386 | dr:D12053 387 | dr:D12120 388 | dr:D12160 389 | dr:D12238 390 | dr:D12251 391 | dr:D12273 392 | dr:D12369 393 | dr:D12384 394 | dr:D12390 395 | dr:D12391 396 | dr:D12423 397 | dr:D12467 398 | dr:D12559 399 | dr:D12604 400 | dr:D12646 -------------------------------------------------------------------------------- /dev/test_pathway_organizer.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import json 4 | import typing as t 5 | import kegg_pull.pathway_organizer as po 6 | import dev.utils as u 7 | 8 | 9 | def test_load_from_kegg_warning(mocker, caplog): 10 | get_mock: mocker.MagicMock = _get_get_mock(mocker=mocker) 11 | parse_hierarchy_spy: mocker.MagicMock = mocker.spy(po.PathwayOrganizer, '_parse_hierarchy') 12 | pathway_org: po.PathwayOrganizer = po.PathwayOrganizer.load_from_kegg(top_level_nodes={'invalid-top-level-node'}) 13 | get_mock.assert_called_once_with(entry_ids=['br:br08901'], entry_field='json') 14 | u.assert_warning( 15 | message='Top level node name "invalid-top-level-node" is not recognized and will be ignored. Valid values are: "Cellular ' 16 | 'Processes, Drug Development, Environmental Information Processing, Genetic Information Processing, ' 17 | 'Human Diseases, Metabolism, Organismal Systems"', caplog=caplog) 18 | parse_hierarchy_spy.assert_called_once_with(pathway_org, level=1, raw_hierarchy_nodes=[], parent_name=None) 19 | assert pathway_org.hierarchy_nodes == dict() 20 | 21 | 22 | def _get_get_mock(mocker): 23 | def get_mock(**_) -> mocker.MagicMock: 24 | with open('dev/test_data/pathway-organizer/pathway-hierarchy.json', 'r') as file_: 25 | text_body_mock: str = file_.read() 26 | kegg_response_mock = mocker.MagicMock(text_body=text_body_mock) 27 | return kegg_response_mock 28 | return mocker.patch('kegg_pull.pathway_organizer.r.KEGGrest.get', wraps=get_mock) 29 | 30 | 31 | test_load_from_kegg_data = [ 32 | (None, None, 'all-nodes.json'), 33 | ({'Metabolism', 'Genetic Information Processing'}, None, 'top-level-nodes.json'), 34 | (None, {'Genetic Information Processing', 'Global and overview maps', '00010 Glycolysis / Gluconeogenesis'}, 'filter-nodes.json')] 35 | 36 | 37 | @pt.mark.parametrize('top_level_nodes,filter_nodes,hierarchy_nodes_file', test_load_from_kegg_data) 38 | def test_load_from_kegg(mocker, top_level_nodes: set, filter_nodes: set, hierarchy_nodes_file: str): 39 | get_mock: mocker.MagicMock = _get_get_mock(mocker=mocker) 40 | pathway_organizer = po.PathwayOrganizer.load_from_kegg(top_level_nodes=top_level_nodes, filter_nodes=filter_nodes) 41 | get_mock.assert_called_once_with(entry_ids=['br:br08901'], entry_field='json') 42 | if top_level_nodes is not None: 43 | actual_top_level_nodes = {node_key for node_key, node_val in pathway_organizer.hierarchy_nodes.items() if node_val['level'] == 1} 44 | assert actual_top_level_nodes == top_level_nodes 45 | if filter_nodes is not None: 46 | for filter_node in filter_nodes: 47 | assert filter_node not in pathway_organizer.hierarchy_nodes.keys() 48 | expected_hierarchy_nodes: dict = _get_expected_hierarchy_nodes(hierarchy_nodes_file=hierarchy_nodes_file) 49 | assert pathway_organizer.hierarchy_nodes == expected_hierarchy_nodes 50 | 51 | 52 | def _get_expected_hierarchy_nodes(hierarchy_nodes_file: str) -> dict: 53 | with open(f'dev/test_data/pathway-organizer/{hierarchy_nodes_file}') as file: 54 | expected_hierarchy_nodes: dict = json.load(file) 55 | return expected_hierarchy_nodes 56 | 57 | 58 | def test_save_to_json(mocker, json_file_path: str): 59 | u.mock_non_instantiable(mocker=mocker) 60 | pathway_organizer = po.PathwayOrganizer() 61 | pathway_organizer.hierarchy_nodes = _get_expected_hierarchy_nodes(hierarchy_nodes_file='top-level-nodes.json') 62 | pathway_organizer.save_to_json(file_path=json_file_path) 63 | u.test_save_to_json(json_file_path=json_file_path, expected_saved_json_object=pathway_organizer.hierarchy_nodes) 64 | 65 | 66 | def test_load_from_json(json_file_path: str): 67 | expected_hierarchy_nodes: dict = _get_expected_hierarchy_nodes(hierarchy_nodes_file='top-level-nodes.json') 68 | u.test_load_from_json( 69 | json_file_path=json_file_path, saved_object=expected_hierarchy_nodes, method=po.PathwayOrganizer.load_from_json, 70 | expected_loaded_object=expected_hierarchy_nodes, loaded_object_attribute='hierarchy_nodes') 71 | 72 | 73 | test_invalid_load_from_json_data = [ 74 | 1, 'a', [], [1, 2], ['a', 'b'], [[], []], [[1], [2]], [['a'], ['b']], [{}, {}], [{'a': {}, 'b': []}], {}, {'a': []}, {'a': {}}, 75 | {'a': {'b': 1}}, {'a': {'name': 'b'}}, {'a': {'level': 1, 'b': 'c'}}, 76 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': None, 'entry_id': 'x'}, 77 | '': {'name': 'b', 'level': 1, 'parent': 'c', 'children': ['d'], 'entry_id': None}}, 78 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': None, 'entry_id': None, 'x': 'y'}}, 79 | {'a': {'name': 2, 'level': 1, 'parent': 'c', 'children': None, 'entry_id': None}}, 80 | {'a': {'name': '', 'level': 1, 'parent': 'c', 'children': None, 'entry_id': None}}, 81 | {'a': {'name': None, 'level': 1, 'parent': 'c', 'children': None, 'entry_id': None}}, 82 | {'a': {'name': 'b', 'level': '1', 'parent': 'c', 'children': None, 'entry_id': None}}, 83 | {'a': {'name': 'b', 'level': None, 'parent': 'c', 'children': None, 'entry_id': None}}, 84 | {'a': {'name': 'b', 'level': 0, 'parent': 'c', 'children': None, 'entry_id': None}}, 85 | {'a': {'name': 'b', 'level': 1, 'parent': '', 'children': None, 'entry_id': None}}, 86 | {'a': {'name': 'b', 'level': 1, 'parent': 2, 'children': None, 'entry_id': None}}, 87 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': [], 'entry_id': None}}, 88 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': [1], 'entry_id': None}}, 89 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': [''], 'entry_id': None}}, 90 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': ['a'], 'entry_id': 1}}, 91 | {'a': {'name': 'b', 'level': 1, 'parent': 'c', 'children': ['a'], 'entry_id': ''}}] 92 | 93 | 94 | @pt.mark.parametrize('invalid_json_object', test_invalid_load_from_json_data) 95 | def test_invalid_load_from_json(caplog, json_file_path: str, invalid_json_object: list | dict | int | float | str): 96 | expected_error_message = f'Failed to load the hierarchy nodes. The pathway organizer JSON file at {json_file_path} is ' \ 97 | f'corrupted and will need to be re-created.' 98 | u.test_invalid_load_from_json( 99 | json_file_path=json_file_path, invalid_json_object=invalid_json_object, method=po.PathwayOrganizer.load_from_json, 100 | expected_error_message=expected_error_message, caplog=caplog) 101 | -------------------------------------------------------------------------------- /dev/utils.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import zipfile as zf 4 | import typing as t 5 | import json 6 | import jsonschema as js 7 | import os 8 | 9 | 10 | def assert_exception(expected_message: str, exception: pt.ExceptionInfo): 11 | actual_message = str(exception.value) 12 | assert actual_message == expected_message 13 | 14 | 15 | def assert_warning(message: str, caplog): 16 | [record] = caplog.records 17 | assert record.levelname == 'WARNING' 18 | assert record.message == message 19 | 20 | 21 | def assert_error(message: str, caplog): 22 | [record] = caplog.records 23 | assert record.levelname == 'ERROR' 24 | assert record.message == message 25 | 26 | 27 | def assert_help(mocker, module, subcommand: str): 28 | for help_arg in ['-h', '--help']: 29 | mocker.patch('sys.argv', ['kegg_pull', subcommand, help_arg]) 30 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 31 | with pt.raises(SystemExit): 32 | module.main() 33 | print_mock.assert_any_call(module.__doc__.strip('\n')) 34 | 35 | 36 | def assert_call_args(function_mock, expected_call_args_list: list, do_kwargs: bool): 37 | actual_call_args_list = function_mock.call_args_list 38 | for actual_call_args, expected_call_args in zip(actual_call_args_list, expected_call_args_list): 39 | if do_kwargs: 40 | assert actual_call_args.kwargs == expected_call_args 41 | else: 42 | assert actual_call_args.args == expected_call_args 43 | 44 | 45 | def _test_main(mocker, argv_mock: list, stdin_mock: str, method: str, method_return_value: object, method_kwargs: dict, module): 46 | argv_mock: list = ['kegg_pull'] + argv_mock 47 | mocker.patch('sys.argv', argv_mock) 48 | stdin_mock: mocker.MagicMock = mocker.patch('kegg_pull._utils.sys.stdin.read', return_value=stdin_mock) if stdin_mock else None 49 | method_mock: mocker.MagicMock = mocker.patch(f'kegg_pull.{method}', return_value=method_return_value) 50 | module.main() 51 | method_mock.assert_called_once_with(**method_kwargs) 52 | if stdin_mock: 53 | stdin_mock.assert_called_once_with() 54 | 55 | 56 | def test_print( 57 | mocker, argv_mock: list, stdin_mock: str, method: str, method_return_value: object, method_kwargs: dict, module, 58 | expected_output: str | bytes, is_binary: bool = False, caplog=None): 59 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 60 | _test_main( 61 | mocker=mocker, argv_mock=argv_mock, stdin_mock=stdin_mock, method=method, method_return_value=method_return_value, 62 | method_kwargs=method_kwargs, module=module) 63 | if is_binary: 64 | assert_warning(message='Printing binary output...', caplog=caplog) 65 | print_mock.assert_called_once_with(expected_output) 66 | 67 | 68 | def test_file( 69 | mocker, argv_mock: list, output_file: str, stdin_mock: str, method: str, method_return_value: object, method_kwargs: dict, module, 70 | expected_output: str | bytes, is_binary: bool = False): 71 | argv_mock: list = argv_mock + [f'--output={output_file}'] 72 | _test_main( 73 | mocker=mocker, argv_mock=argv_mock, stdin_mock=stdin_mock, method=method, method_return_value=method_return_value, 74 | method_kwargs=method_kwargs, module=module) 75 | read_type: str = 'rb' if is_binary else 'r' 76 | with open(output_file, read_type) as file: 77 | actual_output: str | bytes = file.read() 78 | assert actual_output == expected_output 79 | 80 | 81 | def test_zip_archive( 82 | mocker, argv_mock: list, zip_archive_data: tuple, stdin_mock: str, method: str, method_return_value: object, method_kwargs: dict, 83 | module, expected_output: str | bytes, is_binary: bool = False): 84 | zip_archive_path, zip_file_name = zip_archive_data 85 | argv_mock: list = argv_mock + [f'--output={zip_archive_path}:{zip_file_name}'] 86 | _test_main( 87 | mocker=mocker, argv_mock=argv_mock, stdin_mock=stdin_mock, method=method, method_return_value=method_return_value, 88 | method_kwargs=method_kwargs, module=module) 89 | with zf.ZipFile(zip_archive_path, 'r') as zip_file: 90 | actual_output: bytes = zip_file.read(zip_file_name) 91 | if not is_binary: 92 | actual_output: str = actual_output.decode() 93 | assert actual_output == expected_output 94 | 95 | 96 | def test_save_to_json(json_file_path: str, expected_saved_json_object: dict): 97 | if '.zip:' in json_file_path: 98 | with zf.ZipFile('archive.zip', 'r') as zip_file: 99 | json_file_name: str = 'dir/file.json' if 'dir/' in json_file_path else 'file.json' 100 | actual_saved_mapping: dict = json.loads(zip_file.read(name=json_file_name)) 101 | else: 102 | with open(json_file_path, 'r') as file: 103 | actual_saved_mapping: dict = json.load(file) 104 | assert actual_saved_mapping == expected_saved_json_object 105 | 106 | 107 | def test_load_from_json( 108 | json_file_path: str, saved_object: dict, method: t.Callable, expected_loaded_object: dict, loaded_object_attribute: str = None): 109 | _write_test_json_object(json_file_path=json_file_path, test_object=saved_object) 110 | actual_loaded_object = method(file_path=json_file_path) 111 | if loaded_object_attribute is not None: 112 | actual_loaded_object: dict = actual_loaded_object.__getattribute__(loaded_object_attribute) 113 | assert actual_loaded_object == expected_loaded_object 114 | 115 | 116 | def _write_test_json_object(json_file_path: str, test_object: list | dict | int | float | str) -> None: 117 | if '.zip:' in json_file_path: 118 | with zf.ZipFile('archive.zip', 'w') as zip_file: 119 | json_file_name: str = 'dir/file.json' if 'dir/' in json_file_path else 'file.json' 120 | zip_file.writestr(json_file_name, json.dumps(test_object, indent=2)) 121 | else: 122 | if json_file_path.startswith('dir'): 123 | directory, _ = os.path.split(json_file_path) 124 | os.makedirs(directory) 125 | with open(json_file_path, 'w') as file: 126 | file.write(json.dumps(test_object, indent=2)) 127 | 128 | 129 | def test_invalid_load_from_json( 130 | json_file_path: str, invalid_json_object: dict, method: t.Callable, expected_error_message: str, caplog): 131 | _write_test_json_object(json_file_path=json_file_path, test_object=invalid_json_object) 132 | with pt.raises(js.exceptions.ValidationError): 133 | method(file_path=json_file_path) 134 | assert_error(message=expected_error_message, caplog=caplog) 135 | 136 | 137 | def mock_non_instantiable(mocker): 138 | mocker.patch('kegg_pull._utils.NonInstantiable.__init__') 139 | -------------------------------------------------------------------------------- /src/kegg_pull/pull_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | kegg_pull pull -h | --help 4 | kegg_pull pull database [--force-single-entry] [--multi-process] [--n-workers=] [--output=] [--print] [--sep=] [--entry-field=] [--n-tries=] [--time-out=] [--sleep-time=] [--ut=] 5 | kegg_pull pull entry-ids [--force-single-entry] [--multi-process] [--n-workers=] [--output=] [--print] [--sep=] [--entry-field=] [--n-tries=] [--time-out=] [--sleep-time=] [--ut=] 6 | 7 | Options: 8 | -h --help Show this help message. 9 | database Pulls all the entries in a KEGG database. 10 | The KEGG database from which to pull entries. 11 | --force-single-entry Forces pulling only one entry at a time for every request to the KEGG web API. This flag is automatically set if is "brite". 12 | --multi-process If set, the entries are pulled across multiple processes to increase speed. Otherwise, the entries are pulled sequentially in a single process. 13 | --n-workers= The number of sub-processes to create when pulling. Defaults to the number of cores available. Ignored if --multi-process is not set. 14 | --output= The directory where the pulled KEGG entries will be stored. Defaults to the current working directory. If ends in ".zip", entries are saved to a ZIP archive instead of a directory. Ignored if --print is set. 15 | --print If set, prints the entries to the screen rather than saving them to the file system. Separates entries by the --sep option if set. 16 | --sep= The string that separates the entries which are printed to the screen when the --print option is set. Ignored if the --print option is not set. Defaults to printing the entry id, followed by the entry, followed by a newline. 17 | --entry-field= Optional field to extract from the entries pulled rather than the standard flat file format (or "htext" in the case of brite entries). 18 | --n-tries= The number of times to attempt a KEGG request before marking it as timed out or failed. Defaults to 3. 19 | --time-out= The number of seconds to wait for a KEGG request before marking it as timed out. Defaults to 60. 20 | --sleep-time= The amount of time to wait after a KEGG request times out (or potentially blacklists with a 403 error code) before attempting it again. Defaults to 5.0. 21 | --ut= If set, the ratio of unsuccessful entry IDs (failed or timed out) to total entry IDs at which kegg_pull quits. Valid values are between 0.0 and 1.0 non-inclusive. 22 | entry-ids Pulls entries specified by a comma separated list. Or from standard input: one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull pull entry-ids - ...). 23 | Comma separated list of entry IDs to pull (e.g. id1,id2,id3 etc.). Or if equal to "-", entry IDs are read from standard input. Will likely need to set --force-single-entry if any of the entries are from the brite database. 24 | """ 25 | import docopt as d 26 | import json 27 | import time 28 | import logging as log 29 | from . import pull as p 30 | from . import rest as r 31 | from . import entry_ids as ei 32 | from . import kegg_url as ku 33 | from . import _utils as u 34 | 35 | 36 | def main(): 37 | args = d.docopt(__doc__) 38 | n_tries = int(args['--n-tries']) if args['--n-tries'] is not None else None 39 | time_out = int(args['--time-out']) if args['--time-out'] is not None else None 40 | sleep_time = float(args['--sleep-time']) if args['--sleep-time'] is not None else None 41 | kegg_rest = r.KEGGrest(n_tries=n_tries, time_out=time_out, sleep_time=sleep_time) 42 | output = args['--output'] if args['--output'] is not None else '.' 43 | print_to_screen: bool = args['--print'] 44 | entry_field: str = args['--entry-field'] 45 | force_single_entry: bool = args['--force-single-entry'] 46 | if args['database']: 47 | database: str = args[''] 48 | if database == 'brite': 49 | force_single_entry = True 50 | entry_ids = ei.from_database(database=database) 51 | else: 52 | entry_ids = u.parse_input_sequence(input_source=args['']) 53 | unsuccessful_threshold = float(args['--ut']) if args['--ut'] is not None else None 54 | if args['--multi-process']: 55 | n_workers = int(args['--n-workers']) if args['--n-workers'] is not None else None 56 | multiple_pull = p.MultiProcessMultiplePull(kegg_rest=kegg_rest, unsuccessful_threshold=unsuccessful_threshold, n_workers=n_workers) 57 | else: 58 | multiple_pull = p.SingleProcessMultiplePull(kegg_rest=kegg_rest, unsuccessful_threshold=unsuccessful_threshold) 59 | time1 = _testable_time() 60 | if print_to_screen: 61 | pull_result, kegg_entry_mapping = multiple_pull.pull_dict( 62 | entry_ids=entry_ids, entry_field=entry_field, force_single_entry=force_single_entry) 63 | if ku.GetKEGGurl.is_binary(entry_field=entry_field): 64 | log.warning('Printing binary output...') 65 | print_separator: str = args['--sep'] 66 | if print_separator: 67 | print(f'\n{print_separator}\n'.join(kegg_entry_mapping.values())) 68 | else: 69 | for entry_id, entry in kegg_entry_mapping.items(): 70 | print(entry_id) 71 | print(f'{entry}\n') 72 | else: 73 | pull_result = multiple_pull.pull(entry_ids=entry_ids, output=output, entry_field=entry_field, force_single_entry=force_single_entry) 74 | time2 = _testable_time() 75 | n_total_entry_ids = len(pull_result.successful_entry_ids) + len(pull_result.failed_entry_ids) 76 | n_total_entry_ids += len(pull_result.timed_out_entry_ids) 77 | percent_success = len(pull_result.successful_entry_ids) / n_total_entry_ids * 100 78 | pull_results = { 79 | 'percent-success': float(f'{percent_success:.2f}'), 80 | 'pull-minutes': float(f'{(time2 - time1) / 60:.2f}'), 81 | 'num-successful': len(pull_result.successful_entry_ids), 82 | 'num-failed': len(pull_result.failed_entry_ids), 83 | 'num-timed-out': len(pull_result.timed_out_entry_ids), 84 | 'num-total': n_total_entry_ids, 85 | 'successful-entry-ids': pull_result.successful_entry_ids, 86 | 'failed-entry-ids': pull_result.failed_entry_ids, 87 | 'timed-out-entry-ids': pull_result.timed_out_entry_ids} 88 | with open('pull-results.json', 'w') as file: 89 | json.dump(pull_results, file, indent=0) 90 | 91 | 92 | def _testable_time() -> float: 93 | """ The time.time() function causes issues when mocked in tests, so we create this wrapper that can be safely mocked 94 | 95 | :return: The result of time.time() 96 | """ 97 | return time.time() # pragma: no cover 98 | -------------------------------------------------------------------------------- /dev/test_pull_cli.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import os 4 | import json 5 | import kegg_pull.pull_cli as p_cli 6 | import dev.utils as u 7 | 8 | 9 | def test_help(mocker): 10 | u.assert_help(mocker=mocker, module=p_cli, subcommand='pull') 11 | 12 | 13 | @pt.fixture(name='_') 14 | def teardown(): 15 | yield 16 | os.remove('pull-results.json') 17 | 18 | 19 | test_data = [ 20 | (['database', 'db-mock', '--print'], {'n_tries': None, 'time_out': None, 'sleep_time': None}, 'ei.from_database', 21 | {'database': 'db-mock'}, 'SingleProcessMultiplePull', {'unsuccessful_threshold': None}, 22 | {'force_single_entry': False, 'entry_field': None}, True, None), 23 | (['database', 'db-mock', '--print', '--sep=#####', '--force-single-entry', '--ut=0.1', '--multi-process', '--entry-field=image'], 24 | {'n_tries': None, 'time_out': None, 'sleep_time': None}, 'ei.from_database', {'database': 'db-mock'}, 'MultiProcessMultiplePull', 25 | {'unsuccessful_threshold': 0.1, 'n_workers': None}, {'force_single_entry': True, 'entry_field': 'image'}, True, '#####'), 26 | (['entry-ids', '-'], {'n_tries': None, 'time_out': None, 'sleep_time': None}, 'u.parse_input_sequence', {'input_source': '-'}, 27 | 'SingleProcessMultiplePull', {'unsuccessful_threshold': None}, {'output': '.', 'force_single_entry': False, 'entry_field': None}, 28 | False, None), 29 | (['entry-ids', '1,2', '--output=out-dir/', '--sleep-time=10.1'], {'n_tries': None, 'time_out': None, 'sleep_time': 10.1}, 30 | 'u.parse_input_sequence', {'input_source': '1,2'}, 'SingleProcessMultiplePull', {'unsuccessful_threshold': None}, 31 | {'output': 'out-dir/', 'force_single_entry': False, 'entry_field': None}, False, None), 32 | (['entry-ids', '1,2', '--n-tries=4', '--time-out=50', '--entry-field=mol'], {'n_tries': 4, 'time_out': 50, 'sleep_time': None}, 33 | 'u.parse_input_sequence', {'input_source': '1,2'}, 'SingleProcessMultiplePull', {'unsuccessful_threshold': None}, 34 | {'output': '.', 'force_single_entry': False, 'entry_field': 'mol'}, False, None), 35 | (['entry-ids', '-', '--entry-field=mol'], {'n_tries': None, 'time_out': None, 'sleep_time': None}, 36 | 'u.parse_input_sequence', {'input_source': '-'}, 'SingleProcessMultiplePull', {'unsuccessful_threshold': None}, 37 | {'output': '.', 'force_single_entry': False, 'entry_field': 'mol'}, False, None), 38 | (['database', 'pathway', '--output=out-dir', '--multi-process', '--sleep-time=20', '--force-single-entry'], 39 | {'n_tries': None, 'time_out': None, 'sleep_time': 20}, 'ei.from_database', {'database': 'pathway'}, 'MultiProcessMultiplePull', 40 | {'n_workers': None, 'unsuccessful_threshold': None}, {'output': 'out-dir', 'force_single_entry': True, 'entry_field': None}, False, 41 | None), 42 | (['database', 'brite', '--multi-process', '--n-tries=5', '--time-out=35', '--n-workers=6'], 43 | {'n_tries': 5, 'time_out': 35, 'sleep_time': None}, 'ei.from_database', {'database': 'brite'}, 'MultiProcessMultiplePull', 44 | {'n_workers': 6, 'unsuccessful_threshold': None}, {'output': '.', 'force_single_entry': True, 'entry_field': None}, False, None), 45 | (['entry-ids', '-', '--ut=0.4'], {'n_tries': None, 'time_out': None, 'sleep_time': None}, 46 | 'u.parse_input_sequence', {'input_source': '-'}, 'SingleProcessMultiplePull', {'unsuccessful_threshold': 0.4}, 47 | {'output': '.', 'force_single_entry': False, 'entry_field': None}, False, None)] 48 | 49 | 50 | @pt.mark.parametrize( 51 | 'args,kegg_rest_kwargs,entry_ids_method,entry_ids_kwargs,multiple_pull_class,multiple_pull_kwargs,pull_kwargs,print_to_screen,separator', 52 | test_data) 53 | def test_main( 54 | mocker, _, args: list, kegg_rest_kwargs: dict, entry_ids_method: str, entry_ids_kwargs: dict, multiple_pull_class: str, 55 | multiple_pull_kwargs: dict, pull_kwargs: dict, print_to_screen: bool, separator: str | None, caplog): 56 | args = ['kegg_pull', 'pull'] + args 57 | mocker.patch('sys.argv', args) 58 | kegg_rest_mock = mocker.MagicMock() 59 | KEGGrestMock = mocker.patch('kegg_pull.pull.r.KEGGrest', return_value=kegg_rest_mock) 60 | pull_result_mock = mocker.MagicMock( 61 | successful_entry_ids=('a', 'b', 'c', 'x'), failed_entry_ids=('y', 'z'), timed_out_entry_ids=()) 62 | pull_dict_return_value = pull_result_mock, {'a': 'x', 'b': 'y', 'c': 'z', 'x': 'abc123'} 63 | entry_ids_mock = ['1', '2'] 64 | entry_ids_method_mock: mocker.MagicMock = mocker.patch( 65 | f'kegg_pull.pull_cli.{entry_ids_method}', return_value=entry_ids_mock) 66 | multiple_pull_mock = mocker.MagicMock( 67 | pull=mocker.MagicMock(return_value=pull_result_mock), 68 | pull_dict=mocker.MagicMock(return_value=pull_dict_return_value)) 69 | MultiplePullMock: mocker.MagicMock = mocker.patch( 70 | f'kegg_pull.pull_cli.p.{multiple_pull_class}', return_value=multiple_pull_mock) 71 | time_mock: mocker.MagicMock = mocker.patch('kegg_pull.pull_cli._testable_time', side_effect=[26, 94]) 72 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 73 | p_cli.main() 74 | KEGGrestMock.assert_called_once_with(**kegg_rest_kwargs) 75 | assert time_mock.call_count == 2 76 | MultiplePullMock.assert_called_once_with(kegg_rest=kegg_rest_mock, **multiple_pull_kwargs) 77 | if print_to_screen: 78 | multiple_pull_mock.pull_dict.assert_called_once_with(entry_ids=entry_ids_mock, **pull_kwargs) 79 | if pull_kwargs['entry_field'] is not None: 80 | u.assert_warning(message='Printing binary output...', caplog=caplog) 81 | if separator is not None: 82 | print_mock.assert_called_once_with(f'\n{separator}\n'.join(['x', 'y', 'z', 'abc123'])) 83 | else: 84 | u.assert_call_args( 85 | function_mock=print_mock, expected_call_args_list=[(arg,) for arg in ['a', 'x\n', 'b', 'y\n', 'c', 'z\n', 'x', 'abc123\n']], 86 | do_kwargs=False) 87 | else: 88 | multiple_pull_mock.pull.assert_called_once_with(entry_ids=entry_ids_mock, **pull_kwargs) 89 | entry_ids_method_mock.assert_called_with(**entry_ids_kwargs) 90 | expected_pull_results = { 91 | 'percent-success': 66.67, 'pull-minutes': 1.13, 'num-successful': 4, 'num-failed': 2, 'num-timed-out': 0, 'num-total': 6, 92 | 'successful-entry-ids': ['a', 'b', 'c', 'x'], 'failed-entry-ids': ['y', 'z'], 'timed-out-entry-ids': []} 93 | with open('pull-results.json', 'r') as file: 94 | actual_pull_results: dict = json.load(file) 95 | assert actual_pull_results == expected_pull_results 96 | expected_pull_results_text: str = '\n'.join([ 97 | '{', 98 | '"percent-success": 66.67,', 99 | '"pull-minutes": 1.13,', 100 | '"num-successful": 4,', 101 | '"num-failed": 2,', 102 | '"num-timed-out": 0,', 103 | '"num-total": 6,', 104 | '"successful-entry-ids": [', 105 | '"a",', 106 | '"b",', 107 | '"c",', 108 | '"x"', 109 | '],', 110 | '"failed-entry-ids": [', 111 | '"y",', 112 | '"z"', 113 | '],', 114 | '"timed-out-entry-ids": []', 115 | '}']) 116 | with open('pull-results.json', 'r') as file: 117 | actual_pull_results_text: str = file.read() 118 | assert expected_pull_results_text == actual_pull_results_text 119 | -------------------------------------------------------------------------------- /dev/test_data/brite-entries/br_br08005.txt: -------------------------------------------------------------------------------- 1 | +C Peptide 2 | ! 3 | ANeuropeptides 4 | B Tachykinin 5 | C C16094 Substance P 6 | C C16095 Neuropeptide K 7 | C C16096 Neuropeptide gamma 8 | C C16097 Neurokinin A 9 | C C16098 Neurokinin B 10 | C C16099 Endokinin A/B 11 | C C16100 Endokinin C 12 | C C16101 Endokinin D 13 | B Neurotensin 14 | C C01836 Neurotensin 15 | C C15868 Neuromedin N 16 | B Feeding-related peptide 17 | C C16025 Ghrelin 18 | C C15901 Galanin 19 | C C16046 Galanin-like peptide 20 | C C16102 Obestatin 21 | C C16027 Cocaine- and amphetamine-regulated transcript (1-39) 22 | C C16029 Cocaine- and amphetamine-regulated transcript (42-89) 23 | C C16103 Agouti related protein (87-132) 24 | C C16030 Melanin-concentrating hormone 25 | C C16031 Neuropeptide GE 26 | C C16104 Neuropeptide EI 27 | C C16105 Orexin A 28 | C C16106 Orexin B 29 | C C16115 Gastrin-releasing peptide 30 | C C15866 Neuromedin C 31 | C C15869 Neuromedin U 32 | C C16107 Neuromedin S 33 | C C15949 Neuropeptide Y 34 | C C16032 Neuropeptide W-30 35 | C C16033 Neuropeptide W-23 36 | C C16034 Neuropeptide B-29 37 | C C16035 Neuropeptide B-23 38 | C C15865 Neuromedin B 39 | C C16036 Neuropeptide S 40 | B Endogenous opioid peptide 41 | C C16037 Leumorphin 42 | C C01574 Dynorphin A 43 | C C16135 Dynorphin B 44 | C C16039 Neoendorphin alpha 45 | C C16040 Neoendorphin beta 46 | C C16041 Leu-enkephalin 47 | C C11684 Met-enkephalin 48 | C C16042 Met-enkephalin-Arg-Gly-Leu 49 | C C16043 Met-enkephalin-Arg-Phe 50 | C C16108 Adrenorphin 51 | C C15890 Endomorphin-1 52 | C C15891 Endomorphin-2 53 | C C16044 Nociceptin 54 | C C15871 Nocistatin 55 | B Other neuropeptides 56 | C C15863 Cerebellin 57 | C C16109 RFamide-related peptide 1 58 | C C16045 RFamide-related peptide 2 59 | C C16110 RFamide-related peptide 3 60 | C C16111 Neuropeptide AF 61 | C C16112 Neuropeptide FF 62 | ACardiovascular peptides 63 | B Angiotensin 64 | C C00873 Angiotensin I 65 | C C02135 Angiotensin II 66 | C C15848 Angiotensin III 67 | C C15849 Angiotensin IV 68 | C C15851 Angiotensin (1-9) 69 | C C15850 Angiotensin (1-7) 70 | C C15852 Angiotensin (1-5) 71 | C C20970 Angiotensin A 72 | C C20971 Alamandine 73 | B Bradykinin 74 | C C16008 T-kinin 75 | C C01505 Kallidin 76 | C C00306 Bradykinin 77 | B Fibrinopeptide 78 | C C00952 Fibrinopeptide A 79 | C C02404 Fibrinopeptide B 80 | B Natriuretic peptide 81 | C C16000 Urodilatin 82 | C C16003 Atrial natriuretic peptide 83 | C C16004 Brain natriuretic peptide 84 | C C16005 C-Type natriuretic peptide 85 | B Guanylin 86 | C C16006 Uroguanylin 87 | C C16007 Guanylin 88 | B Endothelin 89 | C C16009 Big endothelin 90 | C C16010 Endothelin-1 91 | C C16012 Endothelin-2 92 | C C16013 Endothelin-3 93 | B Urotensin 94 | C C16076 Urotensin I 95 | C C16016 Urotensin II 96 | B Adrenomedullin 97 | C C16127 Adrenomedullin 98 | C C16128 Adrenomedullin-2 99 | C C18198 Proadrenomedullin N-terminal 20 peptide 100 | ACalcium-regulating peptides 101 | B Thyroid peptide hormone 102 | C C06865 Calcitonin 103 | C C16125 Calcitonin gene-related peptide 1 104 | C C16126 Calcitonin gene-related peptide 2 105 | B parathyroid peptide hormone 106 | C C16051 Parathyroid hormone 107 | B Others 108 | C C15876 Katacalcin 109 | C C16129 Calcitonin receptor-stimulating peptide 1 110 | C C16052 Parathyroid hormone-related peptide (1-36) 111 | C C16053 Tuberoinfundibular peptide of 39 residues 112 | APituitary hormones 113 | B Anterior pituitary hormone 114 | C C18181 Growth hormone 115 | C C18182 Thyroid stimulating hormone 116 | C C18183 Luteinizing hormone 117 | C C18184 Follicle stimulating hormone 118 | C C18201 Prolactin 119 | B Posterior pituitary hormone 120 | C C00746 Oxytocin 121 | C C13662 Arg-vasopressin 122 | C C07105 Lys-vasopressin 123 | C C16077 Neurophysin I 124 | C C16078 Neurophysin II 125 | B Proopiomelanocortin-derived peptide 126 | C C02017 Corticotropin 127 | C C16134 Corticotropin-like intermediary peptide 128 | C C16017 Endorphin alpha 129 | C C02210 Endorphin beta 130 | C C16018 Endorphin gamma 131 | C C16019 Lipotropin beta 132 | C C16020 Lipotropin gamma 133 | C C02758 Melanotropin alpha 134 | C C16136 Melanotropin beta 135 | C C16137 Melanotropin gamma 136 | AHypothalamic hormones 137 | B Corticotropin-releasing hormone 138 | C C16079 Corticotropin releasing hormone 139 | C C16080 Urocortin I 140 | C C16081 Urocortin II 141 | C C16082 Urocortin III 142 | B Gonadotropin-releasing hormone 143 | C C07607 Gonadotropin-releasing hormone I 144 | C C16084 Gonadotropin-releasing hormone II 145 | B Growth hormone-releasing hormone / Somatostatin 146 | C C16085 Growth hormone-releasing hormone 147 | C C16021 Somatostatin-28 148 | C C16022 Somatostatin-14 149 | C C16023 Cortistatin-29 150 | C C16024 Cortistatin-17 151 | B Thyrotropin-releasing hormone 152 | C C03958 Thyrotropin-releasing hormone 153 | B Prolactin-releasing peptide 154 | C C16086 Prolactin-releasing peptide-31 155 | C C16087 Prolactin-releasing peptide-20 156 | B Pituitary adenylate cyclase-activating peptide 157 | C C16088 Pituitary adenylate cyclase-activating peptide-38 158 | C C16089 Pituitary adenylate cyclase-activating peptide-27 159 | B Metastin 160 | C C16090 Metastin 161 | C C16091 Kisspeptin-14 162 | C C16092 Kisspeptin-13 163 | C C16093 Kisspeptin-10 164 | APancreatic peptides 165 | B Insulin / C-peptide 166 | C C00723 Insulin 167 | C C16120 C-peptide 168 | C C16131 Insulin-like growth factor I 169 | C C16132 Insulin-like growth factor II 170 | B Relaxin 171 | C C16121 Relaxin-1 172 | C C16122 Relaxin-2 173 | C C16123 Relaxin-3 174 | C C16124 Relaxin-like factor 175 | C C16178 Insulin-like peptide 5 176 | B Glucagon 177 | C C01501 Glucagon 178 | C C16048 Glucagon-like peptide 1 179 | C C16049 Glucagon-like peptide 2 180 | C C16050 Glicentin 181 | C C18197 Oxyntomodulin 182 | B Amylin 183 | C C16130 Amylin 184 | AGonadal peptides 185 | B Placental peptide 186 | C C18185 Chorionic gonadotropin 187 | C C17813 Activin A 188 | C C18208 Activin AB 189 | C C18209 Activin B 190 | C C18210 Placental lactogen 191 | ANon-endocrine glands secretion peptides 192 | B Gut peptide 193 | C C18186 Gastrin-14 194 | C C16113 Gastrin-17 195 | C C18187 Gastrin-34 196 | C C16133 Cholecystokinin-33 197 | C C16114 Cholecystokinin-8 198 | C C16047 Motilin 199 | C C15906 Gastric inhibitory polypeptide 200 | C C16117 Pancreatic polypeptide 201 | C C16118 Peptide YY 202 | C C16119 Vasoactive intestinal peptide 203 | C C13523 Secretin 204 | C C15856 Apelin-36 205 | C C15855 Apelin-13 206 | B Renal peptide 207 | C C18200 Erythropoietin 208 | B Adipocytes secretion peptide 209 | C C18188 Leptin 210 | C C18189 Adiponectin 211 | C C18190 Resistin 212 | AOthers 213 | B Microbicidal and cytotoxic peptide 214 | C C16054 Defensin alpha-1 215 | C C16055 Defensin alpha-2 216 | C C16056 Defensin alpha-3 217 | C C16057 Defensin alpha-4 218 | C C16058 Defensin alpha-5 219 | C C16059 Defensin beta-1 220 | C C16060 Defensin beta-2 221 | C C16061 Defensin beta-3 222 | C C16062 Defensin beta-4 223 | C C16063 Liver-expressed antimicrobial peptide 1 224 | C C16064 Liver-expressed antimicrobial peptide 2 225 | C C18313 Cathelicidin LL-37 226 | C C15922 Indolicidin 227 | C C15921 Histatin 5 228 | ! 229 | # 230 | #[ BRITE | KEGG2 | KEGG ] 231 | #Last updated: January 21, 2019 232 | -------------------------------------------------------------------------------- /dev/test_rest_cli.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import typing as t 4 | import kegg_pull.rest as r 5 | import kegg_pull.rest_cli as r_cli 6 | import kegg_pull.kegg_url as ku 7 | import dev.utils as u 8 | 9 | 10 | def test_help(mocker): 11 | u.assert_help(mocker=mocker, module=r_cli, subcommand='rest') 12 | 13 | 14 | test_exception_data = [ 15 | ('The request to the KEGG web API failed with the following URL: url/mock', r.KEGGresponse.Status.FAILED), 16 | ('The request to the KEGG web API timed out with the following URL: url/mock', r.KEGGresponse.Status.TIMEOUT)] 17 | 18 | 19 | @pt.mark.parametrize('expected_message,status', test_exception_data) 20 | def test_exception(mocker, expected_message: str, status): 21 | mocker.patch( 22 | 'kegg_pull.rest.KEGGrest.info', return_value=mocker.MagicMock(status=status, kegg_url=mocker.MagicMock(url='url/mock'))) 23 | mocker.patch('sys.argv', ['kegg_pull', 'rest', 'info', 'db-name']) 24 | with pt.raises(RuntimeError) as error: 25 | r_cli.main() 26 | u.assert_exception(expected_message=expected_message, exception=error) 27 | 28 | 29 | test_args = [ 30 | ['rest', 'info', 'ligand'], ['rest', 'list', 'module'], ['rest', 'get', 'x,y,z'], ['rest', 'get', ',,,a', '--entry-field=image'], 31 | ['rest', 'find', 'pathway', 'a,b,c,,,'], ['rest', 'find', 'drug', '--formula=CO2'], ['rest', 'find', 'drug', '--em=20.2'], 32 | ['rest', 'find', 'drug', '--mw=202'], ['rest', 'find', 'drug', '--em=20.2', '--em=30.3'], 33 | ['rest', 'find', 'drug', '--mw=202', '--mw=303'], ['rest', 'conv', 'kegg-db', 'out-db'], 34 | ['rest', 'conv', 'entry-ids', 'eid1,eid2', 'genes'], ['rest', 'link', 'target-db', 'source-db'], 35 | ['rest', 'link', 'entry-ids', ',x,,,y', 'target-db'], ['rest', 'ddi', 'de1,de2,de3'], ['rest', 'get', '-'], 36 | ['rest', 'find', 'pathway', '-'], ['rest', 'conv', 'entry-ids', '-', 'genes'], ['rest', 'link', 'entry-ids', '-', 'target-db'], 37 | ['rest', 'ddi', '-']] 38 | test_kwargs = [ 39 | {'database': 'ligand'}, {'database': 'module'}, {'entry_ids': ['x', 'y', 'z'], 'entry_field': None}, 40 | {'entry_ids': ['a'], 'entry_field': 'image'}, {'database': 'pathway', 'keywords': ['a', 'b', 'c']}, 41 | {'database': 'drug', 'formula': 'CO2', 'exact_mass': None, 'molecular_weight': None}, 42 | {'database': 'drug', 'formula': None, 'exact_mass': 20.2, 'molecular_weight': None}, 43 | {'database': 'drug', 'formula': None, 'exact_mass': None, 'molecular_weight': 202}, 44 | {'database': 'drug', 'formula': None, 'exact_mass': (20.2, 30.3), 'molecular_weight': None}, 45 | {'database': 'drug', 'formula': None, 'exact_mass': None, 'molecular_weight': (202, 303)}, 46 | {'kegg_database': 'kegg-db', 'outside_database': 'out-db'}, 47 | {'target_database': 'genes', 'entry_ids': ['eid1', 'eid2']}, 48 | {'target_database': 'target-db', 'source_database': 'source-db'}, 49 | {'target_database': 'target-db', 'entry_ids': ['x', 'y']}, {'drug_entry_ids': ['de1', 'de2', 'de3']}] 50 | test_data = [ 51 | ('rest_cli.r.KEGGrest.info', test_args[0], test_kwargs[0], False, None), 52 | ('rest_cli.r.KEGGrest.list', test_args[1], test_kwargs[1], False, None), 53 | ('rest_cli.r.KEGGrest.get', test_args[2], test_kwargs[2], False, None), 54 | ('rest_cli.r.KEGGrest.get', test_args[3], test_kwargs[3], True, None), 55 | ('rest_cli.r.KEGGrest.keywords_find', test_args[4], test_kwargs[4], False, None), 56 | ('rest_cli.r.KEGGrest.molecular_find', test_args[5], test_kwargs[5], False, None), 57 | ('rest_cli.r.KEGGrest.molecular_find', test_args[6], test_kwargs[6], False, None), 58 | ('rest_cli.r.KEGGrest.molecular_find', test_args[7], test_kwargs[7], False, None), 59 | ('rest_cli.r.KEGGrest.molecular_find', test_args[8], test_kwargs[8], False, None), 60 | ('rest_cli.r.KEGGrest.molecular_find', test_args[9], test_kwargs[9], False, None), 61 | ('rest_cli.r.KEGGrest.database_conv', test_args[10], test_kwargs[10], False, None), 62 | ('rest_cli.r.KEGGrest.entries_conv', test_args[11], test_kwargs[11], False, None), 63 | ('rest_cli.r.KEGGrest.database_link', test_args[12], test_kwargs[12], False, None), 64 | ('rest_cli.r.KEGGrest.entries_link', test_args[13], test_kwargs[13], False, None), 65 | ('rest_cli.r.KEGGrest.ddi', test_args[14], test_kwargs[14], False, None), 66 | ('rest_cli.r.KEGGrest.get', test_args[15], test_kwargs[2], False, '\tx\ny\t\n z '), 67 | ('rest_cli.r.KEGGrest.keywords_find', test_args[16], test_kwargs[4], False, '\t a\n \tb\nc \n '), 68 | ('rest_cli.r.KEGGrest.entries_conv', test_args[17], test_kwargs[11], False, 'eid1\neid2'), 69 | ('rest_cli.r.KEGGrest.entries_link', test_args[18], test_kwargs[13], False, '\nx\n y \n'), 70 | ('rest_cli.r.KEGGrest.ddi', test_args[19], test_kwargs[14], False, '\t\n\t\tde1\nde2\nde3\n\n \n ')] 71 | 72 | 73 | @pt.mark.parametrize('rest_method,args,kwargs,is_binary,stdin_mock', test_data) 74 | def test_print(mocker, rest_method: str, args: list, kwargs: dict, is_binary: bool, stdin_mock: str, caplog): 75 | kegg_response_mock, expected_output = _get_kegg_response_mock_and_expected_output(mocker=mocker, is_binary=is_binary) 76 | u.test_print( 77 | mocker=mocker, argv_mock=args, stdin_mock=stdin_mock, method=rest_method, method_return_value=kegg_response_mock, 78 | method_kwargs=kwargs, module=r_cli, expected_output=expected_output, is_binary=is_binary, caplog=caplog) 79 | 80 | 81 | def _get_kegg_response_mock_and_expected_output(mocker, is_binary: bool) -> tuple: 82 | kegg_response_mock: mocker.MagicMock = mocker.MagicMock( 83 | status=r.KEGGresponse.Status.SUCCESS, text_body='text body mock', binary_body=b'binary body mock') 84 | if is_binary: 85 | expected_output: bytes = kegg_response_mock.binary_body 86 | else: 87 | expected_output: str = kegg_response_mock.text_body 88 | return kegg_response_mock, expected_output 89 | 90 | 91 | @pt.mark.parametrize('rest_method,args,kwargs,is_binary,stdin_mock', test_data) 92 | def test_file(mocker, rest_method: str, args: list, kwargs: dict, is_binary: bool, output_file: str, stdin_mock: str): 93 | kegg_response_mock, expected_output = _get_kegg_response_mock_and_expected_output(mocker=mocker, is_binary=is_binary) 94 | u.test_file( 95 | mocker=mocker, argv_mock=args, output_file=output_file, stdin_mock=stdin_mock, method=rest_method, 96 | method_return_value=kegg_response_mock, method_kwargs=kwargs, module=r_cli, expected_output=expected_output, 97 | is_binary=is_binary) 98 | 99 | 100 | @pt.fixture(name='test_result', params=[True, False]) 101 | def get_test_result(request): 102 | yield request.param 103 | 104 | 105 | test_test_data = [ 106 | (ku.InfoKEGGurl, test_args[0], test_kwargs[0]), 107 | (ku.ListKEGGurl, test_args[1], test_kwargs[1]), 108 | (ku.GetKEGGurl, test_args[2], test_kwargs[2]), 109 | (ku.GetKEGGurl, test_args[3], test_kwargs[3]), 110 | (ku.KeywordsFindKEGGurl, test_args[4], test_kwargs[4]), 111 | (ku.MolecularFindKEGGurl, test_args[5], test_kwargs[5]), 112 | (ku.MolecularFindKEGGurl, test_args[6], test_kwargs[6]), 113 | (ku.MolecularFindKEGGurl, test_args[7], test_kwargs[7]), 114 | (ku.MolecularFindKEGGurl, test_args[8], test_kwargs[8]), 115 | (ku.MolecularFindKEGGurl, test_args[9], test_kwargs[9]), 116 | (ku.DatabaseConvKEGGurl, test_args[10], test_kwargs[10]), 117 | (ku.EntriesConvKEGGurl, test_args[11], test_kwargs[11]), 118 | (ku.DatabaseLinkKEGGurl, test_args[12], test_kwargs[12]), 119 | (ku.EntriesLinkKEGGurl, test_args[13], test_kwargs[13]), 120 | (ku.DdiKEGGurl, test_args[14], test_kwargs[14])] 121 | 122 | 123 | @pt.mark.parametrize('KEGGurl,args,kwargs', test_test_data) 124 | def test_test(mocker, KEGGurl: type[ku.AbstractKEGGurl], args: list, kwargs: dict, test_result: bool): 125 | test_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest_cli.r.KEGGrest.test', return_value=test_result) 126 | argv_mock: list = ['kegg_pull'] + args + ['--test'] 127 | mocker.patch('sys.argv', argv_mock) 128 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 129 | r_cli.main() 130 | test_mock.assert_called_with(KEGGurl=KEGGurl, **kwargs) 131 | print_mock.assert_called_once_with(test_result) 132 | 133 | 134 | @pt.mark.parametrize('rest_method,args,kwargs,is_binary,stdin_mock', test_data) 135 | def test_zip_archive(mocker, rest_method: str, args: list, kwargs: dict, is_binary: bool, zip_archive_data: tuple, stdin_mock: str): 136 | kegg_response_mock, expected_output = _get_kegg_response_mock_and_expected_output(mocker=mocker, is_binary=is_binary) 137 | u.test_zip_archive( 138 | mocker=mocker, argv_mock=args, zip_archive_data=zip_archive_data, stdin_mock=stdin_mock, method=rest_method, 139 | method_return_value=kegg_response_mock, method_kwargs=kwargs, module=r_cli, expected_output=expected_output, 140 | is_binary=is_binary) 141 | -------------------------------------------------------------------------------- /dev/test_main.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import zipfile as zf 4 | import os 5 | import shutil as sh 6 | import json 7 | import kegg_pull.__main__ as m 8 | import kegg_pull.entry_ids_cli as ei_cli 9 | import kegg_pull.rest_cli as r_cli 10 | import kegg_pull.pull_cli as p_cli 11 | import kegg_pull.map_cli as map_cli 12 | import kegg_pull.pathway_organizer_cli as po_cli 13 | import dev.utils as u 14 | 15 | 16 | def test_help(mocker): 17 | mocker.patch('sys.argv', ['kegg_pull', '--full-help']) 18 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 19 | m.main() 20 | delimiter: str = '-'*80 21 | expected_print_call_args = [ 22 | (m.__doc__,), (delimiter,), (p_cli.__doc__,), (delimiter,), (ei_cli.__doc__,), (delimiter,), (map_cli.__doc__,), 23 | (delimiter,), (po_cli.__doc__,), (delimiter,), (r_cli.__doc__,)] 24 | u.assert_call_args(function_mock=print_mock, expected_call_args_list=expected_print_call_args, do_kwargs=False) 25 | for help_arg in (['--help'], ['-h'], []): 26 | help_args = ['kegg_pull'] 27 | help_args.extend(help_arg) 28 | mocker.patch('sys.argv', help_args) 29 | print_mock.reset_mock() 30 | m.main() 31 | print_mock.assert_called_once_with(m.__doc__) 32 | 33 | 34 | def test_version(mocker): 35 | mocker.patch('sys.argv', ['kegg_pull', '--version']) 36 | version_mock = 'version mock' 37 | mocker.patch('kegg_pull.__main__.__version__', version_mock) 38 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 39 | m.main() 40 | print_mock.assert_called_once_with(version_mock) 41 | print_mock.reset_mock() 42 | mocker.patch('sys.argv', ['kegg_pull', '-v']) 43 | m.main() 44 | print_mock.assert_called_once_with(version_mock) 45 | 46 | 47 | @pt.fixture(name='print_output', params=[True, False]) 48 | def print_output_fixture(request): 49 | print_output: bool = request.param 50 | yield print_output 51 | if not print_output: 52 | os.remove('output.txt') 53 | 54 | 55 | test_entry_ids_data = [ 56 | (['database', 'brite'], 'dev/test_data/all-brite-entry-ids.txt'), 57 | (['keywords', 'module', 'Guanine,ribonucleotide'], 'dev/test_data/module-entry-ids.txt'), 58 | (['molec-attr', 'drug', '--em=420', '--em=440'], 'dev/test_data/drug-entry-ids.txt')] 59 | 60 | 61 | @pt.mark.parametrize('args,expected_output', test_entry_ids_data) 62 | def test_entry_ids(mocker, args: list, expected_output: str, print_output: bool): 63 | args: list = ['kegg_pull', 'entry-ids'] + args 64 | _test_output(mocker=mocker, args=args, expected_output=expected_output, print_output=print_output) 65 | 66 | 67 | def _test_output(mocker, args: list, expected_output: str, print_output: bool, json_output: bool = False): 68 | print_mock = None 69 | if print_output: 70 | print_mock: mocker.MagicMock = mocker.patch('builtins.print') 71 | else: 72 | args += ['--output=output.txt'] 73 | mocker.patch('sys.argv', args) 74 | m.main() 75 | with open(expected_output, 'r') as file: 76 | expected_output: str = file.read() 77 | if print_output: 78 | if json_output: 79 | expected_json: dict = json.loads(expected_output) 80 | [[actual_json], _] = print_mock.call_args 81 | actual_json: str = actual_json 82 | actual_json: dict = json.loads(actual_json) 83 | assert actual_json == expected_json 84 | else: 85 | print_mock.assert_called_once_with(expected_output) 86 | else: 87 | with open('output.txt', 'r') as file: 88 | actual_output: str = file.read() 89 | if json_output: 90 | actual_json: dict = json.loads(actual_output) 91 | expected_json: dict = json.loads(expected_output) 92 | assert actual_json == expected_json 93 | else: 94 | assert actual_output == expected_output 95 | 96 | 97 | test_rest_data = [ 98 | (['conv', 'glycan', 'pubchem'], 'dev/test_data/glycan-pubchem-conv.txt'), 99 | (['conv', 'entry-ids', 'gl:G13143,gl:G13141,gl:G13139', 'pubchem'], 'dev/test_data/glycan-pubchem-entry-ids.txt'), 100 | (['link', 'module', 'pathway'], 'dev/test_data/module-pathway-link.txt'), 101 | (['link', 'entry-ids', 'md:M00575,md:M00574,md:M00363', 'pathway'], 'dev/test_data/pathway-module-entry-ids.txt'), 102 | (['ddi', 'D00564,D00100,D00109'], 'dev/test_data/ddi-output.txt')] 103 | 104 | 105 | @pt.mark.parametrize('args,expected_output', test_rest_data) 106 | def test_rest(mocker, args: list, expected_output: str, print_output: bool): 107 | args = ['kegg_pull', 'rest'] + args 108 | _test_output(mocker=mocker, args=args, expected_output=expected_output, print_output=print_output) 109 | 110 | 111 | @pt.fixture(name='output', params=['brite-entries.zip', 'brite-entries']) 112 | def pull_output(request): 113 | output: str = request.param 114 | yield output 115 | if output == 'brite-entries.zip' and os.path.isfile(output): 116 | os.remove(output) 117 | else: 118 | sh.rmtree(output, ignore_errors=True) 119 | os.remove('pull-results.json') 120 | 121 | 122 | test_pull_data = [ 123 | ['--force-single-entry', '--multi-process', '--n-workers=2'], ['--print'], ['--print', '--multi-process'], 124 | ['--multi-process', '--n-workers=2'], ['--force-single-entry']] 125 | 126 | 127 | @pt.mark.parametrize('args', test_pull_data) 128 | def test_pull(mocker, args: list, output: str): 129 | stdin_mock = """ 130 | br:br08005 131 | br:br08902 132 | br:br08431 133 | 134 | br:br03220 135 | br:br03222 136 | """ 137 | stdin_mock: mocker.MagicMock = mocker.patch('kegg_pull._utils.sys.stdin.read', return_value=stdin_mock) 138 | successful_entry_ids = ['br:br08005', 'br:br08902', 'br:br08431'] 139 | # The expected output file names have underscores instead of colons in case testing on Windows. 140 | expected_output_files = [entry_id.replace(':', '_') for entry_id in successful_entry_ids] 141 | expected_pull_results = { 142 | 'successful-entry-ids': successful_entry_ids, 143 | 'failed-entry-ids': ['br:br03220', 'br:br03222'], 144 | 'timed-out-entry-ids': [], 145 | 'num-successful': 3, 146 | 'num-failed': 2, 147 | 'num-timed-out': 0, 148 | 'num-total': 5, 149 | 'percent-success': 60.0, 150 | 'pull-minutes': 1.0} 151 | args = ['kegg_pull', 'pull', 'entry-ids', '-'] + args + [f'--output={output}'] 152 | mocker.patch('sys.argv', args) 153 | time_mock: mocker.MagicMock = mocker.patch('kegg_pull.pull_cli._testable_time', side_effect=[30, 90]) 154 | print_mock = mocker.patch('builtins.print') 155 | m.main() 156 | stdin_mock.assert_called_once_with() 157 | assert time_mock.call_count == 2 158 | # If running on Windows, change the actual files names to have underscores instead of colons. 159 | if os.name == 'nt': # pragma: no cover 160 | expected_output_files = expected_output_files[:-1] # The last brite gives different output on Windows 161 | successful_entry_ids = expected_output_files # pragma: no cover 162 | for successful_entry_id, expected_output_file in zip(successful_entry_ids, expected_output_files): 163 | with open(f'dev/test_data/brite-entries/{expected_output_file}.txt') as expected_file: 164 | expected_entry: str = expected_file.read() 165 | if '--print' in args: 166 | print_mock.assert_any_call(successful_entry_id.replace('_', ':')) 167 | print_mock.assert_any_call(f'{expected_entry}\n') 168 | else: 169 | if output.endswith('.zip'): 170 | with zf.ZipFile(output, 'r') as actual_zip: 171 | actual_entry: str = actual_zip.read(successful_entry_id + '.txt').decode() 172 | else: 173 | with open(f'{output}/{successful_entry_id}.txt') as actual_file: 174 | actual_entry: str = actual_file.read() 175 | assert actual_entry == expected_entry 176 | with open('pull-results.json', 'r') as file: 177 | actual_pull_results: dict = json.load(file) 178 | assert actual_pull_results == expected_pull_results 179 | 180 | 181 | test_map_data = [ 182 | (['link', 'entry-ids', 'mmu:620551', 'reaction'], None, 'empty'), 183 | (['conv', 'mmu', 'ncbi-geneid', '--reverse'], None, 'mmu-ncbi'), 184 | (['conv', 'entry-ids', 'cpd:C00001,cpd:C00002', 'pubchem'], None, 'pubchem'), 185 | (['link', 'entry-ids', '-', 'module', '--reverse'], '\nK12696\nK22365\nK22435\t', 'module'), 186 | (['link', 'pathway', 'ko', '--deduplicate'], None, 'pathway-gene'), 187 | (['link', 'compound', 'reaction', 'pathway', '--add-glycans', '--add-drugs', '--deduplicate'], None, 'compound-reaction-pathway')] 188 | 189 | 190 | @pt.mark.parametrize('args,stdin_mock_str,expected_output', test_map_data) 191 | @pt.mark.disable_mock_organism_set 192 | def test_map(mocker, print_output: bool, args: list, stdin_mock_str: str, expected_output: str): 193 | args: list = ['kegg_pull', 'map'] + args 194 | stdin_mock = None 195 | if stdin_mock_str: 196 | stdin_mock: mocker.MagicMock = mocker.patch('kegg_pull._utils.sys.stdin.read', return_value=stdin_mock_str) 197 | _test_output( 198 | mocker=mocker, args=args, expected_output=f'dev/test_data/map/{expected_output}.json', print_output=print_output, 199 | json_output=True) 200 | if stdin_mock: 201 | stdin_mock.assert_called_once_with() 202 | 203 | 204 | def test_pathway_organizer(mocker, print_output: bool): 205 | args = ['kegg_pull', 'pathway-organizer', '--tln=Metabolism', '--fn=Global and overview maps'] 206 | _test_output( 207 | mocker=mocker, args=args, expected_output='dev/test_data/pathway-organizer/metabolic-pathways.json', 208 | print_output=print_output, json_output=True) 209 | -------------------------------------------------------------------------------- /dev/test_rest.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import typing as t 4 | import requests as rq 5 | import kegg_pull.rest as r 6 | import kegg_pull.kegg_url as ku 7 | import dev.utils as u 8 | 9 | 10 | test_kegg_response_exception_data = [ 11 | ({'status': r.KEGGresponse.Status.SUCCESS, 'kegg_url': None}, 12 | 'A KEGG response cannot be marked as successful if its response body is empty')] 13 | 14 | 15 | @pt.mark.parametrize('kwargs,expected_message', test_kegg_response_exception_data) 16 | def test_kegg_response_exception(mocker, kwargs: dict, expected_message: str): 17 | u.mock_non_instantiable(mocker=mocker) 18 | with pt.raises(ValueError) as error: 19 | r.KEGGresponse(**kwargs) 20 | u.assert_exception(expected_message=expected_message, exception=error) 21 | 22 | 23 | def test_kegg_rest_exception(): 24 | with pt.raises(ValueError) as error: 25 | r.KEGGrest(n_tries=0) 26 | expected_message = '0 is not a valid number of tries to make a KEGG request.' 27 | u.assert_exception(expected_message=expected_message, exception=error) 28 | 29 | 30 | def test_kegg_rest(): 31 | kegg_rest = r.KEGGrest(n_tries=2, time_out=30, sleep_time=0.5) 32 | assert kegg_rest._n_tries == 2 33 | assert kegg_rest._time_out == 30 34 | assert kegg_rest._sleep_time == 0.5 35 | kegg_rest = r.KEGGrest(n_tries=None, time_out=None, sleep_time=None) 36 | assert kegg_rest._n_tries == 3 37 | assert kegg_rest._time_out == 60 38 | assert kegg_rest._sleep_time == 5.0 39 | 40 | 41 | def test_request_and_test_success(mocker): 42 | kegg_rest = r.KEGGrest() 43 | text_mock = 'text mock' 44 | content_mock = b'content mock' 45 | response_mock = mocker.MagicMock(text=text_mock, content=content_mock, status_code=200) 46 | get_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.get', return_value=response_mock) 47 | url_mock = 'url mock' 48 | kegg_url_mock = mocker.MagicMock(url=url_mock) 49 | create_url_spy = mocker.spy(r.KEGGrest, '_get_kegg_url') 50 | kegg_response: r.KEGGresponse = kegg_rest.request(kegg_url=kegg_url_mock) 51 | create_url_spy.assert_called_once_with(KEGGurl=None, kegg_url=kegg_url_mock) 52 | get_mock.assert_called_once_with(url=url_mock, timeout=60) 53 | assert kegg_response.status == r.KEGGresponse.Status.SUCCESS 54 | assert kegg_response.text_body == text_mock 55 | assert kegg_response.binary_body == content_mock 56 | assert kegg_response.kegg_url == kegg_url_mock 57 | head_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.head', return_value=response_mock) 58 | success: bool = kegg_rest.test(kegg_url=kegg_url_mock) 59 | head_mock.assert_called_once_with(url=url_mock, timeout=60) 60 | assert success 61 | 62 | 63 | def test_request_and_test_failed(mocker): 64 | n_tries = 4 65 | kegg_rest = r.KEGGrest(n_tries=4) 66 | url_mock = 'url mock' 67 | kegg_url_mock = mocker.MagicMock(url=url_mock) 68 | failed_status_code = 403 69 | response_mock = mocker.MagicMock(text='', content=b'', status_code=failed_status_code) 70 | get_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.get', return_value=response_mock) 71 | sleep_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.time.sleep') 72 | kegg_response: r.KEGGresponse = kegg_rest.request(kegg_url=kegg_url_mock) 73 | get_mock.assert_has_calls(mocker.call(url=url_mock, timeout=60) for _ in range(n_tries)) 74 | sleep_mock.assert_has_calls(mocker.call(5.0) for _ in range(n_tries)) 75 | assert kegg_response.status == r.KEGGresponse.Status.FAILED 76 | assert kegg_response.kegg_url == kegg_url_mock 77 | assert kegg_response.text_body is None 78 | assert kegg_response.binary_body is None 79 | head_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.head', return_value=response_mock) 80 | success: bool = kegg_rest.test(kegg_url=kegg_url_mock) 81 | head_mock.assert_has_calls(mocker.call(url=url_mock, timeout=60) for _ in range(n_tries)) 82 | assert not success 83 | 84 | 85 | def test_request_and_test_timeout(mocker): 86 | n_tries = 2 87 | time_out = 30 88 | sleep_time = 10.5 89 | kegg_rest = r.KEGGrest(n_tries=n_tries, time_out=time_out, sleep_time=sleep_time) 90 | url_mock = 'url mock' 91 | kegg_url_mock = mocker.MagicMock(url=url_mock) 92 | get_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.get', side_effect=rq.exceptions.Timeout()) 93 | sleep_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.time.sleep') 94 | kegg_response: r.KEGGresponse = kegg_rest.request(kegg_url=kegg_url_mock) 95 | get_mock.assert_has_calls(mocker.call(url=url_mock, timeout=time_out) for _ in range(n_tries)) 96 | sleep_mock.assert_has_calls(mocker.call(sleep_time) for _ in range(n_tries)) 97 | assert kegg_response.status == r.KEGGresponse.Status.TIMEOUT 98 | assert kegg_response.kegg_url == kegg_url_mock 99 | assert kegg_response.text_body is None 100 | assert kegg_response.binary_body is None 101 | sleep_mock.reset_mock() 102 | head_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.rq.head', side_effect=rq.exceptions.Timeout()) 103 | success: bool = kegg_rest.test(kegg_url=kegg_url_mock) 104 | head_mock.assert_has_calls(mocker.call(url=url_mock, timeout=time_out) for _ in range(n_tries)) 105 | sleep_mock.assert_has_calls(mocker.call(sleep_time) for _ in range(n_tries)) 106 | assert not success 107 | 108 | 109 | test_rest_method_data = [ 110 | (ku.ListKEGGurl, r.KEGGrest.list, {'database': 'module'}), 111 | (ku.GetKEGGurl, r.KEGGrest.get, {'entry_ids': ['xyz'], 'entry_field': None}), 112 | (ku.InfoKEGGurl, r.KEGGrest.info, {'database': 'pathway'}), 113 | (ku.KeywordsFindKEGGurl, r.KEGGrest.keywords_find, {'database': '', 'keywords': ['a', 'b']}), 114 | (ku.MolecularFindKEGGurl, r.KEGGrest.molecular_find, {'database': '', 'formula': 'abc', 'exact_mass': None, 'molecular_weight': None}), 115 | (ku.DatabaseConvKEGGurl, r.KEGGrest.database_conv, {'kegg_database': 'a', 'outside_database': 'b'}), 116 | (ku.EntriesConvKEGGurl, r.KEGGrest.entries_conv, {'target_database': 'module', 'entry_ids': ['123', 'abc']}), 117 | (ku.DatabaseLinkKEGGurl, r.KEGGrest.database_link, {'target_database': 'x', 'source_database': 'y'}), 118 | (ku.EntriesLinkKEGGurl, r.KEGGrest.entries_link, {'target_database': '123', 'entry_ids': ['x', 'y']}), 119 | (ku.DdiKEGGurl, r.KEGGrest.ddi, {'drug_entry_ids': ['1', '2']})] 120 | 121 | 122 | @pt.mark.parametrize('KEGGurl,method,kwargs', test_rest_method_data) 123 | def test_rest_method(mocker, KEGGurl: type, method: t.Callable, kwargs: dict): 124 | kegg_rest = r.KEGGrest() 125 | request_spy = mocker.spy(kegg_rest, 'request') 126 | create_url_spy = mocker.spy(r.KEGGrest, '_get_kegg_url') 127 | kegg_url_mock = mocker.MagicMock() 128 | KEGGurlMock: mocker.MagicMock = mocker.patch(f'kegg_pull.rest.ku.{KEGGurl.__name__}', return_value=kegg_url_mock) 129 | getmro_mock: mocker.MagicMock = mocker.patch(f'kegg_pull.rest.ins.getmro', return_value={ku.AbstractKEGGurl}) 130 | mocker.patch('kegg_pull.rest.rq.get', return_value=mocker.MagicMock(status_code=200)) 131 | kegg_response = method(self=kegg_rest, **kwargs) 132 | request_spy.assert_called_once_with(KEGGurl=KEGGurlMock, **kwargs) 133 | create_url_spy.assert_called_once_with(KEGGurl=KEGGurlMock, kegg_url=None, **kwargs) 134 | KEGGurlMock.assert_called_once_with(**kwargs) 135 | getmro_mock.assert_called_once_with(KEGGurlMock) 136 | assert create_url_spy.spy_return == kegg_url_mock 137 | assert request_spy.spy_return == kegg_response 138 | assert kegg_response.kegg_url == kegg_url_mock 139 | 140 | 141 | test_get_kegg_url_exception_data = [ 142 | ({'KEGGurl': None, 'kegg_url': None}, 143 | 'Either an instantiated kegg_url object must be provided or an extended class of AbstractKEGGurl along with the' 144 | ' corresponding kwargs for its constructor.'), 145 | ({'KEGGurl': r.KEGGrest, 'kegg_url': None}, 146 | 'The value for KEGGurl must be an inherited class of AbstractKEGGurl. The class "KEGGrest" is not.')] 147 | 148 | 149 | @pt.mark.parametrize('kwargs,expected_message', test_get_kegg_url_exception_data) 150 | def test_get_kegg_url_exception(kwargs: dict, expected_message: str): 151 | with pt.raises(ValueError) as error: 152 | r.KEGGrest._get_kegg_url(**kwargs) 153 | u.assert_exception(expected_message=expected_message, exception=error) 154 | 155 | 156 | def test_get_kegg_url_warning(mocker, caplog): 157 | kegg_url_mock = mocker.MagicMock() 158 | kegg_url = r.KEGGrest._get_kegg_url(KEGGurl=ku.InfoKEGGurl, kegg_url=kegg_url_mock, database='database mock') 159 | u.assert_warning( 160 | message='Both an instantiated kegg_url object and KEGGurl class are provided. Using the instantiated object...', caplog=caplog) 161 | assert kegg_url == kegg_url_mock 162 | 163 | 164 | test_request_and_check_error_data = [ 165 | ('The KEGG request failed with the following URL: url/mock', r.KEGGresponse.Status.FAILED), 166 | ('The KEGG request timed out with the following URL: url/mock', r.KEGGresponse.Status.TIMEOUT)] 167 | 168 | 169 | @pt.mark.parametrize('expected_message,status', test_request_and_check_error_data) 170 | def test_request_and_check_error(mocker, expected_message: str, status: r.KEGGresponse.Status): 171 | kegg_url_mock = mocker.MagicMock(url='url/mock') 172 | kegg_response_mock = mocker.MagicMock(kegg_url=kegg_url_mock, status=status) 173 | request_mock: mocker.MagicMock = mocker.patch('kegg_pull.rest.KEGGrest.request', return_value=kegg_response_mock) 174 | with pt.raises(RuntimeError) as error: 175 | r.request_and_check_error(kegg_url=kegg_url_mock, kwarg1='val1', kwarg2='val2') 176 | request_mock.assert_called_once_with(KEGGurl=None, kegg_url=kegg_url_mock, kwarg1='val1', kwarg2='val2') 177 | u.assert_exception(expected_message=expected_message, exception=error) 178 | -------------------------------------------------------------------------------- /src/kegg_pull/rest_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | kegg_pull rest -h | --help 4 | kegg_pull rest info [--test] [--output=] 5 | kegg_pull rest list [--test] [--output=] 6 | kegg_pull rest get [--entry-field=] [--test] [--output=] 7 | kegg_pull rest find [--test] [--output=] 8 | kegg_pull rest find (--formula=|--em=...|--mw=...) [--test] [--output=] 9 | kegg_pull rest conv [--test] [--output=] 10 | kegg_pull rest conv entry-ids [--test] [--output=] 11 | kegg_pull rest link [--test] [--output=] 12 | kegg_pull rest link entry-ids [--test] [--output=] 13 | kegg_pull rest ddi [--test] [--output=] 14 | 15 | Options: 16 | -h --help Show this help message. 17 | info Executes the "info" KEGG API operation, pulling information about a KEGG database. 18 | The name of the database to pull information about or entry IDs from. 19 | --test If set, test the request to ensure it works rather than sending it. Print True if the request would succeed and False if the request would fail. Ignores --output if this options is set along with --test. 20 | --output= Path to the file (either in a directory or ZIP archive) to store the response body from the KEGG web API operation. Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt). 21 | list Executes the "list" KEGG API operation, pulling the entry IDs of the provided database. 22 | get Executes the "get" KEGG API operation, pulling the entries of the provided entry IDs. 23 | Comma separated list of entry IDs (e.g. id1,id2,id3 etc.). Or if equal to "-", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest get - ...). 24 | --entry-field= Optional field to extract from an entry instead of the default entry info (i.e. flat file or htext in the case of brite entries). 25 | find Executes the "find" KEGG API operation, finding entry IDs based on provided queries. 26 | Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to "-", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...). 27 | --formula= Sequence of atoms in a chemical formula format to search for (e.g. "O5C7" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms). 28 | --em= Either a single number (e.g. --em=155.5) or two numbers (e.g. --em=155.5 --em=244.4). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range). 29 | --mw= Same as --em but searches based on the molecular weight. 30 | conv Executes the "conv" KEGG API operation, converting entry IDs from an outside database to those of a KEGG database and vice versa. 31 | The name of the KEGG database from which to view equivalent outside database entry IDs. 32 | The name of the non-KEGG database from which to view equivalent KEGG database entry IDs. 33 | entry-ids Perform the "conv" or "link" operation of the form that maps specific provided entry IDs to a target database. 34 | link Executes the "link" KEGG API operation, showing the IDs of entries that are connected/related to entries of other databases. 35 | The name of the database that the entry IDs of the source database or provided entry IDs are mapped to. 36 | The name of the database from which cross-references are found in the target database. 37 | ddi Executes the "ddi" KEGG API operation, searching for drug to drug interactions. Providing one entry ID reports all known interactions, while providing multiple checks if any drug pair in a given set of drugs is CI or P. If providing multiple, all entries must belong to the same database. 38 | Comma separated list of drug entry IDs from the following databases: drug, ndc, or yj (e.g. id1,id2,id3 etc.). Or if equal to "-", entry IDs are read from standard input, one entry ID per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest ddi - ...). 39 | """ 40 | import docopt as d 41 | from . import kegg_url as ku 42 | from . import rest as r 43 | from . import _utils as u 44 | 45 | 46 | def main(): 47 | args = d.docopt(__doc__) 48 | database: str = args[''] 49 | entry_ids: str | list[str] = args[''] 50 | target_database: str = args[''] 51 | test: bool = args['--test'] 52 | is_binary = False 53 | test_result: bool | None = None 54 | kegg_response: r.KEGGresponse | None = None 55 | kegg_rest = r.KEGGrest() 56 | if args['info']: 57 | if test: 58 | test_result = kegg_rest.test(KEGGurl=ku.InfoKEGGurl, database=database) 59 | else: 60 | kegg_response = kegg_rest.info(database=database) 61 | elif args['list']: 62 | if test: 63 | test_result = kegg_rest.test(KEGGurl=ku.ListKEGGurl, database=database) 64 | else: 65 | kegg_response = kegg_rest.list(database=database) 66 | elif args['get']: 67 | entry_ids = u.parse_input_sequence(input_source=entry_ids) 68 | entry_field: str = args['--entry-field'] 69 | if test: 70 | test_result = kegg_rest.test(KEGGurl=ku.GetKEGGurl, entry_ids=entry_ids, entry_field=entry_field) 71 | else: 72 | if ku.GetKEGGurl.is_binary(entry_field=entry_field): 73 | is_binary = True 74 | kegg_response = kegg_rest.get(entry_ids=entry_ids, entry_field=entry_field) 75 | elif args['find']: 76 | if args['']: 77 | keywords = u.parse_input_sequence(input_source=args['']) 78 | if test: 79 | test_result = kegg_rest.test(KEGGurl=ku.KeywordsFindKEGGurl, database=database, keywords=keywords) 80 | else: 81 | kegg_response = kegg_rest.keywords_find(database=database, keywords=keywords) 82 | else: 83 | formula, exact_mass, molecular_weight = u.get_molecular_attribute_args(args=args) 84 | if test: 85 | test_result = kegg_rest.test( 86 | KEGGurl=ku.MolecularFindKEGGurl, database=database, formula=formula, 87 | exact_mass=exact_mass, molecular_weight=molecular_weight) 88 | else: 89 | kegg_response = kegg_rest.molecular_find( 90 | database=database, formula=formula, exact_mass=exact_mass, molecular_weight=molecular_weight) 91 | elif args['conv']: 92 | if args['entry-ids']: 93 | entry_ids = u.parse_input_sequence(input_source=entry_ids) 94 | if test: 95 | test_result = kegg_rest.test(KEGGurl=ku.EntriesConvKEGGurl, target_database=target_database, entry_ids=entry_ids) 96 | else: 97 | kegg_response = kegg_rest.entries_conv(target_database=target_database, entry_ids=entry_ids) 98 | else: 99 | kegg_database = args[''] 100 | outside_database = args[''] 101 | if test: 102 | test_result = kegg_rest.test( 103 | KEGGurl=ku.DatabaseConvKEGGurl, kegg_database=kegg_database, outside_database=outside_database) 104 | else: 105 | kegg_response = kegg_rest.database_conv(kegg_database=kegg_database, outside_database=outside_database) 106 | elif args['link']: 107 | if args['entry-ids']: 108 | entry_ids = u.parse_input_sequence(input_source=entry_ids) 109 | if test: 110 | test_result = kegg_rest.test(KEGGurl=ku.EntriesLinkKEGGurl, target_database=target_database, entry_ids=entry_ids) 111 | else: 112 | kegg_response = kegg_rest.entries_link(target_database=target_database, entry_ids=entry_ids) 113 | else: 114 | source_database: str = args[''] 115 | if test: 116 | test_result = kegg_rest.test( 117 | KEGGurl=ku.DatabaseLinkKEGGurl, target_database=target_database, source_database=source_database) 118 | else: 119 | kegg_response = kegg_rest.database_link( 120 | target_database=target_database, source_database=source_database) 121 | else: 122 | drug_entry_ids = u.parse_input_sequence(input_source=args['']) 123 | if test: 124 | test_result = kegg_rest.test(KEGGurl=ku.DdiKEGGurl, drug_entry_ids=drug_entry_ids) 125 | else: 126 | kegg_response = kegg_rest.ddi(drug_entry_ids=drug_entry_ids) 127 | if test: 128 | print(test_result) 129 | else: 130 | if kegg_response.status == r.KEGGresponse.Status.FAILED: 131 | raise RuntimeError( 132 | f'The request to the KEGG web API failed with the following URL: {kegg_response.kegg_url.url}') 133 | elif kegg_response.status == r.KEGGresponse.Status.TIMEOUT: 134 | raise RuntimeError( 135 | f'The request to the KEGG web API timed out with the following URL: {kegg_response.kegg_url.url}') 136 | if is_binary: 137 | response_body: bytes = kegg_response.binary_body 138 | else: 139 | response_body: str = kegg_response.text_body 140 | u.print_or_save(output_target=args['--output'], output_content=response_body) 141 | -------------------------------------------------------------------------------- /src/kegg_pull/pathway_organizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Flattening A Pathways Brite Hierarchy 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | |Functionality| for flattening a pathways Brite hierarchy (ID: 'br:br08901') into a collection of its nodes, mapping a node ID to information about it, enabling combinations with other KEGG data. 5 | """ 6 | from __future__ import annotations 7 | import json 8 | import logging as log 9 | import typing as t 10 | from . import rest as r 11 | from . import _utils as u 12 | 13 | 14 | class HierarchyNode(t.TypedDict): 15 | """A dictionary with the following keys:""" 16 | name: str 17 | """The name of the node obtained directly from the Brite hierarchy.""" 18 | level: int 19 | """The level that the node appears in the hierarchy.""" 20 | parent: str | None 21 | """The key (not the name) of the parent node (None if top level node).""" 22 | children: list[str] | None 23 | """The keys (not the names) of the node's children (None if leaf node).""" 24 | entry_id: str | None 25 | """The entry ID of the node (None if the node does not correspond to a KEGG entry).""" 26 | 27 | 28 | HierarchyNodes = dict[str, HierarchyNode] 29 | _RawHierarchyNode = t.TypedDict('_RawHierarchyNode', {'name': str, 'children': list[dict] | None}) 30 | 31 | 32 | class PathwayOrganizer(u.NonInstantiable): 33 | """ 34 | Contains methods for managing a mapping of node keys to node information, these nodes coming from a pathways Brite hierarchy. 35 | An instantiated ``PathwayOrganizer`` object must be returned from either ``PathwayOrganizer.load_from_kegg`` or 36 | ``PathwayOrganizer.load_from_json``. The ``__init__`` is not meant to be called directly. The ``__str__`` method returns a JSON 37 | string of ``hierarchy_nodes``. 38 | 39 | :ivar dict[str, HierarchyNode] hierarchy_nodes: The mapping of node keys to node information managed by the PathwayOrganizer. 40 | """ 41 | def __init__(self) -> None: 42 | super(PathwayOrganizer, self).__init__() 43 | self.hierarchy_nodes: HierarchyNodes | None = None 44 | self._filter_nodes: set[str] | None = None 45 | 46 | @staticmethod 47 | def load_from_kegg( 48 | top_level_nodes: set[str] | None = None, filter_nodes: set[str] | None = None, 49 | kegg_rest: r.KEGGrest | None = None) -> PathwayOrganizer: 50 | """ Pulls the Brite hierarchy from the KEGG REST API and converts it to the ``hierarchy_nodes`` mapping. 51 | 52 | :param top_level_nodes: Node names in the highest level of the hierarchy to select from. If None, all top level nodes are traversed to create the ``hierarchy_nodes``. 53 | :param filter_nodes: Names (not keys) of nodes to exclude from the ``hierarchy_nodes`` mapping. Neither these nodes nor any of their children will be included. 54 | :param kegg_rest: Optional KEGGrest object for obtaining the Brite hierarchy. A new KEGGrest object is created by default. 55 | :returns: The resulting PathwayOrganizer object. 56 | """ 57 | pathway_org = PathwayOrganizer() 58 | pathway_org.hierarchy_nodes = HierarchyNodes() 59 | pathway_org._filter_nodes = filter_nodes 60 | hierarchy = PathwayOrganizer._get_hierarchy(kegg_rest=kegg_rest) 61 | valid_top_level_nodes = sorted(top_level_node['name'] for top_level_node in hierarchy) 62 | if top_level_nodes is not None: 63 | for top_level_node in list(top_level_nodes): 64 | if top_level_node not in valid_top_level_nodes: 65 | log.warning( 66 | f'Top level node name "{top_level_node}" is not recognized and will be ignored. Valid values are: ' 67 | f'"{", ".join(valid_top_level_nodes)}"') 68 | top_level_nodes.remove(top_level_node) 69 | hierarchy = [top_level_node for top_level_node in hierarchy if top_level_node['name'] in top_level_nodes] 70 | pathway_org._parse_hierarchy(level=1, raw_hierarchy_nodes=hierarchy, parent_name=None) 71 | return pathway_org 72 | 73 | @staticmethod 74 | def _get_hierarchy(kegg_rest: r.KEGGrest | None) -> list[_RawHierarchyNode]: 75 | """ Pulls the Brite hierarchy (to be converted to hierarchy_nodes) from the KEGG REST API. 76 | 77 | :return: The list of top level nodes that branch out into the rest of the hierarchy until reaching leaf nodes. 78 | """ 79 | kegg_rest = kegg_rest if kegg_rest is not None else r.KEGGrest() 80 | kegg_response = kegg_rest.get(entry_ids=['br:br08901'], entry_field='json') 81 | text_body = kegg_response.text_body.strip() 82 | brite_hierarchy: dict = json.loads(s=text_body) 83 | return brite_hierarchy['children'] 84 | 85 | def _parse_hierarchy(self, level: int, raw_hierarchy_nodes: list[_RawHierarchyNode], parent_name: str | None) -> set[str]: 86 | """ Recursively traverses the Brite hierarchy to create the hierarchy_nodes mapping. 87 | 88 | :param level: The current level of recursion representing the level of the node in the hierarchy. 89 | :param raw_hierarchy_nodes: The list of nodes in the current branch of the hierarchy being traversed. 90 | :param parent_name: The node key of the parent node of the current branch of the hierarchy. 91 | :return: The keys of the nodes added to the hierarchy_nodes property representing the children of the parent node. 92 | """ 93 | nodes_added = set[str]() 94 | for raw_hierarchy_node in raw_hierarchy_nodes: 95 | node_name = raw_hierarchy_node['name'] 96 | if self._filter_nodes is None or node_name not in self._filter_nodes: 97 | if 'children' in raw_hierarchy_node.keys(): 98 | node_children = self._parse_hierarchy( 99 | level=level+1, raw_hierarchy_nodes=raw_hierarchy_node['children'], parent_name=node_name) 100 | if self._filter_nodes is not None: 101 | expected_n_children_added = len( 102 | [child for child in raw_hierarchy_node['children'] if child['name'] not in self._filter_nodes]) 103 | else: 104 | expected_n_children_added = len(raw_hierarchy_node['children']) 105 | assert len(node_children) == expected_n_children_added, f'Not all children added for node: {node_name}' 106 | node_key = self._add_hierarchy_node( 107 | name=node_name, level=level, parent=parent_name, children=node_children, entry_id=None) 108 | else: 109 | entry_id = node_name.split(' ')[0] 110 | entry_id = f'path:map{entry_id}' 111 | node_key = self._add_hierarchy_node( 112 | name=node_name, level=level, parent=parent_name, children=None, entry_id=entry_id) 113 | nodes_added.add(node_key) 114 | return nodes_added 115 | 116 | def _add_hierarchy_node(self, name: str, level: int, parent: str, children: set[str] | None, entry_id: str | None) -> str: 117 | """ Adds a Brite hierarchy node representation to the hierarchy_nodes property. 118 | 119 | :param name: The name of the node obtained directly from the Brite hierarchy. 120 | :param level: The level that the node appears in the hierarchy. 121 | :param parent: The key of the parent node (None if top level node). 122 | :param children: The keys of the node's children (None if leaf node). 123 | :param entry_id: The entry ID of the node; string if it represents a KEGG pathway mapping, else None. 124 | :return: The key chosen for the node, equal to its entry ID if not None, else the name of the Node. 125 | """ 126 | key = entry_id if entry_id is not None else name 127 | assert key not in self.hierarchy_nodes.keys(), f'Duplicate brite hierarchy node name {key}' 128 | children = sorted(children) if children is not None else None 129 | self.hierarchy_nodes[key] = HierarchyNode(name=name, level=level, parent=parent, children=children, entry_id=entry_id) 130 | return key 131 | 132 | def __str__(self) -> str: 133 | """ Converts the hierarchy nodes to a JSON string. 134 | 135 | :return: The JSON string version of the hierarchy nodes. 136 | """ 137 | return json.dumps(self.hierarchy_nodes, indent=2) 138 | 139 | _schema = { 140 | 'type': 'object', 141 | 'minProperties': 1, 142 | 'additionalProperties': False, 143 | 'patternProperties': { 144 | '^.+$': { 145 | 'type': 'object', 146 | 'required': ['name', 'level', 'parent', 'children', 'entry_id'], 147 | 'additionalProperties': False, 148 | 'properties': { 149 | 'name': { 150 | 'type': 'string', 151 | 'minLength': 1 152 | }, 153 | 'level': { 154 | 'type': 'integer', 155 | 'minimum': 1 156 | }, 157 | 'parent': { 158 | 'type': ['string', 'null'], 159 | 'minLength': 1 160 | }, 161 | 'children': { 162 | 'minItems': 1, 163 | 'type': ['array', 'null'], 164 | 'items': { 165 | 'type': 'string', 166 | 'minLength': 1 167 | } 168 | }, 169 | 'entry_id': { 170 | 'type': ['string', 'null'], 171 | 'minLength': 1 172 | } 173 | } 174 | } 175 | } 176 | } 177 | 178 | @staticmethod 179 | def load_from_json(file_path: str) -> PathwayOrganizer: 180 | """ Loads the ``hierarchy_nodes`` mapping that was cached in a JSON file using ``load_from_kegg`` followed by ``save_to_json``. 181 | 182 | :param file_path: Path to the JSON file. If reading from a ZIP archive, the file path must be in the following format: /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:hierarchy-nodes.json). 183 | :returns: The resulting PathwayOrganizer object. 184 | :raises ValidationError: Raised if the JSON file does not follow the correct JSON schema. Should follow the correct schema if ``hierarchy_nodes`` was cached using ``load_from_kegg`` followed by ``save_to_json`` and without any additional alteration. 185 | """ 186 | pathway_org = PathwayOrganizer() 187 | hierarchy_nodes: HierarchyNodes = u.load_json_file( 188 | file_path=file_path, json_schema=PathwayOrganizer._schema, 189 | validation_error_message=f'Failed to load the hierarchy nodes. The pathway organizer JSON file at {file_path} is ' 190 | f'corrupted and will need to be re-created.') 191 | pathway_org.hierarchy_nodes = hierarchy_nodes 192 | return pathway_org 193 | 194 | def save_to_json(self, file_path: str) -> None: 195 | """ Saves the ``hierarchy_nodes`` mapping to a JSON file to cache it. 196 | 197 | :param file_path: The path to the JSON file to save the ``hierarchy_nodes`` mapping. If saving in a ZIP archive, the file path must be in the following format: /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:hierarchy-nodes.json). 198 | """ 199 | json_string = str(self) 200 | u.save_output(output_target=file_path, output_content=json_string) 201 | -------------------------------------------------------------------------------- /dev/test_kegg_url.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import requests as rq 4 | import kegg_pull.kegg_url as ku 5 | import dev.utils as u 6 | 7 | 8 | test_validate_exception_data = [ 9 | (ku.KeywordsFindKEGGurl, {'database': 'ko', 'keywords': ['keyword'] * 500}, 10 | 'The KEGG URL length of 4028 exceeds the limit of 4000'), 11 | (ku.ListKEGGurl, {'database': 'ligand'}, 12 | 'Invalid database name: "ligand". Valid values are: , ag, atc, brite, brite_ja, compound, compound_ja, ' 13 | 'dgroup, dgroup_ja, disease, disease_ja, drug, drug_ja, enzyme, genome, glycan, jtc, ko, module, ndc, network, ' 14 | 'organism, pathway, rclass, reaction, variant, vg, vp, yj. Where is an organism code or T number.'), 15 | (ku.InfoKEGGurl, {'database': 'organism'}, 16 | 'Invalid database name: "organism". Valid values are: , ag, brite, compound, dgroup, disease, drug, ' 17 | 'enzyme, genes, genome, glycan, kegg, ko, ligand, module, network, pathway, rclass, reaction, variant, vg, vp.' 18 | ' Where is an organism code or T number.'), 19 | (ku.GetKEGGurl, {'entry_ids': [], 'entry_field': None}, 'Entry IDs must be specified for the KEGG get operation'), 20 | (ku.GetKEGGurl, {'entry_ids': ['x'], 'entry_field': 'invalid-entry-field'}, 21 | 'Invalid KEGG entry field: "invalid-entry-field". Valid values are: aaseq, conf, image, json, kcf, kgml, mol, ' 22 | 'ntseq.'), 23 | (ku.GetKEGGurl, {'entry_ids': ['x', 'y'], 'entry_field': 'json'}, 24 | 'The KEGG entry field: "json" only supports requests of one KEGG entry at a time but 2 entry IDs are provided'), 25 | (ku.GetKEGGurl, {'entry_ids': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']}, 26 | f'The maximum number of entry IDs is {ku.GetKEGGurl.MAX_ENTRY_IDS_PER_URL} but 11 were provided'), 27 | (ku.KeywordsFindKEGGurl, {'database': 'not-brite', 'keywords': []}, 'No search keywords specified'), 28 | (ku.KeywordsFindKEGGurl, {'database': 'brite', 'keywords': ['x']}, 29 | 'Invalid database name: "brite". Valid values are: , ag, atc, brite_ja, compound, compound_ja, dgroup, ' 30 | 'dgroup_ja, disease, disease_ja, drug, drug_ja, enzyme, genes, genome, glycan, jtc, ko, ligand, module, ndc, ' 31 | 'network, pathway, rclass, reaction, variant, vg, vp, yj. Where is an organism code or T number.'), 32 | (ku.MolecularFindKEGGurl, {'database': 'glycan'}, 'Invalid molecular database name: "glycan". Valid values are: compound, drug.'), 33 | (ku.MolecularFindKEGGurl, {'database': 'drug'}, 'Must provide either a chemical formula, exact mass, or molecular weight option'), 34 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'exact_mass': ()}, 35 | 'Exact mass range can only be constructed from 2 values but 0 are provided: '), 36 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'exact_mass': (1.1, 2.2, 3.3)}, 37 | 'Exact mass range can only be constructed from 2 values but 3 are provided: 1.1, 2.2, 3.3'), 38 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'molecular_weight': ()}, 39 | 'Molecular weight range can only be constructed from 2 values but 0 are provided: '), 40 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'molecular_weight': (10, 20, 30)}, 41 | 'Molecular weight range can only be constructed from 2 values but 3 are provided: 10, 20, 30'), 42 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'exact_mass': (30.3, 20.2)}, 43 | 'The first value in the range must be less than the second. Values provided: 30.3-20.2'), 44 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'exact_mass': (10.1, 10.1)}, 45 | 'The first value in the range must be less than the second. Values provided: 10.1-10.1'), 46 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'molecular_weight': (303, 202)}, 47 | 'The first value in the range must be less than the second. Values provided: 303-202'), 48 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'molecular_weight': (101, 101)}, 49 | 'The first value in the range must be less than the second. Values provided: 101-101'), 50 | (ku.DatabaseConvKEGGurl, {'kegg_database': 'genes', 'outside_database': ''}, 51 | 'Invalid KEGG database: "genes". Valid values are: , compound, drug, glycan. Where is an organism ' 52 | 'code or T number.'), 53 | (ku.DatabaseConvKEGGurl, {'kegg_database': 'drug', 'outside_database': 'glycan'}, 54 | 'Invalid outside database: "glycan". Valid values are: chebi, ncbi-geneid, ncbi-proteinid, pubchem, uniprot.'), 55 | (ku.DatabaseConvKEGGurl, {'kegg_database': 'organism-T-number', 'outside_database': 'pubchem'}, 56 | 'KEGG database "organism-T-number" is a gene database but outside database "pubchem" is not.'), 57 | (ku.DatabaseConvKEGGurl, {'kegg_database': 'compound', 'outside_database': 'ncbi-geneid'}, 58 | 'KEGG database "compound" is a molecule database but outside database "ncbi-geneid" is not.'), 59 | (ku.EntriesConvKEGGurl, {'target_database': 'rclass', 'entry_ids': []}, 60 | 'Invalid target database: "rclass". Valid values are: , chebi, compound, drug, genes, glycan, ncbi-geneid,' 61 | ' ncbi-proteinid, pubchem, uniprot. Where is an organism code or T number.'), 62 | (ku.EntriesConvKEGGurl, {'target_database': 'chebi', 'entry_ids': []}, 63 | 'Entry IDs must be specified for this KEGG "conv" operation'), 64 | (ku.DatabaseLinkKEGGurl, {'target_database': 'genes', 'source_database': ''}, 65 | 'Invalid database name: "genes". Valid values are: , ag, atc, brite, compound, dgroup, disease, drug, ' 66 | 'enzyme, genome, glycan, jtc, ko, module, ndc, network, pathway, pubmed, rclass, reaction, variant, vg, vp, yj.' 67 | ' Where is an organism code or T number.'), 68 | (ku.DatabaseLinkKEGGurl, {'target_database': 'ndc', 'source_database': 'kegg'}, 69 | 'Invalid database name: "kegg". Valid values are: , ag, atc, brite, compound, dgroup, disease, drug, ' 70 | 'enzyme, genome, glycan, jtc, ko, module, ndc, network, pathway, pubmed, rclass, reaction, variant, vg, vp, yj.' 71 | ' Where is an organism code or T number.'), 72 | (ku.DatabaseLinkKEGGurl, {'target_database': 'drug', 'source_database': 'drug'}, 73 | 'The source and target database cannot be identical. Database selected: drug.'), 74 | (ku.EntriesLinkKEGGurl, {'target_database': 'ligand', 'entry_ids': []}, 75 | 'Invalid database name: "ligand". Valid values are: , ag, atc, brite, compound, dgroup, disease, drug, ' 76 | 'enzyme, genes, genome, glycan, jtc, ko, module, ndc, network, pathway, pubmed, rclass, reaction, variant, vg, ' 77 | 'vp, yj. Where is an organism code or T number.'), 78 | (ku.EntriesLinkKEGGurl, {'target_database': 'yj', 'entry_ids': []}, 79 | 'At least one entry ID must be specified to perform the link operation'), 80 | (ku.DdiKEGGurl, {'drug_entry_ids': []}, 'At least one drug entry ID must be specified for the DDI operation')] 81 | 82 | 83 | @pt.mark.parametrize('KEGGurl,kwargs,expected_message', test_validate_exception_data) 84 | def test_validate_exception(KEGGurl: type, kwargs: dict, expected_message: str): 85 | with pt.raises(ValueError) as error: 86 | KEGGurl(**kwargs) 87 | expected_message = f'Cannot create URL - {expected_message}' 88 | u.assert_exception(expected_message=expected_message, exception=error) 89 | 90 | 91 | test_validate_warning_data = [ 92 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'formula': 'O3', 'exact_mass': 20.2}, 93 | 'Only a chemical formula, exact mass, or molecular weight is used to construct the URL. Using formula...', 'find/compound/O3/formula'), 94 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'formula': 'O3', 'molecular_weight': 200}, 95 | 'Only a chemical formula, exact mass, or molecular weight is used to construct the URL. Using formula...', 'find/drug/O3/formula'), 96 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'exact_mass': 20.2, 'molecular_weight': 200}, 97 | 'Both an exact mass and molecular weight are provided. Using exact mass...', 'find/compound/20.2/exact_mass')] 98 | 99 | 100 | @pt.mark.parametrize('KEGGurl,kwargs,expected_message,url', test_validate_warning_data) 101 | def test_validate_warning(KEGGurl: type, kwargs: dict, expected_message: str, url: str, caplog): 102 | kegg_url: ku.AbstractKEGGurl = KEGGurl(**kwargs) 103 | u.assert_warning(message=expected_message, caplog=caplog) 104 | expected_url = f'{ku.BASE_URL}/{url}' 105 | assert kegg_url.url == expected_url 106 | 107 | 108 | test_create_rest_options_data = [ 109 | (ku.ListKEGGurl, {'database': 'vg'}, 'list', 'vg'), 110 | (ku.ListKEGGurl, {'database': 'organism-code'}, 'list', 'organism-code'), 111 | (ku.ListKEGGurl, {'database': 'organism'}, 'list', 'organism'), 112 | (ku.InfoKEGGurl, {'database': 'ligand'}, 'info', 'ligand'), 113 | (ku.GetKEGGurl, {'entry_ids': ['x'], 'entry_field': None}, 'get', 'x'), 114 | (ku.GetKEGGurl, {'entry_ids': ['x'], 'entry_field': 'image'}, 'get', 'x/image'), 115 | (ku.GetKEGGurl, {'entry_ids': ['x'], 'entry_field': 'aaseq'}, 'get', 'x/aaseq'), 116 | (ku.GetKEGGurl, {'entry_ids': ['x', 'y'], 'entry_field': None}, 'get', 'x+y'), 117 | (ku.GetKEGGurl, {'entry_ids': ['x', 'y', 'z'], 'entry_field': 'ntseq'}, 'get', 'x+y+z/ntseq'), 118 | (ku.KeywordsFindKEGGurl, {'database': 'organism-T-number', 'keywords': ['key', 'word']}, 'find', 'organism-T-number/key+word'), 119 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'formula': 'CH4'}, 'find', 'drug/CH4/formula'), 120 | (ku.MolecularFindKEGGurl, {'database': 'compound', 'exact_mass': 30.3}, 'find', 'compound/30.3/exact_mass'), 121 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'molecular_weight': 300}, 'find', 'drug/300/mol_weight'), 122 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'exact_mass': (20.2, 30.3)}, 'find', 'drug/20.2-30.3/exact_mass'), 123 | (ku.MolecularFindKEGGurl, {'database': 'drug', 'molecular_weight': (200, 300)}, 'find', 'drug/200-300/mol_weight'), 124 | (ku.DatabaseConvKEGGurl, {'kegg_database': 'organism-code', 'outside_database': 'uniprot'}, 'conv', 'organism-code/uniprot'), 125 | (ku.DatabaseConvKEGGurl, {'kegg_database': 'glycan', 'outside_database': 'chebi'}, 'conv', 'glycan/chebi'), 126 | (ku.EntriesConvKEGGurl, {'target_database': 'genes', 'entry_ids': ['x', 'y', 'z']}, 'conv', 'genes/x+y+z'), 127 | (ku.EntriesConvKEGGurl, {'target_database': 'ncbi-proteinid', 'entry_ids': ['a']}, 'conv', 'ncbi-proteinid/a'), 128 | (ku.DatabaseLinkKEGGurl, {'target_database': 'pubmed', 'source_database': 'atc'}, 'link', 'pubmed/atc'), 129 | (ku.EntriesLinkKEGGurl, {'target_database': 'genes', 'entry_ids': ['a', 'b', 'c']}, 'link', 'genes/a+b+c'), 130 | (ku.EntriesLinkKEGGurl, {'target_database': 'jtc', 'entry_ids': ['x']}, 'link', 'jtc/x'), 131 | (ku.DdiKEGGurl, {'drug_entry_ids': ['x', 'y']}, 'ddi', 'x+y')] 132 | 133 | 134 | @pt.mark.parametrize('KEGGurl,kwargs,rest_operation,rest_options', test_create_rest_options_data) 135 | def test_create_rest_options(KEGGurl: type, kwargs: dict, rest_operation: str, rest_options: str): 136 | kegg_url: ku.AbstractKEGGurl = KEGGurl(**kwargs) 137 | expected_url = f'{ku.BASE_URL}/{rest_operation}/{rest_options}' 138 | assert str(kegg_url) == kegg_url.url == expected_url 139 | if KEGGurl == ku.GetKEGGurl: 140 | assert kegg_url.__getattribute__('multiple_entry_ids') == (len(kegg_url.__getattribute__('entry_ids')) > 1) 141 | 142 | 143 | @pt.fixture(name='_') 144 | def reset_organism_set(): 145 | ku.AbstractKEGGurl._organism_set = None 146 | 147 | 148 | @pt.mark.disable_mock_organism_set 149 | def test_organism_set(mocker, _): 150 | text_mock = """ 151 | T06555 psyt Candidatus Prometheoarchaeum syntrophicum Prokaryotes;Archaea;Lokiarchaeota;Prometheoarchaeum 152 | T03835 agw Archaeon GW2011_AR10 Prokaryotes;Archaea;unclassified Archaea 153 | T03843 arg Archaeon GW2011_AR20 Prokaryotes;Archaea;unclassified Archaea 154 | """ 155 | response_mock = mocker.MagicMock(status_code=200, text=text_mock) 156 | get_mock: mocker.MagicMock = mocker.patch('kegg_pull.kegg_url.rq.get', return_value=response_mock) 157 | actual_organism_set = ku.AbstractKEGGurl.organism_set 158 | get_mock.assert_called_once_with(url=f'{ku.BASE_URL}/list/organism', timeout=60) 159 | expected_organism_set = {'agw', 'T03835', 'T06555', 'T03843', 'psyt', 'arg'} 160 | assert actual_organism_set == expected_organism_set 161 | get_mock.reset_mock() 162 | actual_organism_set = ku.AbstractKEGGurl.organism_set 163 | get_mock.assert_not_called() 164 | assert actual_organism_set == expected_organism_set 165 | 166 | 167 | @pt.mark.parametrize('timeout', [True, False]) 168 | @pt.mark.disable_mock_organism_set 169 | def test_organism_set_unsuccessful(mocker, timeout: bool, _): 170 | get_function_patch_path = 'kegg_pull.kegg_url.rq.get' 171 | url = f'{ku.BASE_URL}/list/organism' 172 | error_message = 'The request to the KEGG web API {} while fetching the organism set using the URL: {}' 173 | if timeout: 174 | get_mock: mocker.MagicMock = mocker.patch(get_function_patch_path, side_effect=rq.exceptions.Timeout()) 175 | error_message: str = error_message.format('timed out', url) 176 | else: 177 | failed_status_code = 404 178 | get_mock: mocker.MagicMock = mocker.patch( 179 | get_function_patch_path, return_value=mocker.MagicMock(status_code=failed_status_code)) 180 | error_message: str = error_message.format(f'failed with status code {failed_status_code}', url) 181 | with pt.raises(RuntimeError) as error: 182 | ku.AbstractKEGGurl.organism_set() 183 | get_mock.assert_called_once_with(url=url, timeout=60) 184 | u.assert_exception(expected_message=error_message, exception=error) 185 | -------------------------------------------------------------------------------- /src/kegg_pull/rest.py: -------------------------------------------------------------------------------- 1 | """ 2 | KEGG REST API Operations 3 | ~~~~~~~~~~~~~~~~~~~~~~~~ 4 | |Interface for| the KEGG REST API including all its operations. 5 | """ 6 | import typing as t 7 | import enum as e 8 | import requests as rq 9 | import time 10 | import inspect as ins 11 | import logging as log 12 | from . import kegg_url as ku 13 | from . import _utils as u 14 | 15 | 16 | class KEGGresponse(u.NonInstantiable): 17 | """ 18 | Class containing details of a response from the KEGG REST API. 19 | 20 | :ivar Status status: The status of the KEGG response. 21 | :ivar AbstractKEGGurl kegg_url: The URL used in the request to the KEGG REST API that resulted in the KEGG response. 22 | :ivar str text_body: The text version of the response body. 23 | :ivar bytes binary_body: The binary version of the response body. 24 | """ 25 | class Status(e.Enum): 26 | """The status of a KEGG response.""" 27 | SUCCESS = 1 28 | FAILED = 2 29 | TIMEOUT = 3 30 | 31 | def __init__(self, status: Status, kegg_url: ku.AbstractKEGGurl, text_body: str = None, binary_body: bytes = None) -> None: 32 | """ 33 | :param status: The status of the KEGG response. 34 | :param kegg_url: The URL used in the request to the KEGG REST API that resulted in the KEGG response. 35 | :param text_body: The text version of the response body. 36 | :param binary_body: The binary version of the response body. 37 | :raises ValueError: Raised if the status is SUCCESS but a response body is not provided. 38 | """ 39 | super(KEGGresponse, self).__init__() 40 | if status == KEGGresponse.Status.SUCCESS and (text_body is None or binary_body is None or text_body == '' or binary_body == b''): 41 | raise ValueError('A KEGG response cannot be marked as successful if its response body is empty') 42 | self.status = status 43 | self.kegg_url = kegg_url 44 | self.text_body = text_body 45 | self.binary_body = binary_body 46 | 47 | 48 | class KEGGrest: 49 | """Class containing methods for making requests to the KEGG REST API, including all the KEGG REST API operations.""" 50 | def __init__(self, n_tries: int | None = 3, time_out: int | None = 60, sleep_time: float | None = 5.0): 51 | """ 52 | :param n_tries: The number of times to try to make a request (can succeed the first time, or any of n_tries, or none of the tries). 53 | :param time_out: The number of seconds to wait for a request until marking it as timed out. 54 | :param sleep_time: The number of seconds to wait in between timed out requests or blacklisted requests. 55 | """ 56 | self._n_tries = n_tries if n_tries is not None else 3 57 | self._time_out = time_out if time_out is not None else 60 58 | self._sleep_time = sleep_time if sleep_time is not None else 5.0 59 | if self._n_tries < 1: 60 | raise ValueError(f'{self._n_tries} is not a valid number of tries to make a KEGG request.') 61 | 62 | def request(self, KEGGurl: type[ku.AbstractKEGGurl] = None, kegg_url: ku.AbstractKEGGurl = None, **kwargs) -> KEGGresponse: 63 | """ General KEGG request function based on a given KEGG URL (either a class that is instantiated or an already instantiated KEGG URL object). 64 | 65 | :param KEGGurl: Optional KEGG URL class (extended from AbstractKEGGurl) that's instantiated with provided keyword arguments. 66 | :param kegg_url: Optional KEGGurl object that's already instantiated (used if KEGGurl class is not provided). 67 | :param kwargs: The keyword arguments used to instantiate the KEGGurl class, if provided. 68 | :return: The KEGG response. 69 | """ 70 | kegg_url = KEGGrest._get_kegg_url(KEGGurl=KEGGurl, kegg_url=kegg_url, **kwargs) 71 | status: KEGGresponse.Status | None = None 72 | for _ in range(self._n_tries): 73 | try: 74 | response = rq.get(url=kegg_url.url, timeout=self._time_out) 75 | if response.status_code == 200: 76 | return KEGGresponse( 77 | status=KEGGresponse.Status.SUCCESS, kegg_url=kegg_url, text_body=response.text, binary_body=response.content) 78 | else: 79 | status = KEGGresponse.Status.FAILED 80 | if response.status_code == 403: 81 | # 403 forbidden. KEGG may have blocked the request due to too many requests in too little time. 82 | # In case blacklisting, sleep to allow time for KEGG to unblock further requests. 83 | time.sleep(self._sleep_time) 84 | except rq.exceptions.Timeout: 85 | status = KEGGresponse.Status.TIMEOUT 86 | time.sleep(self._sleep_time) 87 | return KEGGresponse(status=status, kegg_url=kegg_url) 88 | 89 | @staticmethod 90 | def _get_kegg_url( 91 | KEGGurl: type[ku.AbstractKEGGurl] | None = None, kegg_url: ku.AbstractKEGGurl | None = None, **kwargs) -> ku.AbstractKEGGurl: 92 | """ Gets the KEGGurl object to be used to make the request to KEGG. 93 | 94 | :param KEGGurl: Optional KEGGurl class to instantiate a KEGGurl object using keyword arguments. 95 | :param kegg_url: Instantiated KEGGurl object that's simply returned if provided (used if the KEGGurl class is not provided). 96 | :param kwargs: The keyword arguments used to instantiate the KEGGurl object if a KEGGurl class is provided. 97 | :return: The KEGGurl object. 98 | :raises ValueError: Raised if both a class and object are provided or the class does not inherit from AbstractKEGGurl. 99 | """ 100 | if KEGGurl is None and kegg_url is None: 101 | raise ValueError( 102 | f'Either an instantiated kegg_url object must be provided or an extended class of ' 103 | f'{ku.AbstractKEGGurl.__name__} along with the corresponding kwargs for its constructor.') 104 | if kegg_url is not None and KEGGurl is not None: 105 | log.warning( 106 | 'Both an instantiated kegg_url object and KEGGurl class are provided. Using the instantiated object...') 107 | if kegg_url is not None: 108 | return kegg_url 109 | if ku.AbstractKEGGurl not in ins.getmro(KEGGurl): 110 | raise ValueError( 111 | f'The value for KEGGurl must be an inherited class of {ku.AbstractKEGGurl.__name__}. ' 112 | f'The class "{KEGGurl.__name__}" is not.') 113 | kegg_url = KEGGurl(**kwargs) 114 | return kegg_url 115 | 116 | def test( 117 | self, KEGGurl: type[ku.AbstractKEGGurl] | None = None, kegg_url: ku.AbstractKEGGurl | None = None, 118 | **kwargs) -> bool: 119 | """ Tests if a KEGGurl will succeed upon being used in a request to the KEGG REST API. 120 | 121 | :param KEGGurl: Optional KEGGurl class used to instantiate a KEGGurl object given keyword arguments. 122 | :param kegg_url: KEGGurl object that's already instantiated (used if a KEGGurl class is not provided). 123 | :param kwargs: The keyword arguments used to instantiated the KEGGurl object from the KEGGurl class, if provided. 124 | :return: True if the URL would succeed, false if it would fail or time out. 125 | """ 126 | kegg_url = KEGGrest._get_kegg_url(KEGGurl=KEGGurl, kegg_url=kegg_url, **kwargs) 127 | for _ in range(self._n_tries): 128 | try: 129 | response = rq.head(url=kegg_url.url, timeout=self._time_out) 130 | if response.status_code == 200: 131 | return True 132 | except rq.exceptions.Timeout: 133 | time.sleep(self._sleep_time) 134 | return False 135 | 136 | def list(self, database: str) -> KEGGresponse: 137 | """ Executes the "list" KEGG API operation, pulling the entry IDs of the provided database. 138 | 139 | :param database: The database from which to pull entry IDs. 140 | :return: The KEGG response. 141 | """ 142 | return self.request(KEGGurl=ku.ListKEGGurl, database=database) 143 | 144 | def get(self, entry_ids: t.List[str], entry_field: str | None = None) -> KEGGresponse: 145 | """ Executes the "get" KEGG API operation, pulling the entries of the provided entry IDs. 146 | 147 | :param entry_ids: The IDs of entries to pull. 148 | :param entry_field: Optional field to extract from the entries. 149 | :return: The KEGG response. 150 | """ 151 | return self.request(KEGGurl=ku.GetKEGGurl, entry_ids=entry_ids, entry_field=entry_field) 152 | 153 | def info(self, database: str) -> KEGGresponse: 154 | """ Executes the "info" KEGG API operation, pulling information about a KEGG database. 155 | 156 | :param database: The database to pull information about. 157 | :return: The KEGG response 158 | """ 159 | return self.request(KEGGurl=ku.InfoKEGGurl, database=database) 160 | 161 | def keywords_find(self, database: str, keywords: t.List[str]) -> KEGGresponse: 162 | """ Executes the "find" KEGG API operation, finding entry IDs based on keywords to search in entries. 163 | 164 | :param database: The name of the database containing entries to search for. 165 | :param keywords: The keywords to search in entries. 166 | :return: The KEGG response 167 | """ 168 | return self.request(KEGGurl=ku.KeywordsFindKEGGurl, database=database, keywords=keywords) 169 | 170 | def molecular_find( 171 | self, database: str, formula: str | None = None, exact_mass: float | tuple[float, float] | None = None, 172 | molecular_weight: int | tuple[int, int] | None = None) -> KEGGresponse: 173 | """ Executes the "find" KEGG API operation, finding entry IDs in chemical databases based on one (and only one) choice of three molecular attributes of the entries. 174 | 175 | :param database: The name of the chemical database to search for entries in. 176 | :param formula: The chemical formula (one of three choices) of chemical entries to search for. 177 | :param exact_mass: The exact mass (one of three choices) of chemical entries to search for (single value or range). 178 | :param molecular_weight: The molecular weight (one of three choices) of chemical entries to search for (single value or range). 179 | :return: The KEGG response 180 | """ 181 | return self.request( 182 | KEGGurl=ku.MolecularFindKEGGurl, database=database, formula=formula, exact_mass=exact_mass, molecular_weight=molecular_weight) 183 | 184 | def database_conv(self, kegg_database: str, outside_database: str) -> KEGGresponse: 185 | """ Executes the "conv" KEGG API operation, converting the entry IDs of a KEGG database to those of an outside database. 186 | 187 | :param kegg_database: The name of the KEGG database to pull converted entry IDs from. 188 | :param outside_database: The name of the outside database to pull converted entry IDs from. 189 | :return: The KEGG response. 190 | """ 191 | return self.request(KEGGurl=ku.DatabaseConvKEGGurl, kegg_database=kegg_database, outside_database=outside_database) 192 | 193 | def entries_conv(self, target_database: str, entry_ids: t.List[str]) -> KEGGresponse: 194 | """ Executes the "conv" KEGG API operation, converting provided entry IDs from one database to the form of a target database. 195 | 196 | :param target_database: The name of the database to get converted entry IDs from. 197 | :param entry_ids: The entry IDs to convert to the form of the target database. 198 | :return: The KEGG response. 199 | """ 200 | return self.request(KEGGurl=ku.EntriesConvKEGGurl, target_database=target_database, entry_ids=entry_ids) 201 | 202 | def database_link(self, target_database: str, source_database: str) -> KEGGresponse: 203 | """ Executes the "link" KEGG API operation, showing the IDs of entries in one KEGG database that are connected/related to entries of another KEGG database. 204 | 205 | :param target_database: One of the two KEGG databases to pull linked entries from. 206 | :param source_database: The other KEGG database to link entries from the target database. 207 | :return: The KEGG response 208 | """ 209 | return self.request(KEGGurl=ku.DatabaseLinkKEGGurl, target_database=target_database, source_database=source_database) 210 | 211 | def entries_link(self, target_database: str, entry_ids: t.List[str]) -> KEGGresponse: 212 | """ Executes the "link" KEGG API operation, showing the IDs of entries that are connected/related to entries of a provided databases. 213 | 214 | :param target_database: The KEGG database to find links to the provided entries. 215 | :param entry_ids: The IDs of the entries to link to entries in the target database. 216 | :return: The KEGG response 217 | """ 218 | return self.request(KEGGurl=ku.EntriesLinkKEGGurl, target_database=target_database, entry_ids=entry_ids) 219 | 220 | def ddi(self, drug_entry_ids: t.List[str]) -> KEGGresponse: 221 | """ Executes the "ddi" KEGG API operation, searching for drug to drug interactions. Providing one entry ID reports all known interactions, while providing multiple checks if any drug pair in a given set of drugs is CI or P. If providing multiple, all entries must belong to the same database. 222 | 223 | :param drug_entry_ids: The IDs of the drug entries within which search for drug interactions. 224 | :return: The KEGG response 225 | """ 226 | return self.request(KEGGurl=ku.DdiKEGGurl, drug_entry_ids=drug_entry_ids) 227 | 228 | 229 | def request_and_check_error( 230 | kegg_rest: KEGGrest | None = None, KEGGurl: type[ku.AbstractKEGGurl] | None = None, 231 | kegg_url: ku.AbstractKEGGurl = None, **kwargs) -> KEGGresponse: 232 | """ Makes a general request to the KEGG REST API using a KEGGrest object. Creates the KEGGrest object if one is not provided. 233 | Additionally, raises an exception if the request is not successful, specifying the URL that was unsuccessful. 234 | 235 | :param kegg_rest: The KEGGrest object to perform the request. If None, one is created with the default parameters. 236 | :param KEGGurl: Optional KEGG URL class (extended from AbstractKEGGurl) that's instantiated with provided keyword arguments. 237 | :param kegg_url: Optional KEGGurl object that's already instantiated (used if KEGGurl class is not provided). 238 | :param kwargs: The keyword arguments used to instantiate the KEGGurl class, if provided. 239 | :return: The KEGG response 240 | :raises RuntimeError: Raised if the request fails or times out. 241 | """ 242 | kegg_rest = kegg_rest if kegg_rest is not None else KEGGrest() 243 | kegg_response = kegg_rest.request(KEGGurl=KEGGurl, kegg_url=kegg_url, **kwargs) 244 | if kegg_response.status == KEGGresponse.Status.FAILED: 245 | raise RuntimeError(f'The KEGG request failed with the following URL: {kegg_response.kegg_url.url}') 246 | elif kegg_response.status == KEGGresponse.Status.TIMEOUT: 247 | raise RuntimeError(f'The KEGG request timed out with the following URL: {kegg_response.kegg_url.url}') 248 | return kegg_response 249 | -------------------------------------------------------------------------------- /dev/test_map.py: -------------------------------------------------------------------------------- 1 | # noinspection PyPackageRequirements 2 | import pytest as pt 3 | import typing as t 4 | import jsonschema as js 5 | import kegg_pull.map as kmap 6 | import kegg_pull.kegg_url as ku 7 | import dev.utils as u 8 | 9 | 10 | @pt.fixture(name='kegg_rest', params=[True, False]) 11 | def get_kegg_rest(request, mocker): 12 | use_kegg_rest = request.param 13 | if use_kegg_rest: 14 | yield mocker.MagicMock() 15 | else: 16 | yield None 17 | 18 | 19 | @pt.fixture(name='reverse', params=[True, False]) 20 | def get_reverse(request): 21 | yield request.param 22 | 23 | 24 | def test_to_dict(mocker, kegg_rest): 25 | kegg_rest = kegg_rest 26 | text_body_mock = """ 27 | a1\tb1 28 | a1\tb2 29 | a1\tb3 30 | a2\tb1 31 | a2\tb4 32 | a3\tb3 33 | a4\tb5 34 | a5\tb6 35 | a5\tb7 36 | """ 37 | kwargs_mock = {'kegg_rest': kegg_rest, 'KEGGurl': ku.EntriesLinkKEGGurl, 'k': 'v'} 38 | kegg_response_mock = mocker.MagicMock(text_body=text_body_mock) 39 | request_and_check_error_mock: mocker.MagicMock = mocker.patch( 40 | 'kegg_pull.map.r.request_and_check_error', return_value=kegg_response_mock) 41 | actual_mapping: kmap.KEGGmapping = kmap._to_dict(**kwargs_mock) 42 | request_and_check_error_mock.assert_called_once_with(**kwargs_mock) 43 | expected_mapping = { 44 | 'a1': {'b1', 'b2', 'b3'}, 'a2': {'b1', 'b4'}, 'a3': {'b3'}, 'a4': {'b5'}, 'a5': {'b6', 'b7'}} 45 | assert actual_mapping == expected_mapping 46 | 47 | 48 | test_map_and_reverse_data = [ 49 | ('database_conv', ku.DatabaseConvKEGGurl, {'kegg_database': 'kegg-db', 'outside_database': 'outside-db'}), 50 | ('entries_conv', ku.EntriesConvKEGGurl, {'entry_ids': ['e1', 'e2'], 'target_database': 'x'}), 51 | ('entries_link', ku.EntriesLinkKEGGurl, {'entry_ids': ['e1', 'e2'], 'target_database': 'x'})] 52 | 53 | 54 | @pt.mark.parametrize('method,KEGGurl,kwargs', test_map_and_reverse_data) 55 | def test_map_and_reverse(mocker, method: str, KEGGurl: type, kwargs: dict, reverse: bool, kegg_rest): 56 | expected_mapping = {'k': {'v1', 'v2'}} 57 | to_dict_mock = mocker.patch('kegg_pull.map._to_dict', return_value=expected_mapping) 58 | # noinspection PyUnresolvedReferences 59 | method: t.Callable = kmap.__getattribute__(method) 60 | actual_mapping: kmap.KEGGmapping = method(reverse=reverse, kegg_rest=kegg_rest, **kwargs) 61 | to_dict_mock.assert_called_once_with(KEGGurl=KEGGurl, kegg_rest=kegg_rest, **kwargs) 62 | if reverse: 63 | expected_mapping = kmap.reverse(mapping=expected_mapping) 64 | assert actual_mapping == expected_mapping 65 | 66 | 67 | test_deduplicate_pathway_ids_data = [ 68 | {'source_database': 'pathway', 'target_database': 'x'}, {'source_database': 'x', 'target_database': 'pathway'}] 69 | 70 | 71 | @pt.mark.parametrize('kwargs', test_deduplicate_pathway_ids_data) 72 | def test_deduplicate_pathway_ids(mocker, kwargs: dict, kegg_rest): 73 | kwargs['kegg_rest'] = kegg_rest 74 | to_dict_return = {'path:map1': {'x1'}, f'path:ko1': {'x1'}, 'path:map2': {'x2', 'x3'}, 'path:ko2': {'x2', 'x3'}} 75 | pathway_is_target = kwargs['target_database'] == 'pathway' 76 | to_dict_return = kmap.reverse(mapping=to_dict_return) if pathway_is_target else to_dict_return 77 | to_dict_mock = mocker.patch('kegg_pull.map._to_dict', return_value=to_dict_return) 78 | actual_mapping = kmap.database_link(deduplicate=True, **kwargs) 79 | to_dict_mock.assert_called_once_with(KEGGurl=ku.DatabaseLinkKEGGurl, **kwargs) 80 | expected_mapping = {'path:map1': {'x1'}, 'path:map2': {'x2', 'x3'}} 81 | expected_mapping = kmap.reverse(mapping=expected_mapping) if pathway_is_target else expected_mapping 82 | assert actual_mapping == expected_mapping 83 | 84 | 85 | def test_deduplicate_pathway_ids_exception(mocker): 86 | message = f'Cannot deduplicate path:map entry ids when neither the source database nor the target database is set to "pathway".' \ 87 | f' Databases specified: module, ko.' 88 | mocker.patch('kegg_pull.map._to_dict') 89 | with pt.raises(ValueError) as error: 90 | kmap.database_link(source_database='module', target_database='ko', deduplicate=True) 91 | u.assert_exception(expected_message=message, exception=error) 92 | 93 | 94 | @pt.fixture(name='mapping_data', params=[(True, True), (False, True), (True, False), (False, False)]) 95 | def get_mapping_data(request, mocker): 96 | add_glycans, add_drugs = request.param 97 | 98 | def mapping_data(kegg_rest: mocker.MagicMock | None, kwargs: dict) -> tuple: 99 | compound_is_target = kwargs['target_database'] == 'compound' 100 | expected_call_args_list = [kwargs] 101 | compound_to_x = {'cpd1': {'x1', 'x2'}, 'cpd2': {'x1'}, 'cpd3': {'x2'}, 'cpd4': {'x3'}, 'cpd5': {'x2'}, 'cpd6': {'x4'}} 102 | to_dict_side_effect = [kmap.reverse(mapping=compound_to_x) if compound_is_target else compound_to_x] 103 | if add_glycans: 104 | expected_call_args_list.extend([ 105 | {'source_database': 'compound', 'target_database': 'glycan'}, {'source_database': 'glycan', 'target_database': 'x'}]) 106 | to_dict_side_effect.extend([ 107 | {'cpd1': {'gl1'}, 'cpd7': {'gl1', 'gl3'}, 'cpd8': {'gl2'}, 'cpd9': {'gl2'}, 'cpd10': {'gl3'}, 'cpd11': {'gl4'}}, 108 | {'gl1': {'x1', 'x5'}, 'gl2': {'x2', 'x5'}, 'gl4': {'x3'}, 'gl3': {'x3'}, 'gl5': {'x6'}}]) 109 | if add_drugs: 110 | expected_call_args_list.extend([ 111 | {'source_database': 'compound', 'target_database': 'drug'}, {'source_database': 'drug', 'target_database': 'x'}]) 112 | to_dict_side_effect.extend([ 113 | {'cpd4': {'d1'}, 'cpd3': {'d1'}, 'cpd6': {'d2'}, 'cpd5': {'d2'}, 'cpd12': {'d4'}, 'cpd13': {'d4'}, 'cpd14': {'d5'}}, 114 | {'d1': {'x1', 'x5'}, 'd2': {'x2', 'x5'}, 'd3': {'x3'}, 'd4': {'x3'}, 'd5': {'x6'}, 'd6': {'x6'}}]) 115 | expected_call_args_list = [{ 116 | 'source_database': d['source_database'], 'target_database': d['target_database'], 117 | 'kegg_rest': kegg_rest, 'KEGGurl': ku.DatabaseLinkKEGGurl} for d in expected_call_args_list] 118 | if add_glycans and add_drugs: 119 | expected_mapping = { 120 | 'cpd4': {'x1', 'x3', 'x5'}, 'cpd2': {'x1'}, 'cpd3': {'x1', 'x2', 'x5'}, 'cpd1': {'x1', 'x2', 'x5'}, 121 | 'cpd7': {'x1', 'x3', 'x5'}, 'cpd8': {'x2', 'x5'}, 'cpd6': {'x2', 'x4', 'x5'}, 'cpd9': {'x2', 'x5'}, 122 | 'cpd5': {'x2', 'x5'}, 'cpd12': {'x3'}, 'cpd11': {'x3'}, 'cpd10': {'x3'}, 'cpd13': {'x3'}, 'cpd14': {'x6'}} 123 | elif not add_glycans and add_drugs: 124 | expected_mapping = { 125 | 'cpd4': {'x3', 'x5', 'x1'}, 'cpd1': {'x2', 'x1'}, 'cpd3': {'x5', 'x2', 'x1'}, 'cpd2': {'x1'}, 'cpd6': {'x5', 'x4', 'x2'}, 126 | 'cpd5': {'x5', 'x2'}, 'cpd12': {'x3'}, 'cpd13': {'x3'}, 'cpd14': {'x6'}} 127 | elif add_glycans and not add_drugs: 128 | expected_mapping = { 129 | 'cpd7': {'x3', 'x5', 'x1'}, 'cpd1': {'x5', 'x2', 'x1'}, 'cpd2': {'x1'}, 'cpd8': {'x5', 'x2'}, 'cpd3': {'x2'}, 130 | 'cpd9': {'x5', 'x2'}, 'cpd5': {'x2'}, 'cpd11': {'x3'}, 'cpd10': {'x3'}, 'cpd4': {'x3'}, 'cpd6': {'x4'}} 131 | else: 132 | expected_mapping = compound_to_x 133 | expected_mapping = kmap.reverse(mapping=expected_mapping) if compound_is_target else expected_mapping 134 | return add_glycans, add_drugs, expected_call_args_list, to_dict_side_effect, expected_mapping 135 | yield mapping_data 136 | 137 | 138 | test_add_glycans_or_drugs_data = [ 139 | {'source_database': 'compound', 'target_database': 'x'}, {'source_database': 'x', 'target_database': 'compound'}] 140 | 141 | 142 | @pt.mark.parametrize('kwargs', test_add_glycans_or_drugs_data) 143 | def test_add_glycans_or_drugs(mocker, kegg_rest, mapping_data: t.Callable, kwargs: dict): 144 | add_glycans, add_drugs, expected_call_args_list, to_dict_side_effect, expected_mapping = mapping_data( 145 | kegg_rest=kegg_rest, kwargs=kwargs) 146 | to_dict_mock: mocker.MagicMock = mocker.patch('kegg_pull.map._to_dict', side_effect=to_dict_side_effect) 147 | # noinspection PyUnresolvedReferences 148 | actual_mapping: kmap.KEGGmapping = kmap.database_link(add_drugs=add_drugs, add_glycans=add_glycans, kegg_rest=kegg_rest, **kwargs) 149 | u.assert_call_args(function_mock=to_dict_mock, expected_call_args_list=expected_call_args_list, do_kwargs=True) 150 | assert actual_mapping == expected_mapping 151 | 152 | 153 | def test_add_glycans_or_drugs_warning(mocker, caplog): 154 | mocker.patch('kegg_pull.map._to_dict') 155 | expected_message = f'Adding compound IDs (corresponding to equivalent glycan and/or drug entries) to a mapping where ' \ 156 | f'neither the source database nor the target database are "compound". Databases specified: reaction, ko.' 157 | kmap.database_link(source_database='reaction', target_database='ko', add_glycans=True) 158 | u.assert_warning(message=expected_message, caplog=caplog) 159 | 160 | 161 | test_indirect_link_data = ['drugs_and_glycans', 'deduplicate', 'drugs_and_glycans_and_deduplicate', ''] 162 | 163 | 164 | @pt.mark.parametrize('test_case', test_indirect_link_data) 165 | def test_indirect_link(mocker, kegg_rest, test_case: str): 166 | kegg_rest = kegg_rest 167 | compound_to_reaction = {'cpd1': {'rn1', 'rn3'}, 'cpd2': {'rn2'}, 'cpd3': {'rn3'}} 168 | pathway_to_reaction = { 169 | 'path:map1': {'rn1', 'rn3'}, 'path:rn1': {'rn1', 'rn3'}, 'path:map2': {'rn2'}, 'path:rn2': {'rn2'}, 170 | 'path:map3': {'rn3'}, 'path:rn3': {'rn3'}} 171 | reaction_to_gene = {'rn1': {'ko1', 'ko2'}, 'rn4': {'ko4', 'ko3'}, 'rn3': {'ko3'}} 172 | compound_to_glycan = {'cpd1': {'gl1'}} 173 | compound_to_drug = {'cpd3': {'d1'}} 174 | compound_to_gene_expected_call_args_list = [ 175 | {'source_database': 'compound', 'target_database': 'reaction'}, 176 | {'source_database': 'reaction', 'target_database': 'ko'}] 177 | if test_case == 'drugs_and_glycans': 178 | glycan_to_gene = {'gl1': {'ko1', 'ko4'}, 'gl2': {'ko5'}} 179 | drug_to_gene = {'d1': {'ko3', 'ko6'}, 'd2': {'ko7'}} 180 | expected_call_args_list = compound_to_gene_expected_call_args_list 181 | expected_call_args_list.extend([ 182 | {'source_database': 'compound', 'target_database': 'glycan'}, {'source_database': 'glycan', 'target_database': 'ko'}, 183 | {'source_database': 'compound', 'target_database': 'drug'}, {'source_database': 'drug', 'target_database': 'ko'}]) 184 | side_effect = [ 185 | compound_to_reaction, reaction_to_gene, compound_to_glycan, glycan_to_gene, compound_to_drug, drug_to_gene] 186 | to_dict_mock = mocker.patch('kegg_pull.map._to_dict', side_effect=side_effect) 187 | actual_mapping = kmap.indirect_link( 188 | source_database='compound', intermediate_database='reaction', target_database='ko', add_glycans=True, add_drugs=True, 189 | kegg_rest=kegg_rest) 190 | expected_mapping = {'cpd1': {'ko3', 'ko1', 'ko2', 'ko4'}, 'cpd3': {'ko3', 'ko6'}} 191 | elif test_case == 'deduplicate': 192 | expected_call_args_list = [ 193 | {'source_database': 'pathway', 'target_database': 'reaction'}, {'source_database': 'reaction', 'target_database': 'ko'}] 194 | to_dict_mock = mocker.patch('kegg_pull.map._to_dict', side_effect=[pathway_to_reaction, reaction_to_gene]) 195 | actual_mapping = kmap.indirect_link( 196 | source_database='pathway', intermediate_database='reaction', target_database='ko', deduplicate=True, 197 | kegg_rest=kegg_rest) 198 | expected_mapping = {'path:map1': {'ko3', 'ko1', 'ko2'}, 'path:map3': {'ko3'}} 199 | elif test_case == 'drugs_and_glycans_and_deduplicate': 200 | reaction_to_pathway = kmap.reverse(mapping=pathway_to_reaction) 201 | glycan_to_pathway = {'gl1': {'path:map1', 'path:map4'}, 'gl2': {'path:map5'}} 202 | drug_to_pathway = {'d1': {'path:map3', 'path:map6'}, 'd2': {'path:map7'}} 203 | expected_call_args_list = [ 204 | {'source_database': 'compound', 'target_database': 'reaction'}, {'source_database': 'reaction', 'target_database': 'pathway'}, 205 | {'source_database': 'compound', 'target_database': 'glycan'}, {'source_database': 'glycan', 'target_database': 'pathway'}, 206 | {'source_database': 'compound', 'target_database': 'drug'}, {'source_database': 'drug', 'target_database': 'pathway'}] 207 | side_effect = [ 208 | compound_to_reaction, reaction_to_pathway, compound_to_glycan, glycan_to_pathway, compound_to_drug, drug_to_pathway] 209 | to_dict_mock = mocker.patch('kegg_pull.map._to_dict', side_effect=side_effect) 210 | actual_mapping = kmap.indirect_link( 211 | source_database='compound', intermediate_database='reaction', target_database='pathway', 212 | deduplicate=True, add_glycans=True, add_drugs=True, kegg_rest=kegg_rest) 213 | expected_mapping = { 214 | 'cpd1': {'path:map1', 'path:map3', 'path:map4'}, 'cpd2': {'path:map2'}, 'cpd3': {'path:map1', 'path:map3', 'path:map6'}} 215 | else: 216 | expected_call_args_list = compound_to_gene_expected_call_args_list 217 | to_dict_mock = mocker.patch('kegg_pull.map._to_dict', side_effect=[compound_to_reaction, reaction_to_gene]) 218 | actual_mapping = kmap.indirect_link( 219 | source_database='compound', intermediate_database='reaction', target_database='ko', 220 | kegg_rest=kegg_rest) 221 | expected_mapping = {'cpd1': {'ko3', 'ko1', 'ko2'}, 'cpd3': {'ko3'}} 222 | expected_call_args_list = [{ 223 | 'source_database': d['source_database'], 'target_database': d['target_database'], 'kegg_rest': kegg_rest, 224 | 'KEGGurl': ku.DatabaseLinkKEGGurl} for d in expected_call_args_list] 225 | u.assert_call_args(function_mock=to_dict_mock, expected_call_args_list=expected_call_args_list, do_kwargs=True) 226 | assert actual_mapping == expected_mapping 227 | 228 | 229 | test_indirect_link_exception_data = [ 230 | ({'source_database': 'pathway', 'intermediate_database': 'reaction', 'target_database': 'reaction'}, 231 | 'The source, intermediate, and target database must all be unique. Databases specified: pathway, reaction, reaction.'), 232 | ({'source_database': 'reaction', 'intermediate_database': 'reaction', 'target_database': 'reaction'}, 233 | 'The source, intermediate, and target database must all be unique. Databases specified: reaction, reaction, reaction.')] 234 | 235 | 236 | @pt.mark.parametrize('kwargs,error_message', test_indirect_link_exception_data) 237 | def test_indirect_link_exception(kwargs: dict, error_message: str): 238 | with pt.raises(ValueError) as error: 239 | kmap.indirect_link(**kwargs) 240 | u.assert_exception(expected_message=error_message, exception=error) 241 | 242 | 243 | def test_combine_mappings(): 244 | mapping1 = {'k1': {'v1'}, 'k4': {'v3', 'v4'}, 'k5': {'v6', 'v7'}} 245 | mapping2 = {'k2': {'v1'}, 'k3': {'v2', 'v3'}, 'k4': {'v3', 'v4'}, 'k5': {'v5', 'v6'}} 246 | actual_combined_mapping = kmap.combine_mappings(mapping1=mapping1, mapping2=mapping2) 247 | expected_combined_mapping = {'k1': {'v1'}, 'k4': {'v3', 'v4'}, 'k5': {'v6', 'v7', 'v5'}, 'k2': {'v1'}, 'k3': {'v2', 'v3'}} 248 | assert actual_combined_mapping == expected_combined_mapping 249 | 250 | 251 | def test_reverse(): 252 | mapping = {'k1': {'v1', 'v2'}, 'k2': {'v1', 'v3', 'v4'}, 'k3': {'v1', 'v2', 'v3', 'v5'}, 'k4': {'v4', 'v5', 'v6'}} 253 | expected_reverse_mapping = { 254 | 'v1': {'k1', 'k2', 'k3'}, 'v2': {'k1', 'k3'}, 'v3': {'k2', 'k3'}, 'v4': {'k2', 'k4'}, 'v5': {'k3', 'k4'}, 'v6': {'k4'}} 255 | actual_reverse_mapping = kmap.reverse(mapping=mapping) 256 | assert actual_reverse_mapping == expected_reverse_mapping 257 | 258 | 259 | def test_to_json_string(): 260 | mapping = {'k1': {'v1'}, 'k2': {'v1', 'v2'}, 'k3': {'v3', 'v4'}} 261 | expected_json_string = '{\n "k1": [\n "v1"\n ],\n "k2": [\n "v1",\n "v2"\n ],\n "k3": [\n "v3",\n "v4"\n ]\n}' 262 | actual_json_string: str = kmap.to_json_string(mapping=mapping) 263 | assert actual_json_string == expected_json_string 264 | 265 | 266 | def test_save_to_json(json_file_path: str): 267 | kmap.save_to_json(mapping={'k1': {'v1'}, 'k2': {'v3', 'v2'}}, file_path=json_file_path) 268 | u.test_save_to_json(json_file_path=json_file_path, expected_saved_json_object={'k1': ['v1'], 'k2': ['v2', 'v3']}) 269 | 270 | 271 | def test_load_from_json(json_file_path: str): 272 | u.test_load_from_json( 273 | json_file_path=json_file_path, saved_object={'k1': ['v1'], 'k2': ['v2', 'v3']}, method=kmap.load_from_json, 274 | expected_loaded_object={'k1': {'v1'}, 'k2': {'v3', 'v2'}}) 275 | 276 | 277 | test_invalid_save_to_json_data = [{'a': [1]}, {'a': [1.2]}, {'a': [[], []]}, {'a': {}}, {'a': []}, {'': ['b']}] 278 | expected_error_message = 'The mapping must be a dictionary of entry IDs (strings) mapped to a set of entry IDs' 279 | 280 | 281 | @pt.mark.parametrize('invalid_json_object', test_invalid_save_to_json_data) 282 | def test_invalid_save_to_json(caplog, invalid_json_object: dict): 283 | with pt.raises(js.exceptions.ValidationError): 284 | kmap.save_to_json(mapping=invalid_json_object, file_path='xxx.json') 285 | u.assert_error( 286 | message=expected_error_message, caplog=caplog) 287 | 288 | 289 | test_invalid_load_from_json_data = test_invalid_save_to_json_data.copy() 290 | test_invalid_load_from_json_data.extend([ 291 | ['1', '2'], {'a': 'b'}, {'a': [2]}, 'abc', 123, 123.123, {1: 2}, {1.2: 2.3}, {'a': [{}, {}]}, {'a': ['b', 1]}, 292 | {'a': [1.2, 'b']}]) 293 | 294 | 295 | @pt.mark.parametrize('invalid_json_object', test_invalid_load_from_json_data) 296 | def test_invalid_load_from_json(caplog, json_file_path: str, invalid_json_object: list | dict | int | float | str): 297 | u.test_invalid_load_from_json( 298 | json_file_path=json_file_path, invalid_json_object=invalid_json_object, method=kmap.load_from_json, 299 | expected_error_message=expected_error_message, caplog=caplog) 300 | --------------------------------------------------------------------------------