├── tests
├── __init__.py
└── test_oop.py
├── src
└── papyrus_scripts
│ ├── utils
│ ├── __init__.py
│ ├── aliases.json
│ ├── UniprotMatch.py
│ ├── mol_reader.py
│ ├── links.json
│ └── IO.py
│ ├── __main__.py
│ ├── __init__.py
│ ├── matchRCSB.py
│ ├── fingerprint.py
│ ├── download.py
│ ├── neuralnet.py
│ └── reader.py
├── figures
└── logo
│ ├── Papyrus_trnsp-bg.png
│ ├── Papyrus_trnsp-bg.svg
│ └── Papyrus_trnsp-bg-white.svg
├── setup.py
├── CONTRIBUTING.md
├── notebook_examples
└── advanced_querying.ipynb
├── .flake8
├── LICENSE
├── tox.ini
├── setup.cfg
├── .gitignore
└── README.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Utility functions."""
4 |
--------------------------------------------------------------------------------
/figures/logo/Papyrus_trnsp-bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/HEAD/figures/logo/Papyrus_trnsp-bg.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Setup module."""
4 |
5 | import setuptools
6 |
7 | if __name__ == '__main__':
8 | setuptools.setup()
9 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/__main__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Entrypoint module, in case you use `python -m papyrus`.
4 |
5 | Why does this file exist, and why `__main__`? For more info, read:
6 |
7 | - https://www.python.org/dev/peps/pep-0338/
8 | - https://docs.python.org/3/using/cmdline.html#cmdoption-m
9 | """
10 |
11 |
12 | from .cli import main
13 |
14 | if __name__ == '__main__':
15 | main()
16 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to this repository
2 |
3 | ## Getting started
4 | - Before contributing, make sure you have a working developping environment set up.
5 | ```bash
6 | pip install tox
7 | ```
8 | Few *tox* environments are defined for easier linting, testing and documentation generation.
9 |
10 | We enforce strict coding rules. :
11 | - To make sure you comply with coding rules use the following command:
12 | ```bash
13 | tox -e isort
14 | tox -e flake8
15 | ```
16 | - Pyroma checks if the installation information is sufficient
17 | ```bash
18 | tox -e pyroma
19 | ```
20 |
21 | **DOES NOT WORK AT THE MOMENT:**
22 | Automatic documentation can be generated like so:
23 | ```
24 | tox -e docs
25 | ```
26 |
27 | For the entire workflow of linting, testing and documentation
28 | ```
29 | tox
30 | ```
--------------------------------------------------------------------------------
/notebook_examples/advanced_querying.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "tags": []
7 | },
8 | "source": [
9 | "# Advanced examples: Using Papyrus scripts"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "Coming soon."
17 | ]
18 | }
19 | ],
20 | "metadata": {
21 | "kernelspec": {
22 | "display_name": "Python 3",
23 | "language": "python",
24 | "name": "python3"
25 | },
26 | "language_info": {
27 | "codemirror_mode": {
28 | "name": "ipython",
29 | "version": 3
30 | },
31 | "file_extension": ".py",
32 | "mimetype": "text/x-python",
33 | "name": "python",
34 | "nbconvert_exporter": "python",
35 | "pygments_lexer": "ipython3",
36 | "version": "3.6.13"
37 | }
38 | },
39 | "nbformat": 4,
40 | "nbformat_minor": 4
41 | }
42 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | ###############################################
2 | # Flake8 Configuration #
3 | # (.flake8) #
4 | # Adapted from https://github.com/pybel/pybel #
5 | ###############################################
6 |
7 | # This config can't go in setup.cfg because Python's ConfigParser
8 | # used by setup.pg will interpolate on all of Scott's beautiful % signs
9 | # that make the pretty colored output
10 |
11 | [flake8]
12 | ignore =
13 | # Complains about URLs
14 | S310
15 | exclude =
16 | .tox,
17 | .git,
18 | __pycache__,
19 | docs/source/conf.py,
20 | build,
21 | dist,
22 | tests/fixtures/*,
23 | *.pyc,
24 | *.egg-info,
25 | .cache,
26 | .eggs
27 | max-line-length = 120
28 | # import-order-style = pycharm
29 | application-import-names =
30 | papyrus_scripts
31 | tests
32 | format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s
--------------------------------------------------------------------------------
/src/papyrus_scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """A collection of scripts to handle the Papyrus bioactivity dataset."""
4 |
5 | from .download import download_papyrus, remove_papyrus
6 | from .reader import (read_papyrus, read_protein_set, read_protein_descriptors,
7 | read_molecular_descriptors, read_molecular_structures)
8 |
9 | from .matchRCSB import update_rcsb_data, get_matches
10 | from .preprocess import (keep_organism, keep_accession, keep_type, keep_source,
11 | keep_protein_class, keep_quality, keep_contains, keep_match,
12 | keep_similar, keep_substructure, keep_not_contains, keep_not_match,
13 | keep_dissimilar, keep_not_substructure, consume_chunks, yscrambling)
14 |
15 | from .modelling import qsar, pcm
16 |
17 | from .utils.mol_reader import MolSupplier
18 | from .utils import IO, UniprotMatch
19 | from .utils.IO import PapyrusVersion
20 |
21 | from .oop import PapyrusDataset
22 |
23 | __version__ = '2.1.2'
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 OlivierBeq
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist =
3 | pyroma
4 | isort
5 | flake8
6 | tests
7 | requires = tox
8 |
9 | [testenv:tests]
10 | commands =
11 | pytest tests/
12 | conda_deps=
13 | rdkit
14 | openbabel
15 | pandas
16 | deps =
17 | pytest
18 | conda_channels=
19 | conda-forge
20 | rdkit
21 | description = Run tests to check code validity.
22 |
23 | [testenv:pyroma]
24 | commands =
25 | pyroma .
26 | deps =
27 | pyroma
28 | skip_install = true
29 | description = Make sure setup.cfg is properly written out.
30 |
31 | [testenv:isort]
32 | extras = tests
33 | # Needs a full install so isort can determine own/foreign imports.
34 | deps =
35 | isort
36 | commands =
37 | isort setup.py src tests
38 |
39 | [testenv:flake8]
40 | skip_install = true
41 | deps =
42 | flake8
43 | flake8-assertive
44 | flake8-bandit
45 | flake8-bugbear
46 | flake8-builtins
47 | flake8-colors
48 | flake8-commas
49 | flake8-comprehensions
50 | flake8-docstrings
51 | # flake8-import-order
52 | flake8-isort
53 | flake8-print
54 | flake8-use-fstring
55 | pep8-naming
56 | pydocstyle
57 | commands =
58 | flake8 src/ setup.py tests/
59 | description = Run the flake8 tool with several plugins (e.g. bandit, docstrings, isort import order) to check code quality.
60 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/utils/aliases.json:
--------------------------------------------------------------------------------
1 | {
2 | "columns":[
3 | "version",
4 | "alias",
5 | "revision",
6 | "chembl",
7 | "chembl_version",
8 | "excape",
9 | "sharma",
10 | "christmann",
11 | "klaeger",
12 | "merget",
13 | "pickett"
14 | ],
15 | "index":[
16 | 0,
17 | 1,
18 | 2,
19 | 3
20 | ],
21 | "data":[
22 | [
23 | "05.4",
24 | 2022.04,
25 | 2,
26 | true,
27 | 29,
28 | true,
29 | true,
30 | true,
31 | true,
32 | true,
33 | false
34 | ],
35 | [
36 | "05.5",
37 | 2022.08,
38 | 3,
39 | true,
40 | 30,
41 | true,
42 | true,
43 | true,
44 | true,
45 | true,
46 | false
47 | ],
48 | [
49 | "05.6",
50 | 2022.11,
51 | 4,
52 | true,
53 | 31,
54 | true,
55 | true,
56 | true,
57 | true,
58 | true,
59 | false
60 | ],
61 | [
62 | "05.7",
63 | 2024.09,
64 | 2,
65 | true,
66 | 34,
67 | true,
68 | true,
69 | true,
70 | true,
71 | true,
72 | true
73 | ]
74 | ]
75 | }
76 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = papyrus_scripts
3 | version = attr: papyrus_scripts.__version__
4 | description = A collection of scripts to handle the Papyrus bioactivity dataset
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | url = https://github.com/OlivierBeq/Papyrus-scripts
8 | author = Olivier J. M. Béquignon - Brandon J. Bongers - Willem Jespers
9 | author_email = "olivier.bequignon.maintainer@gmail.com"
10 | maintainer = Olivier J. M. Béquignon
11 | maintainer_email = "olivier.bequignon.maintainer@gmail.com"
12 | license_file = LICENSE
13 | classifiers =
14 | Development Status :: 2 - Pre-Alpha
15 | Programming Language :: Python
16 | Programming Language :: Python :: 3.10
17 | Programming Language :: Python :: 3.9
18 | Programming Language :: Python :: 3.8
19 | Programming Language :: Python :: 3.7
20 | Programming Language :: Python :: 3.6
21 | keywords =
22 | bioactivity data
23 | QSAR
24 | proteochemometrics
25 | cheminformatics
26 | modelling
27 | machine learning
28 |
29 |
30 | [options]
31 | include_package_data = True
32 | packages = find:
33 | package_dir =
34 | = src
35 | install_requires =
36 | numpy>=2.0.0
37 | pandas
38 | rdkit
39 | requests
40 | joblib
41 | tqdm
42 | mordred
43 | swifter
44 | scikit-learn
45 | xgboost
46 | pystow
47 | prodec
48 |
49 |
50 | [options.packages.find]
51 | where = src
52 |
53 |
54 | [options.package_data]
55 | * = *.json
56 |
57 |
58 | [options.entry_points]
59 | console_scripts =
60 | papyrus = papyrus_scripts.cli:main
61 |
62 |
63 | [options.extras_require]
64 | docs =
65 | sphinx
66 | sphinx-rtd-theme
67 | sphinx-autodoc-typehints
68 |
69 |
70 | testing =
71 | pytest
72 | parameterized
73 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # PyCharm stuff:
85 | .idea/
86 |
87 | # VSCode stuff:
88 | .vscode/
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 |
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 |
107 | # SageMath parsed files
108 | *.sage.py
109 |
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 |
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 |
123 | # Rope project settings
124 | .ropeproject
125 |
126 | # mkdocs documentation
127 | /site
128 |
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 |
134 | # Pyre type checker
135 | .pyre/
--------------------------------------------------------------------------------
/src/papyrus_scripts/matchRCSB.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Match data of the Papyrus dataset with that of the Protein Data Bank."""
4 |
5 | import os
6 | import time
7 | from typing import Iterator, Generator, Optional, Union
8 |
9 | import pystow
10 | from rdkit import Chem
11 | from rdkit import RDLogger
12 | from tqdm.auto import tqdm
13 | import pandas as pd
14 | from pandas.io.parsers import TextFileReader as PandasTextFileReader
15 | import requests
16 |
17 | from .utils import UniprotMatch
18 |
19 |
20 | def update_rcsb_data(root_folder: Optional[str] = None,
21 | overwrite: bool = False,
22 | verbose: bool = True
23 | ) -> pd.DataFrame:
24 | """Update the local data of the RCSB.
25 |
26 | :param root_folder: Directory where Papyrus bioactivity data is stored (default: pystow's home folder)
27 | :param overwrite: Whether to overwrite the local file if already present
28 | (default: False if the local file was downloaded today.
29 | :param verbose: Should logging information be printed.
30 | :return: The mapping between PDB and UniProt identifiers
31 | """
32 | # Define output path
33 | if root_folder is not None:
34 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder)
35 | root_folder = pystow.module('papyrus')
36 | output_path = root_folder.join('rcsb', name='RCSB_data.tsv.xz')
37 | # Check if file is too recent
38 | if (os.path.isfile(output_path) and (time.time() - os.path.getmtime(output_path)) < 86400) and not overwrite:
39 | if verbose:
40 | print(f'RCSB data was obtained less than 24 hours ago: {output_path}\n'
41 | f'Set overwrite=True to force the fetching of data again.')
42 | return pd.read_csv(output_path, sep='\t')
43 | # Obtain the mapping InChI to PDB ligand code
44 | if verbose:
45 | print(f'Obtaining RCSB compound mappings from InChI to PDB ID')
46 | base_url = 'http://ligand-expo.rcsb.org/dictionaries/{}'
47 | request = requests.get(base_url.format('Components-inchi.ich'))
48 | if request.status_code != 200:
49 | raise IOError(f'resource could not be accessed: {request.url}')
50 | inchi_data = pd.DataFrame([line.split('\t')[:2] for line in request.text.splitlines()],
51 | columns=['InChI', 'PDBID'])
52 | # Process InChI for 2D data
53 | if verbose:
54 | pbar = tqdm(enumerate(inchi_data.InChI), total=inchi_data.shape[0], desc='Converting InChIs', ncols=100)
55 | else:
56 | pbar = enumerate(inchi_data.InChI)
57 | RDLogger.DisableLog('rdApp.*')
58 | for i, inchi in pbar:
59 | mol = Chem.MolFromInchi(inchi)
60 | if mol is not None:
61 | Chem.RemoveStereochemistry(mol)
62 | inchi_data.loc[i, 'InChI_2D'] = Chem.MolToInchi(mol)
63 | RDLogger.EnableLog('rdApp.*')
64 | # Obtain the mapping of PDB ids ligand to proteins structures
65 | if verbose:
66 | print(f'Obtaining RCSB compound mappings from ligand PDB ID to protein PDB ID')
67 | request = requests.get(base_url.format('cc-to-pdb.tdd'))
68 | if request.status_code != 200:
69 | raise IOError(f'resource could not be accessed: {request.url}')
70 | pdbid_data = pd.DataFrame([line.split('\t')[:2] for line in request.text.splitlines()],
71 | columns=['PDBIDlig', 'PDBIDprot'])
72 | # Merge both dataframe
73 | if verbose:
74 | print(f'Combining the data')
75 | pdb_data = inchi_data.merge(pdbid_data, left_on='PDBID', right_on='PDBIDlig')
76 | # Unmerge the data per protein PDB ID
77 | pdb_data.PDBIDprot = pdb_data.PDBIDprot.str.split()
78 | pdb_data = pdb_data.explode('PDBIDprot')
79 | # Map PDBID prot to UniProt acessions
80 | if verbose:
81 | print(f'Obtaining mappings from protein PDB ID to UniProt accessions')
82 | uniprot_mapping = UniprotMatch.uniprot_mappings(pdb_data.PDBIDprot.tolist(),
83 | map_from='PDB',
84 | map_to='UniProtKB_AC-ID') # Forces the use of SIFTS
85 | # Join on the RCSB data
86 | if verbose:
87 | print(f'Combining the data')
88 | pdb_data = pdb_data.merge(uniprot_mapping, left_on='PDBIDprot', right_on='PDB')
89 | # Rename columns
90 | pdb_data = pdb_data.rename(columns={'InChI': 'InChI_3D',
91 | 'PDBIDlig': 'PDBID_ligand',
92 | 'PDBIDprot': 'PDBID_protein',
93 | 'UniProtKB_AC-ID': 'UniProt_accession'})
94 | # Drop duplicate information
95 | pdb_data = pdb_data.drop(columns=['PDBID', 'PDB'])
96 | # Reorder columns
97 | pdb_data = pdb_data[['InChI_3D', 'InChI_2D', 'PDBID_ligand', 'PDBID_protein', 'UniProt_accession']]
98 | # Write to disk and return
99 | if verbose:
100 | print(f'Writing results to disk')
101 | pdb_data.to_csv(output_path, sep='\t', index=False)
102 | return pdb_data
103 |
104 |
105 | def get_matches(data: Union[pd.DataFrame, PandasTextFileReader, Iterator],
106 | root_folder: Optional[str] = None,
107 | verbose: bool = True,
108 | total: Optional[int] = None,
109 | update: bool = True) -> Union[pd.DataFrame, Generator]:
110 | """
111 |
112 | :param data: Papyrus data to be mapped with PDB identifiers
113 | :param root_folder: Directory where Papyrus bioactivity data is stored (default: pystow's home folder)
114 | :param verbose: show progress if data is and Iterator or a PandasTextFileReader
115 | :param total: Total number of chunks for progress display
116 | :param update: should the local cache of PDB identifiers be updated
117 | :return: The subset of Papyrus data with matching RCSB PDB identifiers
118 | """
119 | if isinstance(data, (PandasTextFileReader, Iterator)):
120 | return _chunked_get_matches(data, root_folder, verbose, total)
121 | if isinstance(data, pd.DataFrame):
122 | if 'connectivity' in data.columns:
123 | identifier = 'InChI_2D'
124 | elif 'InChIKey' in data.columns:
125 | identifier = 'InChI_3D'
126 | elif 'accession' in data.columns:
127 | raise ValueError('data does not contain either connectivity or InChIKey data.')
128 | else:
129 | raise ValueError('data does not contain either connectivity, InChIKey or protein accession data.')
130 | # Update the data if possible
131 | if update:
132 | _ = update_rcsb_data(root_folder, verbose=verbose)
133 | # Set pystow root folder
134 | if root_folder is not None:
135 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder)
136 | root_folder = pystow.module('papyrus')
137 | rcsb_data_path = root_folder.join('rcsb', name='RCSB_data.tsv.xz')
138 | # Read the data mapping
139 | rcsb_data = pd.read_csv(rcsb_data_path, sep='\t')
140 | # Process InChI
141 | data = data[data['InChI'].isin(rcsb_data[identifier])]
142 | data = data.merge(rcsb_data, left_on=['InChI', 'accession'], right_on=[identifier, 'UniProt_accession'])
143 | data = data.drop(columns=['InChI_2D', 'InChI_3D', 'UniProt_accession'])
144 | data = data.groupby('Activity_ID').aggregate({column: ';'.join
145 | if column == 'PDBID_protein'
146 | else 'first'
147 | for column in data.columns})
148 | return data
149 | else:
150 | raise TypeError('data can only be a pandas DataFrame, TextFileReader or an Iterator')
151 |
152 |
153 | def _chunked_get_matches(chunks: Union[PandasTextFileReader, Iterator], root_folder: Optional[str], verbose: bool,
154 | total: int):
155 | if verbose:
156 | pbar = tqdm(chunks, total=total, ncols=100)
157 | else:
158 | pbar = chunks
159 | for chunk in pbar:
160 | processed_chunk = get_matches(chunk, root_folder, update=False)
161 | yield processed_chunk
162 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Papyrus-scripts
2 |
3 | Collection of scripts to interact with the Papyrus bioactivity dataset.
4 |
5 | 
6 |
7 |
8 |
9 | **Associated Article:** 10.1186/s13321-022-00672-x
10 | ```
11 | Béquignon OJM, Bongers BJ, Jespers W, IJzerman AP, van de Water B, van Westen GJP.
12 | Papyrus - A large scale curated dataset aimed at bioactivity predictions.
13 | J Cheminform 15, 3 (2023). https://doi.org/10.1186/s13321-022-00672-x
14 | ```
15 |
16 | **Associated Preprint:** 10.33774/chemrxiv-2021-1rxhk
17 | ```
18 | Béquignon OJM, Bongers BJ, Jespers W, IJzerman AP, van de Water B, van Westen GJP.
19 | Papyrus - A large scale curated dataset aimed at bioactivity predictions.
20 | ChemRxiv. Cambridge: Cambridge Open Engage; 2021;
21 | This content is a preprint and has not been peer-reviewed.
22 | ```
23 |
24 | ## Installation
25 |
26 | ```bash
27 | pip install papyrus-scripts
28 | ```
29 |
30 | :warning: If pip gives the following error and resolves in import errors
31 | ```bash
32 | Defaulting to user installation because normal site-packages is not writeable
33 | ```
34 | Then uninstall and reinstalling the library with the following commands:
35 | ```bash
36 | pip uninstall -y papyrus-scripts
37 | python -m pip install papyrus-scripts
38 | ```
39 |
40 | Additional dependencies can be installed to allow:
41 | - similarity and substructure searches
42 | ```bash
43 | conda install FPSim2 openbabel h5py cupy -c conda-forge
44 | ```
45 |
46 | - training DNN models:
47 | ```bash
48 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
49 | ```
50 |
51 | ## Getting started
52 |
53 | ### The new application programming interface (API)
54 | This new object-oriented API is available since version 2.0.0.
55 |
56 | It allows for easier filtering of the Papyrus data and ensures that any data being queried is downloaded.
57 |
58 | ```python
59 | from papyrus_scripts import PapyrusDataset
60 |
61 | data = (PapyrusDataset(version='05.7', plusplus=True) # Downloads the data if needed
62 | .keep_source(['chembl', 'sharma']) # Keep specific sources
63 | .keep_quality('high')
64 | .proteins() # Get the corresponding protein targets
65 | )
66 | ```
67 |
68 | ### Functional API (legacy)
69 |
70 | The functional API requires the data to be downloaded beforehand.
71 | One can donwload the dataset either with the functional API itself or the command line interface (CLI).
72 |
73 | #### Donwloading with the command line interface (CLI)
74 | The following command will download the Papyrus++ bioactivities and protein targets (high-quality Ki and KD data as well as IC50 and EC50 of reproducible assays) for the latest version.
75 | ```bash
76 | papyrus download -V latest
77 | ```
78 | The following command will donwload the entire set of high-, medium-, and low-quality bioactivities and protein targets along with all precomputed molecular and protein descriptors for version 05.5.
79 | ```bash
80 | papyrus download -V 05.5 --more --d all
81 | ```
82 | The following command will download Papyrus++ bioactivities, protein targets and compound structures for both version 05.4 and 05.5.
83 | ```bash
84 | papyrus download -V 05.5 -V 05.4 -S
85 | ```
86 |
87 | More options can be found using
88 | ```bash
89 | papyrus download --help
90 | ```
91 |
92 | By default, the data is downloaded to [pystow](https://github.com/cthoyt/pystow)'s default directory.
93 | One can override the folder path by specifying the `-o` switch in the above commands.
94 |
95 | #### Donwloading with the functional API
96 |
97 | ```python
98 |
99 | from papyrus_scripts import download_papyrus
100 |
101 | # Donwload the latest version of the entire dataset with all precomputed descriptors
102 | download_papyrus(version='latest', only_pp=False, structures=True, descriptors='all')
103 | ```
104 |
105 | #### Querying with the functional API
106 |
107 | The query detailed above using the object-oriented API is reproduced below using the functional API.
108 |
109 | ```python
110 | from papyrus_scripts import (read_papyrus, read_protein_set,
111 | keep_quality, keep_source, keep_type,
112 | keep_organism, keep_accession, keep_protein_class,
113 | keep_match, keep_contains,
114 | consume_chunks)
115 |
116 | chunk_reader = read_papyrus(version='05.7', plusplus=True, is3d=False, chunksize=1_000_000)
117 | protein_data = read_protein_set(version='05.7')
118 | filter1 = keep_source(data=chunk_reader, source=['chembl', 'sharma'])
119 | filter2 = keep_quality(data=filter1, min_quality='high')
120 | data = consume_chunks(filter2, progress=False)
121 |
122 | protein_data = protein_data.set_index('target_id').loc[data.target_id.unique()].reset_index()
123 | ```
124 |
125 | ## Versions of the Papyrus dataset
126 |
127 | Different online servers host the Papyrus data based on release and ChEMBL version (table below).
128 |
129 |
130 | | Papyrus version | ChEMBL version | Zenodo | 4TU |
131 | |:---------------:|:--------------:|:---------------------------------------------------------:|:---------------------------------------------------------:|
132 | | 05.4 | 29 | [:heavy_check_mark:](https://zenodo.org/records/10943992) | [:heavy_check_mark:](https://doi.org/10.4121/16896406.v2) |
133 | | 05.5 | 30 | [:heavy_check_mark:](https://zenodo.org/records/7019873) | :x: |
134 | | 05.6 | 31 | [:heavy_check_mark:](https://zenodo.org/records/7373213) | :x: |
135 | | 05.7 | 34 | [:heavy_check_mark:](https://zenodo.org/records/13787633) | :x: |
136 |
137 | Precomputed molecular and protein descriptors along with molecular structures (2D for default set and 3D for low quality set with stereochemistry) are not available for version 05.4 from 4TU but are from Google Drive.
138 |
139 | As stated in the pre-print **we strongly encourage** the use of the dataset in which stereochemistry was not considered.
140 | This corresponds to files containing the mention "2D" and/or "without_stereochemistry".
141 |
142 | ## Interconversion of the compressed files
143 |
144 | The available LZMA-compressed files (*.xz*) may not be supported by some software (e.g. Pipeline Pilot).
145 |
**Decompressing the data is strongly discouraged!**
146 | Though Gzip files were made available at 4TU for version 05.4, we now provide a CLI option to locally interconvert from LZMA to Gzip and vice-versa.
147 |
148 | To convert from LZMA to Gzip (or vice-versa) use the following command:
149 | ```bash
150 | papyrus convert -v latest
151 | ```
152 |
153 | ## Removal of the data
154 |
155 | One can remove the Papyrus data using either the CLI or the API.
156 |
157 | The following exerts exemplify the removal of all Papyrus data files, including all versions utility files.
158 | ```bash
159 | papyrus clean --remove_root
160 | ```
161 |
162 | ```python
163 | from papyrus_scripts import remove_papyrus
164 |
165 | remove_papyrus(papyrus_root=True)
166 | ```
167 |
168 |
169 | ## Easy handling of the dataset
170 |
171 | Once installed the Papyrus-scripts allow for the easy filtering of the data.
172 | - Simple examples can be found in the simple_examples.ipynb notebook. [](https://colab.research.google.com/github/OlivierBeq/Papyrus-scripts/blob/master/notebook_examples/simple_examples.ipynb)
173 | - An example on matching data with the Protein Data Bank can be found in the simple_examples.ipynb notebook. [](https://colab.research.google.com/github/OlivierBeq/Papyrus-scripts/blob/master/notebook_examples/matchRCSB.ipynb)
174 | - More advanced examples will be added to the advanced_querying.ipynb notebook.
175 | ## Reproducing results of the pre-print
176 |
177 | The scripts used to extract subsets, generate models and obtain visualizations can be found here.
178 |
179 | ## Features to come
180 |
181 | - [x] Substructure and similarity molecular searches
182 | - [x] ability to use DNN models
183 | - [x] ability to repeat model training over multiple seeds
184 | - [x] y-scrambling
185 | - [ ] adapt models to QSPRpred
186 |
187 | ## Examples to come
188 |
189 | - Use of custom grouping schemes for training/test set splitting and cross-validation
190 | - Use custom molecular and protein descriptors (either Python function or file on disk)
191 |
192 |
193 | ## Logos
194 |
195 | Logos can be found under **figures/logo**
196 | Two version exist depending on the background used.
197 |
198 | :warning: GitHub does not render the white logo properly in the table below but should not deter you from using it!
199 |
200 |
201 |
202 | | On white background | On colored background |
203 | |:--------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------:|
204 | |

|

|
205 |
206 |
207 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/fingerprint.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Callable, List
2 | import json
3 | import hashlib
4 | from abc import ABC, abstractmethod
5 |
6 | import numpy as np
7 | from rdkit import Chem
8 | from rdkit import DataStructs
9 | from rdkit.Chem import rdMolDescriptors
10 | from rdkit.Avalon import pyAvalonTools
11 | try:
12 | from openbabel import pybel
13 | except ImportError as e:
14 | pybel = e
15 | try:
16 | import FPSim2
17 | from FPSim2.FPSim2lib.utils import BitStrToIntList, PyPopcount
18 | except ImportError as e:
19 | FPSim2 = e
20 |
21 |
22 | class Fingerprint(ABC):
23 | def __init__(self, name: str, params: Dict, call_func: Callable):
24 | self.name = name
25 | self.params = params
26 | self.func = call_func
27 | # determine length
28 | self.length = None
29 | if "nBits" in params.keys():
30 | self.length = params["nBits"]
31 | elif "fpSize" in params.keys():
32 | self.length = params["fpSize"]
33 | elif self.name == "MACCSKeys":
34 | self.length = 166
35 | elif self.name == "FP2":
36 | self.length = 1024
37 | elif self.name == "FP3":
38 | self.length = 55
39 | elif self.name == "FP4":
40 | self.length = 307
41 | if not self.length:
42 | raise Exception("fingerprint size is not specified")
43 | self._hash = self.name + json.dumps(self.params, sort_keys=True)
44 | self._hash = hashlib.sha256((self._hash).encode()).digest()
45 | self._hash = np.frombuffer(self._hash, dtype=np.int64)
46 | self._hash = abs(np.sum(self._hash)) % 65537
47 | self._hash = f'{hex(self._hash)}'
48 |
49 | def __repr__(self):
50 | return f'{self.name}_{self.length}bits_{self._hash}'
51 |
52 | @classmethod
53 | def derived(cls):
54 | if not cls.__subclasses__():
55 | return cls
56 | subclasses = []
57 | for subclass in cls.__subclasses__():
58 | subclass_derived = subclass.derived()
59 | if isinstance(subclass_derived, list):
60 | subclasses.extend(subclass_derived)
61 | else:
62 | subclasses.append(subclass_derived)
63 | return subclasses
64 |
65 | @abstractmethod
66 | def get(self, mol: Chem.Mol) -> List[int]:
67 | """Get the bistring fingerprint of the molecule"""
68 |
69 |
70 | class RDKitFingerprint(Fingerprint):
71 | def get(self, mol: Chem.Mol) -> List[int]:
72 | """Get the bistring fingerprint of the molecule and popcounts"""
73 | if isinstance(FPSim2, ImportError):
74 | raise ImportError('Some required dependencies are missing:\n\ttables, FPSim2')
75 | fp = BitStrToIntList(self.func(mol, **self.params).ToBitString())
76 | popcnt = PyPopcount(np.array(fp, dtype=np.uint64))
77 | return (*fp, popcnt)
78 |
79 |
80 | class MACCSKeysFingerprint(RDKitFingerprint):
81 | def __init__(self):
82 | super(MACCSKeysFingerprint, self).__init__('MACCSKeys', {}, rdMolDescriptors.GetMACCSKeysFingerprint)
83 |
84 |
85 | class AvalonFingerprint(RDKitFingerprint):
86 | def __init__(self, nBits: int = 512, isQuery: bool = False, resetVect: bool = False, bitFlags: int = 15761407):
87 | super(AvalonFingerprint, self).__init__('Avalon',
88 | {'nBits': nBits,
89 | 'isQuery': isQuery,
90 | 'resetVect': resetVect,
91 | 'bitFlags': bitFlags},
92 | pyAvalonTools.GetAvalonFP)
93 |
94 |
95 | class MorganFingerprint(RDKitFingerprint):
96 | def __init__(self, radius: int = 2, nBits: int = 2048, invariants: list = [], fromAtoms: list = [],
97 | useChirality: bool = False, useBondTypes: bool = True, useFeatures: bool = False):
98 | super(MorganFingerprint, self).__init__('Morgan',
99 | {'radius': radius,
100 | 'nBits': nBits,
101 | 'invariants': invariants,
102 | 'fromAtoms': fromAtoms,
103 | 'useChirality': useChirality,
104 | 'useBondTypes': useBondTypes,
105 | 'useFeatures': useFeatures},
106 | rdMolDescriptors.GetMorganFingerprintAsBitVect)
107 |
108 |
109 | class TopologicalTorsionFingerprint(RDKitFingerprint):
110 | def __init__(self, nBits: int = 2048, targetSize: int = 4, fromAtoms: List = 0,
111 | ignoreAtoms: List = 0, atomInvariants: List = 0, includeChirality: bool = False):
112 | super(TopologicalTorsionFingerprint, self
113 | ).__init__('TopologicalTorsion',
114 | {"nBits": nBits,
115 | "targetSize": targetSize,
116 | "fromAtoms": fromAtoms,
117 | "ignoreAtoms": ignoreAtoms,
118 | "atomInvariants": atomInvariants,
119 | "includeChirality": includeChirality, },
120 | rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect)
121 |
122 |
123 | class AtomPairFingerprint(RDKitFingerprint):
124 | def __init__(self, nBits: int = 2048, minLength: int = 1, maxLength: int = 30,
125 | fromAtoms: List = 0, ignoreAtoms: List = 0, atomInvariants: List = 0,
126 | nBitsPerEntry: int = 4, includeChirality: bool = False,
127 | use2D: bool = True, confId: int = -1):
128 | super(AtomPairFingerprint, self).__init__('AtomPair',
129 | {"nBits": nBits,
130 | "minLength": minLength,
131 | "maxLength": maxLength,
132 | "fromAtoms": fromAtoms,
133 | "ignoreAtoms": ignoreAtoms,
134 | "atomInvariants": atomInvariants,
135 | "nBitsPerEntry": nBitsPerEntry,
136 | "includeChirality": includeChirality,
137 | "use2D": use2D,
138 | "confId": confId},
139 | rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect)
140 |
141 |
142 | class RDKitTopologicalFingerprint(RDKitFingerprint):
143 | def __init__(self, fpSize: int = 2048, minPath: int = 1, maxPath: int = 7, nBitsPerHash: int = 2,
144 | useHs: bool = True, tgtDensity: float = 0.0, minSize: int = 128,
145 | branchedPaths: bool = True, useBondOrder: bool = True, atomInvariants: List = 0,
146 | fromAtoms: List = 0, atomBits: List = None, bitInfo: List = None):
147 | super(RDKitTopologicalFingerprint, self).__init__('RDKFingerprint',
148 | {"minPath": minPath,
149 | "maxPath": maxPath,
150 | "fpSize": fpSize,
151 | "nBitsPerHash": nBitsPerHash,
152 | "useHs": useHs,
153 | "tgtDensity": tgtDensity,
154 | "minSize": minSize,
155 | "branchedPaths": branchedPaths,
156 | "useBondOrder": useBondOrder,
157 | "atomInvariants": atomInvariants,
158 | "fromAtoms": fromAtoms,
159 | "atomBits": atomBits,
160 | "bitInfo": bitInfo},
161 | Chem.RDKFingerprint)
162 |
163 |
164 | class RDKPatternFingerprint(RDKitFingerprint):
165 | def __init__(self, fpSize: int = 2048, atomCounts: list = [], setOnlyBits: list = None):
166 | super(RDKPatternFingerprint, self).__init__('RDKPatternFingerprint',
167 | {'fpSize': fpSize,
168 | 'atomCounts': atomCounts,
169 | 'setOnlyBits': setOnlyBits},
170 | Chem.PatternFingerprint)
171 |
172 |
173 | class OBFingerprint(Fingerprint):
174 | def __init__(self, name: str, params: Dict, call_func: Callable):
175 | if isinstance(pybel, ImportError) and isinstance(FPSim2, ImportError):
176 | raise ImportError('Some required dependencies are missing:\n\topenbabel, FPSim2')
177 | elif isinstance(pybel, ImportError):
178 | raise ImportError('Some required dependencies are missing:\n\topenbabel')
179 | elif isinstance(FPSim2, ImportError):
180 | raise ImportError('Some required dependencies are missing:\n\tFPSim2')
181 | super(OBFingerprint, self).__init__(name, params, call_func)
182 |
183 | def get(self, mol: Chem.Mol) -> List[int]:
184 | """Get the bistring fingerprint of the molecule and popcounts"""
185 | binvec = DataStructs.ExplicitBitVect(self.length)
186 | obmol = pybel.readstring('smi', Chem.MolToSmiles(mol))
187 | binvec.SetBitsFromList([x - 1 for x in obmol.calcfp(self.func).bits])
188 | fp = BitStrToIntList(binvec.ToBitString())
189 | popcnt = PyPopcount(np.array(fp, dtype=np.uint64))
190 | return (*fp, popcnt)
191 |
192 |
193 | class FP2Fingerprint(OBFingerprint):
194 | def __init__(self):
195 | super(FP2Fingerprint, self).__init__('FP2',
196 | {},
197 | 'FP2')
198 |
199 |
200 | class FP3Fingerprint(OBFingerprint):
201 | def __init__(self):
202 | super(FP3Fingerprint, self).__init__('FP3',
203 | {},
204 | 'FP3')
205 |
206 |
207 | class FP4Fingerprint(OBFingerprint):
208 | def __init__(self):
209 | super(FP4Fingerprint, self).__init__('FP4',
210 | {},
211 | 'FP4')
212 |
213 |
214 | def get_fp_from_name(fp_name, **kwargs):
215 | """Get the fingerprint TYPE corresponding to a name
216 | :param fp_name: Name of the fingerprint
217 | :param kwargs: parameters specific to the desired fingerprint
218 | :return: fingerprint instance
219 | """
220 | fps = {fp().name: fp for fp in Fingerprint.derived()}
221 | if fp_name not in fps.keys():
222 | raise ValueError(r'Fingerprint {fp_name} not available')
223 | return fps[fp_name](**kwargs)
224 |
--------------------------------------------------------------------------------
/figures/logo/Papyrus_trnsp-bg.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
81 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/utils/UniprotMatch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Functions to interact with UniProt."""
4 |
5 | import re
6 | import json
7 | import time
8 | import zlib
9 | from typing import List, Union
10 | from xml.etree import ElementTree
11 | from urllib.parse import urlparse, parse_qs, urlencode
12 |
13 | import pandas as pd
14 | import requests
15 | from requests.adapters import HTTPAdapter, Retry
16 |
17 |
18 | def uniprot_mappings(query: Union[str, List[str]],
19 | map_from: str = 'ID',
20 | map_to: str = 'PDB_ID',
21 | taxon: str = None
22 | ) -> pd.DataFrame:
23 | """Map identifiers using the UniProt identifier mapping tool.
24 |
25 | :param query: list or space delimited string of identifiers
26 | :param map_from: type of input identifiers (default: accession)
27 | :param map_to: type of desired output identifiers
28 | (default: PDB identifiers)
29 | :param taxon: taxon to be mapped to if 'map_from' is 'Gene_Name'
30 |
31 | If mapping from {'PDB', 'PDB_ID'} to {'UniProtKB_AC-ID', 'ACC'}
32 | and query is None, then returns all SIFTS mappings.
33 |
34 | See: https://www.uniprot.org/help/api_idmapping
35 | """
36 | if isinstance(query, str):
37 | query = [query]
38 | # If mapping PDB to UniProt, use SIFTS flat files
39 | if map_from in ['PDB', 'PDB_ID'] and map_to in ['UniProtKB_AC-ID', 'ACC']:
40 | # Obtain mappings from SIFTS
41 | data = pd.read_csv('ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_pdb.tsv.gz',
42 | sep='\t', skiprows=[0]
43 | ).rename(columns={'SP_PRIMARY': map_to, 'PDB': map_from})
44 | # Reorganize columns
45 | data = data[[map_from, map_to]]
46 | # Split by PDB
47 | data[map_from] = data[map_from].str.split(';')
48 | # Unmerge rows according to PDB
49 | data = data.explode(column=map_from).reset_index(drop=True)
50 | if query is not None:
51 | query = [x.lower() for x in query]
52 | data = data[data[map_from].str.lower().isin(query)]
53 | return data
54 | else:
55 | # Use UniProt API
56 | matching = UniprotMatch()
57 | matches = matching.uniprot_id_mapping(query, map_from, map_to, taxon, verbose=False)
58 | df = pd.DataFrame.from_dict(matches, orient='index')
59 | df = df.reset_index().rename(columns={'index': map_from, 0: map_to})
60 | return df
61 |
62 |
63 | class UniprotMatch:
64 | def __init__(self,
65 | polling_interval: int = 3,
66 | api_url: str = 'https://rest.uniprot.org',
67 | retry: Retry = None):
68 | """Instantiate a class to match UniProt identifiers.
69 |
70 | Based on: https://www.uniprot.org/help/id_mapping#submitting-an-id-mapping-job
71 | """
72 | self._api_url = api_url
73 | self._polling_interval = polling_interval
74 | if retry is None:
75 | self._retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
76 | else:
77 | self._retries = retry
78 | self._session = requests.Session()
79 | self._session.mount("https://", HTTPAdapter(max_retries=self._retries))
80 |
81 |
82 | def _submit_id_mapping(self, from_db, to_db, ids, taxon=None):
83 | if from_db == 'Gene_Name' and taxon is None:
84 | raise ValueError('Taxon must be provided when mapping from gene names.')
85 | if taxon is None:
86 | request = requests.post(
87 | f"{self._api_url}/idmapping/run",
88 | data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
89 | )
90 | else:
91 | request = requests.post(
92 | f"{self._api_url}/idmapping/run",
93 | data={"from": from_db, "to": to_db, "ids": ",".join(ids), "taxId": taxon}
94 | )
95 | request.raise_for_status()
96 | return request.json()["jobId"]
97 |
98 | def _get_next_link(self, headers):
99 | re_next_link = re.compile(r'<(.+)>; rel="next"')
100 | if "Link" in headers:
101 | match = re_next_link.match(headers["Link"])
102 | if match:
103 | return match.group(1)
104 |
105 | def _check_id_mapping_results_ready(self, job_id, verbose):
106 | while True:
107 | request = self._session.get(f"{self._api_url}/idmapping/status/{job_id}")
108 | request.raise_for_status()
109 | j = request.json()
110 | if "jobStatus" in j:
111 | if j["jobStatus"] == "RUNNING":
112 | if verbose:
113 | print(f"Retrying in {self._polling_interval}s")
114 | time.sleep(self._polling_interval)
115 | else:
116 | raise Exception(request["jobStatus"])
117 | else:
118 | return bool(j["results"] or j["failedIds"])
119 |
120 | def _get_batch(self, batch_response, file_format, compressed):
121 | batch_url = self._get_next_link(batch_response.headers)
122 | while batch_url:
123 | batch_response = self._session.get(batch_url)
124 | batch_response.raise_for_status()
125 | yield self._decode_results(batch_response, file_format, compressed)
126 | batch_url = self._get_next_link(batch_response.headers)
127 |
128 | def _combine_batches(self, all_results, batch_results, file_format):
129 | if file_format == "json":
130 | for key in ("results", "failedIds"):
131 | if key in batch_results and batch_results[key]:
132 | all_results[key] += batch_results[key]
133 | elif file_format == "tsv":
134 | return all_results + batch_results[1:]
135 | else:
136 | return all_results + batch_results
137 | return all_results
138 |
139 | def _get_id_mapping_results_link(self, job_id):
140 | url = f"{self._api_url}/idmapping/details/{job_id}"
141 | request = self._session.get(url)
142 | request.raise_for_status()
143 | return request.json()["redirectURL"]
144 |
145 | def _decode_results(self, response, file_format, compressed):
146 | if compressed:
147 | decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
148 | if file_format == "json":
149 | j = json.loads(decompressed.decode("utf-8"))
150 | return j
151 | elif file_format == "tsv":
152 | return [line for line in decompressed.decode("utf-8").split("\n") if line]
153 | elif file_format == "xlsx":
154 | return [decompressed]
155 | elif file_format == "xml":
156 | return [decompressed.decode("utf-8")]
157 | else:
158 | return decompressed.decode("utf-8")
159 | elif file_format == "json":
160 | return response.json()
161 | elif file_format == "tsv":
162 | return [line for line in response.text.split("\n") if line]
163 | elif file_format == "xlsx":
164 | return [response.content]
165 | elif file_format == "xml":
166 | return [response.text]
167 | return response.text
168 |
169 | def _get_xml_namespace(self, element):
170 | m = re.match(r"\{(.*)\}", element.tag)
171 | return m.groups()[0] if m else ""
172 |
173 | def _merge_xml_results(self, xml_results):
174 | merged_root = ElementTree.fromstring(xml_results[0])
175 | for result in xml_results[1:]:
176 | root = ElementTree.fromstring(result)
177 | for child in root.findall("{http://uniprot.org/uniprot}entry"):
178 | merged_root.insert(-1, child)
179 | ElementTree.register_namespace("", self._get_xml_namespace(merged_root[0]))
180 | return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
181 |
182 | def _print_progress_batches(self, batch_index, size, total):
183 | n_fetched = min((batch_index + 1) * size, total)
184 | print(f"Fetched: {n_fetched} / {total}")
185 |
186 | def _get_id_mapping_results_search(self, url, verbose: bool = False):
187 | parsed = urlparse(url)
188 | query = parse_qs(parsed.query)
189 | file_format = query["format"][0] if "format" in query else "json"
190 | if "size" in query:
191 | size = int(query["size"][0])
192 | else:
193 | size = 500
194 | query["size"] = size
195 | compressed = (
196 | query["compressed"][0].lower() == "true" if "compressed" in query else False
197 | )
198 | parsed = parsed._replace(query=urlencode(query, doseq=True))
199 | url = parsed.geturl()
200 | request = self._session.get(url)
201 | request.raise_for_status()
202 | results = self._decode_results(request, file_format, compressed)
203 | total = int(request.headers["x-total-results"])
204 | if verbose:
205 | self._print_progress_batches(0, size, total)
206 | for i, batch in enumerate(self._get_batch(request, file_format, compressed), 1):
207 | results = self._combine_batches(results, batch, file_format)
208 | if verbose:
209 | self._print_progress_batches(i, size, total)
210 | if file_format == "xml":
211 | return self._merge_xml_results(results)
212 | return results
213 |
214 | def _get_id_mapping_results_stream(self, url):
215 | if "/stream/" not in url:
216 | url = url.replace("/results/", "/stream/")
217 | request = self._session.get(url)
218 | request.raise_for_status()
219 | parsed = urlparse(url)
220 | query = parse_qs(parsed.query)
221 | file_format = query["format"][0] if "format" in query else "json"
222 | compressed = (
223 | query["compressed"][0].lower() == "true" if "compressed" in query else False
224 | )
225 | return self._decode_results(request, file_format, compressed)
226 |
227 | def uniprot_id_mapping(self,
228 | ids: list, from_db: str = "UniProtKB_AC-ID", to_db: str = None,
229 | taxon: str = None, verbose: bool = True
230 | ) -> dict:
231 | """
232 | Map Uniprot identifiers into other databases.
233 |
234 | For a list of the available identifiers, check the
235 | `To database` list on https://www.uniprot.org/id-mapping
236 |
237 | :param ids: IDs to be mapped from
238 | :param from_db: Type of identifier supplied through 'ids'
239 | :param to_db: Type of identifier to be obtained
240 | :param taxon: Taxon ID of the species if 'from_db' is 'Gene_Name'
241 | :param verbose: Increase verbosity
242 | :return: A dictionary with query ids as keys and the respective mapped results
243 |
244 | Adapted from David Araripe's (@DavidAraripe) original code
245 | """
246 | job_id = self._submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids, taxon=taxon)
247 | if self._check_id_mapping_results_ready(job_id, verbose):
248 | link = self._get_id_mapping_results_link(job_id)
249 | r = self._get_id_mapping_results_search(link)
250 | r_dict = {idx: r["results"][idx] for idx in range(len(r["results"]))}
251 | r_df = pd.DataFrame.from_dict(r_dict, orient="index")
252 | query_to_newIDs = dict()
253 | for id in r_df["from"].unique():
254 | subset_df = r_df[r_df["from"] == id]
255 | if isinstance(subset_df["to"].tolist()[0], str):
256 | query_to_newIDs[id] = " ".join(list(subset_df["to"].unique()))
257 | elif isinstance(subset_df["to"].tolist()[0], dict):
258 | query_to_newIDs[id] = " ".join(set(subset_df["to"].apply(lambda row: row['primaryAccession'])))
259 | return query_to_newIDs
260 |
--------------------------------------------------------------------------------
/tests/test_oop.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from itertools import product
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from parameterized import parameterized, parameterized_class
9 |
10 | from src.papyrus_scripts import PapyrusDataset
11 | from src.papyrus_scripts import reader, preprocess
12 |
13 |
14 | # Size of chunks of raw file to read
15 | CHUNKSIZE = int(1e6)
16 | # Path root directory of raw files (None = pystow's default)
17 | SOURCE_PATH = None
18 |
19 |
20 | def parametrized_test_name_func(testcase_func, _, param):
21 | return "%s_%s" %(
22 | testcase_func.__name__,
23 | parameterized.to_safe_name("_".join(str(x) for x in param.args)),
24 | )
25 |
26 |
27 | def parametrized_testclass_name_func(cls, _, params_dict):
28 | return "{}_{}".format(
29 | cls.__name__,
30 | parameterized.to_safe_name("_".join(f'{k}_{v}' for k, v in params_dict.items())),
31 | )
32 |
33 |
34 | @parameterized_class(
35 | ('stereo', 'version', 'plusplus'),
36 | list(product(
37 | [True, False],
38 | ['05.4', '05.5', '05.6'],
39 | [True, False]
40 | )), class_name_func=parametrized_testclass_name_func)
41 | class TestPapyrusDataset(unittest.TestCase):
42 |
43 | def setUp(self):
44 | pass
45 |
46 | def assertDataFrameEqual(self, df1: pd.DataFrame, df2: pd.DataFrame):
47 | # Ensure NaN values can be compared
48 | df1.fillna('NaN', inplace=True)
49 | df2.fillna('NaN', inplace=True)
50 | # Ensure dataframes are not empty
51 | self.assertFalse(df1.empty)
52 | self.assertFalse(df2.empty)
53 | # Check number of lines
54 | self.assertEqual(len(df1), len(df2))
55 | # Check number of columns
56 | self.assertEqual(df1.shape[1], df2.shape[1])
57 | # Check column names
58 | self.assertTrue((df1.columns == df2.columns).all())
59 | # Check content column by columns
60 | for j_col in range(df1.shape[1]):
61 | # First check dtype
62 | self.assertEqual(df1.iloc[:, j_col].dtype, df2.iloc[:, j_col].dtype)
63 | # Check content
64 | self.assertEqual(df1.iloc[:, j_col].tolist(),
65 | df2.iloc[:, j_col].tolist())
66 |
67 | def test_medium_quality_kinase(self):
68 | if self.plusplus and self.stereo:
69 | # No chiral data in the Papyrus++
70 | with self.assertRaises(ValueError):
71 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
72 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
73 | return
74 | # 1) Obtain data through the functional API
75 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
76 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
77 | # Read protein targets
78 | fn_protein_data = reader.read_protein_set(version=self.version, source_path=SOURCE_PATH)
79 | # Keep up to medium quality data (Papyrus++ only contains high quality)
80 | fn_filter1 = preprocess.keep_quality(fn_data, 'medium')
81 | # Keep kinases
82 | fn_filter2 = preprocess.keep_protein_class(fn_filter1, fn_protein_data,
83 | classes={'l2': 'Kinase'})
84 | # Aggregate the data
85 | fn_data_agg = preprocess.consume_chunks(fn_filter2, progress=(not self.plusplus))
86 | # 2) Obtain data through the object-oriented API
87 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
88 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
89 | .keep_quality('medium')
90 | .keep_protein_class({'l2': 'Kinase'})
91 | .aggregate(progress=(not self.plusplus)))
92 | # 3) Ensure datasets are equal
93 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg)
94 | del fn_protein_data
95 | # 4) Test values
96 | for quality in oop_data_agg.Quality.unique():
97 | self.assertIn(quality.lower(), ['high', 'medium'])
98 | self.assertEqual(oop_data_agg.Classification.str.split('->').str[1].unique(), ['Kinase'])
99 |
100 | def test_all_quality_human_adenosine_receptors_ic50(self):
101 | if self.plusplus and self.stereo:
102 | # No chiral data in the Papyrus++
103 | with self.assertRaises(ValueError):
104 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
105 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
106 | return
107 | # 1) Obtain data through the functional API
108 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
109 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
110 | # Read protein targets
111 | fn_protein_data = reader.read_protein_set(version=self.version, source_path=SOURCE_PATH)
112 | # Keep human targets
113 | fn_filter1 = preprocess.keep_organism(fn_data, fn_protein_data,
114 | organism='Homo sapiens (Human)')
115 | # Keep adenosine receptors
116 | fn_filter2 = preprocess.keep_protein_class(fn_filter1, fn_protein_data,
117 | classes={'l5': 'Adenosine receptor'})
118 | # Keep IC50
119 | fn_filter3 = preprocess.keep_type(fn_filter2, activity_types='ic50')
120 | # Aggregate the data
121 | fn_data_agg = preprocess.consume_chunks(fn_filter3, progress=(not self.plusplus))
122 | # 2) Obtain data through the object-oriented API
123 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
124 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
125 | .keep_organism('Homo sapiens (Human)')
126 | .keep_protein_class({'l5': 'Adenosine receptor'})
127 | .keep_activity_type('ic50')
128 | .aggregate(progress=(not self.plusplus)))
129 | # 3) Ensure datasets are equal
130 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg)
131 | del fn_data_agg
132 | # 4) Test values
133 | self.assertEqual(oop_data_agg.Classification.str.split('->').str[4].unique(), ['Adenosine receptor'])
134 | self.assertEqual(oop_data_agg.type_IC50.astype(int).unique().tolist(), [1])
135 | oop_data_proteins = (PapyrusDataset.from_dataframe(oop_data_agg, self.stereo, self.version, self.plusplus)
136 | .proteins(progress=True)
137 | .to_dataframe(False))
138 | self.assertEqual(len(oop_data_agg.accession.unique()), len(oop_data_proteins))
139 | self.assertEqual(oop_data_proteins.Organism.unique().tolist(), ['Homo sapiens (Human)'])
140 |
141 | def test_chembl_mouse_cc_chemokine_receptors_ki_and_kd(self):
142 | if self.plusplus and self.stereo:
143 | with self.assertRaises(ValueError):
144 | # No chiral data in the Papyrus++
145 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
146 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
147 | return
148 | # 1) Obtain data through the functional API
149 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
150 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
151 | # Read protein targets
152 | fn_protein_data = reader.read_protein_set(version=self.version, source_path=SOURCE_PATH)
153 | # Keep ChEMBL data
154 | fn_filter1 = preprocess.keep_source(fn_data, 'chembl')
155 | # Keep human targets
156 | fn_filter2 = preprocess.keep_organism(fn_filter1, fn_protein_data,
157 | organism='Mus musculus (Mouse)')
158 | # Keep C-C chemokine receptors
159 | fn_filter3 = preprocess.keep_protein_class(fn_filter2, fn_protein_data,
160 | classes={'l5': 'CC chemokine receptor'})
161 | # Drop CCL2 and CCL5
162 | fn_filter4 = preprocess.keep_not_match(fn_filter3, 'accession', ['P13500', 'P13501'])
163 | # Keep IC50
164 | fn_filter5 = preprocess.keep_type(fn_filter4, activity_types=['ki', 'kd'])
165 | # Aggregate the data
166 | fn_data_agg = preprocess.consume_chunks(fn_filter5, progress=(not self.plusplus))
167 | # 2) Obtain data through the object-oriented API
168 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
169 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
170 | .keep_source('chembl')
171 | .keep_organism('Mus musculus (Mouse)')
172 | .keep_protein_class({'l5': 'CC chemokine receptor'})
173 | .not_isin('accession', ['P13500', 'P13501'])
174 | .keep_activity_type(['ki', 'kd'])
175 | .aggregate(progress=(not self.plusplus)))
176 | # 3) Ensure datasets are equal
177 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg)
178 | del fn_data_agg
179 | # 4) Test values
180 | self.assertEqual(len(oop_data_agg.source.unique()), 1)
181 | self.assertTrue(oop_data_agg.source.unique().item().lower().startswith('chembl'))
182 | self.assertTrue(oop_data_agg.type_IC50.dropna().astype(int).unique().item() == 0)
183 | self.assertTrue(oop_data_agg.type_EC50.dropna().astype(int).unique().item() == 0)
184 | self.assertTrue(oop_data_agg.type_other.replace({'NA': np.nan, 'NaN': np.nan, 'nan': np.nan})
185 | .dropna().empty or (oop_data_agg.type_other.replace({'NA': np.nan, 'NaN': np.nan, 'nan': np.nan})
186 | .dropna().astype(int).unique().item() == 0))
187 | self.assertEqual((oop_data_agg[['type_KD', 'type_Ki']]
188 | .astype(int).
189 | drop_duplicates()
190 | .apply(lambda x: sorted(x), axis=1)
191 | .tolist()),
192 | [[0, 1], [0, 1]]
193 | )
194 | self.assertEqual(oop_data_agg.Classification.str.split('->').str[4].unique(), ['CC chemokine receptor'])
195 | for accession in oop_data_agg.accession.unique():
196 | self.assertNotIn(accession, ['P13500', 'P13501'])
197 | oop_data_proteins = (PapyrusDataset.from_dataframe(oop_data_agg, self.stereo, self.version, self.plusplus)
198 | .proteins(progress=True)
199 | .to_dataframe(False))
200 | self.assertEqual(oop_data_proteins.Organism.unique().tolist(), ['Mus musculus (Mouse)'])
201 |
202 | def test_sharma_klaeger_christman_egfr_specific_mutants_no_chirality(self):
203 | if self.plusplus and self.stereo:
204 | # No chiral data in the Papyrus++
205 | with self.assertRaises(ValueError):
206 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
207 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
208 | return
209 | # 1) Obtain data through the functional API
210 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
211 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
212 | # Keep data related to the human EGFR from its accession
213 | fn_filter1 = preprocess.keep_accession(fn_data, 'P00533')
214 | # Keep specific mutants
215 | fn_filter2 = preprocess.keep_match(fn_filter1, 'target_id', ['P00533_L858R', 'P00533_L861Q'])
216 | # Keep only molecules without chiral centers
217 | fn_filter3 = preprocess.keep_contains(fn_filter2, 'InChIKey', 'UHFFFAOYSA')
218 | # Keep data from the Sharma, Klaeger and Christmann-Franck datasets
219 | fn_filter4 = preprocess.keep_source(fn_filter3, ['sharma', 'klaeger', 'christman'])
220 | # Keep only molecules without chiral centers
221 | fn_filter5 = preprocess.keep_not_contains(fn_filter4, 'InChIKey', '-O$', regex=True)
222 | # Aggregate the data
223 | fn_data_agg = preprocess.consume_chunks(fn_filter5, progress=(not self.plusplus))
224 | # 2) Obtain data through the object-oriented API
225 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus,
226 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH)
227 | .keep_accession('P00533')
228 | .isin('target_id', ['P00533_L858R', 'P00533_L861Q'])
229 | .contains('InChIKey', 'UHFFFAOYSA')
230 | .keep_source(['sharma', 'klaeger', 'christman'])
231 | .not_contains('InChIKey', '-O$', regex=True)
232 | .aggregate(progress=(not self.plusplus)))
233 | # 3) Ensure datasets are equal
234 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg)
235 | del fn_data_agg
236 | # 4) Test values
237 | self.assertEqual(oop_data_agg.accession.unique().item(), 'P00533')
238 | self.assertEqual(np.sort(oop_data_agg.target_id.unique()).tolist(), ['P00533_L858R', 'P00533_L861Q'])
239 | self.assertEqual(oop_data_agg.InChIKey.str.split('-').str[1].unique(), 'UHFFFAOYSA')
240 | self.assertNotEqual(oop_data_agg.InChIKey.str.split('-').str[2].unique(), 'O')
241 |
--------------------------------------------------------------------------------
/figures/logo/Papyrus_trnsp-bg-white.svg:
--------------------------------------------------------------------------------
1 |
2 |
262 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/utils/mol_reader.py:
--------------------------------------------------------------------------------
1 | import bz2
2 | import gzip
3 | import io
4 | import lzma
5 | import re
6 | import warnings
7 | from typing import Iterable, Optional, Tuple, Callable, Union
8 |
9 | from rdkit import Chem
10 | from rdkit import RDLogger
11 | from rdkit.Chem import ForwardSDMolSupplier, MaeMolSupplier, MolFromMol2Block, SmilesMolSupplierFromText, \
12 | SmilesMolSupplier
13 | from tqdm.auto import tqdm
14 |
15 |
16 | class ForwardMol2MolSupplier:
17 | def __init__(self, fileobj: Union[str, io.TextIOBase],
18 | sanitize: bool = True,
19 | removeHs: bool = True,
20 | cleanupSubstructures: bool = True):
21 | self.sanitize = sanitize
22 | self.removeHs = removeHs
23 | self.cleanupSubstructures = cleanupSubstructures
24 | self._buffer_size = 32768 # 32kB
25 | self._buffer = b''
26 | self._mol_delimiter = '@MOLECULE'
27 | if isinstance(fileobj, str):
28 | self._open_supplier = True
29 | self.supplier = open(fileobj)
30 | else:
31 | self._open_supplier = False
32 | self.supplier = fileobj
33 |
34 | def __enter__(self):
35 | return self
36 |
37 | def __exit__(self, exc_type, exc_value, traceback):
38 | self.close()
39 |
40 | def _iterate(self):
41 | self._buffer = self.supplier.read(self._buffer_size)
42 | while True:
43 | i_seps = [x.start() for x in re.finditer(self._mol_delimiter, self._buffer) if x.start() != 0]
44 | if not i_seps:
45 | new_buffer = self.supplier.read(self._buffer_size)
46 | if len(new_buffer):
47 | self._buffer += new_buffer
48 | else:
49 | mol = MolFromMol2Block(self._buffer,
50 | self.sanitize,
51 | self.removeHs,
52 | self.cleanupSubstructures)
53 | yield mol
54 | break
55 | else:
56 | mol = MolFromMol2Block(self._buffer[:i_seps[0]])
57 | yield mol
58 | self._buffer = self._buffer[i_seps[0]:]
59 | del i_seps[0]
60 |
61 | def __iter__(self):
62 | if not hasattr(self, '_iterator'):
63 | self._iterator = self._iterate()
64 | for values in self._iterator:
65 | yield values
66 |
67 | def __next__(self):
68 | if not hasattr(self, '_iterator'):
69 | self._iterator = self._iterate()
70 | return next(self._iterator)
71 |
72 | def close(self):
73 | if self._open_supplier:
74 | self.supplier.close()
75 |
76 |
77 | class ForwardSmilesMolSupplier:
78 | def __init__(self, fileobj: Union[str, io.TextIOBase],
79 | delimiter: str = '\t',
80 | smilesColumn: int = 0,
81 | nameColumn: int = 1,
82 | titleLine: bool = True,
83 | sanitize: bool = True):
84 | self.delimiter = delimiter
85 | self.smilesColumn = smilesColumn
86 | self.nameColumn = nameColumn
87 | self.titleLine = titleLine
88 | self.sanitize = sanitize
89 | self._buffer_size = 32768 # 32kB
90 | self._buffer = b''
91 | self._mol_delimiter = '\n'
92 | if isinstance(fileobj, str):
93 | self._open_supplier = True
94 | self.supplier = None
95 | self._iterator = SmilesMolSupplier(fileobj, self.delimiter, self.smilesColumn, self.nameColumn,
96 | self.titleLine, self.sanitize)
97 | else:
98 | self._open_supplier = False
99 | self.supplier = fileobj
100 |
101 | def __enter__(self):
102 | return self
103 |
104 | def __exit__(self, exc_type, exc_value, traceback):
105 | self.close()
106 |
107 | def _iterate(self):
108 | if self.titleLine:
109 | self.supplier.readline()
110 | self._buffer = self.supplier.read(self._buffer_size)
111 | while True:
112 | i_seps = [x.start() for x in re.finditer(self._mol_delimiter, self._buffer)]
113 | if not i_seps:
114 | new_buffer = self.supplier.read(self._buffer_size)
115 | if len(new_buffer):
116 | self._buffer += new_buffer
117 | else:
118 | if len(self._buffer):
119 | RDLogger.DisableLog('rdApp.*') # Disable logger if no name column
120 | mol = next(SmilesMolSupplierFromText(self._buffer, self._mol_delimiter, self.smilesColumn,
121 | self.nameColumn, False, self.sanitize))
122 | RDLogger.EnableLog('rdApp.*') # Disable logger if no name column
123 | yield mol
124 | break
125 | else:
126 | RDLogger.DisableLog('rdApp.*') # Disable logger if no name column
127 | mol = next(
128 | SmilesMolSupplierFromText(self._buffer[:i_seps[0] + len(self._mol_delimiter)], self._mol_delimiter,
129 | self.smilesColumn, self.nameColumn, False, self.sanitize))
130 | RDLogger.EnableLog('rdApp.*') # Disable logger if no name column
131 | yield mol
132 | self._buffer = self._buffer[i_seps[0] + len(self._mol_delimiter):]
133 | del i_seps[0]
134 |
135 | def __iter__(self):
136 | if not hasattr(self, '_iterator'):
137 | self._iterator = self._iterate()
138 | for values in self._iterator:
139 | yield values
140 |
141 | def __next__(self):
142 | if not hasattr(self, '_iterator'):
143 | self._iterator = self._iterate()
144 | return next(self._iterator)
145 |
146 | def close(self):
147 | if self._open_supplier:
148 | del self._iterator
149 | self._iterator = None
150 |
151 |
152 | class MolSupplier:
153 | # class properties
154 | valid_formats = ('smi', 'mae', 'sd', 'mol2', 'mol')
155 | valid_compression = ('lzma', 'zlib', 'bz2')
156 |
157 | def __init__(self, source: Union[str, io.TextIOBase, io.BufferedIOBase] = None,
158 | supplier: Iterable[Chem.Mol] = None,
159 | format: str = None,
160 | compression: str = None, **kwargs):
161 | f"""Molecular supplier handling format and compression.
162 |
163 | :param source: filename or file-like object;
164 | when using a context manager, file-like objects
165 | are not closed upon exit
166 | :param supplier: molecular supplier (e.g. rdkit.Chem.ForwardSDMolSupplier)
167 | :param format: data format {self.valid_formats}
168 | can be detected if source is a file name,
169 | must be provided if source is a not file name,
170 | ignored if supplier is not None
171 | :param compression: compression type {self.valid_compression}
172 | can be detected if source is a file name,
173 | ignored otherwise
174 | :param kwargs: keyworded arguments to be passed to the underlying supplier,
175 | ignored if source is supplier
176 | can also hold values for 'start_id', 'total' and 'show_progress'
177 | to be considered when used as an iterable
178 | """
179 | # source is None
180 | if source is None and supplier is None:
181 | raise ValueError('source or supplier must be supplied')
182 | # Default attributes
183 | self._open_substream = False # should a file be opened
184 | self.filename = None # name of file to be opened
185 | self.open_fn = None # function opening file and handling compression
186 | self._handle = None # handle to opened file
187 | self._open_supplier = False # should a supplier be opened
188 | self.supplier = None # molecule supplier
189 | self.compression = None
190 | self.format = None
191 | self.kwargs = kwargs # additional parameters for suppliers
192 | self._iter_start = self.kwargs.pop('start_id', 0)
193 | self._iter_total = self.kwargs.pop('total', None)
194 | self._iter_progress = self.kwargs.pop('show_progress', None)
195 | # Handle supplier
196 | if supplier is not None:
197 | self.supplier = supplier
198 | # source is a file name
199 | elif isinstance(source, str):
200 | self.filename = source
201 | self._open_substream = True
202 | self._open_supplier = True
203 | # Handle compressions
204 | if compression is not None:
205 | if compression not in self.valid_compression:
206 | raise ValueError(f'compression must be one of {self.valid_compression}')
207 | self.compression = compression
208 | else:
209 | self.compression, self._trunc_filename = self._get_compression(self.filename)
210 | self.open_fn = self._get_compression_handler(self.compression)
211 | # Handle file types
212 | if format is not None:
213 | if format not in self.valid_formats:
214 | raise ValueError(f'format must be one of {self.valid_formats}')
215 | self.format = format
216 | else:
217 | self.format = self._get_format(self._trunc_filename)
218 | # source is file-like object
219 | elif isinstance(source, (io.TextIOBase, io.BufferedIOBase)):
220 | if format is None:
221 | raise ValueError('format must be specified with text or binary readers')
222 | self._handle = source
223 | self._open_supplier = True
224 | self.format = format
225 | else:
226 | raise ValueError('source must either be filename or file-like object')
227 | # Create rdkit suppliers
228 | if self._open_substream:
229 | self._handle = self.open_fn(self.filename)
230 | # if file name or file-like object
231 | if self._open_supplier:
232 | if self.format == 'smi':
233 | self.supplier = ForwardSmilesMolSupplier(self._handle, **self.kwargs)
234 | elif self.format == 'mae':
235 | self.supplier = MaeMolSupplier(self._handle, **self.kwargs)
236 | elif self.format in ['sd', 'mol']:
237 | self.supplier = ForwardSDMolSupplier(self._handle, **self.kwargs)
238 | elif self.format == 'mol2':
239 | self.supplier = ForwardMol2MolSupplier(self._handle, **self.kwargs)
240 |
241 | def set_start_progress_total(self, start: int = 0, progress: bool = True, total: Optional[int] = None):
242 | """Set the start, progress and total for iterating through the supplier.
243 |
244 | :param start: starting value for generated identifiers while enumerating molecules
245 | :param progress: whether a progress bar should be displayed
246 | :param total: total number of molecules in the supplier
247 | """
248 | self._iter_start = start
249 | self._iter_total = total
250 | self._iter_progress = progress
251 |
252 | def _get_compression(self, filename: str) -> Tuple[Optional[str], str]:
253 | """Get compression type and stripped filename."""
254 | if filename.endswith('.xz'):
255 | return 'lzma', filename.rstrip('.xz')
256 | elif filename.endswith('.gz'):
257 | return 'zlib', filename.rstrip('.gz')
258 | elif filename.endswith('.bz2'):
259 | return 'bz2', filename.rstrip('.bz2')
260 | else:
261 | return None, filename
262 |
263 | def _get_compression_handler(self, compression_type) -> Callable:
264 | """Get function to deal with the compression."""
265 | if compression_type == 'lzma':
266 | return lzma.open
267 | elif compression_type == 'zlib':
268 | return gzip.open
269 | elif compression_type == 'bz2':
270 | return bz2.open
271 | elif compression_type is None:
272 | return open
273 | else:
274 | raise ValueError(f'type compression not handled: {compression_type}')
275 |
276 | def _get_format(self, filename) -> str:
277 | """Get file format from filename."""
278 | if filename.endswith('.smi'):
279 | return 'smi'
280 | elif filename.endswith('.mae'):
281 | return 'mae'
282 | elif filename.endswith(('.sd', '.sdf')):
283 | return 'sd'
284 | elif filename.endswith('.mol2'):
285 | return 'mol2'
286 | elif filename.endswith('.mol'):
287 | return 'mol'
288 |
289 | def _processed_mol_supplier(self) -> Iterable[Tuple[int, Chem.Mol]]:
290 | """Generator function that reads from a rdkit molecule supplier."""
291 | # handle showing progress
292 | if self._iter_progress:
293 | pbar = tqdm(enumerate(self.supplier, self._iter_start), total=self._iter_total, ncols=100)
294 | else:
295 | pbar = enumerate(self.supplier, self._iter_start)
296 | for mol_id, rdmol in pbar:
297 | if rdmol:
298 | yield mol_id, rdmol
299 | else:
300 | warnings.warn(f'molecule {mol_id} could not be processed')
301 | continue
302 |
303 | def __enter__(self):
304 | return self
305 |
306 | def __exit__(self, exc_type, exc_value, traceback):
307 | self.close()
308 |
309 | def __iter__(self):
310 | if not hasattr(self, '_iterator'):
311 | self._iterator = self._processed_mol_supplier()
312 | for values in self._iterator:
313 | yield values
314 |
315 | def __next__(self):
316 | if not hasattr(self, '_iterator'):
317 | self._iterator = self._processed_mol_supplier()
318 | # self._iterator = self.__iter__()
319 | return next(self._iterator)
320 |
321 | def close(self):
322 | if self._open_supplier:
323 | del self.supplier
324 | self.supplier = None
325 | if self._open_substream:
326 | self._handle.close()
327 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/download.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Download utilities of the Papyrus scripts."""
4 |
5 | import os
6 | import zipfile
7 | import shutil
8 | from typing import List, Optional, Union
9 |
10 | import requests
11 | import pystow
12 | from tqdm.auto import tqdm
13 |
14 | from .utils.IO import (get_disk_space, enough_disk_space, assert_sha256sum,
15 | read_jsonfile, write_jsonfile, get_papyrus_links)
16 |
17 |
18 | def download_papyrus(outdir: Optional[str] = None,
19 | version: Union[str, List[str]] = 'latest',
20 | nostereo: bool = True,
21 | stereo: bool = False,
22 | only_pp: bool = True,
23 | structures: bool = False,
24 | descriptors: Optional[Union[str, List[str]]] = 'all',
25 | progress: bool = True,
26 | disk_margin: float = 0.10,
27 | update_links: bool = True) -> None:
28 | """Download the Papyrus data.
29 |
30 | :param outdir: directory where Papyrus data is stored (default: pystow's directory)
31 | :param version: version of the dataset to be downloaded
32 | :param nostereo: should 2D data be downloaded
33 | :param only_pp: download only the curated Papyrus++ subset
34 | :param stereo: should 3D data be downloaded
35 | :param structures: should molecule structures be downloaded
36 | :param descriptors: should molecular and protein descriptors be downloaded
37 | :param progress: should progress be displayed
38 | :param disk_margin: percent of free disk space to keep
39 | :param update_links: Should links be updated (allows new versions to be fetched)
40 | """
41 |
42 | # Determine download parameters
43 | CHUNKSIZE = 1048576 # 1 MB
44 | RETRIES = 3
45 | # Obtain links to files
46 | files = get_papyrus_links(offline=not update_links)
47 | available_versions = list(files.keys())
48 | if isinstance(version, list):
49 | for _version in version:
50 | if _version not in available_versions + ['latest', 'all']:
51 | raise ValueError(f'version can only be one of [{", ".join(["latest"] + available_versions)}]')
52 | # Identify version
53 | latest_version = sorted(available_versions, key=lambda s: [int(u) for u in s.split('.')])[-1]
54 | if version == 'latest':
55 | version = latest_version
56 | if progress:
57 | print(f'Latest version: {version}')
58 | elif isinstance(version, list) and 'latest' in version:
59 | for i in range(len(version)):
60 | if version[i] == 'latest':
61 | version[i] = latest_version
62 | elif version == 'all' or (isinstance(version, list) and 'all' in version):
63 | version = available_versions
64 | # Transform to list
65 | if not isinstance(version, list):
66 | version = [version]
67 | if not isinstance(descriptors, list):
68 | descriptors = [descriptors]
69 | # Remove duplicates of versions
70 | version = sorted(set(version), key=lambda s: [int(u) for u in s.split('.')])
71 | # Define root dir for downloads
72 | if outdir is not None:
73 | os.environ['PYSTOW_HOME'] = os.path.abspath(outdir)
74 | papyrus_root = pystow.module('papyrus')
75 | for _version in version:
76 | papyrus_version_root = pystow.module('papyrus', _version)
77 | # Prepare files to be downloaded
78 | downloads = set()
79 | downloads.add('readme')
80 | downloads.add('requirements')
81 | downloads.add('proteins')
82 | if nostereo:
83 | downloads.add('papyrus++')
84 | if not only_pp:
85 | downloads.add('2D_papyrus')
86 | elif progress:
87 | # Ensure this warning is printed when donwloading the Papyrus++ dataset with progress on
88 | print('########## DISCLAIMER ##########\n'
89 | 'You are downloading the high-quality Papyrus++ dataset.\n'
90 | 'Should you want to access the entire, though of lower quality, Papyrus dataset,\n'
91 | 'look into additional switches of this command.\n'
92 | '################################')
93 | if structures:
94 | downloads.add('2D_structures')
95 | if 'mold2' in descriptors or 'all' in descriptors:
96 | downloads.add('2D_mold2')
97 | if 'cddd' in descriptors or 'all' in descriptors:
98 | downloads.add('2D_cddd')
99 | if 'mordred' in descriptors or 'all' in descriptors:
100 | downloads.add('2D_mordred')
101 | if 'fingerprint' in descriptors or 'all' in descriptors:
102 | downloads.add('2D_fingerprint')
103 | if stereo:
104 | downloads.add('3D_papyrus')
105 | if structures:
106 | downloads.add('3D_structures')
107 | if 'mordred' in descriptors or 'all' in descriptors:
108 | downloads.add('3D_mordred')
109 | if 'fingerprint' in descriptors or 'all' in descriptors:
110 | downloads.add('3D_fingerprint')
111 | if 'unirep' in descriptors or 'all' in descriptors:
112 | downloads.add('proteins_unirep')
113 | if 'prodec' in descriptors or 'all' in descriptors:
114 | downloads.add('proteins_prodec')
115 | # Determine total download size
116 | total = 0
117 | for ftype in downloads:
118 | if ftype == 'proteins_prodec' and ftype not in files[_version] and 'all' in descriptors:
119 | continue
120 | if isinstance(files[_version][ftype], dict):
121 | total += files[_version][ftype]['size']
122 | elif isinstance(files[_version][ftype], list):
123 | for subfile in files[_version][ftype]:
124 | total += subfile['size']
125 | else:
126 | raise ValueError('########## ERROR ##########\n'
127 | f'Papyrus versioning file corrupted: {files[_version][ftype]} '
128 | 'is neither a dict or a list.\nThis is most likely due to bad formatting '
129 | 'of the underlying parsed JSON files. If you are not the maintainer, please '
130 | 'remove the Papyrus data and enforce root folder removal and download '
131 | 'the data before trying again.\n'
132 | '################################')
133 | if progress:
134 | print(f'Number of files to be downloaded: {len(downloads)}\n'
135 | f'Total size: {tqdm.format_sizeof(total)}B')
136 | # Verify enough disk space
137 | if not enough_disk_space(papyrus_version_root.base.as_posix(), total, disk_margin):
138 | print('########## ERROR ##########\n'
139 | f'Not enough disk space ({disk_margin:.0%} kept for safety)\n'
140 | f'Available: {tqdm.format_sizeof(get_disk_space(papyrus_version_root.base.as_posix()))}B\n'
141 | f'Required: {tqdm.format_sizeof(total)}B\n'
142 | '################################')
143 | return
144 | # Download files
145 | if progress:
146 | pbar = tqdm(total=total, desc=f'Downloading version {_version}', unit='B', unit_scale=True)
147 | for ftype in downloads:
148 | if ftype == 'proteins_prodec' and 'proteins_prodec' not in files[_version]:
149 | if 'all' in descriptors:
150 | continue
151 | else:
152 | raise ValueError(f'ProDEC descriptors not available for Papyrus version {_version}')
153 | download = files[_version][ftype]
154 | if not isinstance(download, list):
155 | download = [download]
156 | for subfile in download:
157 | dname, durl, dsize, dhash = subfile['name'], subfile['url'], subfile['size'], subfile['sha256']
158 | # Determine path
159 | if ftype in ['papyrus++', '2D_papyrus', '3D_papyrus', 'proteins', 'data_types', 'data_size',
160 | 'readme', 'license', 'requirements']:
161 | fpath = papyrus_version_root.join(name=dname).as_posix()
162 | elif ftype in ['2D_structures', '3D_structures']:
163 | fpath = papyrus_version_root.join('structures', name=dname).as_posix()
164 | else:
165 | fpath = papyrus_version_root.join('descriptors', name=dname).as_posix()
166 | # File already exists
167 | if os.path.isfile(fpath) and assert_sha256sum(fpath, dhash):
168 | if progress:
169 | pbar.update(dsize)
170 | continue # skip
171 | # Download file
172 | correct = False # ensure file is not corrupted
173 | retries = RETRIES
174 | while not correct and retries > 0: # Allow 3 failures
175 | session = requests.session()
176 | res = session.get(durl, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
177 | "AppleWebKit/537.36 (KHTML, like Gecko) "
178 | "Chrome/39.0.2171.95 "
179 | "Safari/537.36"},
180 | stream=True, verify=True)
181 | with open(fpath, 'wb') as fh:
182 | for chunk in res.iter_content(chunk_size=CHUNKSIZE):
183 | fh.write(chunk)
184 | if progress:
185 | pbar.update(len(chunk))
186 | correct = assert_sha256sum(fpath, dhash)
187 | if not correct:
188 | retries -= 1
189 | if progress:
190 | if retries > 0:
191 | message = f'SHA256 hash unexpected for {dname}. Remaining download attempts: {retries}'
192 | else:
193 | message = f'SHA256 hash unexpected for {dname}. All {RETRIES} attempts failed.'
194 | pbar.write(message)
195 | os.remove(fpath)
196 | if retries == 0:
197 | if progress:
198 | pbar.close()
199 | raise IOError(f'Download failed for {dname}')
200 | # Extract if ZIP file
201 | if dname.endswith('.zip'):
202 | with zipfile.ZipFile(fpath) as zip_handle:
203 | for name in zip_handle.namelist():
204 | subpath = os.path.join(fpath, os.path.pardir)
205 | zip_handle.extract(name, subpath)
206 | os.remove(fpath)
207 | if progress:
208 | pbar.close()
209 | # Save version number
210 | json_file = papyrus_root.join(name='versions.json').as_posix()
211 | if os.path.isfile(json_file):
212 | data = read_jsonfile(json_file)
213 | data.append(_version)
214 | data = sorted(set(data))
215 | write_jsonfile(data, json_file)
216 | else:
217 | write_jsonfile([_version], json_file)
218 |
219 |
220 | def remove_papyrus(outdir: Optional[str] = None,
221 | version: Union[str, List[str]] = 'latest',
222 | papyruspp: bool = False,
223 | bioactivities: bool = False,
224 | proteins: bool = False,
225 | nostereo: bool = True,
226 | stereo: bool = False,
227 | structures: bool = False,
228 | descriptors: Union[str, List[str]] = 'all',
229 | other_files: bool = False,
230 | version_root: bool = False,
231 | papyrus_root: bool = False,
232 | force: bool = False,
233 | progress: bool = True) -> None:
234 | """Remove the Papyrus data.
235 |
236 | :param outdir: directory where Papyrus data is stored (default: pystow's directory)
237 | :param version: version of the dataset to be removed
238 | :param papyruspp: should Papyrus++ bioactivities be removed
239 | :param bioactivities: should bioactivity data be removed
240 | :param proteins: should protein data be removed
241 | :param nostereo: should the files related to 2D data be considered
242 | :param stereo: should the files related to 3D data be considered
243 | :param structures: should molecule structures be removed
244 | :param descriptors: should molecular and protein descriptors be removed
245 | :param other_files: should other files (e.g. LICENSE, README, data_types, data_size) be removed
246 | :param version_root: remove the specified version of the papyrus data, requires confirmation
247 | :param papyrus_root: remove all versions of the papyrus data, requires confirmation
248 | :param force: disable confirmation prompt
249 | :param progress: should progress be displayed
250 | """
251 | # Obtain links to files
252 | files = get_papyrus_links()
253 | # Handle exceptions
254 | available_versions = list(files.keys())
255 | if isinstance(version, list):
256 | for _version in version:
257 | if _version not in available_versions + ['latest', 'all']:
258 | raise ValueError(f'version can only be one of [{", ".join(["latest"] + available_versions)}]')
259 | # Identify version
260 | latest_version = sorted(available_versions, key=lambda s: [int(u) for u in s.split('.')])[-1]
261 | if version == 'latest':
262 | version = latest_version
263 | if progress:
264 | print(f'Latest version: {version}')
265 | elif isinstance(version, list) and 'latest' in version:
266 | for i in range(len(version)):
267 | if version[i] == 'latest':
268 | version[i] = latest_version
269 | elif version == 'all' or (isinstance(version, list) and 'all' in version):
270 | version = available_versions
271 | # Transform to list
272 | if not isinstance(version, list):
273 | version = [version]
274 | if not isinstance(descriptors, list):
275 | descriptors = [descriptors]
276 | # Remove duplicates of versions
277 | version = sorted(set(version), key=lambda s: [int(u) for u in s.split('.')])
278 | # Define root dir for removal
279 | if outdir is not None:
280 | os.environ['PYSTOW_HOME'] = os.path.abspath(outdir)
281 | papyrus_root_dir = pystow.module('papyrus')
282 | # Deep cleaning
283 | if papyrus_root:
284 | if not force:
285 | confirmation = input('Confirm the removal of all Papyrus data and versions (Y/N): ')
286 | if confirmation != 'Y':
287 | print('Removal was aborted.')
288 | return
289 | # Either forced or confirmed
290 | shutil.rmtree(papyrus_root_dir.base.as_posix())
291 | if progress:
292 | print('All Papyrus data was successfully removed.')
293 | return
294 | for _version in version:
295 | papyrus_version_root = pystow.module('papyrus', _version)
296 | # If removal of the whole version
297 | if version_root:
298 | if not force:
299 | confirmation = input(f'Confirm the removal of version {_version} of Papyrus data (Y/N): ')
300 | if confirmation != 'Y':
301 | print('Removal was aborted.')
302 | return
303 | # Either forced or confirmed
304 | shutil.rmtree(papyrus_version_root.base.as_posix())
305 | if progress:
306 | print(f'Version {_version} of Papyrus was successfully removed.')
307 | return
308 | # Prepare files to be removed
309 | removal = set()
310 | if bioactivities and papyruspp:
311 | removal.add('papyrus++')
312 | if bioactivities and nostereo:
313 | removal.add('2D_papyrus')
314 | elif bioactivities and stereo:
315 | removal.add('3D_papyrus')
316 | if proteins:
317 | removal.add('proteins')
318 | if structures and nostereo:
319 | removal.add('2D_structures')
320 | elif structures and stereo:
321 | removal.add('3D_structures')
322 | if nostereo and ('mold2' in descriptors or 'all' in descriptors):
323 | removal.add('2D_mold2')
324 | if nostereo and ('cddd' in descriptors or 'all' in descriptors):
325 | removal.add('2D_cddd')
326 | if nostereo and ('mordred' in descriptors or 'all' in descriptors):
327 | removal.add('2D_mordred')
328 | elif stereo and ('mordred' in descriptors or 'all' in descriptors):
329 | removal.add('3D_mordred')
330 | if nostereo and ('fingerprint' in descriptors or 'all' in descriptors):
331 | removal.add('2D_fingerprint')
332 | elif stereo and 'fingerprint' in descriptors or 'all' in descriptors:
333 | removal.add('3D_fingerprint')
334 | if 'unirep' in descriptors or 'all' in descriptors:
335 | removal.add('proteins_unirep')
336 | if 'prodec' in descriptors or 'all' in descriptors:
337 | removal.add('proteins_prodec')
338 | if other_files:
339 | removal.add('data_types')
340 | removal.add('data_size')
341 | removal.add('readme')
342 | removal.add('license')
343 | removal = list(removal)
344 | # Determine total removed size
345 | total = 0
346 | for i in range(len(removal) - 1, -1, -1):
347 | ftype = removal[i]
348 | data = files[_version][ftype]
349 | dname, dsize = data['name'], data['size']
350 | # Determine path
351 | if ftype in ['papyrus++', '2D_papyrus', '3D_papyrus', 'proteins', 'readme']:
352 | fpath = papyrus_version_root.join(name=dname).as_posix()
353 | elif ftype in ['2D_structures', '3D_structures']:
354 | fpath = papyrus_version_root.join('structures', name=dname).as_posix()
355 | else:
356 | fpath = papyrus_version_root.join('descriptors', name=dname).as_posix()
357 | # Handle LICENSE, data_types and data_size separately
358 | if other_files:
359 | fpath = papyrus_version_root.join(name=dname).as_posix()
360 | # Will throw an error if these files do not exist
361 | # Nevertheless they should always exist
362 | os.remove('data_types.json')
363 | os.remove('data_size.json')
364 | os.remove('LICENSE.txt')
365 | # Handle other files
366 | if os.path.isfile(fpath): # file exists
367 | total += dsize # add size to be removed
368 | else: # file does not exist
369 | del removal[i]
370 | if progress:
371 | print(f'Number of files to be removed: {len(removal)}\n'
372 | f'Total size: {tqdm.format_sizeof(total)}B')
373 | # Early stop:
374 | if len(removal) == 0:
375 | return
376 | # Remove files
377 | if progress:
378 | pbar = tqdm(total=total, desc=f'Removing files from version {_version}', unit='B', unit_scale=True)
379 | for ftype in removal:
380 | data = files[_version][ftype]
381 | dname, dsize = data['name'], data['size']
382 | # Determine path
383 | if ftype in ['papyrus++', '2D_papyrus', '3D_papyrus', 'proteins', 'data_types', 'data_size', 'readme', 'license']:
384 | fpath = papyrus_version_root.join(name=dname).as_posix()
385 | elif ftype in ['2D_structures', '3D_structures']:
386 | fpath = papyrus_version_root.join('structures', name=dname).as_posix()
387 | else:
388 | fpath = papyrus_version_root.join('descriptors', name=dname).as_posix()
389 | # File does not exist
390 | if not os.path.isfile(fpath):
391 | if progress:
392 | pbar.update(dsize)
393 | continue # skip
394 | # Remove file
395 | os.remove(fpath)
396 | pbar.update(dsize)
397 | if progress:
398 | pbar.close()
399 | # Remove version number
400 | json_file = papyrus_root_dir.join(name='versions.json').as_posix()
401 | if os.path.isfile(json_file):
402 | data = read_jsonfile(json_file)
403 | data = [v for v in data if v != _version]
404 | data = sorted(set(data))
405 | write_jsonfile(data, json_file)
406 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/utils/links.json:
--------------------------------------------------------------------------------
1 | {
2 | "05.4": {
3 | "readme": {
4 | "name": "README.txt",
5 | "url": "https://zenodo.org/records/10944245/files/README.txt?download=1",
6 | "size": 8743,
7 | "sha256": "f552ae0b58121b20c9aefcce0737e5f31240d72676dc9ec559f97585aceb33ad"
8 | },
9 | "requirements": [
10 | {
11 | "name": "LICENSE.txt",
12 | "url": "https://zenodo.org/records/10944245/files/LICENSE.txt?download=1",
13 | "size": 20138,
14 | "sha256": "3b2890eacd851373001c4a14623458e3adaf1b1967939aa9c38a318e28d61c00"
15 | },
16 | {
17 | "name": "data_types.json",
18 | "url": "https://zenodo.org/records/10944245/files/data_types.json?download=1",
19 | "size": 450559,
20 | "sha256": "d80a5810d99b62680ee1a214df5d5a30f505ec335a0c221194efb91d1c23913e"
21 | },
22 | {
23 | "name": "data_size.json",
24 | "url": "https://zenodo.org/records/10944245/files/data_size.json?download=1",
25 | "size": 324,
26 | "sha256": "decbe66e14eaeccf5e0f657bb33065600b503e2902503aa59f5ffa81b7126775"
27 | }
28 | ],
29 | "papyrus++": {
30 | "name": "05.4++_combined_set_without_stereochemistry.tsv.xz",
31 | "url": "https://zenodo.org/records/10944245/files/05.4++_combined_set_without_stereochemistry.tsv.xz?download=1",
32 | "size": 40278204,
33 | "sha256": "42dcbe76b33ad541f6c54673eccffa15af64785cf844938c0f73518dfdf4404b"
34 | },
35 | "2D_papyrus": {
36 | "name": "05.4_combined_set_without_stereochemistry.tsv.xz",
37 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_set_without_stereochemistry.tsv.xz?download=1",
38 | "size": 742110788,
39 | "sha256": "1a1c946917f77d9a250a181c8ef19bea4d04871915e9e75a615893a2c514684e"
40 | },
41 | "2D_structures": {
42 | "name": "05.4_combined_2D_set_without_stereochemistry.sd.xz",
43 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_set_without_stereochemistry.sd.xz?download=1",
44 | "size": 416640448,
45 | "sha256": "4595f726daf12a784049f20e9f9464ed0287af3a22a27f2a919399c535f633fc"
46 | },
47 | "3D_papyrus": {
48 | "name": "05.4_combined_set_with_stereochemistry.tsv.xz",
49 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_set_with_stereochemistry.tsv.xz?download=1",
50 | "size": 777395668,
51 | "sha256": "56cf389030246d4525bb31cd3dfc9e5ab3afa9613535d1540c71f0f7426c778f"
52 | },
53 | "3D_structures": {
54 | "name": "05.4_combined_3D_set_with_stereochemistry.sd.xz",
55 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_3D_set_with_stereochemistry.sd.xz?download=1",
56 | "size": 446702556,
57 | "sha256": "b0f04e066b7ac6b1e1f2a868ff0258b13bd8d3433023ff59c3af58317bfeb3e9"
58 | },
59 | "2D_fingerprint": {
60 | "name": "05.4_combined_2D_moldescs_ECFP6.tsv.xz",
61 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_ECFP6.tsv.xz?download=1",
62 | "size": 141318356,
63 | "sha256": "4ab781cc238107f7c48f1d866eea0e2114068b6512acf74932a5b21958c9ffe0"
64 | },
65 | "3D_fingerprint": {
66 | "name": "05.4_combined_3D_moldescs_E3FP.tsv.xz",
67 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_3D_moldescs_E3FP.tsv.xz?download=1",
68 | "size": 146751352,
69 | "sha256": "2b89027dad8f4e59f007dd082664a7d2a491f4f79d112fb29f14565acedfe4d0"
70 | },
71 | "2D_mordred": {
72 | "name": "05.4_combined_2D_moldescs_mordred2D.tsv.xz",
73 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_mordred2D.tsv.xz?download=1",
74 | "size": 3085232504,
75 | "sha256": "d15bca59f542a6c46528e4f131cb44d8bd6b21440ab139f4175f4327c15c39c6"
76 | },
77 | "3D_mordred": {
78 | "name": "05.4_combined_3D_moldescs_mordred3D.tsv.xz",
79 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_3D_moldescs_mordred3D.tsv.xz?download=1",
80 | "size": 2996851908,
81 | "sha256": "80fc4f9b2d0b89e68c289c44e9f4df78f4c08e5867cd414d6169a4e1344aead8"
82 | },
83 | "2D_cddd": {
84 | "name": "05.4_combined_2D_moldescs_CDDDs.tsv.xz",
85 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_CDDDs.tsv.xz?download=1",
86 | "size": 3770082588,
87 | "sha256": "9bb0d9adba1b812aa05b6391ecbc3f0148f6ed37972a004b13772d08790a9bda"
88 | },
89 | "2D_mold2": {
90 | "name": "05.4_combined_2D_moldescs_mold2.tsv.xz",
91 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_mold2.tsv.xz?download=1",
92 | "size": 1552425452,
93 | "sha256": "bdfb0cbb6e9a3d1b62065808fa0e6ce238e04760df62e34ce4f15046810efd82"
94 | },
95 | "proteins": {
96 | "name": "05.4_combined_set_protein_targets.tsv.xz",
97 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_set_protein_targets.tsv.xz?download=1",
98 | "size": 1701316,
99 | "sha256": "5f49030509ce188a119910f16054558e1cdd1c70a22d2a1458ec4189f5d1a08e"
100 | },
101 | "proteins_unirep": {
102 | "name": "05.4_combined_prot_embeddings_unirep.tsv.xz",
103 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_prot_embeddings_unirep.tsv.xz?download=1",
104 | "size": 138392528,
105 | "sha256": "19aa0562c3b695883c5aa8c05ad0934c4b9b851a26550345940d92ed17f36b93"
106 | }
107 | },
108 | "05.5": {
109 | "readme": {
110 | "name": "README.txt",
111 | "url": "https://zenodo.org/record/7019874/files/README.txt?download=1",
112 | "size": 11092,
113 | "sha256": "0af036c1d02b150f6402a53960a6e995611e66ee7724b61a21f58d3366ec8eda"
114 | },
115 | "requirements": [{
116 | "name": "LICENSE.txt",
117 | "url": "https://zenodo.org/records/10943207/files/LICENSE.txt?download=1",
118 | "size": 20138,
119 | "sha256": "3b2890eacd851373001c4a14623458e3adaf1b1967939aa9c38a318e28d61c00"
120 | },
121 | {
122 | "name": "data_types.json",
123 | "url": "https://zenodo.org/records/10943207/files/data_types.json?download=1",
124 | "size": 450678,
125 | "sha256": "d38f0b6b53f0450c5530b5bf44d8a7d0bb85417f22b7c818237e3346fe68149c"
126 | },
127 | {
128 | "name": "data_size.json",
129 | "url": "https://zenodo.org/records/10943207/files/data_size.json?download=1",
130 | "size": 324,
131 | "sha256": "513307863c4acc779789340e900821ff8f38c845865aa078edc649caa1559dcc"
132 | }],
133 | "papyrus++": {
134 | "name": "05.5++_combined_set_without_stereochemistry.tsv.xz",
135 | "url": "https://zenodo.org/records/10943207/files/05.5++_combined_set_without_stereochemistry.tsv.xz?download=1",
136 | "size": 41357608,
137 | "sha256": "8ecaea9533f3c475dca6d335f30dd1b4abb259fa77b7441548dd15879e1afa58"
138 | },
139 | "2D_papyrus": {
140 | "name": "05.5_combined_set_without_stereochemistry.tsv.xz",
141 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_set_without_stereochemistry.tsv.xz?download=1",
142 | "size": 718601992,
143 | "sha256": "04ecaea97c09d02dbde809ad99ea2127fc3997a4e3b200b56dee85c30801890a"
144 | },
145 | "2D_structures": {
146 | "name": "05.5_combined_2D_set_without_stereochemistry.sd.xz",
147 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_set_without_stereochemistry.sd.xz?download=1",
148 | "size": 399767580,
149 | "sha256": "2e088ca662c5c33c5fc018c42c9c21e918ec167f1129a0a11fbf9c72888e8be6"
150 | },
151 | "3D_papyrus": {
152 | "name": "05.5_combined_set_with_stereochemistry.tsv.xz",
153 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_set_with_stereochemistry.tsv.xz?download=1",
154 | "size": 690498416,
155 | "sha256": "822aca70ccf4c19879ae45dfa16de5fc29c3ee08b25739e7a087899652af7dd9"
156 | },
157 | "3D_structures": {
158 | "name": "05.5_combined_3D_set_with_stereochemistry.sd.xz",
159 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_3D_set_with_stereochemistry.sd.xz?download=1",
160 | "size": 492426264,
161 | "sha256": "a4a5355ffc56de8d914c2ad281d10c227171c27e4d6c250daad14a16280cf136"
162 | },
163 | "2D_fingerprint": {
164 | "name": "05.5_combined_2D_moldescs_ECFP6.tsv.xz",
165 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_ECFP6.tsv.xz?download=1",
166 | "size": 97818228,
167 | "sha256": "3d626b4295cfbe73877157d8eea84b911a3cb60bf9571165d88c00cc0b0880d2"
168 | },
169 | "3D_fingerprint": {
170 | "name": "05.5_combined_3D_moldescs_E3FP.tsv.xz",
171 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_3D_moldescs_E3FP.tsv.xz?download=1",
172 | "size": 114052016,
173 | "sha256": "446fe36d50487f29a2d7402a53cc661097e884dc0df8ffd278646dba6708cb65"
174 | },
175 | "2D_mordred": {
176 | "name": "05.5_combined_2D_moldescs_mordred2D.tsv.xz",
177 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_mordred2D.tsv.xz?download=1",
178 | "size": 2936434876,
179 | "sha256": "bcef94b1c04a1e7d8f9da11ad87e598e19932548a8ea4f00029c2f3a89672ff4"
180 | },
181 | "3D_mordred": {
182 | "name": "05.5_combined_3D_moldescs_mordred3D.tsv.xz",
183 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_3D_moldescs_mordred3D.tsv.xz?download=1",
184 | "size": 3206020732,
185 | "sha256": "e6ffd0858f85217b57c4a88619e5f41d7f6bae16a9948612872162e54d3231dc"
186 | },
187 | "2D_cddd": {
188 | "name": "05.5_combined_2D_moldescs_CDDDs.tsv.xz",
189 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_CDDDs.tsv.xz?download=1",
190 | "size": 3775676256,
191 | "sha256": "8421d973b4eb119f0739506a0b20ba9508356df97d4673e1c170e871cd134983"
192 | },
193 | "2D_mold2": {
194 | "name": "05.5_combined_2D_moldescs_mold2.tsv.xz",
195 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_mold2.tsv.xz?download=1",
196 | "size": 1553510028,
197 | "sha256": "0fd1c2b3869c5fa749c21ddd70c5dff621974eccafb8e04fd6f95f3b37242058"
198 | },
199 | "proteins": {
200 | "name": "05.5_combined_set_protein_targets.tsv.xz",
201 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_set_protein_targets.tsv.xz?download=1",
202 | "size": 1710756,
203 | "sha256": "d8f2cbee8b9849f7c3664fe7e8165c5abf785d374c36a8f151a6ec38fd582d80"
204 | },
205 | "proteins_unirep": {
206 | "name": "05.5_combined_prot_embeddings_unirep.tsv.xz",
207 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_prot_embeddings_unirep.tsv.xz?download=1",
208 | "size": 128869632,
209 | "sha256": "9f1fce00e77563481eafc44405f9dc8188d5669ed93cafaee256c0208ca135b8"
210 | }
211 | },
212 | "05.6": {
213 | "readme": {
214 | "name": "README.txt",
215 | "url": "https://zenodo.org/record/7377161/files/README.txt?download=1",
216 | "size": 12170,
217 | "sha256": "c60b7146a295ddbd7d1cc0d7815ffa9389d5e93deb0e2a577b1065abcb468e03"
218 | },
219 | "requirements": {
220 | "name": "05.6_additional_files.zip",
221 | "url": "https://zenodo.org/record/7377161/files/05.6_additional_files.zip?download=1",
222 | "size": 51310,
223 | "sha256": "c1d8df814ba54e17619f3740ff82577898a85a07acd220822403874159e26d8a"
224 | },
225 | "papyrus++": {
226 | "name": "05.6++_combined_set_without_stereochemistry.tsv.xz",
227 | "url": "https://zenodo.org/records/7821775/files/05.6++_combined_set_without_stereochemistry.tsv.xz?download=1",
228 | "size": 31085780,
229 | "sha256": "7518019c3ba287cd4cd0ff29425fe9da8a4760d891d22ed1abb33da4920cf96a"
230 | },
231 | "2D_papyrus": {
232 | "name": "05.6_combined_set_without_stereochemistry.tsv.xz",
233 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_set_without_stereochemistry.tsv.xz?download=1",
234 | "size": 744449364,
235 | "sha256": "82a36ed7bb2f80846bb46e4c3e38905895bd1a2cfddd471d32091cb59dcf9437"
236 | },
237 | "2D_structures": {
238 | "name": "05.6_combined_2D_set_without_stereochemistry.sd.xz",
239 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_set_without_stereochemistry.sd.xz?download=1",
240 | "size": 439758444,
241 | "sha256": "1ec001964aca301494ea05fc24529120f01bc6952dcf4276dcd03625dfec460d"
242 | },
243 | "3D_papyrus": {
244 | "name": "05.6_combined_set_with_stereochemistry.tsv.xz",
245 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_set_with_stereochemistry.tsv.xz?download=1",
246 | "size": 711529352,
247 | "sha256": "62068d500986b78fc90fe82b9e224555f8ca85319cd19f9df8bc73549e8a3e31"
248 | },
249 | "3D_structures": {
250 | "name": "05.6_combined_3D_set_with_stereochemistry.sd.xz",
251 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_3D_set_with_stereochemistry.sd.xz?download=1",
252 | "size": 500108592,
253 | "sha256": "38e39963cd79845b4adca9dea871ffba18576ea742677471fc46a73a7dabbf38"
254 | },
255 | "2D_fingerprint": {
256 | "name": "05.6_combined_2D_moldescs_ECFP6.tsv.xz",
257 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_ECFP6.tsv.xz?download=1",
258 | "size": 96612972,
259 | "sha256": "01c7366ee2ca7353d3a9f76601702b6d2eb312e71f02ea8ef48e2f90870c266c"
260 | },
261 | "3D_fingerprint": {
262 | "name": "05.6_combined_3D_moldescs_E3FP.tsv.xz",
263 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_3D_moldescs_E3FP.tsv.xz?download=1",
264 | "size": 117065432,
265 | "sha256": "0d15baa4a9425daf63a0066511e9e96cbd5d7dab223bdaf48803536ab2484dc2"
266 | },
267 | "2D_mordred": {
268 | "name": "05.6_combined_2D_moldescs_mordred2D.tsv.xz",
269 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_mordred2D.tsv.xz?download=1",
270 | "size": 3055443236,
271 | "sha256": "c497db85e97542f59b5252e2b1d3bdd93604e5c4d2ea131088a87d79ea6954c3"
272 | },
273 | "3D_mordred": {
274 | "name": "05.6_combined_3D_moldescs_mordred3D.tsv.xz",
275 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_3D_moldescs_mordred3D.tsv.xz?download=1",
276 | "size": 3324119256,
277 | "sha256": "6b022acb6a0bec8bfc1ae7585014ae0b812a12ddcbed7be4ac7ec073c662192f"
278 | },
279 | "2D_cddd": {
280 | "name": "05.6_combined_2D_moldescs_CDDDs.tsv.xz",
281 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_CDDDs.tsv.xz?download=1",
282 | "size": 2103289016,
283 | "sha256": "fbb54e5ca9a28ff022dc5baddf87cb6601169a2d86f3b55db4d183fd3885642a"
284 | },
285 | "2D_mold2": {
286 | "name": "05.6_combined_2D_moldescs_mold2.tsv.xz",
287 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_mold2.tsv.xz?download=1",
288 | "size": 1487710808,
289 | "sha256": "cd46ce9841a1f956840b4fe7c56310eaa32c5e957a6ffaca62fbc55f820aad99"
290 | },
291 | "proteins": {
292 | "name": "05.6_combined_set_protein_targets.tsv.xz",
293 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_set_protein_targets.tsv.xz?download=1",
294 | "size": 1850764,
295 | "sha256": "f443a2f8c74b8eb3f2c9d1bd7bfbddc86cbcc3fd5e8e505b7057b78a4ad17ee1"
296 | },
297 | "proteins_unirep": {
298 | "name": "05.6_combined_protdescs_unirep.tsv.xz",
299 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_protdescs_unirep.tsv.xz?download=1",
300 | "size": 207122632,
301 | "sha256": "47f37c1f1efcb9d6f002d5a096853975c27ddcc767d7903a8af12bac0439181c"
302 | },
303 | "proteins_prodec": {
304 | "name": "05.6_combined_protdescs_ProDEC.tsv.xz",
305 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_protdescs_ProDEC.tsv.xz?download=1",
306 | "size": 447818916,
307 | "sha256": "3211a62f18ccb7ccc13f885374c1462efeb83ab0e98ed62d2645723f7dc9f1a1"
308 | }
309 | },
310 | "05.7": {
311 | "readme": {
312 | "name": "README.txt",
313 | "url": "https://zenodo.org/records/13987985/files/README.txt?download=1",
314 | "size": 12813,
315 | "sha256": "fab159a88e302cad3e5b84ccea72a7c9fb2c212f656324d6191865460511f50d"
316 | },
317 | "requirements": {
318 | "name": "05.7_additional_files.zip",
319 | "url": "https://zenodo.org/records/13987985/files/05.7_additional_files.zip?download=1",
320 | "size": 113945,
321 | "sha256": "0621cd63111286777769e6ea1c59e7adc1d05833bb1f61e50ba9e5be189d60da"
322 | },
323 | "papyrus++": {
324 | "name": "05.7++_combined_set_without_stereochemistry.tsv.xz",
325 | "url": "https://zenodo.org/records/13987985/files/05.7++_combined_set_without_stereochemistry.tsv.xz?download=1",
326 | "size": 56759540 ,
327 | "sha256": "8004e0d1027a760f205b45264386f792e7d49658da39f77f52e660a6f19760dd"
328 | },
329 | "2D_papyrus": {
330 | "name": "05.7_combined_set_without_stereochemistry.tsv.xz",
331 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_set_without_stereochemistry.tsv.xz?download=1",
332 | "size": 751521856,
333 | "sha256": "9a7657f2636473ea1f4b033c3abbc9709608517f262f97e8adcc8f59d4f1189b"
334 | },
335 | "2D_structures": {
336 | "name": "05.7_combined_2D_set_without_stereochemistry.sd.xz",
337 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_set_without_stereochemistry.sd.xz?download=1",
338 | "size": 453721500 ,
339 | "sha256": "56be7c058130e9e861d884dc6094cf0ac4c3f37a75c7d2c4302685c4720f69ae"
340 | },
341 | "3D_papyrus": {
342 | "name": "05.7_combined_set_with_stereochemistry.tsv.xz",
343 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_set_with_stereochemistry.tsv.xz?download=1",
344 | "size": 715716852,
345 | "sha256": "88a965ef8827692b1489bc947249e9fc00287ab6b63cbd2767862080a98e9a4c"
346 | },
347 | "3D_structures": {
348 | "name": "05.7_combined_3D_set_with_stereochemistry.sd.xz",
349 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_3D_set_with_stereochemistry.sd.xz?download=1",
350 | "size": 507933052,
351 | "sha256": "8f1490a701e918e013770ea589651825ca2a459b214f50d6ff9ce892af398def"
352 | },
353 | "2D_fingerprint": {
354 | "name": "05.7_combined_2D_moldescs_ECFP6.tsv.xz",
355 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_ECFP6.tsv.xz?download=1",
356 | "size": 99889764,
357 | "sha256": "6689cd5d3841abc350cb2dba719a2af02e119af2a595f15790ad14e5c4ace108"
358 | },
359 | "3D_fingerprint": {
360 | "name": "05.7_combined_3D_moldescs_E3FP.tsv.xz",
361 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_3D_moldescs_E3FP.tsv.xz?download=1",
362 | "size": 119952372,
363 | "sha256": "6c247e785e5885e08ecebc2b452e05dcbb24395adabdef71b903d6491e9ae096"
364 | },
365 | "2D_mordred": {
366 | "name": "05.7_combined_2D_moldescs_mordred2D.tsv.xz",
367 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_mordred2D.tsv.xz?download=1",
368 | "size": 3149391660,
369 | "sha256": "26781e0879af798a6b7df4e6c515efd79599335a04706d5335fdc8e5c5565fc3"
370 | },
371 | "3D_mordred": {
372 | "name": "05.7_combined_3D_moldescs_mordred3D.tsv.xz",
373 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_3D_moldescs_mordred3D.tsv.xz?download=1",
374 | "size": 3421107392,
375 | "sha256": "c03f3d4c702815d5bfa2ddf80e20717d0bd13a542b0ca30e62534126eef03b0d"
376 | },
377 | "2D_cddd": {
378 | "name": "05.7_combined_2D_moldescs_CDDDs.tsv.xz",
379 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_CDDDs.tsv.xz?download=1",
380 | "size": 2167302956,
381 | "sha256": "d86539cc76a537878725c4ef8a6703d316add737cb51915ad203e346fe92f6c9"
382 | },
383 | "2D_mold2": {
384 | "name": "05.7_combined_2D_moldescs_mold2.tsv.xz",
385 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_mold2.tsv.xz?download=1",
386 | "size": 1539905628,
387 | "sha256": "493436f96d30680568c2a70ed08d76a159b916c57e9df03f639ae7b414fb87cb"
388 | },
389 | "proteins": {
390 | "name": "05.7_combined_set_protein_targets.tsv.xz",
391 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_set_protein_targets.tsv.xz?download=1",
392 | "size": 1780032,
393 | "sha256": "832e564fb82daea0e4da79abcb44834d10104229382874e79915a1288d80783c"
394 | },
395 | "proteins_unirep": {
396 | "name": "05.7_combined_protdescs_unirep.tsv.xz",
397 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_protdescs_unirep.tsv.xz?download=1",
398 | "size": 209957000,
399 | "sha256": "a39f21db7ed5ba72ef881a9f05f5362f7aaaa0f2709c023a0060417678f30dec"
400 | },
401 | "proteins_prodec": {
402 | "name": "05.7_combined_protdescs_ProDEC.tsv.xz",
403 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_protdescs_ProDEC.tsv.xz?download=1",
404 | "size": 435015384,
405 | "sha256": "1ec2d7b0cd95c93aaabacf4153e58e464e4327f0ebb3bad0077fd740b7334cb1"
406 | }
407 | }
408 | }
409 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/utils/IO.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """IO functions."""
4 |
5 | from __future__ import annotations
6 |
7 | import glob
8 | import hashlib
9 | import importlib
10 | import inspect
11 | import json
12 | import os
13 | import re
14 | from collections import namedtuple
15 |
16 | import requests
17 | import shutil
18 | import lzma
19 | import gzip
20 | from typing import List, Optional
21 |
22 | import pystow
23 | import pandas as pd
24 | from tqdm.auto import tqdm
25 |
26 |
27 | def sha256sum(filename, blocksize=None):
28 | if blocksize is None:
29 | blocksize = 65536
30 | hash = hashlib.sha256()
31 | with open(filename, "rb") as fh:
32 | for block in iter(lambda: fh.read(blocksize), b""):
33 | hash.update(block)
34 | return hash.hexdigest()
35 |
36 |
37 | def assert_sha256sum(filename, sha256, blocksize=None):
38 | if not (isinstance(sha256, str) and len(sha256) == 64):
39 | raise ValueError("SHA256 must be 64 chars: {}".format(sha256))
40 | sha256_actual = sha256sum(filename, blocksize)
41 | return sha256_actual == sha256
42 |
43 |
44 | def write_jsonfile(data: object, json_outfile: str) -> None:
45 | """Write a json object to a file with lazy formatting."""
46 | with open(json_outfile, 'w') as outfile:
47 | json.dump(data, outfile, indent=4)
48 |
49 |
50 | def read_jsonfile(json_infile: str) -> dict:
51 | """Read in a json file and return the json object."""
52 | if not os.path.isfile(json_infile):
53 | return {}
54 | with open(json_infile) as infile:
55 | data = json.load(infile)
56 | return data
57 |
58 |
59 | class TypeEncoder(json.JSONEncoder):
60 | """Custom json encoder to support types as values."""
61 |
62 | def default(self, obj):
63 | """Add support if value is a type."""
64 | if isinstance(obj, type):
65 | return {'__type__': {'module': inspect.getmodule(obj).__name__,
66 | 'type': obj.__name__}
67 | }
68 | # Let the base class default method raise the TypeError
69 | return json.JSONEncoder.default(self, obj)
70 |
71 |
72 | class TypeDecoder(json.JSONDecoder):
73 | """Custom json decoder to support types as values."""
74 |
75 | def __init__(self, *args, **kwargs):
76 | """Simple json decoder handling types as values."""
77 | json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
78 |
79 | def object_hook(self, obj):
80 | """Handle types."""
81 | if '__type__' not in obj:
82 | return obj
83 | module = obj['__type__']['module']
84 | type_ = obj['__type__']['type']
85 | if module == 'builtins':
86 | return getattr(__builtins__, type_)
87 | loaded_module = importlib.import_module(module)
88 | return getattr(loaded_module, type_)
89 |
90 |
91 | def enough_disk_space(destination: str,
92 | required: int,
93 | margin: float = 0.10):
94 | """Check disk has enough space.
95 |
96 | :param destination: folder to check
97 | :param required: space required in bytes
98 | :param margin: percent of free disk space once file is written
99 | """
100 | total, _, free = shutil.disk_usage(destination)
101 | return free - required > margin * total
102 |
103 |
104 | def get_disk_space(destination: str):
105 | """Obtain size of free disk space.
106 |
107 | :param destination: folder to check
108 | """
109 | _, _, free = shutil.disk_usage(destination)
110 | return free
111 |
112 |
113 | def get_downloaded_versions(root_folder: str = None) -> dict:
114 | """Identify versions of the downloaded Papyrus data
115 |
116 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder)
117 | """
118 | if root_folder is not None:
119 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder)
120 | version_json = pystow.join('papyrus', name='versions.json').as_posix()
121 | return read_jsonfile(version_json)
122 |
123 |
124 | def get_downloaded_papyrus_files(root_folder: str = None) -> pd.DataFrame:
125 | """Identify downloaded files for each version of the Papyrus data
126 |
127 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder)
128 | """
129 | # Obtain versions downloaded
130 | downloaded_versions = get_downloaded_versions(root_folder)
131 | # Obtain filenames that could have been downloaded
132 | files = get_papyrus_links(offline=True)
133 | # Keep only file names
134 | file_info = namedtuple('file_info', ('version', 'short_name', 'file_name'))
135 | files = [file_info(version, file, file_data['name'])
136 | for version in downloaded_versions
137 | for file, file_data in files[version].items()
138 | if file in ['papyrus++', '2D_papyrus', '3D_papyrus', '2D_structures', '3D_structures',
139 | '2D_fingerprint', '3D_fingerprint', '2D_mordred', '3D_mordred',
140 | '2D_cddd', '2D_mold2', 'proteins', 'proteins_unirep', 'proteins_prodec']]
141 | # Try to locate files
142 | # Uses glob to prevent maintaining a mapping of subfolders and file names
143 | # This does not check files have been downloaded in the right subfolders
144 | data = pd.DataFrame([{'version': file.version,
145 | 'short_name': file.short_name,
146 | 'downloaded': len(glob.glob(
147 | os.path.join(pystow.module('papyrus', file.version).base.as_posix(), '**',
148 | file.file_name), recursive=True)) > 0}
149 | for file in files])
150 | return data
151 |
152 |
153 | def get_latest_downloaded_version(root_folder: str = None) -> List[str]:
154 | """Identify the latest version of the downloaded Papyrus data
155 |
156 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder)
157 | """
158 | if root_folder is not None:
159 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder)
160 | version_json = pystow.join('papyrus', name='versions.json').as_posix()
161 | versions = read_jsonfile(version_json)
162 | return sorted(versions, key=lambda s: [int(u) for u in s.split('.')])[-1]
163 |
164 |
165 | def get_online_versions() -> List[str]:
166 | """Identify the versions of the Papyrus data available online
167 |
168 | :return: a list of the versions available
169 | """
170 | papyrus_links = get_papyrus_links()
171 | return sorted(papyrus_links.keys(), key=lambda s: [int(u) for u in s.split('.')]) + ['latest']
172 |
173 |
174 | def process_data_version(version: str | PapyrusVersion, root_folder: str = None):
175 | """Confirm the version is available, downloaded and convert synonyms.
176 |
177 | :param version: version to be confirmed and/or converted.
178 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder)
179 | :return: version number
180 | :raises: IOError is the version is not available
181 | """
182 | # Check if aliases
183 | if not isinstance(version, PapyrusVersion):
184 | version = PapyrusVersion(version=version)
185 | # Handle exceptions
186 | available_versions = get_downloaded_versions(root_folder)
187 | if len(available_versions) == 0:
188 | raise IOError('Papyrus data not available (did you download it first?)')
189 | else:
190 | available_versions += ['latest']
191 | if version.version_old_fmt not in available_versions:
192 | raise ValueError(f'version can only be one of [{", ".join(available_versions)}] not {version.version_old_fmt}')
193 | elif version == 'latest':
194 | version = get_latest_downloaded_version(root_folder)
195 | return version
196 |
197 |
198 | def is_local_version_available(version: str, root_folder: str = None):
199 | """Confirm the version is available and downloaded
200 |
201 | :param version: version to check the local availability.
202 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder)
203 | :return: True if the version is available locally, False otherwise
204 | """
205 | try:
206 | _ = process_data_version(version=version, root_folder=root_folder)
207 | return True
208 | except (IOError, ValueError):
209 | return False
210 |
211 | def locate_file(dirpath: str, regex_pattern: str):
212 | """Find file(s) matching the given pattern in the given directory
213 |
214 | :param dirpath: Path to the directory to obtain the file from
215 | :param regex_pattern: Pattern used to locate the file(s)
216 | :return: a list of files matching the pattern and in the given directory
217 | """
218 | # Handle exceptions
219 | if not os.path.isdir(dirpath):
220 | raise NotADirectoryError(f'Directory does not exist: {dirpath}')
221 | # Find the file
222 | filenames = [os.path.join(dirpath, fname) for fname in os.listdir(dirpath) if re.search(regex_pattern, fname)]
223 | # Handle WSL ZoneIdentifier files
224 | filenames = [fname for fname in filenames if not fname.endswith(':ZoneIdentifier')]
225 | if len(filenames) == 0:
226 | raise FileNotFoundError(f'Could not locate a file in {dirpath} matching {regex_pattern}')
227 | return filenames
228 |
229 |
230 | def get_num_rows_in_file(filetype: str, is3D: bool, descriptor_name: Optional[str] = None,
231 | version: str | PapyrusVersion = 'latest',
232 | plusplus: bool = True, root_folder: Optional[str] = None) -> int:
233 | """Get the number of rows a Papyrus file has.
234 |
235 |
236 | :param filetype: Type of file, one of {'bioactivities', 'structures', 'descriptors'}
237 | :param is3D: Whether to consider the standardised (2D) or non-standardised (3D) data
238 | :param descriptor_name: Name of the descriptor, one of {'cddd', 'mold2', 'mordred', 'fingerprint'},
239 | only considered if type='descriptors'.
240 | :param version: Version of Papyrus to be considered
241 | :param plusplus: If bioactivities come from the Papyrus++ very high quality curated set,
242 | only considered if type='bioactivitities'.
243 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder)
244 | :return: The number of lines in the corresponding file
245 | """
246 | if filetype not in ['bioactivities', 'structures', 'descriptors']:
247 | raise ValueError('filetype must be one of [\'bioactivities\', \'structures\', \'descriptors\']')
248 | if filetype == 'descriptors' and (
249 | descriptor_name is None or descriptor_name not in ['cddd', 'mold2', 'mordred', 'fingerprint']):
250 | raise ValueError('filetype must be one of [\'cddd\', \'mold2\', \'mordred\', \'fingerprint\']')
251 | # Process version shortcuts
252 | version = process_data_version(version=version, root_folder=root_folder)
253 | if root_folder is not None:
254 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder)
255 | json_file = pystow.join('papyrus', version.version_old_fmt, name='data_size.json').as_posix()
256 | # Obtain file sizes (number of lines)
257 | sizes = read_jsonfile(json_file)
258 | if filetype == 'bioactivities':
259 | if plusplus:
260 | if 'papyrus_++' in sizes.keys():
261 | return sizes['papyrus_++']
262 | else:
263 | return sizes['papyrus++']
264 | return sizes['papyrus_3D'] if is3D else sizes['papyrus_2D']
265 | elif filetype == 'structures':
266 | return sizes['structures_3D'] if is3D else sizes['structures_2D']
267 | elif filetype == 'descriptors':
268 | if descriptor_name == 'cddd':
269 | return sizes['cddd']
270 | elif descriptor_name == 'mold2':
271 | return sizes['mold2']
272 | elif descriptor_name == 'fingerprint':
273 | return sizes['E3FP'] if is3D else sizes['ECFP6']
274 | elif descriptor_name == 'mordred':
275 | return sizes['mordred_3D'] if is3D else sizes['mordred_2D']
276 |
277 |
278 | def get_papyrus_links(offline: bool = False):
279 | """Obtain the latest links to Papyrus data files from GitHub.
280 |
281 | If the connection to the GitHub server is made, the
282 | local version of the file is updated.
283 | Otherwise, defaults ot the local version of the file.
284 |
285 | :param offline: do not attempt to download the latest file from GitHub
286 | """
287 | local_file = os.path.join(os.path.dirname(__file__), 'links.json')
288 | if not offline:
289 | url = "https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/db-links/links.json"
290 | session = requests.session()
291 | try:
292 | res = session.get(url, verify=True)
293 | with open(local_file, 'w') as oh:
294 | oh.write(res.text)
295 | except requests.exceptions.ConnectionError as e:
296 | pass
297 | with open(local_file) as fh:
298 | data = json.load(fh)
299 | return data
300 |
301 |
302 | def get_papyrus_aliases(offline: bool = False):
303 | """Obtain the latest aliases of the Papyrus versions from GitHub.
304 |
305 | If the connection to the GitHub server is made, the
306 | local version of the file is updated.
307 | Otherwise, defaults ot the local version of the file.
308 |
309 | :param offline: do not attempt to download the latest file from GitHub
310 | """
311 | local_file = os.path.join(os.path.dirname(__file__), 'aliases.json')
312 | if not offline:
313 | url = "https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/db-links/aliases.json"
314 | session = requests.session()
315 | try:
316 | res = session.get(url, verify=True)
317 | with open(local_file, 'w') as oh:
318 | oh.write(res.text)
319 | except requests.exceptions.ConnectionError as e:
320 | pass
321 | data = pd.read_json(local_file, orient='split', dtype={'version': 'str', 'alias': 'str',
322 | 'revision': 'str', 'chembl_version': 'str'})
323 | return data
324 |
325 |
326 | def convert_xz_to_gz(input_file: str, output_file: str,
327 | compression_level: int = 9,
328 | progress: bool = False) -> None:
329 | """Convert a LZMA-compressed xz file to a GZIP-compressed file.
330 |
331 | :param input_file: Path of the input file
332 | :param output_file: Path of the output file
333 | :param compression_level: Compression level of the output file (if None, defaults to 9)
334 | :param progress: Show conversion progress.
335 | """
336 | if compression_level is None:
337 | compression_level = 9
338 | # Transform per chunk
339 | chunksize = 10 * 1048576 # 10 MB
340 | with lzma.open(input_file, 'rb') as fh, gzip.open(output_file, 'wb', compresslevel=compression_level) as oh:
341 | if progress:
342 | pbar = tqdm(desc='Determining size', unit='B', unit_scale=True)
343 | size = fh.seek(0, 2) # Determine original size
344 | _ = fh.seek(0, 0) # Go back to the beginning
345 | pbar.set_description('Converting')
346 | pbar.total = size
347 | # pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True)
348 | while True:
349 | chunk = fh.read(chunksize)
350 | if not chunk:
351 | if progress:
352 | pbar.close()
353 | break
354 | written = oh.write(chunk)
355 | if progress:
356 | pbar.update(written)
357 |
358 |
359 | def convert_gz_to_xz(input_file: str, output_file: str,
360 | compression_level: int = lzma.PRESET_DEFAULT,
361 | extreme: bool = False,
362 | progress: bool = False) -> None:
363 | """Convert a GZIP- compressed file to a LZMA-compressed xz file.
364 |
365 | :param input_file: Path of the input file
366 | :param output_file: Path of the output file
367 | :param compression_level: Compression level of the output file (if None, defaults to 6)
368 | :param extreme: Should extreme compression be toggled on top of the compression level
369 | :param progress: Show conversion progress.
370 | """
371 | if compression_level is None:
372 | compression_level = lzma.PRESET_DEFAULT
373 | preset = compression_level | lzma.PRESET_EXTREME if extreme else compression_level
374 | # Transform per chunk
375 | chunksize = 10 * 1048576 # 10 MB
376 | with gzip.open(input_file, 'rb') as fh, lzma.open(output_file, 'wb', preset=preset) as oh:
377 | if progress:
378 | pbar = tqdm(desc='Determining size', unit='B', unit_scale=True)
379 | size = fh.seek(0, 2) # Determine original size
380 | _ = fh.seek(0, 0) # Go back to the beginning
381 | pbar.set_description('Converting')
382 | pbar.total = size
383 | # pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True)
384 | while True:
385 | chunk = fh.read(chunksize)
386 | if not chunk:
387 | if progress:
388 | pbar.close()
389 | break
390 | written = oh.write(chunk)
391 | if progress:
392 | pbar.update(written)
393 |
394 |
395 | class PapyrusVersion:
396 |
397 | aliases = get_papyrus_aliases(offline=True)
398 |
399 | def __init__(self, version: Optional[str] = None, chembl_version: Optional[int] = None,
400 | chembl: Optional[bool] = None, excape: Optional[bool] = None,
401 | sharma: Optional[bool] = None, christmann: Optional[bool] = None,
402 | klaeger: Optional[bool] = None, merget: Optional[bool] = None,
403 | pickett: Optional[bool] = None):
404 | """Determine the Papyrus version based on provided information.
405 |
406 | :param version: Version number (either older '05.4', or new format '2022.04')
407 | :param chembl_version: Version of ChEMBL to select the Papyrus version from
408 | :param chembl: Whether ChEMBL is included in the Papyrus version to select
409 | :param excape: Whether ExCAPED-DB is included in the Papyrus version to select
410 | :param sharma: Whether the Sharma et al. dataset is included in the Papyrus version to select
411 | :param christmann: Whether the Christmann-Franck et al. dataset is included in the Papyrus version to select
412 | :param klaeger: Whether the Klaeger et al. dataset is included in the Papyrus version to select
413 | :param merget: Whether the Merget et al. dataset is included in the Papyrus version to select
414 | :param pickett: Whether the Pickett et al. dataset is included in the Papyrus version to select
415 | """
416 | # Determine version from the given version name
417 | if version is not None:
418 | if version.lower() == 'latest':
419 | query = 'alias == alias.max()'
420 | else:
421 | query = f'version == "{version}" or alias == "{version.strip()}"'
422 | else:
423 | # Determine version from sources
424 | query = []
425 | if chembl:
426 | query.append('chembl')
427 | if excape:
428 | query.append('excape')
429 | if sharma:
430 | query.append('sharma')
431 | if christmann:
432 | query.append('christmann')
433 | if klaeger:
434 | query.append('klaeger')
435 | if merget:
436 | query.append('merget')
437 | if pickett:
438 | query.append('pickett')
439 | if chembl_version:
440 | query.append(f'chembl_version == "{chembl_version}"')
441 | query = " and ".join(query)
442 | # Identify the aliases matching the query
443 | if len(query):
444 | subset = self.aliases.query(query)
445 | else:
446 | subset = self.aliases
447 | if subset.empty:
448 | raise ValueError('None of the Papyrus versions match the provided information.')
449 | elif len(subset) > 1:
450 | raise ValueError(f'The provided information match multiple versions:\n\n' +
451 | str(subset.set_index('version')) +
452 | '\n\nChoose the version that matches your requirements.')
453 | else:
454 | params = subset.squeeze().to_dict()
455 | for key, value in params.items():
456 | if key == 'version':
457 | setattr(self, 'version_old_fmt', value)
458 | elif key == 'alias':
459 | setattr(self, 'version', value)
460 | else:
461 | setattr(self, key, value)
462 |
463 | def __repr__(self):
464 | return f''
465 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/neuralnet.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import time
5 | import random
6 | import itertools
7 | from typing import Iterator, List, Optional, Union
8 |
9 | import numpy as np
10 | import pandas as pd
11 | from pandas.io.parsers import TextFileReader as PandasTextFileReader
12 |
13 | try:
14 | import torch as T
15 | from torch import nn, optim
16 | from torch.nn import functional as F
17 | from torch.utils.data import DataLoader, TensorDataset, IterableDataset as PandasIterableDataset
18 | except ImportError as e:
19 | T = e
20 | nn = e
21 | # Placeholders
22 | T.Tensor = int
23 | nn.Module = list
24 | PandasIterableDataset = int
25 |
26 |
27 | def cuda(var: nn.Module):
28 | """Move model parameters and buffers to GPU if a GPU is available.
29 |
30 | Originates from Xuhan Liu's DrugEx version 1 (https://github.com/XuhanLiu/DrugEx/tree/1.0)
31 |
32 | :param var: torch.nn.Module derived class to be trained on GPU (or CPU if not GPU available)
33 | """
34 | if T.cuda.is_available():
35 | return var.cuda()
36 | return var
37 |
38 |
39 | def Variable(tensor: Union[T.Tensor, np.ndarray, List]):
40 | """Transform a list or numpy array into a pytorch tensor on GPU (if available).
41 |
42 | Originates from Xuhan Liu's DrugEx version 1 (https://github.com/XuhanLiu/DrugEx/tree/1.0)
43 | Original documentation: Wrapper for torch.autograd.Variable that also accepts
44 | numpy arrays directly and automatically assigns it to
45 | the GPU. Be aware in some cases operations are better
46 | left to the CPU.
47 | :param tensor: the list, numpy array or pytorch tensor to be sent to GPU (if available)
48 | """
49 | if isinstance(tensor, np.ndarray):
50 | tensor = T.from_numpy(tensor)
51 | if isinstance(tensor, list):
52 | tensor = T.Tensor(tensor)
53 | return cuda(T.autograd.Variable(tensor))
54 |
55 |
56 | def set_seed(seed: Optional[int] = None) -> Optional[np.random.Generator]:
57 | """Set the internal seed of rnadom number generators for reproducibility."""
58 | if seed is None:
59 | return
60 | T.manual_seed(seed)
61 | T.cuda.manual_seed_all(seed)
62 | T.cuda.manual_seed(seed)
63 | rng = np.random.default_rng(seed)
64 | random.seed(seed)
65 | T.backends.cudnn.deterministic = True
66 | T.backends.cudnn.benchmark = False
67 | return rng
68 |
69 |
70 | class BaseNN(nn.Module):
71 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3,
72 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25,
73 | random_seed: Optional[int] = None):
74 | """Base class for neural networks.
75 |
76 | Architecture is derived from https://doi.org/10.1186/s13321-017-0232-0
77 |
78 | :param out: output folder
79 | :param epochs: number of epochs
80 | :param lr: learning rate
81 | :param early_stop: stop after these many epochs without any decrease of loss
82 | :param batch_size: size of data batches
83 | :param dropout: fraction of randomly disabled neurons at each epoch during training
84 | :param random_seed: seed of random number generators
85 | """
86 | if isinstance(T, ImportError):
87 | raise ImportError('Some required dependencies are missing:\n\tpytorch')
88 | if not os.path.isdir(out):
89 | os.makedirs(out, exist_ok=True)
90 | super().__init__()
91 | self.fcl = nn.ModuleList() # fully connected layers
92 | self.out = out
93 | self.batch_size = batch_size
94 | self.epochs = epochs
95 | self.lr = lr
96 | self.early_stop = early_stop
97 | self.dropout = dropout
98 | self.rng = set_seed(random_seed)
99 |
100 | def set_validation(self, X: Union[Iterator, pd.DataFrame], y: Union[Iterator, pd.Series]):
101 | """Set the validation set to be used during fitting.
102 |
103 | :param X: features to predict y from
104 | :param y: feature to be predicted (dependent variable)
105 | """
106 | if not isinstance(X, (pd.DataFrame, np.ndarray)) and type(X) != type(y):
107 | raise ValueError('X and y must have the same type (i.e. either Iterator or pandas dataframe)')
108 | # Get data loaders
109 | if isinstance(X, (pd.DataFrame, np.ndarray)):
110 | self.loader_valid = loader_from_dataframe(X, y, batch_size=self.batch_size)
111 | else:
112 | self.loader_valid = loader_from_iterator(X, y, batch_size=self.batch_size)
113 |
114 | def set_architecture(self, dimensions: List[int]):
115 | """Define the size of each fully connected linear hidden layer
116 |
117 | :param dimensions: dimensions of the layers
118 | """
119 | for i in range(len(dimensions) - 1):
120 | self.fcl.append(nn.Linear(dimensions[i], dimensions[i + 1]))
121 | T.save(self.state_dict(), os.path.join(self.out, 'empty_model.pkg'))
122 |
123 | def reset(self):
124 | """Reset weights and reload the initial state of the model"""
125 | self.load_state_dict(T.load(os.path.join(self.out, 'empty_model.pkg')))
126 |
127 | def fit(self, X: Union[Iterator, pd.DataFrame], y: Union[Iterator, pd.Series]):
128 | """Fit neural network with training set and optimize for loss on validation set.
129 |
130 | :param X: features to predict y from
131 | :param y: feature to be predicted (dependent variable)
132 | """
133 | if not self.fcl:
134 | raise ValueError('set architecture before fitting')
135 | if not isinstance(X, (pd.DataFrame, np.ndarray)) and type(X) != type(y):
136 | raise ValueError('X and y must have the same type (i.e. either Iterator or pandas dataframe)')
137 | # Set number of classes
138 | self.classes_ = sorted(set(y))
139 | # Get data loaders
140 | if isinstance(X, (pd.DataFrame, np.ndarray)):
141 | loader_train = loader_from_dataframe(X, y, batch_size=self.batch_size)
142 | else:
143 | loader_train = loader_from_iterator(X, y, batch_size=self.batch_size)
144 | # Set optimizer
145 | if 'optim' in self.__dict__:
146 | optimizer = self.optim
147 | else:
148 | optimizer = optim.Adam(self.parameters(), lr=self.lr)
149 | best_loss = np.inf
150 | last_save = 0
151 | # Set up output folder
152 | if not (os.path.exists(self.out) and os.path.isdir(self.out)):
153 | os.mkdir(self.out)
154 | # Log file
155 | log = open(os.path.join(self.out, 'training_log.txt'), 'w')
156 | for epoch in range(self.epochs):
157 | t0 = time.perf_counter()
158 | # Change learning rate according to epoch
159 | for param_group in optimizer.param_groups:
160 | param_group['lr'] = self.lr * (1 - 1 / self.epochs) ** (epoch * 10)
161 | # Train epoch over all batches
162 | for i, (Xb, yb) in enumerate(loader_train):
163 | Xb, yb = Variable(Xb), Variable(yb)
164 | optimizer.zero_grad()
165 | y_ = self.forward(Xb, istrain=True)
166 | ix = yb == yb
167 | yb, y_ = yb[ix], y_[ix]
168 | loss = self.criterion(y_, yb)
169 | loss.backward()
170 | optimizer.step()
171 | # Calculate loss and log
172 | loss_valid = self.evaluate(self.loader_valid)
173 | print(f'[Epoch: {epoch + 1}/{self.epochs}] {time.perf_counter() - t0:.1f}s '
174 | f'loss_train: {loss.item():f} loss_valid: {loss_valid:f}', file=log, flush=True)
175 | if loss_valid < best_loss:
176 | T.save(self.state_dict(), os.path.join(self.out, 'model.pkg'))
177 | print(f'[Performance] loss_valid improved from {best_loss:f} to {loss_valid:f}, '
178 | 'Saved model to model.pkg', file=log, flush=True)
179 | best_loss = loss_valid
180 | last_save = epoch
181 | else:
182 | print('[Performance] loss_valid did not improve.', file=log, flush=True)
183 | # Early stop if no improvement for some time
184 | if epoch - last_save > self.early_stop:
185 | break
186 | log.close()
187 | self.load_state_dict(T.load(os.path.join(self.out, 'model.pkg')))
188 |
189 | def evaluate(self, loader):
190 | """Calculate loss according to criterion function
191 |
192 | :param loader: data loader of the validation set
193 | """
194 | loss = 0
195 | for Xb, yb in loader:
196 | Xb, yb = Variable(Xb), Variable(yb)
197 | y_ = self.forward(Xb)
198 | ix = yb == yb
199 | yb, y_ = yb[ix], y_[ix]
200 | loss += self.criterion(y_, yb).item()
201 | return loss / len(loader)
202 |
203 | def predict(self, X: Union[pd.DataFrame, np.ndarray]):
204 | """Predict outcome for the incoming data
205 |
206 | :param X: features to predict the endpoint(s) from
207 | """
208 | if not isinstance(X, (pd.DataFrame, np.ndarray)):
209 | raise ValueError('X must be either a numpy array or a pandas dataframe')
210 | if isinstance(X, pd.DataFrame):
211 | y = X.iloc[:, 0]
212 | else:
213 | y = X[:, 0]
214 | loader = loader_from_dataframe(X, y, self.batch_size)
215 | score = []
216 | for Xb, _ in loader:
217 | Xb = Variable(Xb)
218 | y_ = self.forward(Xb)
219 | score.append(y_.cpu().data)
220 | return T.cat(score, dim=0).numpy()
221 |
222 |
223 | class SingleTaskNNClassifier(BaseNN):
224 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3,
225 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25,
226 | random_seed: Optional[int] = None):
227 | """Neural Network classifier to predict a unique endpoint.
228 |
229 | Architecture is derived from https://doi.org/10.1186/s13321-017-0232-0
230 |
231 | :param out: output folder
232 | :param epochs: number of epochs
233 | :param lr: learning rate
234 | :param early_stop: stop after these many epochs without any decrease of loss
235 | :param batch_size: size of data batches
236 | :param dropout: fraction of randomly disabled neurons at each epoch during training
237 | :param random_seed: seed of random number generators
238 | """
239 | super(SingleTaskNNClassifier, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed)
240 | self.dropoutl = nn.Dropout(self.dropout)
241 | # Consider binary classification as default
242 | self.criterion = nn.BCELoss()
243 | self.activation = nn.Sigmoid()
244 |
245 | def set_architecture(self, n_dim: int, n_class: int):
246 | """Set dimension of input and number of classes to be predicted.
247 |
248 | :param n_dim: number of input parameters
249 | :param n_class: number of one-hot encoded classes (i.e. 1 for binary endpoint not one-hot encoded)
250 | """
251 | if n_class < 1:
252 | raise ValueError('can only perform binary (n_class=1 or n_class=2)'
253 | ' or multi-classes predictions (n_class>2)')
254 | super().set_architecture([n_dim, 8000, 4000, 2000, n_class])
255 | self._n_classes_ = n_class
256 | self._n_features_in_ = n_dim
257 | if n_class == 1:
258 | self.criterion = nn.BCELoss()
259 | self.activation = nn.Sigmoid()
260 | else:
261 | self.criterion = nn.CrossEntropyLoss()
262 | self.activation = nn.Softmax()
263 | cuda(self)
264 |
265 | def forward(self, X, istrain=False):
266 | """Calculate model output from input data.
267 |
268 | :param X: input data
269 | :param istrain: whether called during training, to activate dropout
270 | """
271 | input = X
272 | for layer in self.fcl[:-1]:
273 | input = F.relu(layer(input))
274 | if istrain:
275 | input = self.dropoutl(input)
276 | return self.activation(self.fcl[-1](input))
277 |
278 | def predict_proba(self, X):
279 | """Predict class probabilities for the incoming data
280 |
281 | :param X: features to predict the endpoint probabilities from
282 | """
283 | y = super().predict(X)
284 | return y
285 |
286 | def predict(self, X):
287 | """Predict classes for the incoming data
288 |
289 | :param X: features to predict the endpoint(s) from
290 | """
291 | probas = self.predict_proba(X)
292 | return np.round(probas)
293 |
294 |
295 | class SingleTaskNNRegressor(BaseNN):
296 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3,
297 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25,
298 | random_seed: Optional[int] = None):
299 | """Neural Network regressor to predict a unique endpoint.
300 |
301 | Architecture is adapted from https://doi.org/10.1186/s13321-017-0232-0 for regression
302 |
303 | :param out: output folder
304 | :param epochs: number of epochs
305 | :param lr: learning rate
306 | :param early_stop: stop after these many epochs without any decrease of loss
307 | :param batch_size: size of data batches
308 | :param dropout: fraction of randomly disabled neurons at each epoch during training
309 | :param random_seed: seed of random number generators
310 | """
311 | super(SingleTaskNNRegressor, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed)
312 | self.dropoutl = nn.Dropout(self.dropout)
313 | self.criterion = nn.MSELoss()
314 |
315 | def set_architecture(self, n_dim: int):
316 | """Set dimension of input.
317 |
318 | :param n_dim: number of input parameters
319 | """
320 | super().set_architecture([n_dim, 8000, 4000, 2000, 1])
321 | cuda(self)
322 |
323 | def forward(self, X, istrain=False):
324 | """Calculate model output from input data.
325 |
326 | :param X: input data
327 | :param istrain: whether called during training, to activate dropout
328 | """
329 | input = X
330 | for layer in self.fcl[:-1]:
331 | input = F.relu(layer(input))
332 | if istrain:
333 | input = self.dropoutl(input)
334 | return self.fcl[-1](input)
335 |
336 |
337 | class MultiTaskNNClassifier(BaseNN):
338 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3,
339 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25,
340 | random_seed: Optional[int] = None):
341 | """Neural Network classifier to predict multiple endpoints.
342 |
343 | Architecture is derived from https://doi.org/10.1186/s13321-017-0232-0
344 |
345 | :param out: output folder
346 | :param epochs: number of epochs
347 | :param lr: learning rate
348 | :param early_stop: stop after these many epochs without any decrease of loss
349 | :param batch_size: size of data batches
350 | :param dropout: fraction of randomly disabled neurons at each epoch during training
351 | :param random_seed: seed of random number generators
352 | """
353 | super(MultiTaskNNClassifier, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed)
354 | self.criterion = nn.BCELoss()
355 | self.activation = nn.Sigmoid()
356 | self.dropoutl = nn.Dropout(self.dropout)
357 |
358 | def set_architecture(self, n_dim: int, n_task: int):
359 | """Set dimension of input and number of classes to be predicted.
360 |
361 | :param n_dim: number of input parameters
362 | :param n_task: number of tasks to be predicted at the same time
363 | """
364 | if n_task < 2:
365 | raise ValueError('use SingleTaskNNClassifier for a single task')
366 | super().set_architecture([n_dim, 8000, 4000, 2000, n_task])
367 | cuda(self)
368 |
369 | def forward(self, X, istrain=False):
370 | """Calculate model output from input data.
371 |
372 | :param X: input data
373 | :param istrain: whether called during training, to activate dropout
374 | """
375 | input = X
376 | for layer in self.fcl[:-1]:
377 | input = F.relu(layer(input))
378 | if istrain:
379 | input = self.dropoutl(input)
380 | return self.activation(self.fcl[-1](input))
381 |
382 | def predict_proba(self, X):
383 | """Predict class probabilities for the incoming data
384 |
385 | :param X: features to predict the endpoint probabilities from
386 | """
387 | y = super().predict(X)
388 | return y
389 |
390 | def predict(self, X):
391 | """Predict classes for the incoming data
392 |
393 | :param X: features to predict the endpoint(s) from
394 | """
395 | probas = self.predict_proba(X)
396 | return np.round(probas)
397 |
398 |
399 | class MultiTaskNNRegressor(BaseNN):
400 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3,
401 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25,
402 | random_seed: Optional[int] = None):
403 | """Neural Network regressor to predict multiple endpoints.
404 |
405 | Architecture is adapted from https://doi.org/10.1186/s13321-017-0232-0 for multi-task regression
406 |
407 | :param out: output folder
408 | :param epochs: number of epochs
409 | :param lr: learning rate
410 | :param early_stop: stop after these many epochs without any decrease of loss
411 | :param batch_size: size of data batches
412 | :param dropout: fraction of randomly disabled neurons at each epoch during training
413 | :param random_seed: seed of random number generators
414 | """
415 | super(MultiTaskNNRegressor, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed)
416 | self.dropoutl = nn.Dropout(self.dropout)
417 | self.criterion = nn.MSELoss()
418 |
419 | def set_architecture(self, n_dim: int, n_task: int):
420 | """Set dimension of input.
421 |
422 | :param n_dim: number of input parameters
423 | :param n_task: number of tasks to be predicted at the same time
424 | """
425 | if n_task < 2:
426 | raise ValueError('use SingleTaskNNRegressor for a single task')
427 | super().set_architecture([n_dim, 8000, 4000, 2000, n_task])
428 | cuda(self)
429 |
430 | def forward(self, X, istrain=False):
431 | """Calculate model output from input data.
432 |
433 | :param X: input data
434 | :param istrain: whether called during training, to activate dropout
435 | """
436 | y = F.relu(self.fc0(X))
437 | if istrain:
438 | y = self.dropoutl(y)
439 | y = F.relu(self.fc1(y))
440 | if istrain:
441 | y = self.dropoutl(y)
442 | y = self.output(y)
443 | return y
444 |
445 |
446 | def loader_from_dataframe(X: pd.DataFrame,
447 | Y: Union[pd.Series, pd.DataFrame],
448 | batch_size: int = 1024):
449 | """Get PyTorch data loaders from pandas dataframes
450 |
451 | :param X: features to predict Y from
452 | :param Y: feature(s) to be predicted (dependent variable(s))
453 | :param batch_size: batch size of the data loader
454 | """
455 | if Y is None:
456 | raise ValueError('Y must be specified')
457 | if isinstance(X, pd.DataFrame):
458 | X = X.values
459 | if isinstance(Y, (pd.Series, pd.DataFrame)):
460 | Y = Y.values
461 | if len(Y.shape) == 1:
462 | Y = Y.reshape(Y.shape[0], 1)
463 | dataset = TensorDataset(T.Tensor(X), T.Tensor(Y))
464 | loader = DataLoader(dataset, batch_size=batch_size)
465 | return loader
466 |
467 |
468 | def loader_from_iterator(X: Union[PandasTextFileReader, Iterator],
469 | Y: Union[PandasTextFileReader, Iterator] = None,
470 | y_col: Optional[str] = None,
471 | batch_size: int = 1024):
472 | """Get PyTorch data loaders from iterators
473 |
474 | :param X: features to predict Y from
475 | :param Y: features to be predicted (dependent variables)
476 | :param y_col: name of the columns in X containing the dependent variables to be predicted
477 | :param batch_size: batch size of the data loader
478 | """
479 | if Y is None and y_col is None:
480 | raise ValueError('either Y or y_col must be specified')
481 | if Y is None:
482 | X, Y = split_into_x_and_y(X, y_col)
483 | dataset = IterableDataset(X, Y)
484 | return DataLoader(dataset, batch_size=batch_size)
485 |
486 |
487 | class IterableDataset(PandasIterableDataset):
488 | def __init__(self, x_iterator: Iterator, y_iterator: Iterator):
489 | self.iterator = zip(x_iterator, y_iterator)
490 |
491 | def __iter__(self):
492 | for chunk_x, chunk_y in self.iterator:
493 | for row in zip(chunk_x, chunk_y):
494 | yield row
495 |
496 |
497 | def split_into_x_and_y(data: Union[PandasTextFileReader, Iterator],
498 | y_col: Union[str, List[str]]):
499 | """Extract the columns for the data iterator into another iterator.
500 |
501 | :param data: the input iterator to extract columns from
502 | :param y_col: name of the columns to be extracted
503 | :return: first iterator
504 | """
505 | if isinstance(y_col, list) and not len(y_col):
506 | raise ValueError('at least one column must be extracted')
507 | if not isinstance(y_col, list):
508 | y_col = [y_col]
509 | gen_x, gen_y = itertools.tee(data, 2)
510 | return extract_x(gen_x, y_col), extract_y(gen_y, y_col)
511 |
512 |
513 | def extract_y(data: Union[PandasTextFileReader, Iterator], y_col: List[str]):
514 | """Extract the columns from the data."""
515 | for chunk in data:
516 | if not np.all(chunk.columns.isin(y_col)):
517 | raise ValueError(f'columns {chunk.columns} not found in data')
518 | return T.Tensor(chunk[y_col])
519 |
520 |
521 | def extract_x(data: Union[PandasTextFileReader, Iterator], y_col: List[str]):
522 | """Extract the columns from the data."""
523 | for chunk in data:
524 | if not np.all(data.columns.isin(y_col)):
525 | raise ValueError(f'columns {chunk.columns} not found in data')
526 | return T.Tensor(chunk.drop(columns=y_col))
527 |
--------------------------------------------------------------------------------
/src/papyrus_scripts/reader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Reading capacities of the Papyrus-scripts."""
4 | from __future__ import annotations
5 |
6 | import json
7 | import os
8 | from typing import Optional, Union, Iterator, List
9 | from functools import partial
10 |
11 | import pystow
12 | import pandas as pd
13 | from tqdm.auto import tqdm
14 | from prodec import Descriptor, Transform
15 |
16 | from .utils.mol_reader import MolSupplier
17 | from .utils.IO import locate_file, process_data_version, TypeDecoder, PapyrusVersion
18 |
19 |
20 | def read_papyrus(is3d: bool = False, version: str | PapyrusVersion = 'latest', plusplus: bool = True,
21 | chunksize: Optional[int] = None, source_path: Optional[str] = None
22 | ) -> Union[Iterator[pd.DataFrame], pd.DataFrame]:
23 |
24 | """Read the Papyrus dataset.
25 |
26 | :param is3d: whether to consider stereochemistry or not (default: False)
27 | :param version: version of the dataset to be read
28 | :param plusplus: read the Papyrus++ curated subset of very high quality
29 | :param chunksize: number of lines per chunk. To read without chunks, set to None
30 | :param source_path: folder containing the bioactivity dataset (default: pystow's home folder)
31 | :return: the Papyrus activity dataset
32 | """
33 | # Papyrus++ with stereo does not exist
34 | if is3d and plusplus:
35 | raise ValueError('Papyrus++ is only available without stereochemistry.')
36 | # Determine default paths
37 | if source_path is not None:
38 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path)
39 | version = process_data_version(version=version, root_folder=source_path)
40 | source_path = pystow.module('papyrus', version.version_old_fmt)
41 | # Load data types
42 | dtype_file = source_path.join(name='data_types.json').as_posix()
43 | with open(dtype_file, 'r') as jsonfile:
44 | dtypes = json.load(jsonfile, cls=TypeDecoder)['papyrus']
45 | # Find the file
46 | filenames = locate_file(source_path.base.as_posix(),
47 | r'\d+\.\d+' + (r'\+\+' if plusplus else '') + '_combined_set_'
48 | f'with{"out" if not is3d else ""}' + r'_stereochemistry\.tsv.*')
49 | return pd.read_csv(filenames[0], sep='\t', chunksize=chunksize, dtype=dtypes, low_memory=True)
50 |
51 |
52 | def read_protein_set(source_path: Optional[str] = None, version: str | PapyrusVersion = 'latest') -> pd.DataFrame:
53 | """Read the protein targets of the Papyrus dataset.
54 |
55 | :param source_path: folder containing the molecular descriptor datasets
56 | :param version: version of the dataset to be read
57 | :return: the set of protein targets in the Papyrus dataset
58 | """
59 | version = process_data_version(version=version, root_folder=source_path)
60 | # Determine default paths
61 | if source_path is not None:
62 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path)
63 | source_path = pystow.module('papyrus', version.version_old_fmt)
64 | # Find the file
65 | filenames = locate_file(source_path.base.as_posix(), r'\d+\.\d+_combined_set_protein_targets\.tsv.*')
66 | return pd.read_csv(filenames[0], sep='\t', keep_default_na=False)
67 |
68 |
69 | def read_molecular_descriptors(desc_type: str = 'mold2', is3d: bool = False,
70 | version: str | PapyrusVersion = 'latest', chunksize: Optional[int] = None,
71 | source_path: Optional[str] = None,
72 | ids: Optional[List[str]] = None, verbose: bool = True):
73 | """Get molecular descriptors
74 |
75 | :param desc_type: type of descriptor {'mold2', 'mordred', 'cddd', 'fingerprint', 'moe', 'all'}
76 | :param is3d: whether to load descriptors of the dataset containing stereochemistry
77 | :param version: version of the dataset to be read
78 | :param chunksize: number of lines per chunk. To read without chunks, set to None
79 | :param source_path: folder containing the bioactivity dataset (default: pystow's home folder)
80 | :param ids: identifiers of the molecules which descriptors should be loaded
81 | if is3d=True, then identifiers are InChIKeys, otherwise connectivities
82 | :param verbose: whether to show progress
83 | :return: the dataframe of molecular descriptors
84 | """
85 | if desc_type not in ['mold2', 'mordred', 'cddd', 'fingerprint', 'moe', 'all']:
86 | raise ValueError("descriptor type not supported")
87 | # Determine default paths
88 | if source_path is not None:
89 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path)
90 | version = process_data_version(version=version, root_folder=source_path)
91 | source_path = pystow.module('papyrus', version.version_old_fmt)
92 | # Load data types
93 | dtype_file = source_path.join(name='data_types.json').as_posix()
94 | with open(dtype_file, 'r') as jsonfile:
95 | dtypes = json.load(jsonfile, cls=TypeDecoder)
96 | # Find the files
97 | if desc_type in ['mold2', 'all']:
98 | mold2_files = locate_file(source_path.join('descriptors').as_posix(),
99 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_mold2\.tsv.*')
100 | elif desc_type in ['mordred', 'all']:
101 | mordd_files = locate_file(source_path.join('descriptors').as_posix(),
102 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_mordred{3 if is3d else 2}D\.tsv.*')
103 | elif desc_type in ['cddd', 'all']:
104 | cddds_files = locate_file(source_path.join('descriptors').as_posix(),
105 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_CDDDs.tsv.*')
106 | elif desc_type in ['fingerprint', 'all']:
107 | molfp_files = locate_file(source_path.join('descriptors').as_posix(),
108 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_{"E3FP" if is3d else "ECFP6"}\.tsv.*')
109 | elif desc_type in ['moe', 'all']:
110 | moe_files = locate_file(source_path.join('descriptors').as_posix(),
111 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_MOE\.tsv.*')
112 | if verbose:
113 | pbar = partial(tqdm, desc='Loading molecular descriptors')
114 | else:
115 | pbar = partial(iter)
116 | if desc_type == 'mold2':
117 | return _filter_molecular_descriptors(pbar(pd.read_csv(mold2_files[0], sep='\t',
118 | dtype=dtypes['mold2'], low_memory=True, chunksize=chunksize)),
119 | ids, 'InChIKey' if is3d else 'connectivity')
120 | elif desc_type == 'mordred':
121 | return _filter_molecular_descriptors(pbar(pd.read_csv(mordd_files[0], sep='\t',
122 | dtype=dtypes[f'mordred_{3 if is3d else 2}D'], low_memory=True,
123 | chunksize=chunksize)),
124 | ids, 'InChIKey' if is3d else 'connectivity')
125 | elif desc_type == 'cddd':
126 | return _filter_molecular_descriptors(pbar(pd.read_csv(cddds_files[0], sep='\t',
127 | dtype=dtypes['CDDD'], low_memory=True, chunksize=chunksize)),
128 | ids, 'InChIKey' if is3d else 'connectivity')
129 | elif desc_type == 'fingerprint':
130 | return _filter_molecular_descriptors(pbar(pd.read_csv(molfp_files[0], sep='\t',
131 | dtype=dtypes[f'{"E3FP" if is3d else "ECFP6"}'],
132 | low_memory=True, chunksize=chunksize)),
133 | ids, 'InChIKey' if is3d else 'connectivity')
134 | elif desc_type == 'moe':
135 | return _filter_molecular_descriptors(pbar(pd.read_csv(moe_files[0], sep='\t',
136 | low_memory=True, chunksize=chunksize)),
137 | ids, 'InChIKey' if is3d else 'connectivity')
138 | elif desc_type == 'all':
139 | mold2 = _filter_molecular_descriptors(pd.read_csv(mold2_files[0], sep='\t',
140 | dtype=dtypes['mold2'], low_memory=True, chunksize=chunksize),
141 | ids, 'InChIKey' if is3d else 'connectivity')
142 | mordd = _filter_molecular_descriptors(pd.read_csv(mordd_files[0], sep='\t',
143 | dtype=dtypes[f'mordred_{3 if is3d else 2}D'],
144 | low_memory=True, chunksize=chunksize),
145 | ids, 'InChIKey' if is3d else 'connectivity')
146 | cddds = _filter_molecular_descriptors(pd.read_csv(cddds_files[0], sep='\t', dtype=dtypes['CDDD'],
147 | low_memory=True, chunksize=chunksize),
148 | ids, 'InChIKey' if is3d else 'connectivity')
149 | molfp = _filter_molecular_descriptors(pd.read_csv(molfp_files[0], sep='\t',
150 | dtype=dtypes[f'{"E3FP" if is3d else "ECFP6"}'],
151 | low_memory=True, chunksize=chunksize),
152 | ids, 'InChIKey' if is3d else 'connectivity')
153 | moe = _filter_molecular_descriptors(pd.read_csv(moe_files[0], sep='\t', low_memory=True, chunksize=chunksize),
154 | ids, 'InChIKey' if is3d else 'connectivity')
155 | if chunksize is None:
156 | mold2.set_index('InChIKey' if is3d else 'connectivity', inplace=True)
157 | mordd.set_index('InChIKey' if is3d else 'connectivity', inplace=True)
158 | molfp.set_index('InChIKey' if is3d else 'connectivity', inplace=True)
159 | cddds.set_index('InChIKey' if is3d else 'connectivity', inplace=True)
160 | moe.set_index('InChIKey' if is3d else 'connectivity', inplace=True)
161 | data = pd.concat([mold2, mordd, cddds, molfp, moe], axis=1)
162 | del mold2, mordd, cddds, molfp, moe
163 | data.reset_index(inplace=True)
164 | return data
165 | return _filter_molecular_descriptors(pbar(_join_molecular_descriptors(mold2, mordd, molfp, cddds, moe,
166 | on='InChIKey' if is3d else 'connectivity')),
167 | ids, 'InChIKey' if is3d else 'connectivity')
168 |
169 |
170 | def _join_molecular_descriptors(*descriptors: Iterator, on: str = 'connectivity') -> Iterator:
171 | """Concatenate multiple types of molecular descriptors on the same identifier.
172 |
173 | :param descriptors: the different iterators of descriptors to be joined
174 | :param on: identifier to join the descriptors on
175 | """
176 | try:
177 | while True:
178 | values = [next(descriptor).set_index(on) for descriptor in descriptors]
179 | data = pd.concat(values, axis=1)
180 | data.reset_index(inplace=True)
181 | yield data
182 | except StopIteration:
183 | raise StopIteration
184 |
185 |
186 | def _filter_molecular_descriptors(data: Union[pd.DataFrame, Iterator],
187 | ids: Optional[List[str]], id_name: str):
188 | if isinstance(data, pd.DataFrame):
189 | if ids is None:
190 | return _iterate_filter_descriptors(data, None, None)
191 | return data[data[id_name].isin(ids)]
192 | else:
193 | return _iterate_filter_descriptors(data, ids, id_name)
194 |
195 |
196 | def _iterate_filter_descriptors(data: Iterator, ids: Optional[List[str]], id_name: Optional[str]):
197 | for chunk in data:
198 | if ids is None:
199 | yield chunk
200 | else:
201 | yield chunk[chunk[id_name].isin(ids)]
202 |
203 |
204 | def read_protein_descriptors(desc_type: Union[str, Descriptor, Transform] = 'unirep',
205 | version: str | PapyrusVersion = 'latest', chunksize: Optional[int] = None,
206 | source_path: Optional[str] = None,
207 | ids: Optional[List[str]] = None, verbose: bool = True,
208 | **kwargs):
209 | """Get protein descriptors
210 |
211 | :param desc_type: type of descriptor {'unirep'} or a prodec instance of a Descriptor or Transform
212 | :param version: version of the dataset to be read
213 | :param chunksize: number of lines per chunk. To read without chunks, set to None
214 | :param source_path: If desc_type is 'unirep', folder containing the protein descriptor datasets.
215 | If desc_type is 'custom', the file path to a tab-separated dataframe containing target_id
216 | as its first column and custom descriptors in the following ones.
217 | If desc_type is a ProDEC Descriptor or Transform instance, folder containing the bioactivity dataset
218 | (default: pystow's home folder)
219 | :param ids: identifiers of the sequences which descriptors should be loaded (e.g. P30542_WT)
220 | :param verbose: whether to show progress
221 | :param kwargs: keyword arguments passed to the `pandas` method of the ProDEC Descriptor or Transform instance
222 | (is ignored if `desc_type` is not a ProDEC Descriptor or Transform instance)
223 | :return: the dataframe of protein descriptors
224 | """
225 | if desc_type not in ['unirep', 'custom'] and not isinstance(desc_type, (Descriptor, Transform)):
226 | raise ValueError("descriptor type not supported")
227 | if desc_type != 'custom':
228 | # Determine default paths
229 | if source_path is not None:
230 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path)
231 | version = process_data_version(version=version, root_folder=source_path)
232 | source_path = pystow.module('papyrus', version.version_old_fmt)
233 | if not isinstance(desc_type, (Descriptor, Transform)):
234 | # Load data types
235 | dtype_file = source_path.join(name='data_types.json').as_posix()
236 | with open(dtype_file, 'r') as jsonfile:
237 | dtypes = json.load(jsonfile, cls=TypeDecoder)
238 | # Set verbose level
239 | if verbose:
240 | pbar = partial(tqdm, desc='Loading protein descriptors')
241 | else:
242 | pbar = partial(iter)
243 | if desc_type == 'unirep':
244 | unirep_files = locate_file(source_path.join('descriptors').as_posix(), r'(?:\d+\.\d+_combined_prot_embeddings_unirep\.tsv.*)|(?:\d+\.\d+_combined_protdescs_unirep\.tsv.*)')
245 | if len(unirep_files) == 0:
246 | raise ValueError('Could not find unirep descriptor file')
247 | if desc_type == 'unirep':
248 | if chunksize is None and ids is None:
249 | return pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'], low_memory=True)
250 | elif chunksize is None and ids is not None:
251 | descriptors = pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'], low_memory=True)
252 | if 'target_id' in descriptors.columns:
253 | return descriptors[descriptors['target_id'].isin(ids)]
254 | return descriptors[descriptors['TARGET_NAME'].isin(ids)].rename(columns={'TARGET_NAME': 'target_id'})
255 | elif chunksize is not None and ids is None:
256 | return pd.concat([chunk
257 | for chunk in pbar(pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'],
258 | low_memory=True, chunksize=chunksize))
259 | ]).rename(columns={'TARGET_NAME': 'target_id'})
260 | return pd.concat([chunk[chunk['target_id'].isin(ids)]
261 | if 'target_id' in chunk.columns
262 | else chunk[chunk['TARGET_NAME'].isin(ids)]
263 | for chunk in pbar(pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'],
264 | low_memory=True, chunksize=chunksize))
265 | ]).rename(columns={'TARGET_NAME': 'target_id'})
266 | else:
267 | # Calculate protein descriptors
268 | protein_data = read_protein_set(pystow.module('').base.as_posix(), version=version)
269 | protein_data.rename(columns={'TARGET_NAME': 'target_id'}, inplace=True)
270 | # Keep only selected proteins
271 | if ids is not None:
272 | protein_data = protein_data[protein_data['target_id'].isin(ids)]
273 | # Filter out non-natural amino-acids
274 | protein_data = protein_data.loc[protein_data['Sequence'].map(desc_type.Descriptor.is_sequence_valid), :]
275 | # Obtain descriptors
276 | descriptors = desc_type.pandas_get(protein_data['Sequence'].tolist(), protein_data['target_id'].tolist(),
277 | **kwargs)
278 | descriptors.rename(columns={'ID': 'target_id'}, inplace=True)
279 | return descriptors
280 | elif desc_type == 'custom':
281 | # Check path exists
282 | if not os.path.isfile(source_path):
283 | raise ValueError('source_path must point to an existing file if using a custom descriptor type')
284 | # No chunksier, no filtering
285 | if chunksize is None and ids is None:
286 | return pd.read_csv(source_path, sep='\t', low_memory=True).rename(columns={'TARGET_NAME': 'target_id'})
287 | # No chunksize but filtering
288 | elif chunksize is None and ids is not None:
289 | descriptors = pd.read_csv(source_path, sep='\t', low_memory=True)
290 | descriptors.rename(columns={'TARGET_NAME': 'target_id'}, inplace=True)
291 | return descriptors[descriptors['target_id'].isin(ids)]
292 | else:
293 | # Set verbose level
294 | if verbose:
295 | pbar = partial(tqdm, desc='Loading custom protein descriptors')
296 | else:
297 | pbar = partial(iter)
298 | # Chunksize but no filtering
299 | if chunksize is not None and ids is None:
300 | return pd.concat([chunk
301 | for chunk in pbar(pd.read_csv(source_path, sep='\t',
302 | low_memory=True, chunksize=chunksize))
303 | ]).rename(columns={'TARGET_NAME': 'target_id'})
304 | # Both chunksize and filtering
305 | return pd.concat([chunk[chunk['target_id'].isin(ids)]
306 | if 'target_id' in chunk.columns
307 | else chunk[chunk['TARGET_NAME'].isin(ids)]
308 | for chunk in pbar(pd.read_csv(source_path,
309 | sep='\t', low_memory=True, chunksize=chunksize))
310 | ]).rename(columns={'TARGET_NAME': 'target_id'})
311 |
312 |
313 | def read_molecular_structures(is3d: bool = False, version: str | PapyrusVersion = 'latest',
314 | chunksize: Optional[int] = None, source_path: Optional[str] = None,
315 | ids: Optional[List[str]] = None, verbose: bool = True):
316 | """Get molecular structures
317 |
318 | :param is3d: whether to load descriptors of the dataset containing stereochemistry
319 | :param version: version of the dataset to be read
320 | :param chunksize: number of lines per chunk. To read without chunks, set to None
321 | :param source_path: folder containing the bioactivity dataset (default: pystow's home folder)
322 | :param ids: identifiers of the molecules which descriptors should be loaded
323 | if is3d=True, then identifiers are InChIKeys, otherwise connectivities
324 | :param verbose: whether to show progress
325 | :return: the dataframe of molecular structures
326 | """
327 | # Determine default paths
328 | if source_path is not None:
329 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path)
330 | version = process_data_version(version=version, root_folder=source_path)
331 | source_path = pystow.module('papyrus', version.version_old_fmt)
332 | # Find the files
333 | sd_files = locate_file(source_path.join('structures').as_posix(),
334 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_set_with{"" if is3d else "out"}_stereochemistry.sd.*')
335 | if chunksize is None:
336 | data = []
337 | # Iterate through the file
338 | with MolSupplier(sd_files[0], show_progress=True) as f_handle:
339 | for _, mol in f_handle:
340 | # Obtain SD molecular properties
341 | props = mol.GetPropsAsDict()
342 | # If IDs given and not in the list, skip
343 | if ids is not None and props['InChIKey' if is3d else 'connectivity'] not in ids:
344 | continue
345 | # Else add structure to the dict
346 | # and add the dict to data
347 | props['mol'] = mol
348 | data.append(props)
349 | # Return the list of dicts as a pandas DataFrame
350 | return pd.DataFrame(data)
351 | else:
352 | # Process the data through an iterator
353 | structure_iterator = _structures_iterator(sd_files[0], chunksize, ids, is3d, verbose)
354 | return structure_iterator
355 |
356 |
357 | def _structures_iterator(sd_file: str, chunksize: int,
358 | ids: Optional[List[str]] = None,
359 | is3d: bool = False, verbose: bool = True) -> Iterator[pd.DataFrame]:
360 | if not isinstance(chunksize, int) or chunksize < 1:
361 | raise ValueError('Chunksize must be a non-null positive integer.')
362 | if verbose:
363 | pbar = tqdm(desc='Loading molecular structures')
364 | data = []
365 | # Iterate through the file
366 | with MolSupplier(sd_file) as f_handle:
367 | for _, mol in f_handle:
368 | # Obtain SD molecular properties
369 | props = mol.GetPropsAsDict()
370 | # If IDs given and not in the list, skip
371 | id_ = props['InChIKey' if is3d else 'connectivity']
372 | if (ids is not None) and (id_ not in ids):
373 | continue
374 | props['mol'] = mol
375 | data.append(props)
376 | # Chunk is complete
377 | if len(data) == chunksize:
378 | if verbose:
379 | pbar.update()
380 | yield pd.DataFrame(data)
381 | data = []
382 | if verbose:
383 | pbar.update()
384 | yield pd.DataFrame(data)
385 |
--------------------------------------------------------------------------------