├── tests ├── __init__.py └── test_oop.py ├── src └── papyrus_scripts │ ├── utils │ ├── __init__.py │ ├── aliases.json │ ├── UniprotMatch.py │ ├── mol_reader.py │ ├── links.json │ └── IO.py │ ├── __main__.py │ ├── __init__.py │ ├── matchRCSB.py │ ├── fingerprint.py │ ├── download.py │ ├── neuralnet.py │ └── reader.py ├── figures └── logo │ ├── Papyrus_trnsp-bg.png │ ├── Papyrus_trnsp-bg.svg │ └── Papyrus_trnsp-bg-white.svg ├── setup.py ├── CONTRIBUTING.md ├── notebook_examples └── advanced_querying.ipynb ├── .flake8 ├── LICENSE ├── tox.ini ├── setup.cfg ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/papyrus_scripts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Utility functions.""" 4 | -------------------------------------------------------------------------------- /figures/logo/Papyrus_trnsp-bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/HEAD/figures/logo/Papyrus_trnsp-bg.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Setup module.""" 4 | 5 | import setuptools 6 | 7 | if __name__ == '__main__': 8 | setuptools.setup() 9 | -------------------------------------------------------------------------------- /src/papyrus_scripts/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Entrypoint module, in case you use `python -m papyrus`. 4 | 5 | Why does this file exist, and why `__main__`? For more info, read: 6 | 7 | - https://www.python.org/dev/peps/pep-0338/ 8 | - https://docs.python.org/3/using/cmdline.html#cmdoption-m 9 | """ 10 | 11 | 12 | from .cli import main 13 | 14 | if __name__ == '__main__': 15 | main() 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to this repository 2 | 3 | ## Getting started 4 | - Before contributing, make sure you have a working developping environment set up. 5 | ```bash 6 | pip install tox 7 | ``` 8 | Few *tox* environments are defined for easier linting, testing and documentation generation. 9 | 10 | We enforce strict coding rules. : 11 | - To make sure you comply with coding rules use the following command: 12 | ```bash 13 | tox -e isort 14 | tox -e flake8 15 | ``` 16 | - Pyroma checks if the installation information is sufficient 17 | ```bash 18 | tox -e pyroma 19 | ``` 20 | 21 | **DOES NOT WORK AT THE MOMENT:** 22 | Automatic documentation can be generated like so: 23 | ``` 24 | tox -e docs 25 | ``` 26 | 27 | For the entire workflow of linting, testing and documentation 28 | ``` 29 | tox 30 | ``` -------------------------------------------------------------------------------- /notebook_examples/advanced_querying.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [] 7 | }, 8 | "source": [ 9 | "# Advanced examples: Using Papyrus scripts" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Coming soon." 17 | ] 18 | } 19 | ], 20 | "metadata": { 21 | "kernelspec": { 22 | "display_name": "Python 3", 23 | "language": "python", 24 | "name": "python3" 25 | }, 26 | "language_info": { 27 | "codemirror_mode": { 28 | "name": "ipython", 29 | "version": 3 30 | }, 31 | "file_extension": ".py", 32 | "mimetype": "text/x-python", 33 | "name": "python", 34 | "nbconvert_exporter": "python", 35 | "pygments_lexer": "ipython3", 36 | "version": "3.6.13" 37 | } 38 | }, 39 | "nbformat": 4, 40 | "nbformat_minor": 4 41 | } 42 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | ############################################### 2 | # Flake8 Configuration # 3 | # (.flake8) # 4 | # Adapted from https://github.com/pybel/pybel # 5 | ############################################### 6 | 7 | # This config can't go in setup.cfg because Python's ConfigParser 8 | # used by setup.pg will interpolate on all of Scott's beautiful % signs 9 | # that make the pretty colored output 10 | 11 | [flake8] 12 | ignore = 13 | # Complains about URLs 14 | S310 15 | exclude = 16 | .tox, 17 | .git, 18 | __pycache__, 19 | docs/source/conf.py, 20 | build, 21 | dist, 22 | tests/fixtures/*, 23 | *.pyc, 24 | *.egg-info, 25 | .cache, 26 | .eggs 27 | max-line-length = 120 28 | # import-order-style = pycharm 29 | application-import-names = 30 | papyrus_scripts 31 | tests 32 | format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s -------------------------------------------------------------------------------- /src/papyrus_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """A collection of scripts to handle the Papyrus bioactivity dataset.""" 4 | 5 | from .download import download_papyrus, remove_papyrus 6 | from .reader import (read_papyrus, read_protein_set, read_protein_descriptors, 7 | read_molecular_descriptors, read_molecular_structures) 8 | 9 | from .matchRCSB import update_rcsb_data, get_matches 10 | from .preprocess import (keep_organism, keep_accession, keep_type, keep_source, 11 | keep_protein_class, keep_quality, keep_contains, keep_match, 12 | keep_similar, keep_substructure, keep_not_contains, keep_not_match, 13 | keep_dissimilar, keep_not_substructure, consume_chunks, yscrambling) 14 | 15 | from .modelling import qsar, pcm 16 | 17 | from .utils.mol_reader import MolSupplier 18 | from .utils import IO, UniprotMatch 19 | from .utils.IO import PapyrusVersion 20 | 21 | from .oop import PapyrusDataset 22 | 23 | __version__ = '2.1.2' 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 OlivierBeq 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | pyroma 4 | isort 5 | flake8 6 | tests 7 | requires = tox 8 | 9 | [testenv:tests] 10 | commands = 11 | pytest tests/ 12 | conda_deps= 13 | rdkit 14 | openbabel 15 | pandas 16 | deps = 17 | pytest 18 | conda_channels= 19 | conda-forge 20 | rdkit 21 | description = Run tests to check code validity. 22 | 23 | [testenv:pyroma] 24 | commands = 25 | pyroma . 26 | deps = 27 | pyroma 28 | skip_install = true 29 | description = Make sure setup.cfg is properly written out. 30 | 31 | [testenv:isort] 32 | extras = tests 33 | # Needs a full install so isort can determine own/foreign imports. 34 | deps = 35 | isort 36 | commands = 37 | isort setup.py src tests 38 | 39 | [testenv:flake8] 40 | skip_install = true 41 | deps = 42 | flake8 43 | flake8-assertive 44 | flake8-bandit 45 | flake8-bugbear 46 | flake8-builtins 47 | flake8-colors 48 | flake8-commas 49 | flake8-comprehensions 50 | flake8-docstrings 51 | # flake8-import-order 52 | flake8-isort 53 | flake8-print 54 | flake8-use-fstring 55 | pep8-naming 56 | pydocstyle 57 | commands = 58 | flake8 src/ setup.py tests/ 59 | description = Run the flake8 tool with several plugins (e.g. bandit, docstrings, isort import order) to check code quality. 60 | -------------------------------------------------------------------------------- /src/papyrus_scripts/utils/aliases.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns":[ 3 | "version", 4 | "alias", 5 | "revision", 6 | "chembl", 7 | "chembl_version", 8 | "excape", 9 | "sharma", 10 | "christmann", 11 | "klaeger", 12 | "merget", 13 | "pickett" 14 | ], 15 | "index":[ 16 | 0, 17 | 1, 18 | 2, 19 | 3 20 | ], 21 | "data":[ 22 | [ 23 | "05.4", 24 | 2022.04, 25 | 2, 26 | true, 27 | 29, 28 | true, 29 | true, 30 | true, 31 | true, 32 | true, 33 | false 34 | ], 35 | [ 36 | "05.5", 37 | 2022.08, 38 | 3, 39 | true, 40 | 30, 41 | true, 42 | true, 43 | true, 44 | true, 45 | true, 46 | false 47 | ], 48 | [ 49 | "05.6", 50 | 2022.11, 51 | 4, 52 | true, 53 | 31, 54 | true, 55 | true, 56 | true, 57 | true, 58 | true, 59 | false 60 | ], 61 | [ 62 | "05.7", 63 | 2024.09, 64 | 2, 65 | true, 66 | 34, 67 | true, 68 | true, 69 | true, 70 | true, 71 | true, 72 | true 73 | ] 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = papyrus_scripts 3 | version = attr: papyrus_scripts.__version__ 4 | description = A collection of scripts to handle the Papyrus bioactivity dataset 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/OlivierBeq/Papyrus-scripts 8 | author = Olivier J. M. Béquignon - Brandon J. Bongers - Willem Jespers 9 | author_email = "olivier.bequignon.maintainer@gmail.com" 10 | maintainer = Olivier J. M. Béquignon 11 | maintainer_email = "olivier.bequignon.maintainer@gmail.com" 12 | license_file = LICENSE 13 | classifiers = 14 | Development Status :: 2 - Pre-Alpha 15 | Programming Language :: Python 16 | Programming Language :: Python :: 3.10 17 | Programming Language :: Python :: 3.9 18 | Programming Language :: Python :: 3.8 19 | Programming Language :: Python :: 3.7 20 | Programming Language :: Python :: 3.6 21 | keywords = 22 | bioactivity data 23 | QSAR 24 | proteochemometrics 25 | cheminformatics 26 | modelling 27 | machine learning 28 | 29 | 30 | [options] 31 | include_package_data = True 32 | packages = find: 33 | package_dir = 34 | = src 35 | install_requires = 36 | numpy>=2.0.0 37 | pandas 38 | rdkit 39 | requests 40 | joblib 41 | tqdm 42 | mordred 43 | swifter 44 | scikit-learn 45 | xgboost 46 | pystow 47 | prodec 48 | 49 | 50 | [options.packages.find] 51 | where = src 52 | 53 | 54 | [options.package_data] 55 | * = *.json 56 | 57 | 58 | [options.entry_points] 59 | console_scripts = 60 | papyrus = papyrus_scripts.cli:main 61 | 62 | 63 | [options.extras_require] 64 | docs = 65 | sphinx 66 | sphinx-rtd-theme 67 | sphinx-autodoc-typehints 68 | 69 | 70 | testing = 71 | pytest 72 | parameterized 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # PyCharm stuff: 85 | .idea/ 86 | 87 | # VSCode stuff: 88 | .vscode/ 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ -------------------------------------------------------------------------------- /src/papyrus_scripts/matchRCSB.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Match data of the Papyrus dataset with that of the Protein Data Bank.""" 4 | 5 | import os 6 | import time 7 | from typing import Iterator, Generator, Optional, Union 8 | 9 | import pystow 10 | from rdkit import Chem 11 | from rdkit import RDLogger 12 | from tqdm.auto import tqdm 13 | import pandas as pd 14 | from pandas.io.parsers import TextFileReader as PandasTextFileReader 15 | import requests 16 | 17 | from .utils import UniprotMatch 18 | 19 | 20 | def update_rcsb_data(root_folder: Optional[str] = None, 21 | overwrite: bool = False, 22 | verbose: bool = True 23 | ) -> pd.DataFrame: 24 | """Update the local data of the RCSB. 25 | 26 | :param root_folder: Directory where Papyrus bioactivity data is stored (default: pystow's home folder) 27 | :param overwrite: Whether to overwrite the local file if already present 28 | (default: False if the local file was downloaded today. 29 | :param verbose: Should logging information be printed. 30 | :return: The mapping between PDB and UniProt identifiers 31 | """ 32 | # Define output path 33 | if root_folder is not None: 34 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder) 35 | root_folder = pystow.module('papyrus') 36 | output_path = root_folder.join('rcsb', name='RCSB_data.tsv.xz') 37 | # Check if file is too recent 38 | if (os.path.isfile(output_path) and (time.time() - os.path.getmtime(output_path)) < 86400) and not overwrite: 39 | if verbose: 40 | print(f'RCSB data was obtained less than 24 hours ago: {output_path}\n' 41 | f'Set overwrite=True to force the fetching of data again.') 42 | return pd.read_csv(output_path, sep='\t') 43 | # Obtain the mapping InChI to PDB ligand code 44 | if verbose: 45 | print(f'Obtaining RCSB compound mappings from InChI to PDB ID') 46 | base_url = 'http://ligand-expo.rcsb.org/dictionaries/{}' 47 | request = requests.get(base_url.format('Components-inchi.ich')) 48 | if request.status_code != 200: 49 | raise IOError(f'resource could not be accessed: {request.url}') 50 | inchi_data = pd.DataFrame([line.split('\t')[:2] for line in request.text.splitlines()], 51 | columns=['InChI', 'PDBID']) 52 | # Process InChI for 2D data 53 | if verbose: 54 | pbar = tqdm(enumerate(inchi_data.InChI), total=inchi_data.shape[0], desc='Converting InChIs', ncols=100) 55 | else: 56 | pbar = enumerate(inchi_data.InChI) 57 | RDLogger.DisableLog('rdApp.*') 58 | for i, inchi in pbar: 59 | mol = Chem.MolFromInchi(inchi) 60 | if mol is not None: 61 | Chem.RemoveStereochemistry(mol) 62 | inchi_data.loc[i, 'InChI_2D'] = Chem.MolToInchi(mol) 63 | RDLogger.EnableLog('rdApp.*') 64 | # Obtain the mapping of PDB ids ligand to proteins structures 65 | if verbose: 66 | print(f'Obtaining RCSB compound mappings from ligand PDB ID to protein PDB ID') 67 | request = requests.get(base_url.format('cc-to-pdb.tdd')) 68 | if request.status_code != 200: 69 | raise IOError(f'resource could not be accessed: {request.url}') 70 | pdbid_data = pd.DataFrame([line.split('\t')[:2] for line in request.text.splitlines()], 71 | columns=['PDBIDlig', 'PDBIDprot']) 72 | # Merge both dataframe 73 | if verbose: 74 | print(f'Combining the data') 75 | pdb_data = inchi_data.merge(pdbid_data, left_on='PDBID', right_on='PDBIDlig') 76 | # Unmerge the data per protein PDB ID 77 | pdb_data.PDBIDprot = pdb_data.PDBIDprot.str.split() 78 | pdb_data = pdb_data.explode('PDBIDprot') 79 | # Map PDBID prot to UniProt acessions 80 | if verbose: 81 | print(f'Obtaining mappings from protein PDB ID to UniProt accessions') 82 | uniprot_mapping = UniprotMatch.uniprot_mappings(pdb_data.PDBIDprot.tolist(), 83 | map_from='PDB', 84 | map_to='UniProtKB_AC-ID') # Forces the use of SIFTS 85 | # Join on the RCSB data 86 | if verbose: 87 | print(f'Combining the data') 88 | pdb_data = pdb_data.merge(uniprot_mapping, left_on='PDBIDprot', right_on='PDB') 89 | # Rename columns 90 | pdb_data = pdb_data.rename(columns={'InChI': 'InChI_3D', 91 | 'PDBIDlig': 'PDBID_ligand', 92 | 'PDBIDprot': 'PDBID_protein', 93 | 'UniProtKB_AC-ID': 'UniProt_accession'}) 94 | # Drop duplicate information 95 | pdb_data = pdb_data.drop(columns=['PDBID', 'PDB']) 96 | # Reorder columns 97 | pdb_data = pdb_data[['InChI_3D', 'InChI_2D', 'PDBID_ligand', 'PDBID_protein', 'UniProt_accession']] 98 | # Write to disk and return 99 | if verbose: 100 | print(f'Writing results to disk') 101 | pdb_data.to_csv(output_path, sep='\t', index=False) 102 | return pdb_data 103 | 104 | 105 | def get_matches(data: Union[pd.DataFrame, PandasTextFileReader, Iterator], 106 | root_folder: Optional[str] = None, 107 | verbose: bool = True, 108 | total: Optional[int] = None, 109 | update: bool = True) -> Union[pd.DataFrame, Generator]: 110 | """ 111 | 112 | :param data: Papyrus data to be mapped with PDB identifiers 113 | :param root_folder: Directory where Papyrus bioactivity data is stored (default: pystow's home folder) 114 | :param verbose: show progress if data is and Iterator or a PandasTextFileReader 115 | :param total: Total number of chunks for progress display 116 | :param update: should the local cache of PDB identifiers be updated 117 | :return: The subset of Papyrus data with matching RCSB PDB identifiers 118 | """ 119 | if isinstance(data, (PandasTextFileReader, Iterator)): 120 | return _chunked_get_matches(data, root_folder, verbose, total) 121 | if isinstance(data, pd.DataFrame): 122 | if 'connectivity' in data.columns: 123 | identifier = 'InChI_2D' 124 | elif 'InChIKey' in data.columns: 125 | identifier = 'InChI_3D' 126 | elif 'accession' in data.columns: 127 | raise ValueError('data does not contain either connectivity or InChIKey data.') 128 | else: 129 | raise ValueError('data does not contain either connectivity, InChIKey or protein accession data.') 130 | # Update the data if possible 131 | if update: 132 | _ = update_rcsb_data(root_folder, verbose=verbose) 133 | # Set pystow root folder 134 | if root_folder is not None: 135 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder) 136 | root_folder = pystow.module('papyrus') 137 | rcsb_data_path = root_folder.join('rcsb', name='RCSB_data.tsv.xz') 138 | # Read the data mapping 139 | rcsb_data = pd.read_csv(rcsb_data_path, sep='\t') 140 | # Process InChI 141 | data = data[data['InChI'].isin(rcsb_data[identifier])] 142 | data = data.merge(rcsb_data, left_on=['InChI', 'accession'], right_on=[identifier, 'UniProt_accession']) 143 | data = data.drop(columns=['InChI_2D', 'InChI_3D', 'UniProt_accession']) 144 | data = data.groupby('Activity_ID').aggregate({column: ';'.join 145 | if column == 'PDBID_protein' 146 | else 'first' 147 | for column in data.columns}) 148 | return data 149 | else: 150 | raise TypeError('data can only be a pandas DataFrame, TextFileReader or an Iterator') 151 | 152 | 153 | def _chunked_get_matches(chunks: Union[PandasTextFileReader, Iterator], root_folder: Optional[str], verbose: bool, 154 | total: int): 155 | if verbose: 156 | pbar = tqdm(chunks, total=total, ncols=100) 157 | else: 158 | pbar = chunks 159 | for chunk in pbar: 160 | processed_chunk = get_matches(chunk, root_folder, update=False) 161 | yield processed_chunk 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Papyrus-scripts 2 | 3 | Collection of scripts to interact with the Papyrus bioactivity dataset. 4 | 5 | ![alt text](https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/refs/heads/master/figures/papyrus_workflow.svg) 6 | 7 |
8 | 9 | **Associated Article:** 10.1186/s13321-022-00672-x 10 | ``` 11 | Béquignon OJM, Bongers BJ, Jespers W, IJzerman AP, van de Water B, van Westen GJP. 12 | Papyrus - A large scale curated dataset aimed at bioactivity predictions. 13 | J Cheminform 15, 3 (2023). https://doi.org/10.1186/s13321-022-00672-x 14 | ``` 15 | 16 | **Associated Preprint:** 10.33774/chemrxiv-2021-1rxhk 17 | ``` 18 | Béquignon OJM, Bongers BJ, Jespers W, IJzerman AP, van de Water B, van Westen GJP. 19 | Papyrus - A large scale curated dataset aimed at bioactivity predictions. 20 | ChemRxiv. Cambridge: Cambridge Open Engage; 2021; 21 | This content is a preprint and has not been peer-reviewed. 22 | ``` 23 | 24 | ## Installation 25 | 26 | ```bash 27 | pip install papyrus-scripts 28 | ``` 29 | 30 | :warning: If pip gives the following error and resolves in import errors 31 | ```bash 32 | Defaulting to user installation because normal site-packages is not writeable 33 | ``` 34 | Then uninstall and reinstalling the library with the following commands: 35 | ```bash 36 | pip uninstall -y papyrus-scripts 37 | python -m pip install papyrus-scripts 38 | ``` 39 | 40 | Additional dependencies can be installed to allow: 41 | - similarity and substructure searches 42 | ```bash 43 | conda install FPSim2 openbabel h5py cupy -c conda-forge 44 | ``` 45 | 46 | - training DNN models: 47 | ```bash 48 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch 49 | ``` 50 | 51 | ## Getting started 52 | 53 | ### The new application programming interface (API) 54 | This new object-oriented API is available since version 2.0.0. 55 | 56 | It allows for easier filtering of the Papyrus data and ensures that any data being queried is downloaded. 57 | 58 | ```python 59 | from papyrus_scripts import PapyrusDataset 60 | 61 | data = (PapyrusDataset(version='05.7', plusplus=True) # Downloads the data if needed 62 | .keep_source(['chembl', 'sharma']) # Keep specific sources 63 | .keep_quality('high') 64 | .proteins() # Get the corresponding protein targets 65 | ) 66 | ``` 67 | 68 | ### Functional API (legacy) 69 | 70 | The functional API requires the data to be downloaded beforehand.
71 | One can donwload the dataset either with the functional API itself or the command line interface (CLI). 72 | 73 | #### Donwloading with the command line interface (CLI) 74 | The following command will download the Papyrus++ bioactivities and protein targets (high-quality Ki and KD data as well as IC50 and EC50 of reproducible assays) for the latest version. 75 | ```bash 76 | papyrus download -V latest 77 | ``` 78 | The following command will donwload the entire set of high-, medium-, and low-quality bioactivities and protein targets along with all precomputed molecular and protein descriptors for version 05.5. 79 | ```bash 80 | papyrus download -V 05.5 --more --d all 81 | ``` 82 | The following command will download Papyrus++ bioactivities, protein targets and compound structures for both version 05.4 and 05.5. 83 | ```bash 84 | papyrus download -V 05.5 -V 05.4 -S 85 | ``` 86 | 87 | More options can be found using 88 | ```bash 89 | papyrus download --help 90 | ``` 91 | 92 | By default, the data is downloaded to [pystow](https://github.com/cthoyt/pystow)'s default directory.
93 | One can override the folder path by specifying the `-o` switch in the above commands. 94 | 95 | #### Donwloading with the functional API 96 | 97 | ```python 98 | 99 | from papyrus_scripts import download_papyrus 100 | 101 | # Donwload the latest version of the entire dataset with all precomputed descriptors 102 | download_papyrus(version='latest', only_pp=False, structures=True, descriptors='all') 103 | ``` 104 | 105 | #### Querying with the functional API 106 | 107 | The query detailed above using the object-oriented API is reproduced below using the functional API. 108 | 109 | ```python 110 | from papyrus_scripts import (read_papyrus, read_protein_set, 111 | keep_quality, keep_source, keep_type, 112 | keep_organism, keep_accession, keep_protein_class, 113 | keep_match, keep_contains, 114 | consume_chunks) 115 | 116 | chunk_reader = read_papyrus(version='05.7', plusplus=True, is3d=False, chunksize=1_000_000) 117 | protein_data = read_protein_set(version='05.7') 118 | filter1 = keep_source(data=chunk_reader, source=['chembl', 'sharma']) 119 | filter2 = keep_quality(data=filter1, min_quality='high') 120 | data = consume_chunks(filter2, progress=False) 121 | 122 | protein_data = protein_data.set_index('target_id').loc[data.target_id.unique()].reset_index() 123 | ``` 124 | 125 | ## Versions of the Papyrus dataset 126 | 127 | Different online servers host the Papyrus data based on release and ChEMBL version (table below). 128 | 129 | 130 | | Papyrus version | ChEMBL version | Zenodo | 4TU | 131 | |:---------------:|:--------------:|:---------------------------------------------------------:|:---------------------------------------------------------:| 132 | | 05.4 | 29 | [:heavy_check_mark:](https://zenodo.org/records/10943992) | [:heavy_check_mark:](https://doi.org/10.4121/16896406.v2) | 133 | | 05.5 | 30 | [:heavy_check_mark:](https://zenodo.org/records/7019873) | :x: | 134 | | 05.6 | 31 | [:heavy_check_mark:](https://zenodo.org/records/7373213) | :x: | 135 | | 05.7 | 34 | [:heavy_check_mark:](https://zenodo.org/records/13787633) | :x: | 136 | 137 | Precomputed molecular and protein descriptors along with molecular structures (2D for default set and 3D for low quality set with stereochemistry) are not available for version 05.4 from 4TU but are from Google Drive. 138 | 139 | As stated in the pre-print **we strongly encourage** the use of the dataset in which stereochemistry was not considered. 140 | This corresponds to files containing the mention "2D" and/or "without_stereochemistry". 141 | 142 | ## Interconversion of the compressed files 143 | 144 | The available LZMA-compressed files (*.xz*) may not be supported by some software (e.g. Pipeline Pilot). 145 |
**Decompressing the data is strongly discouraged!**
146 | Though Gzip files were made available at 4TU for version 05.4, we now provide a CLI option to locally interconvert from LZMA to Gzip and vice-versa. 147 | 148 | To convert from LZMA to Gzip (or vice-versa) use the following command: 149 | ```bash 150 | papyrus convert -v latest 151 | ``` 152 | 153 | ## Removal of the data 154 | 155 | One can remove the Papyrus data using either the CLI or the API. 156 | 157 | The following exerts exemplify the removal of all Papyrus data files, including all versions utility files. 158 | ```bash 159 | papyrus clean --remove_root 160 | ``` 161 | 162 | ```python 163 | from papyrus_scripts import remove_papyrus 164 | 165 | remove_papyrus(papyrus_root=True) 166 | ``` 167 | 168 | 169 | ## Easy handling of the dataset 170 | 171 | Once installed the Papyrus-scripts allow for the easy filtering of the data.
172 | - Simple examples can be found in the simple_examples.ipynb notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/OlivierBeq/Papyrus-scripts/blob/master/notebook_examples/simple_examples.ipynb) 173 | - An example on matching data with the Protein Data Bank can be found in the simple_examples.ipynb notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/OlivierBeq/Papyrus-scripts/blob/master/notebook_examples/matchRCSB.ipynb) 174 | - More advanced examples will be added to the advanced_querying.ipynb notebook. 175 | ## Reproducing results of the pre-print 176 | 177 | The scripts used to extract subsets, generate models and obtain visualizations can be found here. 178 | 179 | ## Features to come 180 | 181 | - [x] Substructure and similarity molecular searches 182 | - [x] ability to use DNN models 183 | - [x] ability to repeat model training over multiple seeds 184 | - [x] y-scrambling 185 | - [ ] adapt models to QSPRpred 186 | 187 | ## Examples to come 188 | 189 | - Use of custom grouping schemes for training/test set splitting and cross-validation 190 | - Use custom molecular and protein descriptors (either Python function or file on disk) 191 | 192 | 193 | ## Logos 194 | 195 | Logos can be found under **figures/logo** 196 | Two version exist depending on the background used. 197 | 198 | :warning: GitHub does not render the white logo properly in the table below but should not deter you from using it! 199 | 200 |
201 | 202 | | On white background | On colored background | 203 | |:--------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------:| 204 | | | | 205 | 206 |
207 | -------------------------------------------------------------------------------- /src/papyrus_scripts/fingerprint.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Callable, List 2 | import json 3 | import hashlib 4 | from abc import ABC, abstractmethod 5 | 6 | import numpy as np 7 | from rdkit import Chem 8 | from rdkit import DataStructs 9 | from rdkit.Chem import rdMolDescriptors 10 | from rdkit.Avalon import pyAvalonTools 11 | try: 12 | from openbabel import pybel 13 | except ImportError as e: 14 | pybel = e 15 | try: 16 | import FPSim2 17 | from FPSim2.FPSim2lib.utils import BitStrToIntList, PyPopcount 18 | except ImportError as e: 19 | FPSim2 = e 20 | 21 | 22 | class Fingerprint(ABC): 23 | def __init__(self, name: str, params: Dict, call_func: Callable): 24 | self.name = name 25 | self.params = params 26 | self.func = call_func 27 | # determine length 28 | self.length = None 29 | if "nBits" in params.keys(): 30 | self.length = params["nBits"] 31 | elif "fpSize" in params.keys(): 32 | self.length = params["fpSize"] 33 | elif self.name == "MACCSKeys": 34 | self.length = 166 35 | elif self.name == "FP2": 36 | self.length = 1024 37 | elif self.name == "FP3": 38 | self.length = 55 39 | elif self.name == "FP4": 40 | self.length = 307 41 | if not self.length: 42 | raise Exception("fingerprint size is not specified") 43 | self._hash = self.name + json.dumps(self.params, sort_keys=True) 44 | self._hash = hashlib.sha256((self._hash).encode()).digest() 45 | self._hash = np.frombuffer(self._hash, dtype=np.int64) 46 | self._hash = abs(np.sum(self._hash)) % 65537 47 | self._hash = f'{hex(self._hash)}' 48 | 49 | def __repr__(self): 50 | return f'{self.name}_{self.length}bits_{self._hash}' 51 | 52 | @classmethod 53 | def derived(cls): 54 | if not cls.__subclasses__(): 55 | return cls 56 | subclasses = [] 57 | for subclass in cls.__subclasses__(): 58 | subclass_derived = subclass.derived() 59 | if isinstance(subclass_derived, list): 60 | subclasses.extend(subclass_derived) 61 | else: 62 | subclasses.append(subclass_derived) 63 | return subclasses 64 | 65 | @abstractmethod 66 | def get(self, mol: Chem.Mol) -> List[int]: 67 | """Get the bistring fingerprint of the molecule""" 68 | 69 | 70 | class RDKitFingerprint(Fingerprint): 71 | def get(self, mol: Chem.Mol) -> List[int]: 72 | """Get the bistring fingerprint of the molecule and popcounts""" 73 | if isinstance(FPSim2, ImportError): 74 | raise ImportError('Some required dependencies are missing:\n\ttables, FPSim2') 75 | fp = BitStrToIntList(self.func(mol, **self.params).ToBitString()) 76 | popcnt = PyPopcount(np.array(fp, dtype=np.uint64)) 77 | return (*fp, popcnt) 78 | 79 | 80 | class MACCSKeysFingerprint(RDKitFingerprint): 81 | def __init__(self): 82 | super(MACCSKeysFingerprint, self).__init__('MACCSKeys', {}, rdMolDescriptors.GetMACCSKeysFingerprint) 83 | 84 | 85 | class AvalonFingerprint(RDKitFingerprint): 86 | def __init__(self, nBits: int = 512, isQuery: bool = False, resetVect: bool = False, bitFlags: int = 15761407): 87 | super(AvalonFingerprint, self).__init__('Avalon', 88 | {'nBits': nBits, 89 | 'isQuery': isQuery, 90 | 'resetVect': resetVect, 91 | 'bitFlags': bitFlags}, 92 | pyAvalonTools.GetAvalonFP) 93 | 94 | 95 | class MorganFingerprint(RDKitFingerprint): 96 | def __init__(self, radius: int = 2, nBits: int = 2048, invariants: list = [], fromAtoms: list = [], 97 | useChirality: bool = False, useBondTypes: bool = True, useFeatures: bool = False): 98 | super(MorganFingerprint, self).__init__('Morgan', 99 | {'radius': radius, 100 | 'nBits': nBits, 101 | 'invariants': invariants, 102 | 'fromAtoms': fromAtoms, 103 | 'useChirality': useChirality, 104 | 'useBondTypes': useBondTypes, 105 | 'useFeatures': useFeatures}, 106 | rdMolDescriptors.GetMorganFingerprintAsBitVect) 107 | 108 | 109 | class TopologicalTorsionFingerprint(RDKitFingerprint): 110 | def __init__(self, nBits: int = 2048, targetSize: int = 4, fromAtoms: List = 0, 111 | ignoreAtoms: List = 0, atomInvariants: List = 0, includeChirality: bool = False): 112 | super(TopologicalTorsionFingerprint, self 113 | ).__init__('TopologicalTorsion', 114 | {"nBits": nBits, 115 | "targetSize": targetSize, 116 | "fromAtoms": fromAtoms, 117 | "ignoreAtoms": ignoreAtoms, 118 | "atomInvariants": atomInvariants, 119 | "includeChirality": includeChirality, }, 120 | rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect) 121 | 122 | 123 | class AtomPairFingerprint(RDKitFingerprint): 124 | def __init__(self, nBits: int = 2048, minLength: int = 1, maxLength: int = 30, 125 | fromAtoms: List = 0, ignoreAtoms: List = 0, atomInvariants: List = 0, 126 | nBitsPerEntry: int = 4, includeChirality: bool = False, 127 | use2D: bool = True, confId: int = -1): 128 | super(AtomPairFingerprint, self).__init__('AtomPair', 129 | {"nBits": nBits, 130 | "minLength": minLength, 131 | "maxLength": maxLength, 132 | "fromAtoms": fromAtoms, 133 | "ignoreAtoms": ignoreAtoms, 134 | "atomInvariants": atomInvariants, 135 | "nBitsPerEntry": nBitsPerEntry, 136 | "includeChirality": includeChirality, 137 | "use2D": use2D, 138 | "confId": confId}, 139 | rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect) 140 | 141 | 142 | class RDKitTopologicalFingerprint(RDKitFingerprint): 143 | def __init__(self, fpSize: int = 2048, minPath: int = 1, maxPath: int = 7, nBitsPerHash: int = 2, 144 | useHs: bool = True, tgtDensity: float = 0.0, minSize: int = 128, 145 | branchedPaths: bool = True, useBondOrder: bool = True, atomInvariants: List = 0, 146 | fromAtoms: List = 0, atomBits: List = None, bitInfo: List = None): 147 | super(RDKitTopologicalFingerprint, self).__init__('RDKFingerprint', 148 | {"minPath": minPath, 149 | "maxPath": maxPath, 150 | "fpSize": fpSize, 151 | "nBitsPerHash": nBitsPerHash, 152 | "useHs": useHs, 153 | "tgtDensity": tgtDensity, 154 | "minSize": minSize, 155 | "branchedPaths": branchedPaths, 156 | "useBondOrder": useBondOrder, 157 | "atomInvariants": atomInvariants, 158 | "fromAtoms": fromAtoms, 159 | "atomBits": atomBits, 160 | "bitInfo": bitInfo}, 161 | Chem.RDKFingerprint) 162 | 163 | 164 | class RDKPatternFingerprint(RDKitFingerprint): 165 | def __init__(self, fpSize: int = 2048, atomCounts: list = [], setOnlyBits: list = None): 166 | super(RDKPatternFingerprint, self).__init__('RDKPatternFingerprint', 167 | {'fpSize': fpSize, 168 | 'atomCounts': atomCounts, 169 | 'setOnlyBits': setOnlyBits}, 170 | Chem.PatternFingerprint) 171 | 172 | 173 | class OBFingerprint(Fingerprint): 174 | def __init__(self, name: str, params: Dict, call_func: Callable): 175 | if isinstance(pybel, ImportError) and isinstance(FPSim2, ImportError): 176 | raise ImportError('Some required dependencies are missing:\n\topenbabel, FPSim2') 177 | elif isinstance(pybel, ImportError): 178 | raise ImportError('Some required dependencies are missing:\n\topenbabel') 179 | elif isinstance(FPSim2, ImportError): 180 | raise ImportError('Some required dependencies are missing:\n\tFPSim2') 181 | super(OBFingerprint, self).__init__(name, params, call_func) 182 | 183 | def get(self, mol: Chem.Mol) -> List[int]: 184 | """Get the bistring fingerprint of the molecule and popcounts""" 185 | binvec = DataStructs.ExplicitBitVect(self.length) 186 | obmol = pybel.readstring('smi', Chem.MolToSmiles(mol)) 187 | binvec.SetBitsFromList([x - 1 for x in obmol.calcfp(self.func).bits]) 188 | fp = BitStrToIntList(binvec.ToBitString()) 189 | popcnt = PyPopcount(np.array(fp, dtype=np.uint64)) 190 | return (*fp, popcnt) 191 | 192 | 193 | class FP2Fingerprint(OBFingerprint): 194 | def __init__(self): 195 | super(FP2Fingerprint, self).__init__('FP2', 196 | {}, 197 | 'FP2') 198 | 199 | 200 | class FP3Fingerprint(OBFingerprint): 201 | def __init__(self): 202 | super(FP3Fingerprint, self).__init__('FP3', 203 | {}, 204 | 'FP3') 205 | 206 | 207 | class FP4Fingerprint(OBFingerprint): 208 | def __init__(self): 209 | super(FP4Fingerprint, self).__init__('FP4', 210 | {}, 211 | 'FP4') 212 | 213 | 214 | def get_fp_from_name(fp_name, **kwargs): 215 | """Get the fingerprint TYPE corresponding to a name 216 | :param fp_name: Name of the fingerprint 217 | :param kwargs: parameters specific to the desired fingerprint 218 | :return: fingerprint instance 219 | """ 220 | fps = {fp().name: fp for fp in Fingerprint.derived()} 221 | if fp_name not in fps.keys(): 222 | raise ValueError(r'Fingerprint {fp_name} not available') 223 | return fps[fp_name](**kwargs) 224 | -------------------------------------------------------------------------------- /figures/logo/Papyrus_trnsp-bg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/papyrus_scripts/utils/UniprotMatch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Functions to interact with UniProt.""" 4 | 5 | import re 6 | import json 7 | import time 8 | import zlib 9 | from typing import List, Union 10 | from xml.etree import ElementTree 11 | from urllib.parse import urlparse, parse_qs, urlencode 12 | 13 | import pandas as pd 14 | import requests 15 | from requests.adapters import HTTPAdapter, Retry 16 | 17 | 18 | def uniprot_mappings(query: Union[str, List[str]], 19 | map_from: str = 'ID', 20 | map_to: str = 'PDB_ID', 21 | taxon: str = None 22 | ) -> pd.DataFrame: 23 | """Map identifiers using the UniProt identifier mapping tool. 24 | 25 | :param query: list or space delimited string of identifiers 26 | :param map_from: type of input identifiers (default: accession) 27 | :param map_to: type of desired output identifiers 28 | (default: PDB identifiers) 29 | :param taxon: taxon to be mapped to if 'map_from' is 'Gene_Name' 30 | 31 | If mapping from {'PDB', 'PDB_ID'} to {'UniProtKB_AC-ID', 'ACC'} 32 | and query is None, then returns all SIFTS mappings. 33 | 34 | See: https://www.uniprot.org/help/api_idmapping 35 | """ 36 | if isinstance(query, str): 37 | query = [query] 38 | # If mapping PDB to UniProt, use SIFTS flat files 39 | if map_from in ['PDB', 'PDB_ID'] and map_to in ['UniProtKB_AC-ID', 'ACC']: 40 | # Obtain mappings from SIFTS 41 | data = pd.read_csv('ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_pdb.tsv.gz', 42 | sep='\t', skiprows=[0] 43 | ).rename(columns={'SP_PRIMARY': map_to, 'PDB': map_from}) 44 | # Reorganize columns 45 | data = data[[map_from, map_to]] 46 | # Split by PDB 47 | data[map_from] = data[map_from].str.split(';') 48 | # Unmerge rows according to PDB 49 | data = data.explode(column=map_from).reset_index(drop=True) 50 | if query is not None: 51 | query = [x.lower() for x in query] 52 | data = data[data[map_from].str.lower().isin(query)] 53 | return data 54 | else: 55 | # Use UniProt API 56 | matching = UniprotMatch() 57 | matches = matching.uniprot_id_mapping(query, map_from, map_to, taxon, verbose=False) 58 | df = pd.DataFrame.from_dict(matches, orient='index') 59 | df = df.reset_index().rename(columns={'index': map_from, 0: map_to}) 60 | return df 61 | 62 | 63 | class UniprotMatch: 64 | def __init__(self, 65 | polling_interval: int = 3, 66 | api_url: str = 'https://rest.uniprot.org', 67 | retry: Retry = None): 68 | """Instantiate a class to match UniProt identifiers. 69 | 70 | Based on: https://www.uniprot.org/help/id_mapping#submitting-an-id-mapping-job 71 | """ 72 | self._api_url = api_url 73 | self._polling_interval = polling_interval 74 | if retry is None: 75 | self._retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) 76 | else: 77 | self._retries = retry 78 | self._session = requests.Session() 79 | self._session.mount("https://", HTTPAdapter(max_retries=self._retries)) 80 | 81 | 82 | def _submit_id_mapping(self, from_db, to_db, ids, taxon=None): 83 | if from_db == 'Gene_Name' and taxon is None: 84 | raise ValueError('Taxon must be provided when mapping from gene names.') 85 | if taxon is None: 86 | request = requests.post( 87 | f"{self._api_url}/idmapping/run", 88 | data={"from": from_db, "to": to_db, "ids": ",".join(ids)}, 89 | ) 90 | else: 91 | request = requests.post( 92 | f"{self._api_url}/idmapping/run", 93 | data={"from": from_db, "to": to_db, "ids": ",".join(ids), "taxId": taxon} 94 | ) 95 | request.raise_for_status() 96 | return request.json()["jobId"] 97 | 98 | def _get_next_link(self, headers): 99 | re_next_link = re.compile(r'<(.+)>; rel="next"') 100 | if "Link" in headers: 101 | match = re_next_link.match(headers["Link"]) 102 | if match: 103 | return match.group(1) 104 | 105 | def _check_id_mapping_results_ready(self, job_id, verbose): 106 | while True: 107 | request = self._session.get(f"{self._api_url}/idmapping/status/{job_id}") 108 | request.raise_for_status() 109 | j = request.json() 110 | if "jobStatus" in j: 111 | if j["jobStatus"] == "RUNNING": 112 | if verbose: 113 | print(f"Retrying in {self._polling_interval}s") 114 | time.sleep(self._polling_interval) 115 | else: 116 | raise Exception(request["jobStatus"]) 117 | else: 118 | return bool(j["results"] or j["failedIds"]) 119 | 120 | def _get_batch(self, batch_response, file_format, compressed): 121 | batch_url = self._get_next_link(batch_response.headers) 122 | while batch_url: 123 | batch_response = self._session.get(batch_url) 124 | batch_response.raise_for_status() 125 | yield self._decode_results(batch_response, file_format, compressed) 126 | batch_url = self._get_next_link(batch_response.headers) 127 | 128 | def _combine_batches(self, all_results, batch_results, file_format): 129 | if file_format == "json": 130 | for key in ("results", "failedIds"): 131 | if key in batch_results and batch_results[key]: 132 | all_results[key] += batch_results[key] 133 | elif file_format == "tsv": 134 | return all_results + batch_results[1:] 135 | else: 136 | return all_results + batch_results 137 | return all_results 138 | 139 | def _get_id_mapping_results_link(self, job_id): 140 | url = f"{self._api_url}/idmapping/details/{job_id}" 141 | request = self._session.get(url) 142 | request.raise_for_status() 143 | return request.json()["redirectURL"] 144 | 145 | def _decode_results(self, response, file_format, compressed): 146 | if compressed: 147 | decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) 148 | if file_format == "json": 149 | j = json.loads(decompressed.decode("utf-8")) 150 | return j 151 | elif file_format == "tsv": 152 | return [line for line in decompressed.decode("utf-8").split("\n") if line] 153 | elif file_format == "xlsx": 154 | return [decompressed] 155 | elif file_format == "xml": 156 | return [decompressed.decode("utf-8")] 157 | else: 158 | return decompressed.decode("utf-8") 159 | elif file_format == "json": 160 | return response.json() 161 | elif file_format == "tsv": 162 | return [line for line in response.text.split("\n") if line] 163 | elif file_format == "xlsx": 164 | return [response.content] 165 | elif file_format == "xml": 166 | return [response.text] 167 | return response.text 168 | 169 | def _get_xml_namespace(self, element): 170 | m = re.match(r"\{(.*)\}", element.tag) 171 | return m.groups()[0] if m else "" 172 | 173 | def _merge_xml_results(self, xml_results): 174 | merged_root = ElementTree.fromstring(xml_results[0]) 175 | for result in xml_results[1:]: 176 | root = ElementTree.fromstring(result) 177 | for child in root.findall("{http://uniprot.org/uniprot}entry"): 178 | merged_root.insert(-1, child) 179 | ElementTree.register_namespace("", self._get_xml_namespace(merged_root[0])) 180 | return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True) 181 | 182 | def _print_progress_batches(self, batch_index, size, total): 183 | n_fetched = min((batch_index + 1) * size, total) 184 | print(f"Fetched: {n_fetched} / {total}") 185 | 186 | def _get_id_mapping_results_search(self, url, verbose: bool = False): 187 | parsed = urlparse(url) 188 | query = parse_qs(parsed.query) 189 | file_format = query["format"][0] if "format" in query else "json" 190 | if "size" in query: 191 | size = int(query["size"][0]) 192 | else: 193 | size = 500 194 | query["size"] = size 195 | compressed = ( 196 | query["compressed"][0].lower() == "true" if "compressed" in query else False 197 | ) 198 | parsed = parsed._replace(query=urlencode(query, doseq=True)) 199 | url = parsed.geturl() 200 | request = self._session.get(url) 201 | request.raise_for_status() 202 | results = self._decode_results(request, file_format, compressed) 203 | total = int(request.headers["x-total-results"]) 204 | if verbose: 205 | self._print_progress_batches(0, size, total) 206 | for i, batch in enumerate(self._get_batch(request, file_format, compressed), 1): 207 | results = self._combine_batches(results, batch, file_format) 208 | if verbose: 209 | self._print_progress_batches(i, size, total) 210 | if file_format == "xml": 211 | return self._merge_xml_results(results) 212 | return results 213 | 214 | def _get_id_mapping_results_stream(self, url): 215 | if "/stream/" not in url: 216 | url = url.replace("/results/", "/stream/") 217 | request = self._session.get(url) 218 | request.raise_for_status() 219 | parsed = urlparse(url) 220 | query = parse_qs(parsed.query) 221 | file_format = query["format"][0] if "format" in query else "json" 222 | compressed = ( 223 | query["compressed"][0].lower() == "true" if "compressed" in query else False 224 | ) 225 | return self._decode_results(request, file_format, compressed) 226 | 227 | def uniprot_id_mapping(self, 228 | ids: list, from_db: str = "UniProtKB_AC-ID", to_db: str = None, 229 | taxon: str = None, verbose: bool = True 230 | ) -> dict: 231 | """ 232 | Map Uniprot identifiers into other databases. 233 | 234 | For a list of the available identifiers, check the 235 | `To database` list on https://www.uniprot.org/id-mapping 236 | 237 | :param ids: IDs to be mapped from 238 | :param from_db: Type of identifier supplied through 'ids' 239 | :param to_db: Type of identifier to be obtained 240 | :param taxon: Taxon ID of the species if 'from_db' is 'Gene_Name' 241 | :param verbose: Increase verbosity 242 | :return: A dictionary with query ids as keys and the respective mapped results 243 | 244 | Adapted from David Araripe's (@DavidAraripe) original code 245 | """ 246 | job_id = self._submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids, taxon=taxon) 247 | if self._check_id_mapping_results_ready(job_id, verbose): 248 | link = self._get_id_mapping_results_link(job_id) 249 | r = self._get_id_mapping_results_search(link) 250 | r_dict = {idx: r["results"][idx] for idx in range(len(r["results"]))} 251 | r_df = pd.DataFrame.from_dict(r_dict, orient="index") 252 | query_to_newIDs = dict() 253 | for id in r_df["from"].unique(): 254 | subset_df = r_df[r_df["from"] == id] 255 | if isinstance(subset_df["to"].tolist()[0], str): 256 | query_to_newIDs[id] = " ".join(list(subset_df["to"].unique())) 257 | elif isinstance(subset_df["to"].tolist()[0], dict): 258 | query_to_newIDs[id] = " ".join(set(subset_df["to"].apply(lambda row: row['primaryAccession']))) 259 | return query_to_newIDs 260 | -------------------------------------------------------------------------------- /tests/test_oop.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from itertools import product 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from parameterized import parameterized, parameterized_class 9 | 10 | from src.papyrus_scripts import PapyrusDataset 11 | from src.papyrus_scripts import reader, preprocess 12 | 13 | 14 | # Size of chunks of raw file to read 15 | CHUNKSIZE = int(1e6) 16 | # Path root directory of raw files (None = pystow's default) 17 | SOURCE_PATH = None 18 | 19 | 20 | def parametrized_test_name_func(testcase_func, _, param): 21 | return "%s_%s" %( 22 | testcase_func.__name__, 23 | parameterized.to_safe_name("_".join(str(x) for x in param.args)), 24 | ) 25 | 26 | 27 | def parametrized_testclass_name_func(cls, _, params_dict): 28 | return "{}_{}".format( 29 | cls.__name__, 30 | parameterized.to_safe_name("_".join(f'{k}_{v}' for k, v in params_dict.items())), 31 | ) 32 | 33 | 34 | @parameterized_class( 35 | ('stereo', 'version', 'plusplus'), 36 | list(product( 37 | [True, False], 38 | ['05.4', '05.5', '05.6'], 39 | [True, False] 40 | )), class_name_func=parametrized_testclass_name_func) 41 | class TestPapyrusDataset(unittest.TestCase): 42 | 43 | def setUp(self): 44 | pass 45 | 46 | def assertDataFrameEqual(self, df1: pd.DataFrame, df2: pd.DataFrame): 47 | # Ensure NaN values can be compared 48 | df1.fillna('NaN', inplace=True) 49 | df2.fillna('NaN', inplace=True) 50 | # Ensure dataframes are not empty 51 | self.assertFalse(df1.empty) 52 | self.assertFalse(df2.empty) 53 | # Check number of lines 54 | self.assertEqual(len(df1), len(df2)) 55 | # Check number of columns 56 | self.assertEqual(df1.shape[1], df2.shape[1]) 57 | # Check column names 58 | self.assertTrue((df1.columns == df2.columns).all()) 59 | # Check content column by columns 60 | for j_col in range(df1.shape[1]): 61 | # First check dtype 62 | self.assertEqual(df1.iloc[:, j_col].dtype, df2.iloc[:, j_col].dtype) 63 | # Check content 64 | self.assertEqual(df1.iloc[:, j_col].tolist(), 65 | df2.iloc[:, j_col].tolist()) 66 | 67 | def test_medium_quality_kinase(self): 68 | if self.plusplus and self.stereo: 69 | # No chiral data in the Papyrus++ 70 | with self.assertRaises(ValueError): 71 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 72 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 73 | return 74 | # 1) Obtain data through the functional API 75 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 76 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 77 | # Read protein targets 78 | fn_protein_data = reader.read_protein_set(version=self.version, source_path=SOURCE_PATH) 79 | # Keep up to medium quality data (Papyrus++ only contains high quality) 80 | fn_filter1 = preprocess.keep_quality(fn_data, 'medium') 81 | # Keep kinases 82 | fn_filter2 = preprocess.keep_protein_class(fn_filter1, fn_protein_data, 83 | classes={'l2': 'Kinase'}) 84 | # Aggregate the data 85 | fn_data_agg = preprocess.consume_chunks(fn_filter2, progress=(not self.plusplus)) 86 | # 2) Obtain data through the object-oriented API 87 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 88 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 89 | .keep_quality('medium') 90 | .keep_protein_class({'l2': 'Kinase'}) 91 | .aggregate(progress=(not self.plusplus))) 92 | # 3) Ensure datasets are equal 93 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg) 94 | del fn_protein_data 95 | # 4) Test values 96 | for quality in oop_data_agg.Quality.unique(): 97 | self.assertIn(quality.lower(), ['high', 'medium']) 98 | self.assertEqual(oop_data_agg.Classification.str.split('->').str[1].unique(), ['Kinase']) 99 | 100 | def test_all_quality_human_adenosine_receptors_ic50(self): 101 | if self.plusplus and self.stereo: 102 | # No chiral data in the Papyrus++ 103 | with self.assertRaises(ValueError): 104 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 105 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 106 | return 107 | # 1) Obtain data through the functional API 108 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 109 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 110 | # Read protein targets 111 | fn_protein_data = reader.read_protein_set(version=self.version, source_path=SOURCE_PATH) 112 | # Keep human targets 113 | fn_filter1 = preprocess.keep_organism(fn_data, fn_protein_data, 114 | organism='Homo sapiens (Human)') 115 | # Keep adenosine receptors 116 | fn_filter2 = preprocess.keep_protein_class(fn_filter1, fn_protein_data, 117 | classes={'l5': 'Adenosine receptor'}) 118 | # Keep IC50 119 | fn_filter3 = preprocess.keep_type(fn_filter2, activity_types='ic50') 120 | # Aggregate the data 121 | fn_data_agg = preprocess.consume_chunks(fn_filter3, progress=(not self.plusplus)) 122 | # 2) Obtain data through the object-oriented API 123 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 124 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 125 | .keep_organism('Homo sapiens (Human)') 126 | .keep_protein_class({'l5': 'Adenosine receptor'}) 127 | .keep_activity_type('ic50') 128 | .aggregate(progress=(not self.plusplus))) 129 | # 3) Ensure datasets are equal 130 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg) 131 | del fn_data_agg 132 | # 4) Test values 133 | self.assertEqual(oop_data_agg.Classification.str.split('->').str[4].unique(), ['Adenosine receptor']) 134 | self.assertEqual(oop_data_agg.type_IC50.astype(int).unique().tolist(), [1]) 135 | oop_data_proteins = (PapyrusDataset.from_dataframe(oop_data_agg, self.stereo, self.version, self.plusplus) 136 | .proteins(progress=True) 137 | .to_dataframe(False)) 138 | self.assertEqual(len(oop_data_agg.accession.unique()), len(oop_data_proteins)) 139 | self.assertEqual(oop_data_proteins.Organism.unique().tolist(), ['Homo sapiens (Human)']) 140 | 141 | def test_chembl_mouse_cc_chemokine_receptors_ki_and_kd(self): 142 | if self.plusplus and self.stereo: 143 | with self.assertRaises(ValueError): 144 | # No chiral data in the Papyrus++ 145 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 146 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 147 | return 148 | # 1) Obtain data through the functional API 149 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 150 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 151 | # Read protein targets 152 | fn_protein_data = reader.read_protein_set(version=self.version, source_path=SOURCE_PATH) 153 | # Keep ChEMBL data 154 | fn_filter1 = preprocess.keep_source(fn_data, 'chembl') 155 | # Keep human targets 156 | fn_filter2 = preprocess.keep_organism(fn_filter1, fn_protein_data, 157 | organism='Mus musculus (Mouse)') 158 | # Keep C-C chemokine receptors 159 | fn_filter3 = preprocess.keep_protein_class(fn_filter2, fn_protein_data, 160 | classes={'l5': 'CC chemokine receptor'}) 161 | # Drop CCL2 and CCL5 162 | fn_filter4 = preprocess.keep_not_match(fn_filter3, 'accession', ['P13500', 'P13501']) 163 | # Keep IC50 164 | fn_filter5 = preprocess.keep_type(fn_filter4, activity_types=['ki', 'kd']) 165 | # Aggregate the data 166 | fn_data_agg = preprocess.consume_chunks(fn_filter5, progress=(not self.plusplus)) 167 | # 2) Obtain data through the object-oriented API 168 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 169 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 170 | .keep_source('chembl') 171 | .keep_organism('Mus musculus (Mouse)') 172 | .keep_protein_class({'l5': 'CC chemokine receptor'}) 173 | .not_isin('accession', ['P13500', 'P13501']) 174 | .keep_activity_type(['ki', 'kd']) 175 | .aggregate(progress=(not self.plusplus))) 176 | # 3) Ensure datasets are equal 177 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg) 178 | del fn_data_agg 179 | # 4) Test values 180 | self.assertEqual(len(oop_data_agg.source.unique()), 1) 181 | self.assertTrue(oop_data_agg.source.unique().item().lower().startswith('chembl')) 182 | self.assertTrue(oop_data_agg.type_IC50.dropna().astype(int).unique().item() == 0) 183 | self.assertTrue(oop_data_agg.type_EC50.dropna().astype(int).unique().item() == 0) 184 | self.assertTrue(oop_data_agg.type_other.replace({'NA': np.nan, 'NaN': np.nan, 'nan': np.nan}) 185 | .dropna().empty or (oop_data_agg.type_other.replace({'NA': np.nan, 'NaN': np.nan, 'nan': np.nan}) 186 | .dropna().astype(int).unique().item() == 0)) 187 | self.assertEqual((oop_data_agg[['type_KD', 'type_Ki']] 188 | .astype(int). 189 | drop_duplicates() 190 | .apply(lambda x: sorted(x), axis=1) 191 | .tolist()), 192 | [[0, 1], [0, 1]] 193 | ) 194 | self.assertEqual(oop_data_agg.Classification.str.split('->').str[4].unique(), ['CC chemokine receptor']) 195 | for accession in oop_data_agg.accession.unique(): 196 | self.assertNotIn(accession, ['P13500', 'P13501']) 197 | oop_data_proteins = (PapyrusDataset.from_dataframe(oop_data_agg, self.stereo, self.version, self.plusplus) 198 | .proteins(progress=True) 199 | .to_dataframe(False)) 200 | self.assertEqual(oop_data_proteins.Organism.unique().tolist(), ['Mus musculus (Mouse)']) 201 | 202 | def test_sharma_klaeger_christman_egfr_specific_mutants_no_chirality(self): 203 | if self.plusplus and self.stereo: 204 | # No chiral data in the Papyrus++ 205 | with self.assertRaises(ValueError): 206 | reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 207 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 208 | return 209 | # 1) Obtain data through the functional API 210 | fn_data = reader.read_papyrus(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 211 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 212 | # Keep data related to the human EGFR from its accession 213 | fn_filter1 = preprocess.keep_accession(fn_data, 'P00533') 214 | # Keep specific mutants 215 | fn_filter2 = preprocess.keep_match(fn_filter1, 'target_id', ['P00533_L858R', 'P00533_L861Q']) 216 | # Keep only molecules without chiral centers 217 | fn_filter3 = preprocess.keep_contains(fn_filter2, 'InChIKey', 'UHFFFAOYSA') 218 | # Keep data from the Sharma, Klaeger and Christmann-Franck datasets 219 | fn_filter4 = preprocess.keep_source(fn_filter3, ['sharma', 'klaeger', 'christman']) 220 | # Keep only molecules without chiral centers 221 | fn_filter5 = preprocess.keep_not_contains(fn_filter4, 'InChIKey', '-O$', regex=True) 222 | # Aggregate the data 223 | fn_data_agg = preprocess.consume_chunks(fn_filter5, progress=(not self.plusplus)) 224 | # 2) Obtain data through the object-oriented API 225 | oop_data_agg = (PapyrusDataset(is3d=self.stereo, version=self.version, plusplus=self.plusplus, 226 | chunksize=CHUNKSIZE, source_path=SOURCE_PATH) 227 | .keep_accession('P00533') 228 | .isin('target_id', ['P00533_L858R', 'P00533_L861Q']) 229 | .contains('InChIKey', 'UHFFFAOYSA') 230 | .keep_source(['sharma', 'klaeger', 'christman']) 231 | .not_contains('InChIKey', '-O$', regex=True) 232 | .aggregate(progress=(not self.plusplus))) 233 | # 3) Ensure datasets are equal 234 | self.assertDataFrameEqual(fn_data_agg, oop_data_agg) 235 | del fn_data_agg 236 | # 4) Test values 237 | self.assertEqual(oop_data_agg.accession.unique().item(), 'P00533') 238 | self.assertEqual(np.sort(oop_data_agg.target_id.unique()).tolist(), ['P00533_L858R', 'P00533_L861Q']) 239 | self.assertEqual(oop_data_agg.InChIKey.str.split('-').str[1].unique(), 'UHFFFAOYSA') 240 | self.assertNotEqual(oop_data_agg.InChIKey.str.split('-').str[2].unique(), 'O') 241 | -------------------------------------------------------------------------------- /figures/logo/Papyrus_trnsp-bg-white.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 20 | 22 | image/svg+xml 23 | 25 | 26 | 27 | 28 | 29 | 49 | 51 | 53 | 58 | 63 | 64 | 66 | 71 | 76 | 77 | 79 | 85 | 91 | 92 | 94 | 100 | 106 | 107 | 108 | 111 | 114 | 117 | 121 | 125 | 129 | 133 | 137 | 141 | 147 | 151 | 157 | 158 | 162 | 163 | 170 | 176 | 179 | 180 | 186 | 189 | 195 | 198 | 202 | 205 | 206 | 208 | 211 | 214 | 217 | 220 | 221 | 222 | 227 | 230 | 234 | 237 | 238 | 240 | 244 | 248 | 252 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /src/papyrus_scripts/utils/mol_reader.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import gzip 3 | import io 4 | import lzma 5 | import re 6 | import warnings 7 | from typing import Iterable, Optional, Tuple, Callable, Union 8 | 9 | from rdkit import Chem 10 | from rdkit import RDLogger 11 | from rdkit.Chem import ForwardSDMolSupplier, MaeMolSupplier, MolFromMol2Block, SmilesMolSupplierFromText, \ 12 | SmilesMolSupplier 13 | from tqdm.auto import tqdm 14 | 15 | 16 | class ForwardMol2MolSupplier: 17 | def __init__(self, fileobj: Union[str, io.TextIOBase], 18 | sanitize: bool = True, 19 | removeHs: bool = True, 20 | cleanupSubstructures: bool = True): 21 | self.sanitize = sanitize 22 | self.removeHs = removeHs 23 | self.cleanupSubstructures = cleanupSubstructures 24 | self._buffer_size = 32768 # 32kB 25 | self._buffer = b'' 26 | self._mol_delimiter = '@MOLECULE' 27 | if isinstance(fileobj, str): 28 | self._open_supplier = True 29 | self.supplier = open(fileobj) 30 | else: 31 | self._open_supplier = False 32 | self.supplier = fileobj 33 | 34 | def __enter__(self): 35 | return self 36 | 37 | def __exit__(self, exc_type, exc_value, traceback): 38 | self.close() 39 | 40 | def _iterate(self): 41 | self._buffer = self.supplier.read(self._buffer_size) 42 | while True: 43 | i_seps = [x.start() for x in re.finditer(self._mol_delimiter, self._buffer) if x.start() != 0] 44 | if not i_seps: 45 | new_buffer = self.supplier.read(self._buffer_size) 46 | if len(new_buffer): 47 | self._buffer += new_buffer 48 | else: 49 | mol = MolFromMol2Block(self._buffer, 50 | self.sanitize, 51 | self.removeHs, 52 | self.cleanupSubstructures) 53 | yield mol 54 | break 55 | else: 56 | mol = MolFromMol2Block(self._buffer[:i_seps[0]]) 57 | yield mol 58 | self._buffer = self._buffer[i_seps[0]:] 59 | del i_seps[0] 60 | 61 | def __iter__(self): 62 | if not hasattr(self, '_iterator'): 63 | self._iterator = self._iterate() 64 | for values in self._iterator: 65 | yield values 66 | 67 | def __next__(self): 68 | if not hasattr(self, '_iterator'): 69 | self._iterator = self._iterate() 70 | return next(self._iterator) 71 | 72 | def close(self): 73 | if self._open_supplier: 74 | self.supplier.close() 75 | 76 | 77 | class ForwardSmilesMolSupplier: 78 | def __init__(self, fileobj: Union[str, io.TextIOBase], 79 | delimiter: str = '\t', 80 | smilesColumn: int = 0, 81 | nameColumn: int = 1, 82 | titleLine: bool = True, 83 | sanitize: bool = True): 84 | self.delimiter = delimiter 85 | self.smilesColumn = smilesColumn 86 | self.nameColumn = nameColumn 87 | self.titleLine = titleLine 88 | self.sanitize = sanitize 89 | self._buffer_size = 32768 # 32kB 90 | self._buffer = b'' 91 | self._mol_delimiter = '\n' 92 | if isinstance(fileobj, str): 93 | self._open_supplier = True 94 | self.supplier = None 95 | self._iterator = SmilesMolSupplier(fileobj, self.delimiter, self.smilesColumn, self.nameColumn, 96 | self.titleLine, self.sanitize) 97 | else: 98 | self._open_supplier = False 99 | self.supplier = fileobj 100 | 101 | def __enter__(self): 102 | return self 103 | 104 | def __exit__(self, exc_type, exc_value, traceback): 105 | self.close() 106 | 107 | def _iterate(self): 108 | if self.titleLine: 109 | self.supplier.readline() 110 | self._buffer = self.supplier.read(self._buffer_size) 111 | while True: 112 | i_seps = [x.start() for x in re.finditer(self._mol_delimiter, self._buffer)] 113 | if not i_seps: 114 | new_buffer = self.supplier.read(self._buffer_size) 115 | if len(new_buffer): 116 | self._buffer += new_buffer 117 | else: 118 | if len(self._buffer): 119 | RDLogger.DisableLog('rdApp.*') # Disable logger if no name column 120 | mol = next(SmilesMolSupplierFromText(self._buffer, self._mol_delimiter, self.smilesColumn, 121 | self.nameColumn, False, self.sanitize)) 122 | RDLogger.EnableLog('rdApp.*') # Disable logger if no name column 123 | yield mol 124 | break 125 | else: 126 | RDLogger.DisableLog('rdApp.*') # Disable logger if no name column 127 | mol = next( 128 | SmilesMolSupplierFromText(self._buffer[:i_seps[0] + len(self._mol_delimiter)], self._mol_delimiter, 129 | self.smilesColumn, self.nameColumn, False, self.sanitize)) 130 | RDLogger.EnableLog('rdApp.*') # Disable logger if no name column 131 | yield mol 132 | self._buffer = self._buffer[i_seps[0] + len(self._mol_delimiter):] 133 | del i_seps[0] 134 | 135 | def __iter__(self): 136 | if not hasattr(self, '_iterator'): 137 | self._iterator = self._iterate() 138 | for values in self._iterator: 139 | yield values 140 | 141 | def __next__(self): 142 | if not hasattr(self, '_iterator'): 143 | self._iterator = self._iterate() 144 | return next(self._iterator) 145 | 146 | def close(self): 147 | if self._open_supplier: 148 | del self._iterator 149 | self._iterator = None 150 | 151 | 152 | class MolSupplier: 153 | # class properties 154 | valid_formats = ('smi', 'mae', 'sd', 'mol2', 'mol') 155 | valid_compression = ('lzma', 'zlib', 'bz2') 156 | 157 | def __init__(self, source: Union[str, io.TextIOBase, io.BufferedIOBase] = None, 158 | supplier: Iterable[Chem.Mol] = None, 159 | format: str = None, 160 | compression: str = None, **kwargs): 161 | f"""Molecular supplier handling format and compression. 162 | 163 | :param source: filename or file-like object; 164 | when using a context manager, file-like objects 165 | are not closed upon exit 166 | :param supplier: molecular supplier (e.g. rdkit.Chem.ForwardSDMolSupplier) 167 | :param format: data format {self.valid_formats} 168 | can be detected if source is a file name, 169 | must be provided if source is a not file name, 170 | ignored if supplier is not None 171 | :param compression: compression type {self.valid_compression} 172 | can be detected if source is a file name, 173 | ignored otherwise 174 | :param kwargs: keyworded arguments to be passed to the underlying supplier, 175 | ignored if source is supplier 176 | can also hold values for 'start_id', 'total' and 'show_progress' 177 | to be considered when used as an iterable 178 | """ 179 | # source is None 180 | if source is None and supplier is None: 181 | raise ValueError('source or supplier must be supplied') 182 | # Default attributes 183 | self._open_substream = False # should a file be opened 184 | self.filename = None # name of file to be opened 185 | self.open_fn = None # function opening file and handling compression 186 | self._handle = None # handle to opened file 187 | self._open_supplier = False # should a supplier be opened 188 | self.supplier = None # molecule supplier 189 | self.compression = None 190 | self.format = None 191 | self.kwargs = kwargs # additional parameters for suppliers 192 | self._iter_start = self.kwargs.pop('start_id', 0) 193 | self._iter_total = self.kwargs.pop('total', None) 194 | self._iter_progress = self.kwargs.pop('show_progress', None) 195 | # Handle supplier 196 | if supplier is not None: 197 | self.supplier = supplier 198 | # source is a file name 199 | elif isinstance(source, str): 200 | self.filename = source 201 | self._open_substream = True 202 | self._open_supplier = True 203 | # Handle compressions 204 | if compression is not None: 205 | if compression not in self.valid_compression: 206 | raise ValueError(f'compression must be one of {self.valid_compression}') 207 | self.compression = compression 208 | else: 209 | self.compression, self._trunc_filename = self._get_compression(self.filename) 210 | self.open_fn = self._get_compression_handler(self.compression) 211 | # Handle file types 212 | if format is not None: 213 | if format not in self.valid_formats: 214 | raise ValueError(f'format must be one of {self.valid_formats}') 215 | self.format = format 216 | else: 217 | self.format = self._get_format(self._trunc_filename) 218 | # source is file-like object 219 | elif isinstance(source, (io.TextIOBase, io.BufferedIOBase)): 220 | if format is None: 221 | raise ValueError('format must be specified with text or binary readers') 222 | self._handle = source 223 | self._open_supplier = True 224 | self.format = format 225 | else: 226 | raise ValueError('source must either be filename or file-like object') 227 | # Create rdkit suppliers 228 | if self._open_substream: 229 | self._handle = self.open_fn(self.filename) 230 | # if file name or file-like object 231 | if self._open_supplier: 232 | if self.format == 'smi': 233 | self.supplier = ForwardSmilesMolSupplier(self._handle, **self.kwargs) 234 | elif self.format == 'mae': 235 | self.supplier = MaeMolSupplier(self._handle, **self.kwargs) 236 | elif self.format in ['sd', 'mol']: 237 | self.supplier = ForwardSDMolSupplier(self._handle, **self.kwargs) 238 | elif self.format == 'mol2': 239 | self.supplier = ForwardMol2MolSupplier(self._handle, **self.kwargs) 240 | 241 | def set_start_progress_total(self, start: int = 0, progress: bool = True, total: Optional[int] = None): 242 | """Set the start, progress and total for iterating through the supplier. 243 | 244 | :param start: starting value for generated identifiers while enumerating molecules 245 | :param progress: whether a progress bar should be displayed 246 | :param total: total number of molecules in the supplier 247 | """ 248 | self._iter_start = start 249 | self._iter_total = total 250 | self._iter_progress = progress 251 | 252 | def _get_compression(self, filename: str) -> Tuple[Optional[str], str]: 253 | """Get compression type and stripped filename.""" 254 | if filename.endswith('.xz'): 255 | return 'lzma', filename.rstrip('.xz') 256 | elif filename.endswith('.gz'): 257 | return 'zlib', filename.rstrip('.gz') 258 | elif filename.endswith('.bz2'): 259 | return 'bz2', filename.rstrip('.bz2') 260 | else: 261 | return None, filename 262 | 263 | def _get_compression_handler(self, compression_type) -> Callable: 264 | """Get function to deal with the compression.""" 265 | if compression_type == 'lzma': 266 | return lzma.open 267 | elif compression_type == 'zlib': 268 | return gzip.open 269 | elif compression_type == 'bz2': 270 | return bz2.open 271 | elif compression_type is None: 272 | return open 273 | else: 274 | raise ValueError(f'type compression not handled: {compression_type}') 275 | 276 | def _get_format(self, filename) -> str: 277 | """Get file format from filename.""" 278 | if filename.endswith('.smi'): 279 | return 'smi' 280 | elif filename.endswith('.mae'): 281 | return 'mae' 282 | elif filename.endswith(('.sd', '.sdf')): 283 | return 'sd' 284 | elif filename.endswith('.mol2'): 285 | return 'mol2' 286 | elif filename.endswith('.mol'): 287 | return 'mol' 288 | 289 | def _processed_mol_supplier(self) -> Iterable[Tuple[int, Chem.Mol]]: 290 | """Generator function that reads from a rdkit molecule supplier.""" 291 | # handle showing progress 292 | if self._iter_progress: 293 | pbar = tqdm(enumerate(self.supplier, self._iter_start), total=self._iter_total, ncols=100) 294 | else: 295 | pbar = enumerate(self.supplier, self._iter_start) 296 | for mol_id, rdmol in pbar: 297 | if rdmol: 298 | yield mol_id, rdmol 299 | else: 300 | warnings.warn(f'molecule {mol_id} could not be processed') 301 | continue 302 | 303 | def __enter__(self): 304 | return self 305 | 306 | def __exit__(self, exc_type, exc_value, traceback): 307 | self.close() 308 | 309 | def __iter__(self): 310 | if not hasattr(self, '_iterator'): 311 | self._iterator = self._processed_mol_supplier() 312 | for values in self._iterator: 313 | yield values 314 | 315 | def __next__(self): 316 | if not hasattr(self, '_iterator'): 317 | self._iterator = self._processed_mol_supplier() 318 | # self._iterator = self.__iter__() 319 | return next(self._iterator) 320 | 321 | def close(self): 322 | if self._open_supplier: 323 | del self.supplier 324 | self.supplier = None 325 | if self._open_substream: 326 | self._handle.close() 327 | -------------------------------------------------------------------------------- /src/papyrus_scripts/download.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Download utilities of the Papyrus scripts.""" 4 | 5 | import os 6 | import zipfile 7 | import shutil 8 | from typing import List, Optional, Union 9 | 10 | import requests 11 | import pystow 12 | from tqdm.auto import tqdm 13 | 14 | from .utils.IO import (get_disk_space, enough_disk_space, assert_sha256sum, 15 | read_jsonfile, write_jsonfile, get_papyrus_links) 16 | 17 | 18 | def download_papyrus(outdir: Optional[str] = None, 19 | version: Union[str, List[str]] = 'latest', 20 | nostereo: bool = True, 21 | stereo: bool = False, 22 | only_pp: bool = True, 23 | structures: bool = False, 24 | descriptors: Optional[Union[str, List[str]]] = 'all', 25 | progress: bool = True, 26 | disk_margin: float = 0.10, 27 | update_links: bool = True) -> None: 28 | """Download the Papyrus data. 29 | 30 | :param outdir: directory where Papyrus data is stored (default: pystow's directory) 31 | :param version: version of the dataset to be downloaded 32 | :param nostereo: should 2D data be downloaded 33 | :param only_pp: download only the curated Papyrus++ subset 34 | :param stereo: should 3D data be downloaded 35 | :param structures: should molecule structures be downloaded 36 | :param descriptors: should molecular and protein descriptors be downloaded 37 | :param progress: should progress be displayed 38 | :param disk_margin: percent of free disk space to keep 39 | :param update_links: Should links be updated (allows new versions to be fetched) 40 | """ 41 | 42 | # Determine download parameters 43 | CHUNKSIZE = 1048576 # 1 MB 44 | RETRIES = 3 45 | # Obtain links to files 46 | files = get_papyrus_links(offline=not update_links) 47 | available_versions = list(files.keys()) 48 | if isinstance(version, list): 49 | for _version in version: 50 | if _version not in available_versions + ['latest', 'all']: 51 | raise ValueError(f'version can only be one of [{", ".join(["latest"] + available_versions)}]') 52 | # Identify version 53 | latest_version = sorted(available_versions, key=lambda s: [int(u) for u in s.split('.')])[-1] 54 | if version == 'latest': 55 | version = latest_version 56 | if progress: 57 | print(f'Latest version: {version}') 58 | elif isinstance(version, list) and 'latest' in version: 59 | for i in range(len(version)): 60 | if version[i] == 'latest': 61 | version[i] = latest_version 62 | elif version == 'all' or (isinstance(version, list) and 'all' in version): 63 | version = available_versions 64 | # Transform to list 65 | if not isinstance(version, list): 66 | version = [version] 67 | if not isinstance(descriptors, list): 68 | descriptors = [descriptors] 69 | # Remove duplicates of versions 70 | version = sorted(set(version), key=lambda s: [int(u) for u in s.split('.')]) 71 | # Define root dir for downloads 72 | if outdir is not None: 73 | os.environ['PYSTOW_HOME'] = os.path.abspath(outdir) 74 | papyrus_root = pystow.module('papyrus') 75 | for _version in version: 76 | papyrus_version_root = pystow.module('papyrus', _version) 77 | # Prepare files to be downloaded 78 | downloads = set() 79 | downloads.add('readme') 80 | downloads.add('requirements') 81 | downloads.add('proteins') 82 | if nostereo: 83 | downloads.add('papyrus++') 84 | if not only_pp: 85 | downloads.add('2D_papyrus') 86 | elif progress: 87 | # Ensure this warning is printed when donwloading the Papyrus++ dataset with progress on 88 | print('########## DISCLAIMER ##########\n' 89 | 'You are downloading the high-quality Papyrus++ dataset.\n' 90 | 'Should you want to access the entire, though of lower quality, Papyrus dataset,\n' 91 | 'look into additional switches of this command.\n' 92 | '################################') 93 | if structures: 94 | downloads.add('2D_structures') 95 | if 'mold2' in descriptors or 'all' in descriptors: 96 | downloads.add('2D_mold2') 97 | if 'cddd' in descriptors or 'all' in descriptors: 98 | downloads.add('2D_cddd') 99 | if 'mordred' in descriptors or 'all' in descriptors: 100 | downloads.add('2D_mordred') 101 | if 'fingerprint' in descriptors or 'all' in descriptors: 102 | downloads.add('2D_fingerprint') 103 | if stereo: 104 | downloads.add('3D_papyrus') 105 | if structures: 106 | downloads.add('3D_structures') 107 | if 'mordred' in descriptors or 'all' in descriptors: 108 | downloads.add('3D_mordred') 109 | if 'fingerprint' in descriptors or 'all' in descriptors: 110 | downloads.add('3D_fingerprint') 111 | if 'unirep' in descriptors or 'all' in descriptors: 112 | downloads.add('proteins_unirep') 113 | if 'prodec' in descriptors or 'all' in descriptors: 114 | downloads.add('proteins_prodec') 115 | # Determine total download size 116 | total = 0 117 | for ftype in downloads: 118 | if ftype == 'proteins_prodec' and ftype not in files[_version] and 'all' in descriptors: 119 | continue 120 | if isinstance(files[_version][ftype], dict): 121 | total += files[_version][ftype]['size'] 122 | elif isinstance(files[_version][ftype], list): 123 | for subfile in files[_version][ftype]: 124 | total += subfile['size'] 125 | else: 126 | raise ValueError('########## ERROR ##########\n' 127 | f'Papyrus versioning file corrupted: {files[_version][ftype]} ' 128 | 'is neither a dict or a list.\nThis is most likely due to bad formatting ' 129 | 'of the underlying parsed JSON files. If you are not the maintainer, please ' 130 | 'remove the Papyrus data and enforce root folder removal and download ' 131 | 'the data before trying again.\n' 132 | '################################') 133 | if progress: 134 | print(f'Number of files to be downloaded: {len(downloads)}\n' 135 | f'Total size: {tqdm.format_sizeof(total)}B') 136 | # Verify enough disk space 137 | if not enough_disk_space(papyrus_version_root.base.as_posix(), total, disk_margin): 138 | print('########## ERROR ##########\n' 139 | f'Not enough disk space ({disk_margin:.0%} kept for safety)\n' 140 | f'Available: {tqdm.format_sizeof(get_disk_space(papyrus_version_root.base.as_posix()))}B\n' 141 | f'Required: {tqdm.format_sizeof(total)}B\n' 142 | '################################') 143 | return 144 | # Download files 145 | if progress: 146 | pbar = tqdm(total=total, desc=f'Downloading version {_version}', unit='B', unit_scale=True) 147 | for ftype in downloads: 148 | if ftype == 'proteins_prodec' and 'proteins_prodec' not in files[_version]: 149 | if 'all' in descriptors: 150 | continue 151 | else: 152 | raise ValueError(f'ProDEC descriptors not available for Papyrus version {_version}') 153 | download = files[_version][ftype] 154 | if not isinstance(download, list): 155 | download = [download] 156 | for subfile in download: 157 | dname, durl, dsize, dhash = subfile['name'], subfile['url'], subfile['size'], subfile['sha256'] 158 | # Determine path 159 | if ftype in ['papyrus++', '2D_papyrus', '3D_papyrus', 'proteins', 'data_types', 'data_size', 160 | 'readme', 'license', 'requirements']: 161 | fpath = papyrus_version_root.join(name=dname).as_posix() 162 | elif ftype in ['2D_structures', '3D_structures']: 163 | fpath = papyrus_version_root.join('structures', name=dname).as_posix() 164 | else: 165 | fpath = papyrus_version_root.join('descriptors', name=dname).as_posix() 166 | # File already exists 167 | if os.path.isfile(fpath) and assert_sha256sum(fpath, dhash): 168 | if progress: 169 | pbar.update(dsize) 170 | continue # skip 171 | # Download file 172 | correct = False # ensure file is not corrupted 173 | retries = RETRIES 174 | while not correct and retries > 0: # Allow 3 failures 175 | session = requests.session() 176 | res = session.get(durl, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " 177 | "AppleWebKit/537.36 (KHTML, like Gecko) " 178 | "Chrome/39.0.2171.95 " 179 | "Safari/537.36"}, 180 | stream=True, verify=True) 181 | with open(fpath, 'wb') as fh: 182 | for chunk in res.iter_content(chunk_size=CHUNKSIZE): 183 | fh.write(chunk) 184 | if progress: 185 | pbar.update(len(chunk)) 186 | correct = assert_sha256sum(fpath, dhash) 187 | if not correct: 188 | retries -= 1 189 | if progress: 190 | if retries > 0: 191 | message = f'SHA256 hash unexpected for {dname}. Remaining download attempts: {retries}' 192 | else: 193 | message = f'SHA256 hash unexpected for {dname}. All {RETRIES} attempts failed.' 194 | pbar.write(message) 195 | os.remove(fpath) 196 | if retries == 0: 197 | if progress: 198 | pbar.close() 199 | raise IOError(f'Download failed for {dname}') 200 | # Extract if ZIP file 201 | if dname.endswith('.zip'): 202 | with zipfile.ZipFile(fpath) as zip_handle: 203 | for name in zip_handle.namelist(): 204 | subpath = os.path.join(fpath, os.path.pardir) 205 | zip_handle.extract(name, subpath) 206 | os.remove(fpath) 207 | if progress: 208 | pbar.close() 209 | # Save version number 210 | json_file = papyrus_root.join(name='versions.json').as_posix() 211 | if os.path.isfile(json_file): 212 | data = read_jsonfile(json_file) 213 | data.append(_version) 214 | data = sorted(set(data)) 215 | write_jsonfile(data, json_file) 216 | else: 217 | write_jsonfile([_version], json_file) 218 | 219 | 220 | def remove_papyrus(outdir: Optional[str] = None, 221 | version: Union[str, List[str]] = 'latest', 222 | papyruspp: bool = False, 223 | bioactivities: bool = False, 224 | proteins: bool = False, 225 | nostereo: bool = True, 226 | stereo: bool = False, 227 | structures: bool = False, 228 | descriptors: Union[str, List[str]] = 'all', 229 | other_files: bool = False, 230 | version_root: bool = False, 231 | papyrus_root: bool = False, 232 | force: bool = False, 233 | progress: bool = True) -> None: 234 | """Remove the Papyrus data. 235 | 236 | :param outdir: directory where Papyrus data is stored (default: pystow's directory) 237 | :param version: version of the dataset to be removed 238 | :param papyruspp: should Papyrus++ bioactivities be removed 239 | :param bioactivities: should bioactivity data be removed 240 | :param proteins: should protein data be removed 241 | :param nostereo: should the files related to 2D data be considered 242 | :param stereo: should the files related to 3D data be considered 243 | :param structures: should molecule structures be removed 244 | :param descriptors: should molecular and protein descriptors be removed 245 | :param other_files: should other files (e.g. LICENSE, README, data_types, data_size) be removed 246 | :param version_root: remove the specified version of the papyrus data, requires confirmation 247 | :param papyrus_root: remove all versions of the papyrus data, requires confirmation 248 | :param force: disable confirmation prompt 249 | :param progress: should progress be displayed 250 | """ 251 | # Obtain links to files 252 | files = get_papyrus_links() 253 | # Handle exceptions 254 | available_versions = list(files.keys()) 255 | if isinstance(version, list): 256 | for _version in version: 257 | if _version not in available_versions + ['latest', 'all']: 258 | raise ValueError(f'version can only be one of [{", ".join(["latest"] + available_versions)}]') 259 | # Identify version 260 | latest_version = sorted(available_versions, key=lambda s: [int(u) for u in s.split('.')])[-1] 261 | if version == 'latest': 262 | version = latest_version 263 | if progress: 264 | print(f'Latest version: {version}') 265 | elif isinstance(version, list) and 'latest' in version: 266 | for i in range(len(version)): 267 | if version[i] == 'latest': 268 | version[i] = latest_version 269 | elif version == 'all' or (isinstance(version, list) and 'all' in version): 270 | version = available_versions 271 | # Transform to list 272 | if not isinstance(version, list): 273 | version = [version] 274 | if not isinstance(descriptors, list): 275 | descriptors = [descriptors] 276 | # Remove duplicates of versions 277 | version = sorted(set(version), key=lambda s: [int(u) for u in s.split('.')]) 278 | # Define root dir for removal 279 | if outdir is not None: 280 | os.environ['PYSTOW_HOME'] = os.path.abspath(outdir) 281 | papyrus_root_dir = pystow.module('papyrus') 282 | # Deep cleaning 283 | if papyrus_root: 284 | if not force: 285 | confirmation = input('Confirm the removal of all Papyrus data and versions (Y/N): ') 286 | if confirmation != 'Y': 287 | print('Removal was aborted.') 288 | return 289 | # Either forced or confirmed 290 | shutil.rmtree(papyrus_root_dir.base.as_posix()) 291 | if progress: 292 | print('All Papyrus data was successfully removed.') 293 | return 294 | for _version in version: 295 | papyrus_version_root = pystow.module('papyrus', _version) 296 | # If removal of the whole version 297 | if version_root: 298 | if not force: 299 | confirmation = input(f'Confirm the removal of version {_version} of Papyrus data (Y/N): ') 300 | if confirmation != 'Y': 301 | print('Removal was aborted.') 302 | return 303 | # Either forced or confirmed 304 | shutil.rmtree(papyrus_version_root.base.as_posix()) 305 | if progress: 306 | print(f'Version {_version} of Papyrus was successfully removed.') 307 | return 308 | # Prepare files to be removed 309 | removal = set() 310 | if bioactivities and papyruspp: 311 | removal.add('papyrus++') 312 | if bioactivities and nostereo: 313 | removal.add('2D_papyrus') 314 | elif bioactivities and stereo: 315 | removal.add('3D_papyrus') 316 | if proteins: 317 | removal.add('proteins') 318 | if structures and nostereo: 319 | removal.add('2D_structures') 320 | elif structures and stereo: 321 | removal.add('3D_structures') 322 | if nostereo and ('mold2' in descriptors or 'all' in descriptors): 323 | removal.add('2D_mold2') 324 | if nostereo and ('cddd' in descriptors or 'all' in descriptors): 325 | removal.add('2D_cddd') 326 | if nostereo and ('mordred' in descriptors or 'all' in descriptors): 327 | removal.add('2D_mordred') 328 | elif stereo and ('mordred' in descriptors or 'all' in descriptors): 329 | removal.add('3D_mordred') 330 | if nostereo and ('fingerprint' in descriptors or 'all' in descriptors): 331 | removal.add('2D_fingerprint') 332 | elif stereo and 'fingerprint' in descriptors or 'all' in descriptors: 333 | removal.add('3D_fingerprint') 334 | if 'unirep' in descriptors or 'all' in descriptors: 335 | removal.add('proteins_unirep') 336 | if 'prodec' in descriptors or 'all' in descriptors: 337 | removal.add('proteins_prodec') 338 | if other_files: 339 | removal.add('data_types') 340 | removal.add('data_size') 341 | removal.add('readme') 342 | removal.add('license') 343 | removal = list(removal) 344 | # Determine total removed size 345 | total = 0 346 | for i in range(len(removal) - 1, -1, -1): 347 | ftype = removal[i] 348 | data = files[_version][ftype] 349 | dname, dsize = data['name'], data['size'] 350 | # Determine path 351 | if ftype in ['papyrus++', '2D_papyrus', '3D_papyrus', 'proteins', 'readme']: 352 | fpath = papyrus_version_root.join(name=dname).as_posix() 353 | elif ftype in ['2D_structures', '3D_structures']: 354 | fpath = papyrus_version_root.join('structures', name=dname).as_posix() 355 | else: 356 | fpath = papyrus_version_root.join('descriptors', name=dname).as_posix() 357 | # Handle LICENSE, data_types and data_size separately 358 | if other_files: 359 | fpath = papyrus_version_root.join(name=dname).as_posix() 360 | # Will throw an error if these files do not exist 361 | # Nevertheless they should always exist 362 | os.remove('data_types.json') 363 | os.remove('data_size.json') 364 | os.remove('LICENSE.txt') 365 | # Handle other files 366 | if os.path.isfile(fpath): # file exists 367 | total += dsize # add size to be removed 368 | else: # file does not exist 369 | del removal[i] 370 | if progress: 371 | print(f'Number of files to be removed: {len(removal)}\n' 372 | f'Total size: {tqdm.format_sizeof(total)}B') 373 | # Early stop: 374 | if len(removal) == 0: 375 | return 376 | # Remove files 377 | if progress: 378 | pbar = tqdm(total=total, desc=f'Removing files from version {_version}', unit='B', unit_scale=True) 379 | for ftype in removal: 380 | data = files[_version][ftype] 381 | dname, dsize = data['name'], data['size'] 382 | # Determine path 383 | if ftype in ['papyrus++', '2D_papyrus', '3D_papyrus', 'proteins', 'data_types', 'data_size', 'readme', 'license']: 384 | fpath = papyrus_version_root.join(name=dname).as_posix() 385 | elif ftype in ['2D_structures', '3D_structures']: 386 | fpath = papyrus_version_root.join('structures', name=dname).as_posix() 387 | else: 388 | fpath = papyrus_version_root.join('descriptors', name=dname).as_posix() 389 | # File does not exist 390 | if not os.path.isfile(fpath): 391 | if progress: 392 | pbar.update(dsize) 393 | continue # skip 394 | # Remove file 395 | os.remove(fpath) 396 | pbar.update(dsize) 397 | if progress: 398 | pbar.close() 399 | # Remove version number 400 | json_file = papyrus_root_dir.join(name='versions.json').as_posix() 401 | if os.path.isfile(json_file): 402 | data = read_jsonfile(json_file) 403 | data = [v for v in data if v != _version] 404 | data = sorted(set(data)) 405 | write_jsonfile(data, json_file) 406 | -------------------------------------------------------------------------------- /src/papyrus_scripts/utils/links.json: -------------------------------------------------------------------------------- 1 | { 2 | "05.4": { 3 | "readme": { 4 | "name": "README.txt", 5 | "url": "https://zenodo.org/records/10944245/files/README.txt?download=1", 6 | "size": 8743, 7 | "sha256": "f552ae0b58121b20c9aefcce0737e5f31240d72676dc9ec559f97585aceb33ad" 8 | }, 9 | "requirements": [ 10 | { 11 | "name": "LICENSE.txt", 12 | "url": "https://zenodo.org/records/10944245/files/LICENSE.txt?download=1", 13 | "size": 20138, 14 | "sha256": "3b2890eacd851373001c4a14623458e3adaf1b1967939aa9c38a318e28d61c00" 15 | }, 16 | { 17 | "name": "data_types.json", 18 | "url": "https://zenodo.org/records/10944245/files/data_types.json?download=1", 19 | "size": 450559, 20 | "sha256": "d80a5810d99b62680ee1a214df5d5a30f505ec335a0c221194efb91d1c23913e" 21 | }, 22 | { 23 | "name": "data_size.json", 24 | "url": "https://zenodo.org/records/10944245/files/data_size.json?download=1", 25 | "size": 324, 26 | "sha256": "decbe66e14eaeccf5e0f657bb33065600b503e2902503aa59f5ffa81b7126775" 27 | } 28 | ], 29 | "papyrus++": { 30 | "name": "05.4++_combined_set_without_stereochemistry.tsv.xz", 31 | "url": "https://zenodo.org/records/10944245/files/05.4++_combined_set_without_stereochemistry.tsv.xz?download=1", 32 | "size": 40278204, 33 | "sha256": "42dcbe76b33ad541f6c54673eccffa15af64785cf844938c0f73518dfdf4404b" 34 | }, 35 | "2D_papyrus": { 36 | "name": "05.4_combined_set_without_stereochemistry.tsv.xz", 37 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_set_without_stereochemistry.tsv.xz?download=1", 38 | "size": 742110788, 39 | "sha256": "1a1c946917f77d9a250a181c8ef19bea4d04871915e9e75a615893a2c514684e" 40 | }, 41 | "2D_structures": { 42 | "name": "05.4_combined_2D_set_without_stereochemistry.sd.xz", 43 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_set_without_stereochemistry.sd.xz?download=1", 44 | "size": 416640448, 45 | "sha256": "4595f726daf12a784049f20e9f9464ed0287af3a22a27f2a919399c535f633fc" 46 | }, 47 | "3D_papyrus": { 48 | "name": "05.4_combined_set_with_stereochemistry.tsv.xz", 49 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_set_with_stereochemistry.tsv.xz?download=1", 50 | "size": 777395668, 51 | "sha256": "56cf389030246d4525bb31cd3dfc9e5ab3afa9613535d1540c71f0f7426c778f" 52 | }, 53 | "3D_structures": { 54 | "name": "05.4_combined_3D_set_with_stereochemistry.sd.xz", 55 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_3D_set_with_stereochemistry.sd.xz?download=1", 56 | "size": 446702556, 57 | "sha256": "b0f04e066b7ac6b1e1f2a868ff0258b13bd8d3433023ff59c3af58317bfeb3e9" 58 | }, 59 | "2D_fingerprint": { 60 | "name": "05.4_combined_2D_moldescs_ECFP6.tsv.xz", 61 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_ECFP6.tsv.xz?download=1", 62 | "size": 141318356, 63 | "sha256": "4ab781cc238107f7c48f1d866eea0e2114068b6512acf74932a5b21958c9ffe0" 64 | }, 65 | "3D_fingerprint": { 66 | "name": "05.4_combined_3D_moldescs_E3FP.tsv.xz", 67 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_3D_moldescs_E3FP.tsv.xz?download=1", 68 | "size": 146751352, 69 | "sha256": "2b89027dad8f4e59f007dd082664a7d2a491f4f79d112fb29f14565acedfe4d0" 70 | }, 71 | "2D_mordred": { 72 | "name": "05.4_combined_2D_moldescs_mordred2D.tsv.xz", 73 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_mordred2D.tsv.xz?download=1", 74 | "size": 3085232504, 75 | "sha256": "d15bca59f542a6c46528e4f131cb44d8bd6b21440ab139f4175f4327c15c39c6" 76 | }, 77 | "3D_mordred": { 78 | "name": "05.4_combined_3D_moldescs_mordred3D.tsv.xz", 79 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_3D_moldescs_mordred3D.tsv.xz?download=1", 80 | "size": 2996851908, 81 | "sha256": "80fc4f9b2d0b89e68c289c44e9f4df78f4c08e5867cd414d6169a4e1344aead8" 82 | }, 83 | "2D_cddd": { 84 | "name": "05.4_combined_2D_moldescs_CDDDs.tsv.xz", 85 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_CDDDs.tsv.xz?download=1", 86 | "size": 3770082588, 87 | "sha256": "9bb0d9adba1b812aa05b6391ecbc3f0148f6ed37972a004b13772d08790a9bda" 88 | }, 89 | "2D_mold2": { 90 | "name": "05.4_combined_2D_moldescs_mold2.tsv.xz", 91 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_2D_moldescs_mold2.tsv.xz?download=1", 92 | "size": 1552425452, 93 | "sha256": "bdfb0cbb6e9a3d1b62065808fa0e6ce238e04760df62e34ce4f15046810efd82" 94 | }, 95 | "proteins": { 96 | "name": "05.4_combined_set_protein_targets.tsv.xz", 97 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_set_protein_targets.tsv.xz?download=1", 98 | "size": 1701316, 99 | "sha256": "5f49030509ce188a119910f16054558e1cdd1c70a22d2a1458ec4189f5d1a08e" 100 | }, 101 | "proteins_unirep": { 102 | "name": "05.4_combined_prot_embeddings_unirep.tsv.xz", 103 | "url": "https://zenodo.org/records/10944245/files/05.4_combined_prot_embeddings_unirep.tsv.xz?download=1", 104 | "size": 138392528, 105 | "sha256": "19aa0562c3b695883c5aa8c05ad0934c4b9b851a26550345940d92ed17f36b93" 106 | } 107 | }, 108 | "05.5": { 109 | "readme": { 110 | "name": "README.txt", 111 | "url": "https://zenodo.org/record/7019874/files/README.txt?download=1", 112 | "size": 11092, 113 | "sha256": "0af036c1d02b150f6402a53960a6e995611e66ee7724b61a21f58d3366ec8eda" 114 | }, 115 | "requirements": [{ 116 | "name": "LICENSE.txt", 117 | "url": "https://zenodo.org/records/10943207/files/LICENSE.txt?download=1", 118 | "size": 20138, 119 | "sha256": "3b2890eacd851373001c4a14623458e3adaf1b1967939aa9c38a318e28d61c00" 120 | }, 121 | { 122 | "name": "data_types.json", 123 | "url": "https://zenodo.org/records/10943207/files/data_types.json?download=1", 124 | "size": 450678, 125 | "sha256": "d38f0b6b53f0450c5530b5bf44d8a7d0bb85417f22b7c818237e3346fe68149c" 126 | }, 127 | { 128 | "name": "data_size.json", 129 | "url": "https://zenodo.org/records/10943207/files/data_size.json?download=1", 130 | "size": 324, 131 | "sha256": "513307863c4acc779789340e900821ff8f38c845865aa078edc649caa1559dcc" 132 | }], 133 | "papyrus++": { 134 | "name": "05.5++_combined_set_without_stereochemistry.tsv.xz", 135 | "url": "https://zenodo.org/records/10943207/files/05.5++_combined_set_without_stereochemistry.tsv.xz?download=1", 136 | "size": 41357608, 137 | "sha256": "8ecaea9533f3c475dca6d335f30dd1b4abb259fa77b7441548dd15879e1afa58" 138 | }, 139 | "2D_papyrus": { 140 | "name": "05.5_combined_set_without_stereochemistry.tsv.xz", 141 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_set_without_stereochemistry.tsv.xz?download=1", 142 | "size": 718601992, 143 | "sha256": "04ecaea97c09d02dbde809ad99ea2127fc3997a4e3b200b56dee85c30801890a" 144 | }, 145 | "2D_structures": { 146 | "name": "05.5_combined_2D_set_without_stereochemistry.sd.xz", 147 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_set_without_stereochemistry.sd.xz?download=1", 148 | "size": 399767580, 149 | "sha256": "2e088ca662c5c33c5fc018c42c9c21e918ec167f1129a0a11fbf9c72888e8be6" 150 | }, 151 | "3D_papyrus": { 152 | "name": "05.5_combined_set_with_stereochemistry.tsv.xz", 153 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_set_with_stereochemistry.tsv.xz?download=1", 154 | "size": 690498416, 155 | "sha256": "822aca70ccf4c19879ae45dfa16de5fc29c3ee08b25739e7a087899652af7dd9" 156 | }, 157 | "3D_structures": { 158 | "name": "05.5_combined_3D_set_with_stereochemistry.sd.xz", 159 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_3D_set_with_stereochemistry.sd.xz?download=1", 160 | "size": 492426264, 161 | "sha256": "a4a5355ffc56de8d914c2ad281d10c227171c27e4d6c250daad14a16280cf136" 162 | }, 163 | "2D_fingerprint": { 164 | "name": "05.5_combined_2D_moldescs_ECFP6.tsv.xz", 165 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_ECFP6.tsv.xz?download=1", 166 | "size": 97818228, 167 | "sha256": "3d626b4295cfbe73877157d8eea84b911a3cb60bf9571165d88c00cc0b0880d2" 168 | }, 169 | "3D_fingerprint": { 170 | "name": "05.5_combined_3D_moldescs_E3FP.tsv.xz", 171 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_3D_moldescs_E3FP.tsv.xz?download=1", 172 | "size": 114052016, 173 | "sha256": "446fe36d50487f29a2d7402a53cc661097e884dc0df8ffd278646dba6708cb65" 174 | }, 175 | "2D_mordred": { 176 | "name": "05.5_combined_2D_moldescs_mordred2D.tsv.xz", 177 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_mordred2D.tsv.xz?download=1", 178 | "size": 2936434876, 179 | "sha256": "bcef94b1c04a1e7d8f9da11ad87e598e19932548a8ea4f00029c2f3a89672ff4" 180 | }, 181 | "3D_mordred": { 182 | "name": "05.5_combined_3D_moldescs_mordred3D.tsv.xz", 183 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_3D_moldescs_mordred3D.tsv.xz?download=1", 184 | "size": 3206020732, 185 | "sha256": "e6ffd0858f85217b57c4a88619e5f41d7f6bae16a9948612872162e54d3231dc" 186 | }, 187 | "2D_cddd": { 188 | "name": "05.5_combined_2D_moldescs_CDDDs.tsv.xz", 189 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_CDDDs.tsv.xz?download=1", 190 | "size": 3775676256, 191 | "sha256": "8421d973b4eb119f0739506a0b20ba9508356df97d4673e1c170e871cd134983" 192 | }, 193 | "2D_mold2": { 194 | "name": "05.5_combined_2D_moldescs_mold2.tsv.xz", 195 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_2D_moldescs_mold2.tsv.xz?download=1", 196 | "size": 1553510028, 197 | "sha256": "0fd1c2b3869c5fa749c21ddd70c5dff621974eccafb8e04fd6f95f3b37242058" 198 | }, 199 | "proteins": { 200 | "name": "05.5_combined_set_protein_targets.tsv.xz", 201 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_set_protein_targets.tsv.xz?download=1", 202 | "size": 1710756, 203 | "sha256": "d8f2cbee8b9849f7c3664fe7e8165c5abf785d374c36a8f151a6ec38fd582d80" 204 | }, 205 | "proteins_unirep": { 206 | "name": "05.5_combined_prot_embeddings_unirep.tsv.xz", 207 | "url": "https://zenodo.org/record/7019874/files/05.5_combined_prot_embeddings_unirep.tsv.xz?download=1", 208 | "size": 128869632, 209 | "sha256": "9f1fce00e77563481eafc44405f9dc8188d5669ed93cafaee256c0208ca135b8" 210 | } 211 | }, 212 | "05.6": { 213 | "readme": { 214 | "name": "README.txt", 215 | "url": "https://zenodo.org/record/7377161/files/README.txt?download=1", 216 | "size": 12170, 217 | "sha256": "c60b7146a295ddbd7d1cc0d7815ffa9389d5e93deb0e2a577b1065abcb468e03" 218 | }, 219 | "requirements": { 220 | "name": "05.6_additional_files.zip", 221 | "url": "https://zenodo.org/record/7377161/files/05.6_additional_files.zip?download=1", 222 | "size": 51310, 223 | "sha256": "c1d8df814ba54e17619f3740ff82577898a85a07acd220822403874159e26d8a" 224 | }, 225 | "papyrus++": { 226 | "name": "05.6++_combined_set_without_stereochemistry.tsv.xz", 227 | "url": "https://zenodo.org/records/7821775/files/05.6++_combined_set_without_stereochemistry.tsv.xz?download=1", 228 | "size": 31085780, 229 | "sha256": "7518019c3ba287cd4cd0ff29425fe9da8a4760d891d22ed1abb33da4920cf96a" 230 | }, 231 | "2D_papyrus": { 232 | "name": "05.6_combined_set_without_stereochemistry.tsv.xz", 233 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_set_without_stereochemistry.tsv.xz?download=1", 234 | "size": 744449364, 235 | "sha256": "82a36ed7bb2f80846bb46e4c3e38905895bd1a2cfddd471d32091cb59dcf9437" 236 | }, 237 | "2D_structures": { 238 | "name": "05.6_combined_2D_set_without_stereochemistry.sd.xz", 239 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_set_without_stereochemistry.sd.xz?download=1", 240 | "size": 439758444, 241 | "sha256": "1ec001964aca301494ea05fc24529120f01bc6952dcf4276dcd03625dfec460d" 242 | }, 243 | "3D_papyrus": { 244 | "name": "05.6_combined_set_with_stereochemistry.tsv.xz", 245 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_set_with_stereochemistry.tsv.xz?download=1", 246 | "size": 711529352, 247 | "sha256": "62068d500986b78fc90fe82b9e224555f8ca85319cd19f9df8bc73549e8a3e31" 248 | }, 249 | "3D_structures": { 250 | "name": "05.6_combined_3D_set_with_stereochemistry.sd.xz", 251 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_3D_set_with_stereochemistry.sd.xz?download=1", 252 | "size": 500108592, 253 | "sha256": "38e39963cd79845b4adca9dea871ffba18576ea742677471fc46a73a7dabbf38" 254 | }, 255 | "2D_fingerprint": { 256 | "name": "05.6_combined_2D_moldescs_ECFP6.tsv.xz", 257 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_ECFP6.tsv.xz?download=1", 258 | "size": 96612972, 259 | "sha256": "01c7366ee2ca7353d3a9f76601702b6d2eb312e71f02ea8ef48e2f90870c266c" 260 | }, 261 | "3D_fingerprint": { 262 | "name": "05.6_combined_3D_moldescs_E3FP.tsv.xz", 263 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_3D_moldescs_E3FP.tsv.xz?download=1", 264 | "size": 117065432, 265 | "sha256": "0d15baa4a9425daf63a0066511e9e96cbd5d7dab223bdaf48803536ab2484dc2" 266 | }, 267 | "2D_mordred": { 268 | "name": "05.6_combined_2D_moldescs_mordred2D.tsv.xz", 269 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_mordred2D.tsv.xz?download=1", 270 | "size": 3055443236, 271 | "sha256": "c497db85e97542f59b5252e2b1d3bdd93604e5c4d2ea131088a87d79ea6954c3" 272 | }, 273 | "3D_mordred": { 274 | "name": "05.6_combined_3D_moldescs_mordred3D.tsv.xz", 275 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_3D_moldescs_mordred3D.tsv.xz?download=1", 276 | "size": 3324119256, 277 | "sha256": "6b022acb6a0bec8bfc1ae7585014ae0b812a12ddcbed7be4ac7ec073c662192f" 278 | }, 279 | "2D_cddd": { 280 | "name": "05.6_combined_2D_moldescs_CDDDs.tsv.xz", 281 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_CDDDs.tsv.xz?download=1", 282 | "size": 2103289016, 283 | "sha256": "fbb54e5ca9a28ff022dc5baddf87cb6601169a2d86f3b55db4d183fd3885642a" 284 | }, 285 | "2D_mold2": { 286 | "name": "05.6_combined_2D_moldescs_mold2.tsv.xz", 287 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_2D_moldescs_mold2.tsv.xz?download=1", 288 | "size": 1487710808, 289 | "sha256": "cd46ce9841a1f956840b4fe7c56310eaa32c5e957a6ffaca62fbc55f820aad99" 290 | }, 291 | "proteins": { 292 | "name": "05.6_combined_set_protein_targets.tsv.xz", 293 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_set_protein_targets.tsv.xz?download=1", 294 | "size": 1850764, 295 | "sha256": "f443a2f8c74b8eb3f2c9d1bd7bfbddc86cbcc3fd5e8e505b7057b78a4ad17ee1" 296 | }, 297 | "proteins_unirep": { 298 | "name": "05.6_combined_protdescs_unirep.tsv.xz", 299 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_protdescs_unirep.tsv.xz?download=1", 300 | "size": 207122632, 301 | "sha256": "47f37c1f1efcb9d6f002d5a096853975c27ddcc767d7903a8af12bac0439181c" 302 | }, 303 | "proteins_prodec": { 304 | "name": "05.6_combined_protdescs_ProDEC.tsv.xz", 305 | "url": "https://zenodo.org/record/7377161/files/05.6_combined_protdescs_ProDEC.tsv.xz?download=1", 306 | "size": 447818916, 307 | "sha256": "3211a62f18ccb7ccc13f885374c1462efeb83ab0e98ed62d2645723f7dc9f1a1" 308 | } 309 | }, 310 | "05.7": { 311 | "readme": { 312 | "name": "README.txt", 313 | "url": "https://zenodo.org/records/13987985/files/README.txt?download=1", 314 | "size": 12813, 315 | "sha256": "fab159a88e302cad3e5b84ccea72a7c9fb2c212f656324d6191865460511f50d" 316 | }, 317 | "requirements": { 318 | "name": "05.7_additional_files.zip", 319 | "url": "https://zenodo.org/records/13987985/files/05.7_additional_files.zip?download=1", 320 | "size": 113945, 321 | "sha256": "0621cd63111286777769e6ea1c59e7adc1d05833bb1f61e50ba9e5be189d60da" 322 | }, 323 | "papyrus++": { 324 | "name": "05.7++_combined_set_without_stereochemistry.tsv.xz", 325 | "url": "https://zenodo.org/records/13987985/files/05.7++_combined_set_without_stereochemistry.tsv.xz?download=1", 326 | "size": 56759540 , 327 | "sha256": "8004e0d1027a760f205b45264386f792e7d49658da39f77f52e660a6f19760dd" 328 | }, 329 | "2D_papyrus": { 330 | "name": "05.7_combined_set_without_stereochemistry.tsv.xz", 331 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_set_without_stereochemistry.tsv.xz?download=1", 332 | "size": 751521856, 333 | "sha256": "9a7657f2636473ea1f4b033c3abbc9709608517f262f97e8adcc8f59d4f1189b" 334 | }, 335 | "2D_structures": { 336 | "name": "05.7_combined_2D_set_without_stereochemistry.sd.xz", 337 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_set_without_stereochemistry.sd.xz?download=1", 338 | "size": 453721500 , 339 | "sha256": "56be7c058130e9e861d884dc6094cf0ac4c3f37a75c7d2c4302685c4720f69ae" 340 | }, 341 | "3D_papyrus": { 342 | "name": "05.7_combined_set_with_stereochemistry.tsv.xz", 343 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_set_with_stereochemistry.tsv.xz?download=1", 344 | "size": 715716852, 345 | "sha256": "88a965ef8827692b1489bc947249e9fc00287ab6b63cbd2767862080a98e9a4c" 346 | }, 347 | "3D_structures": { 348 | "name": "05.7_combined_3D_set_with_stereochemistry.sd.xz", 349 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_3D_set_with_stereochemistry.sd.xz?download=1", 350 | "size": 507933052, 351 | "sha256": "8f1490a701e918e013770ea589651825ca2a459b214f50d6ff9ce892af398def" 352 | }, 353 | "2D_fingerprint": { 354 | "name": "05.7_combined_2D_moldescs_ECFP6.tsv.xz", 355 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_ECFP6.tsv.xz?download=1", 356 | "size": 99889764, 357 | "sha256": "6689cd5d3841abc350cb2dba719a2af02e119af2a595f15790ad14e5c4ace108" 358 | }, 359 | "3D_fingerprint": { 360 | "name": "05.7_combined_3D_moldescs_E3FP.tsv.xz", 361 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_3D_moldescs_E3FP.tsv.xz?download=1", 362 | "size": 119952372, 363 | "sha256": "6c247e785e5885e08ecebc2b452e05dcbb24395adabdef71b903d6491e9ae096" 364 | }, 365 | "2D_mordred": { 366 | "name": "05.7_combined_2D_moldescs_mordred2D.tsv.xz", 367 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_mordred2D.tsv.xz?download=1", 368 | "size": 3149391660, 369 | "sha256": "26781e0879af798a6b7df4e6c515efd79599335a04706d5335fdc8e5c5565fc3" 370 | }, 371 | "3D_mordred": { 372 | "name": "05.7_combined_3D_moldescs_mordred3D.tsv.xz", 373 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_3D_moldescs_mordred3D.tsv.xz?download=1", 374 | "size": 3421107392, 375 | "sha256": "c03f3d4c702815d5bfa2ddf80e20717d0bd13a542b0ca30e62534126eef03b0d" 376 | }, 377 | "2D_cddd": { 378 | "name": "05.7_combined_2D_moldescs_CDDDs.tsv.xz", 379 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_CDDDs.tsv.xz?download=1", 380 | "size": 2167302956, 381 | "sha256": "d86539cc76a537878725c4ef8a6703d316add737cb51915ad203e346fe92f6c9" 382 | }, 383 | "2D_mold2": { 384 | "name": "05.7_combined_2D_moldescs_mold2.tsv.xz", 385 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_2D_moldescs_mold2.tsv.xz?download=1", 386 | "size": 1539905628, 387 | "sha256": "493436f96d30680568c2a70ed08d76a159b916c57e9df03f639ae7b414fb87cb" 388 | }, 389 | "proteins": { 390 | "name": "05.7_combined_set_protein_targets.tsv.xz", 391 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_set_protein_targets.tsv.xz?download=1", 392 | "size": 1780032, 393 | "sha256": "832e564fb82daea0e4da79abcb44834d10104229382874e79915a1288d80783c" 394 | }, 395 | "proteins_unirep": { 396 | "name": "05.7_combined_protdescs_unirep.tsv.xz", 397 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_protdescs_unirep.tsv.xz?download=1", 398 | "size": 209957000, 399 | "sha256": "a39f21db7ed5ba72ef881a9f05f5362f7aaaa0f2709c023a0060417678f30dec" 400 | }, 401 | "proteins_prodec": { 402 | "name": "05.7_combined_protdescs_ProDEC.tsv.xz", 403 | "url": "https://zenodo.org/records/13987985/files/05.7_combined_protdescs_ProDEC.tsv.xz?download=1", 404 | "size": 435015384, 405 | "sha256": "1ec2d7b0cd95c93aaabacf4153e58e464e4327f0ebb3bad0077fd740b7334cb1" 406 | } 407 | } 408 | } 409 | -------------------------------------------------------------------------------- /src/papyrus_scripts/utils/IO.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """IO functions.""" 4 | 5 | from __future__ import annotations 6 | 7 | import glob 8 | import hashlib 9 | import importlib 10 | import inspect 11 | import json 12 | import os 13 | import re 14 | from collections import namedtuple 15 | 16 | import requests 17 | import shutil 18 | import lzma 19 | import gzip 20 | from typing import List, Optional 21 | 22 | import pystow 23 | import pandas as pd 24 | from tqdm.auto import tqdm 25 | 26 | 27 | def sha256sum(filename, blocksize=None): 28 | if blocksize is None: 29 | blocksize = 65536 30 | hash = hashlib.sha256() 31 | with open(filename, "rb") as fh: 32 | for block in iter(lambda: fh.read(blocksize), b""): 33 | hash.update(block) 34 | return hash.hexdigest() 35 | 36 | 37 | def assert_sha256sum(filename, sha256, blocksize=None): 38 | if not (isinstance(sha256, str) and len(sha256) == 64): 39 | raise ValueError("SHA256 must be 64 chars: {}".format(sha256)) 40 | sha256_actual = sha256sum(filename, blocksize) 41 | return sha256_actual == sha256 42 | 43 | 44 | def write_jsonfile(data: object, json_outfile: str) -> None: 45 | """Write a json object to a file with lazy formatting.""" 46 | with open(json_outfile, 'w') as outfile: 47 | json.dump(data, outfile, indent=4) 48 | 49 | 50 | def read_jsonfile(json_infile: str) -> dict: 51 | """Read in a json file and return the json object.""" 52 | if not os.path.isfile(json_infile): 53 | return {} 54 | with open(json_infile) as infile: 55 | data = json.load(infile) 56 | return data 57 | 58 | 59 | class TypeEncoder(json.JSONEncoder): 60 | """Custom json encoder to support types as values.""" 61 | 62 | def default(self, obj): 63 | """Add support if value is a type.""" 64 | if isinstance(obj, type): 65 | return {'__type__': {'module': inspect.getmodule(obj).__name__, 66 | 'type': obj.__name__} 67 | } 68 | # Let the base class default method raise the TypeError 69 | return json.JSONEncoder.default(self, obj) 70 | 71 | 72 | class TypeDecoder(json.JSONDecoder): 73 | """Custom json decoder to support types as values.""" 74 | 75 | def __init__(self, *args, **kwargs): 76 | """Simple json decoder handling types as values.""" 77 | json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) 78 | 79 | def object_hook(self, obj): 80 | """Handle types.""" 81 | if '__type__' not in obj: 82 | return obj 83 | module = obj['__type__']['module'] 84 | type_ = obj['__type__']['type'] 85 | if module == 'builtins': 86 | return getattr(__builtins__, type_) 87 | loaded_module = importlib.import_module(module) 88 | return getattr(loaded_module, type_) 89 | 90 | 91 | def enough_disk_space(destination: str, 92 | required: int, 93 | margin: float = 0.10): 94 | """Check disk has enough space. 95 | 96 | :param destination: folder to check 97 | :param required: space required in bytes 98 | :param margin: percent of free disk space once file is written 99 | """ 100 | total, _, free = shutil.disk_usage(destination) 101 | return free - required > margin * total 102 | 103 | 104 | def get_disk_space(destination: str): 105 | """Obtain size of free disk space. 106 | 107 | :param destination: folder to check 108 | """ 109 | _, _, free = shutil.disk_usage(destination) 110 | return free 111 | 112 | 113 | def get_downloaded_versions(root_folder: str = None) -> dict: 114 | """Identify versions of the downloaded Papyrus data 115 | 116 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder) 117 | """ 118 | if root_folder is not None: 119 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder) 120 | version_json = pystow.join('papyrus', name='versions.json').as_posix() 121 | return read_jsonfile(version_json) 122 | 123 | 124 | def get_downloaded_papyrus_files(root_folder: str = None) -> pd.DataFrame: 125 | """Identify downloaded files for each version of the Papyrus data 126 | 127 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder) 128 | """ 129 | # Obtain versions downloaded 130 | downloaded_versions = get_downloaded_versions(root_folder) 131 | # Obtain filenames that could have been downloaded 132 | files = get_papyrus_links(offline=True) 133 | # Keep only file names 134 | file_info = namedtuple('file_info', ('version', 'short_name', 'file_name')) 135 | files = [file_info(version, file, file_data['name']) 136 | for version in downloaded_versions 137 | for file, file_data in files[version].items() 138 | if file in ['papyrus++', '2D_papyrus', '3D_papyrus', '2D_structures', '3D_structures', 139 | '2D_fingerprint', '3D_fingerprint', '2D_mordred', '3D_mordred', 140 | '2D_cddd', '2D_mold2', 'proteins', 'proteins_unirep', 'proteins_prodec']] 141 | # Try to locate files 142 | # Uses glob to prevent maintaining a mapping of subfolders and file names 143 | # This does not check files have been downloaded in the right subfolders 144 | data = pd.DataFrame([{'version': file.version, 145 | 'short_name': file.short_name, 146 | 'downloaded': len(glob.glob( 147 | os.path.join(pystow.module('papyrus', file.version).base.as_posix(), '**', 148 | file.file_name), recursive=True)) > 0} 149 | for file in files]) 150 | return data 151 | 152 | 153 | def get_latest_downloaded_version(root_folder: str = None) -> List[str]: 154 | """Identify the latest version of the downloaded Papyrus data 155 | 156 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder) 157 | """ 158 | if root_folder is not None: 159 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder) 160 | version_json = pystow.join('papyrus', name='versions.json').as_posix() 161 | versions = read_jsonfile(version_json) 162 | return sorted(versions, key=lambda s: [int(u) for u in s.split('.')])[-1] 163 | 164 | 165 | def get_online_versions() -> List[str]: 166 | """Identify the versions of the Papyrus data available online 167 | 168 | :return: a list of the versions available 169 | """ 170 | papyrus_links = get_papyrus_links() 171 | return sorted(papyrus_links.keys(), key=lambda s: [int(u) for u in s.split('.')]) + ['latest'] 172 | 173 | 174 | def process_data_version(version: str | PapyrusVersion, root_folder: str = None): 175 | """Confirm the version is available, downloaded and convert synonyms. 176 | 177 | :param version: version to be confirmed and/or converted. 178 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder) 179 | :return: version number 180 | :raises: IOError is the version is not available 181 | """ 182 | # Check if aliases 183 | if not isinstance(version, PapyrusVersion): 184 | version = PapyrusVersion(version=version) 185 | # Handle exceptions 186 | available_versions = get_downloaded_versions(root_folder) 187 | if len(available_versions) == 0: 188 | raise IOError('Papyrus data not available (did you download it first?)') 189 | else: 190 | available_versions += ['latest'] 191 | if version.version_old_fmt not in available_versions: 192 | raise ValueError(f'version can only be one of [{", ".join(available_versions)}] not {version.version_old_fmt}') 193 | elif version == 'latest': 194 | version = get_latest_downloaded_version(root_folder) 195 | return version 196 | 197 | 198 | def is_local_version_available(version: str, root_folder: str = None): 199 | """Confirm the version is available and downloaded 200 | 201 | :param version: version to check the local availability. 202 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder) 203 | :return: True if the version is available locally, False otherwise 204 | """ 205 | try: 206 | _ = process_data_version(version=version, root_folder=root_folder) 207 | return True 208 | except (IOError, ValueError): 209 | return False 210 | 211 | def locate_file(dirpath: str, regex_pattern: str): 212 | """Find file(s) matching the given pattern in the given directory 213 | 214 | :param dirpath: Path to the directory to obtain the file from 215 | :param regex_pattern: Pattern used to locate the file(s) 216 | :return: a list of files matching the pattern and in the given directory 217 | """ 218 | # Handle exceptions 219 | if not os.path.isdir(dirpath): 220 | raise NotADirectoryError(f'Directory does not exist: {dirpath}') 221 | # Find the file 222 | filenames = [os.path.join(dirpath, fname) for fname in os.listdir(dirpath) if re.search(regex_pattern, fname)] 223 | # Handle WSL ZoneIdentifier files 224 | filenames = [fname for fname in filenames if not fname.endswith(':ZoneIdentifier')] 225 | if len(filenames) == 0: 226 | raise FileNotFoundError(f'Could not locate a file in {dirpath} matching {regex_pattern}') 227 | return filenames 228 | 229 | 230 | def get_num_rows_in_file(filetype: str, is3D: bool, descriptor_name: Optional[str] = None, 231 | version: str | PapyrusVersion = 'latest', 232 | plusplus: bool = True, root_folder: Optional[str] = None) -> int: 233 | """Get the number of rows a Papyrus file has. 234 | 235 | 236 | :param filetype: Type of file, one of {'bioactivities', 'structures', 'descriptors'} 237 | :param is3D: Whether to consider the standardised (2D) or non-standardised (3D) data 238 | :param descriptor_name: Name of the descriptor, one of {'cddd', 'mold2', 'mordred', 'fingerprint'}, 239 | only considered if type='descriptors'. 240 | :param version: Version of Papyrus to be considered 241 | :param plusplus: If bioactivities come from the Papyrus++ very high quality curated set, 242 | only considered if type='bioactivitities'. 243 | :param root_folder: folder containing the bioactivity dataset (default: pystow's home folder) 244 | :return: The number of lines in the corresponding file 245 | """ 246 | if filetype not in ['bioactivities', 'structures', 'descriptors']: 247 | raise ValueError('filetype must be one of [\'bioactivities\', \'structures\', \'descriptors\']') 248 | if filetype == 'descriptors' and ( 249 | descriptor_name is None or descriptor_name not in ['cddd', 'mold2', 'mordred', 'fingerprint']): 250 | raise ValueError('filetype must be one of [\'cddd\', \'mold2\', \'mordred\', \'fingerprint\']') 251 | # Process version shortcuts 252 | version = process_data_version(version=version, root_folder=root_folder) 253 | if root_folder is not None: 254 | os.environ['PYSTOW_HOME'] = os.path.abspath(root_folder) 255 | json_file = pystow.join('papyrus', version.version_old_fmt, name='data_size.json').as_posix() 256 | # Obtain file sizes (number of lines) 257 | sizes = read_jsonfile(json_file) 258 | if filetype == 'bioactivities': 259 | if plusplus: 260 | if 'papyrus_++' in sizes.keys(): 261 | return sizes['papyrus_++'] 262 | else: 263 | return sizes['papyrus++'] 264 | return sizes['papyrus_3D'] if is3D else sizes['papyrus_2D'] 265 | elif filetype == 'structures': 266 | return sizes['structures_3D'] if is3D else sizes['structures_2D'] 267 | elif filetype == 'descriptors': 268 | if descriptor_name == 'cddd': 269 | return sizes['cddd'] 270 | elif descriptor_name == 'mold2': 271 | return sizes['mold2'] 272 | elif descriptor_name == 'fingerprint': 273 | return sizes['E3FP'] if is3D else sizes['ECFP6'] 274 | elif descriptor_name == 'mordred': 275 | return sizes['mordred_3D'] if is3D else sizes['mordred_2D'] 276 | 277 | 278 | def get_papyrus_links(offline: bool = False): 279 | """Obtain the latest links to Papyrus data files from GitHub. 280 | 281 | If the connection to the GitHub server is made, the 282 | local version of the file is updated. 283 | Otherwise, defaults ot the local version of the file. 284 | 285 | :param offline: do not attempt to download the latest file from GitHub 286 | """ 287 | local_file = os.path.join(os.path.dirname(__file__), 'links.json') 288 | if not offline: 289 | url = "https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/db-links/links.json" 290 | session = requests.session() 291 | try: 292 | res = session.get(url, verify=True) 293 | with open(local_file, 'w') as oh: 294 | oh.write(res.text) 295 | except requests.exceptions.ConnectionError as e: 296 | pass 297 | with open(local_file) as fh: 298 | data = json.load(fh) 299 | return data 300 | 301 | 302 | def get_papyrus_aliases(offline: bool = False): 303 | """Obtain the latest aliases of the Papyrus versions from GitHub. 304 | 305 | If the connection to the GitHub server is made, the 306 | local version of the file is updated. 307 | Otherwise, defaults ot the local version of the file. 308 | 309 | :param offline: do not attempt to download the latest file from GitHub 310 | """ 311 | local_file = os.path.join(os.path.dirname(__file__), 'aliases.json') 312 | if not offline: 313 | url = "https://raw.githubusercontent.com/OlivierBeq/Papyrus-scripts/db-links/aliases.json" 314 | session = requests.session() 315 | try: 316 | res = session.get(url, verify=True) 317 | with open(local_file, 'w') as oh: 318 | oh.write(res.text) 319 | except requests.exceptions.ConnectionError as e: 320 | pass 321 | data = pd.read_json(local_file, orient='split', dtype={'version': 'str', 'alias': 'str', 322 | 'revision': 'str', 'chembl_version': 'str'}) 323 | return data 324 | 325 | 326 | def convert_xz_to_gz(input_file: str, output_file: str, 327 | compression_level: int = 9, 328 | progress: bool = False) -> None: 329 | """Convert a LZMA-compressed xz file to a GZIP-compressed file. 330 | 331 | :param input_file: Path of the input file 332 | :param output_file: Path of the output file 333 | :param compression_level: Compression level of the output file (if None, defaults to 9) 334 | :param progress: Show conversion progress. 335 | """ 336 | if compression_level is None: 337 | compression_level = 9 338 | # Transform per chunk 339 | chunksize = 10 * 1048576 # 10 MB 340 | with lzma.open(input_file, 'rb') as fh, gzip.open(output_file, 'wb', compresslevel=compression_level) as oh: 341 | if progress: 342 | pbar = tqdm(desc='Determining size', unit='B', unit_scale=True) 343 | size = fh.seek(0, 2) # Determine original size 344 | _ = fh.seek(0, 0) # Go back to the beginning 345 | pbar.set_description('Converting') 346 | pbar.total = size 347 | # pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True) 348 | while True: 349 | chunk = fh.read(chunksize) 350 | if not chunk: 351 | if progress: 352 | pbar.close() 353 | break 354 | written = oh.write(chunk) 355 | if progress: 356 | pbar.update(written) 357 | 358 | 359 | def convert_gz_to_xz(input_file: str, output_file: str, 360 | compression_level: int = lzma.PRESET_DEFAULT, 361 | extreme: bool = False, 362 | progress: bool = False) -> None: 363 | """Convert a GZIP- compressed file to a LZMA-compressed xz file. 364 | 365 | :param input_file: Path of the input file 366 | :param output_file: Path of the output file 367 | :param compression_level: Compression level of the output file (if None, defaults to 6) 368 | :param extreme: Should extreme compression be toggled on top of the compression level 369 | :param progress: Show conversion progress. 370 | """ 371 | if compression_level is None: 372 | compression_level = lzma.PRESET_DEFAULT 373 | preset = compression_level | lzma.PRESET_EXTREME if extreme else compression_level 374 | # Transform per chunk 375 | chunksize = 10 * 1048576 # 10 MB 376 | with gzip.open(input_file, 'rb') as fh, lzma.open(output_file, 'wb', preset=preset) as oh: 377 | if progress: 378 | pbar = tqdm(desc='Determining size', unit='B', unit_scale=True) 379 | size = fh.seek(0, 2) # Determine original size 380 | _ = fh.seek(0, 0) # Go back to the beginning 381 | pbar.set_description('Converting') 382 | pbar.total = size 383 | # pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True) 384 | while True: 385 | chunk = fh.read(chunksize) 386 | if not chunk: 387 | if progress: 388 | pbar.close() 389 | break 390 | written = oh.write(chunk) 391 | if progress: 392 | pbar.update(written) 393 | 394 | 395 | class PapyrusVersion: 396 | 397 | aliases = get_papyrus_aliases(offline=True) 398 | 399 | def __init__(self, version: Optional[str] = None, chembl_version: Optional[int] = None, 400 | chembl: Optional[bool] = None, excape: Optional[bool] = None, 401 | sharma: Optional[bool] = None, christmann: Optional[bool] = None, 402 | klaeger: Optional[bool] = None, merget: Optional[bool] = None, 403 | pickett: Optional[bool] = None): 404 | """Determine the Papyrus version based on provided information. 405 | 406 | :param version: Version number (either older '05.4', or new format '2022.04') 407 | :param chembl_version: Version of ChEMBL to select the Papyrus version from 408 | :param chembl: Whether ChEMBL is included in the Papyrus version to select 409 | :param excape: Whether ExCAPED-DB is included in the Papyrus version to select 410 | :param sharma: Whether the Sharma et al. dataset is included in the Papyrus version to select 411 | :param christmann: Whether the Christmann-Franck et al. dataset is included in the Papyrus version to select 412 | :param klaeger: Whether the Klaeger et al. dataset is included in the Papyrus version to select 413 | :param merget: Whether the Merget et al. dataset is included in the Papyrus version to select 414 | :param pickett: Whether the Pickett et al. dataset is included in the Papyrus version to select 415 | """ 416 | # Determine version from the given version name 417 | if version is not None: 418 | if version.lower() == 'latest': 419 | query = 'alias == alias.max()' 420 | else: 421 | query = f'version == "{version}" or alias == "{version.strip()}"' 422 | else: 423 | # Determine version from sources 424 | query = [] 425 | if chembl: 426 | query.append('chembl') 427 | if excape: 428 | query.append('excape') 429 | if sharma: 430 | query.append('sharma') 431 | if christmann: 432 | query.append('christmann') 433 | if klaeger: 434 | query.append('klaeger') 435 | if merget: 436 | query.append('merget') 437 | if pickett: 438 | query.append('pickett') 439 | if chembl_version: 440 | query.append(f'chembl_version == "{chembl_version}"') 441 | query = " and ".join(query) 442 | # Identify the aliases matching the query 443 | if len(query): 444 | subset = self.aliases.query(query) 445 | else: 446 | subset = self.aliases 447 | if subset.empty: 448 | raise ValueError('None of the Papyrus versions match the provided information.') 449 | elif len(subset) > 1: 450 | raise ValueError(f'The provided information match multiple versions:\n\n' + 451 | str(subset.set_index('version')) + 452 | '\n\nChoose the version that matches your requirements.') 453 | else: 454 | params = subset.squeeze().to_dict() 455 | for key, value in params.items(): 456 | if key == 'version': 457 | setattr(self, 'version_old_fmt', value) 458 | elif key == 'alias': 459 | setattr(self, 'version', value) 460 | else: 461 | setattr(self, key, value) 462 | 463 | def __repr__(self): 464 | return f'' 465 | -------------------------------------------------------------------------------- /src/papyrus_scripts/neuralnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import time 5 | import random 6 | import itertools 7 | from typing import Iterator, List, Optional, Union 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from pandas.io.parsers import TextFileReader as PandasTextFileReader 12 | 13 | try: 14 | import torch as T 15 | from torch import nn, optim 16 | from torch.nn import functional as F 17 | from torch.utils.data import DataLoader, TensorDataset, IterableDataset as PandasIterableDataset 18 | except ImportError as e: 19 | T = e 20 | nn = e 21 | # Placeholders 22 | T.Tensor = int 23 | nn.Module = list 24 | PandasIterableDataset = int 25 | 26 | 27 | def cuda(var: nn.Module): 28 | """Move model parameters and buffers to GPU if a GPU is available. 29 | 30 | Originates from Xuhan Liu's DrugEx version 1 (https://github.com/XuhanLiu/DrugEx/tree/1.0) 31 | 32 | :param var: torch.nn.Module derived class to be trained on GPU (or CPU if not GPU available) 33 | """ 34 | if T.cuda.is_available(): 35 | return var.cuda() 36 | return var 37 | 38 | 39 | def Variable(tensor: Union[T.Tensor, np.ndarray, List]): 40 | """Transform a list or numpy array into a pytorch tensor on GPU (if available). 41 | 42 | Originates from Xuhan Liu's DrugEx version 1 (https://github.com/XuhanLiu/DrugEx/tree/1.0) 43 | Original documentation: Wrapper for torch.autograd.Variable that also accepts 44 | numpy arrays directly and automatically assigns it to 45 | the GPU. Be aware in some cases operations are better 46 | left to the CPU. 47 | :param tensor: the list, numpy array or pytorch tensor to be sent to GPU (if available) 48 | """ 49 | if isinstance(tensor, np.ndarray): 50 | tensor = T.from_numpy(tensor) 51 | if isinstance(tensor, list): 52 | tensor = T.Tensor(tensor) 53 | return cuda(T.autograd.Variable(tensor)) 54 | 55 | 56 | def set_seed(seed: Optional[int] = None) -> Optional[np.random.Generator]: 57 | """Set the internal seed of rnadom number generators for reproducibility.""" 58 | if seed is None: 59 | return 60 | T.manual_seed(seed) 61 | T.cuda.manual_seed_all(seed) 62 | T.cuda.manual_seed(seed) 63 | rng = np.random.default_rng(seed) 64 | random.seed(seed) 65 | T.backends.cudnn.deterministic = True 66 | T.backends.cudnn.benchmark = False 67 | return rng 68 | 69 | 70 | class BaseNN(nn.Module): 71 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3, 72 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25, 73 | random_seed: Optional[int] = None): 74 | """Base class for neural networks. 75 | 76 | Architecture is derived from https://doi.org/10.1186/s13321-017-0232-0 77 | 78 | :param out: output folder 79 | :param epochs: number of epochs 80 | :param lr: learning rate 81 | :param early_stop: stop after these many epochs without any decrease of loss 82 | :param batch_size: size of data batches 83 | :param dropout: fraction of randomly disabled neurons at each epoch during training 84 | :param random_seed: seed of random number generators 85 | """ 86 | if isinstance(T, ImportError): 87 | raise ImportError('Some required dependencies are missing:\n\tpytorch') 88 | if not os.path.isdir(out): 89 | os.makedirs(out, exist_ok=True) 90 | super().__init__() 91 | self.fcl = nn.ModuleList() # fully connected layers 92 | self.out = out 93 | self.batch_size = batch_size 94 | self.epochs = epochs 95 | self.lr = lr 96 | self.early_stop = early_stop 97 | self.dropout = dropout 98 | self.rng = set_seed(random_seed) 99 | 100 | def set_validation(self, X: Union[Iterator, pd.DataFrame], y: Union[Iterator, pd.Series]): 101 | """Set the validation set to be used during fitting. 102 | 103 | :param X: features to predict y from 104 | :param y: feature to be predicted (dependent variable) 105 | """ 106 | if not isinstance(X, (pd.DataFrame, np.ndarray)) and type(X) != type(y): 107 | raise ValueError('X and y must have the same type (i.e. either Iterator or pandas dataframe)') 108 | # Get data loaders 109 | if isinstance(X, (pd.DataFrame, np.ndarray)): 110 | self.loader_valid = loader_from_dataframe(X, y, batch_size=self.batch_size) 111 | else: 112 | self.loader_valid = loader_from_iterator(X, y, batch_size=self.batch_size) 113 | 114 | def set_architecture(self, dimensions: List[int]): 115 | """Define the size of each fully connected linear hidden layer 116 | 117 | :param dimensions: dimensions of the layers 118 | """ 119 | for i in range(len(dimensions) - 1): 120 | self.fcl.append(nn.Linear(dimensions[i], dimensions[i + 1])) 121 | T.save(self.state_dict(), os.path.join(self.out, 'empty_model.pkg')) 122 | 123 | def reset(self): 124 | """Reset weights and reload the initial state of the model""" 125 | self.load_state_dict(T.load(os.path.join(self.out, 'empty_model.pkg'))) 126 | 127 | def fit(self, X: Union[Iterator, pd.DataFrame], y: Union[Iterator, pd.Series]): 128 | """Fit neural network with training set and optimize for loss on validation set. 129 | 130 | :param X: features to predict y from 131 | :param y: feature to be predicted (dependent variable) 132 | """ 133 | if not self.fcl: 134 | raise ValueError('set architecture before fitting') 135 | if not isinstance(X, (pd.DataFrame, np.ndarray)) and type(X) != type(y): 136 | raise ValueError('X and y must have the same type (i.e. either Iterator or pandas dataframe)') 137 | # Set number of classes 138 | self.classes_ = sorted(set(y)) 139 | # Get data loaders 140 | if isinstance(X, (pd.DataFrame, np.ndarray)): 141 | loader_train = loader_from_dataframe(X, y, batch_size=self.batch_size) 142 | else: 143 | loader_train = loader_from_iterator(X, y, batch_size=self.batch_size) 144 | # Set optimizer 145 | if 'optim' in self.__dict__: 146 | optimizer = self.optim 147 | else: 148 | optimizer = optim.Adam(self.parameters(), lr=self.lr) 149 | best_loss = np.inf 150 | last_save = 0 151 | # Set up output folder 152 | if not (os.path.exists(self.out) and os.path.isdir(self.out)): 153 | os.mkdir(self.out) 154 | # Log file 155 | log = open(os.path.join(self.out, 'training_log.txt'), 'w') 156 | for epoch in range(self.epochs): 157 | t0 = time.perf_counter() 158 | # Change learning rate according to epoch 159 | for param_group in optimizer.param_groups: 160 | param_group['lr'] = self.lr * (1 - 1 / self.epochs) ** (epoch * 10) 161 | # Train epoch over all batches 162 | for i, (Xb, yb) in enumerate(loader_train): 163 | Xb, yb = Variable(Xb), Variable(yb) 164 | optimizer.zero_grad() 165 | y_ = self.forward(Xb, istrain=True) 166 | ix = yb == yb 167 | yb, y_ = yb[ix], y_[ix] 168 | loss = self.criterion(y_, yb) 169 | loss.backward() 170 | optimizer.step() 171 | # Calculate loss and log 172 | loss_valid = self.evaluate(self.loader_valid) 173 | print(f'[Epoch: {epoch + 1}/{self.epochs}] {time.perf_counter() - t0:.1f}s ' 174 | f'loss_train: {loss.item():f} loss_valid: {loss_valid:f}', file=log, flush=True) 175 | if loss_valid < best_loss: 176 | T.save(self.state_dict(), os.path.join(self.out, 'model.pkg')) 177 | print(f'[Performance] loss_valid improved from {best_loss:f} to {loss_valid:f}, ' 178 | 'Saved model to model.pkg', file=log, flush=True) 179 | best_loss = loss_valid 180 | last_save = epoch 181 | else: 182 | print('[Performance] loss_valid did not improve.', file=log, flush=True) 183 | # Early stop if no improvement for some time 184 | if epoch - last_save > self.early_stop: 185 | break 186 | log.close() 187 | self.load_state_dict(T.load(os.path.join(self.out, 'model.pkg'))) 188 | 189 | def evaluate(self, loader): 190 | """Calculate loss according to criterion function 191 | 192 | :param loader: data loader of the validation set 193 | """ 194 | loss = 0 195 | for Xb, yb in loader: 196 | Xb, yb = Variable(Xb), Variable(yb) 197 | y_ = self.forward(Xb) 198 | ix = yb == yb 199 | yb, y_ = yb[ix], y_[ix] 200 | loss += self.criterion(y_, yb).item() 201 | return loss / len(loader) 202 | 203 | def predict(self, X: Union[pd.DataFrame, np.ndarray]): 204 | """Predict outcome for the incoming data 205 | 206 | :param X: features to predict the endpoint(s) from 207 | """ 208 | if not isinstance(X, (pd.DataFrame, np.ndarray)): 209 | raise ValueError('X must be either a numpy array or a pandas dataframe') 210 | if isinstance(X, pd.DataFrame): 211 | y = X.iloc[:, 0] 212 | else: 213 | y = X[:, 0] 214 | loader = loader_from_dataframe(X, y, self.batch_size) 215 | score = [] 216 | for Xb, _ in loader: 217 | Xb = Variable(Xb) 218 | y_ = self.forward(Xb) 219 | score.append(y_.cpu().data) 220 | return T.cat(score, dim=0).numpy() 221 | 222 | 223 | class SingleTaskNNClassifier(BaseNN): 224 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3, 225 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25, 226 | random_seed: Optional[int] = None): 227 | """Neural Network classifier to predict a unique endpoint. 228 | 229 | Architecture is derived from https://doi.org/10.1186/s13321-017-0232-0 230 | 231 | :param out: output folder 232 | :param epochs: number of epochs 233 | :param lr: learning rate 234 | :param early_stop: stop after these many epochs without any decrease of loss 235 | :param batch_size: size of data batches 236 | :param dropout: fraction of randomly disabled neurons at each epoch during training 237 | :param random_seed: seed of random number generators 238 | """ 239 | super(SingleTaskNNClassifier, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed) 240 | self.dropoutl = nn.Dropout(self.dropout) 241 | # Consider binary classification as default 242 | self.criterion = nn.BCELoss() 243 | self.activation = nn.Sigmoid() 244 | 245 | def set_architecture(self, n_dim: int, n_class: int): 246 | """Set dimension of input and number of classes to be predicted. 247 | 248 | :param n_dim: number of input parameters 249 | :param n_class: number of one-hot encoded classes (i.e. 1 for binary endpoint not one-hot encoded) 250 | """ 251 | if n_class < 1: 252 | raise ValueError('can only perform binary (n_class=1 or n_class=2)' 253 | ' or multi-classes predictions (n_class>2)') 254 | super().set_architecture([n_dim, 8000, 4000, 2000, n_class]) 255 | self._n_classes_ = n_class 256 | self._n_features_in_ = n_dim 257 | if n_class == 1: 258 | self.criterion = nn.BCELoss() 259 | self.activation = nn.Sigmoid() 260 | else: 261 | self.criterion = nn.CrossEntropyLoss() 262 | self.activation = nn.Softmax() 263 | cuda(self) 264 | 265 | def forward(self, X, istrain=False): 266 | """Calculate model output from input data. 267 | 268 | :param X: input data 269 | :param istrain: whether called during training, to activate dropout 270 | """ 271 | input = X 272 | for layer in self.fcl[:-1]: 273 | input = F.relu(layer(input)) 274 | if istrain: 275 | input = self.dropoutl(input) 276 | return self.activation(self.fcl[-1](input)) 277 | 278 | def predict_proba(self, X): 279 | """Predict class probabilities for the incoming data 280 | 281 | :param X: features to predict the endpoint probabilities from 282 | """ 283 | y = super().predict(X) 284 | return y 285 | 286 | def predict(self, X): 287 | """Predict classes for the incoming data 288 | 289 | :param X: features to predict the endpoint(s) from 290 | """ 291 | probas = self.predict_proba(X) 292 | return np.round(probas) 293 | 294 | 295 | class SingleTaskNNRegressor(BaseNN): 296 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3, 297 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25, 298 | random_seed: Optional[int] = None): 299 | """Neural Network regressor to predict a unique endpoint. 300 | 301 | Architecture is adapted from https://doi.org/10.1186/s13321-017-0232-0 for regression 302 | 303 | :param out: output folder 304 | :param epochs: number of epochs 305 | :param lr: learning rate 306 | :param early_stop: stop after these many epochs without any decrease of loss 307 | :param batch_size: size of data batches 308 | :param dropout: fraction of randomly disabled neurons at each epoch during training 309 | :param random_seed: seed of random number generators 310 | """ 311 | super(SingleTaskNNRegressor, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed) 312 | self.dropoutl = nn.Dropout(self.dropout) 313 | self.criterion = nn.MSELoss() 314 | 315 | def set_architecture(self, n_dim: int): 316 | """Set dimension of input. 317 | 318 | :param n_dim: number of input parameters 319 | """ 320 | super().set_architecture([n_dim, 8000, 4000, 2000, 1]) 321 | cuda(self) 322 | 323 | def forward(self, X, istrain=False): 324 | """Calculate model output from input data. 325 | 326 | :param X: input data 327 | :param istrain: whether called during training, to activate dropout 328 | """ 329 | input = X 330 | for layer in self.fcl[:-1]: 331 | input = F.relu(layer(input)) 332 | if istrain: 333 | input = self.dropoutl(input) 334 | return self.fcl[-1](input) 335 | 336 | 337 | class MultiTaskNNClassifier(BaseNN): 338 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3, 339 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25, 340 | random_seed: Optional[int] = None): 341 | """Neural Network classifier to predict multiple endpoints. 342 | 343 | Architecture is derived from https://doi.org/10.1186/s13321-017-0232-0 344 | 345 | :param out: output folder 346 | :param epochs: number of epochs 347 | :param lr: learning rate 348 | :param early_stop: stop after these many epochs without any decrease of loss 349 | :param batch_size: size of data batches 350 | :param dropout: fraction of randomly disabled neurons at each epoch during training 351 | :param random_seed: seed of random number generators 352 | """ 353 | super(MultiTaskNNClassifier, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed) 354 | self.criterion = nn.BCELoss() 355 | self.activation = nn.Sigmoid() 356 | self.dropoutl = nn.Dropout(self.dropout) 357 | 358 | def set_architecture(self, n_dim: int, n_task: int): 359 | """Set dimension of input and number of classes to be predicted. 360 | 361 | :param n_dim: number of input parameters 362 | :param n_task: number of tasks to be predicted at the same time 363 | """ 364 | if n_task < 2: 365 | raise ValueError('use SingleTaskNNClassifier for a single task') 366 | super().set_architecture([n_dim, 8000, 4000, 2000, n_task]) 367 | cuda(self) 368 | 369 | def forward(self, X, istrain=False): 370 | """Calculate model output from input data. 371 | 372 | :param X: input data 373 | :param istrain: whether called during training, to activate dropout 374 | """ 375 | input = X 376 | for layer in self.fcl[:-1]: 377 | input = F.relu(layer(input)) 378 | if istrain: 379 | input = self.dropoutl(input) 380 | return self.activation(self.fcl[-1](input)) 381 | 382 | def predict_proba(self, X): 383 | """Predict class probabilities for the incoming data 384 | 385 | :param X: features to predict the endpoint probabilities from 386 | """ 387 | y = super().predict(X) 388 | return y 389 | 390 | def predict(self, X): 391 | """Predict classes for the incoming data 392 | 393 | :param X: features to predict the endpoint(s) from 394 | """ 395 | probas = self.predict_proba(X) 396 | return np.round(probas) 397 | 398 | 399 | class MultiTaskNNRegressor(BaseNN): 400 | def __init__(self, out: str, epochs: int = 100, lr: float = 1e-3, 401 | early_stop: int = 100, batch_size: int = 1024, dropout: float = 0.25, 402 | random_seed: Optional[int] = None): 403 | """Neural Network regressor to predict multiple endpoints. 404 | 405 | Architecture is adapted from https://doi.org/10.1186/s13321-017-0232-0 for multi-task regression 406 | 407 | :param out: output folder 408 | :param epochs: number of epochs 409 | :param lr: learning rate 410 | :param early_stop: stop after these many epochs without any decrease of loss 411 | :param batch_size: size of data batches 412 | :param dropout: fraction of randomly disabled neurons at each epoch during training 413 | :param random_seed: seed of random number generators 414 | """ 415 | super(MultiTaskNNRegressor, self).__init__(out, epochs, lr, early_stop, batch_size, dropout, random_seed) 416 | self.dropoutl = nn.Dropout(self.dropout) 417 | self.criterion = nn.MSELoss() 418 | 419 | def set_architecture(self, n_dim: int, n_task: int): 420 | """Set dimension of input. 421 | 422 | :param n_dim: number of input parameters 423 | :param n_task: number of tasks to be predicted at the same time 424 | """ 425 | if n_task < 2: 426 | raise ValueError('use SingleTaskNNRegressor for a single task') 427 | super().set_architecture([n_dim, 8000, 4000, 2000, n_task]) 428 | cuda(self) 429 | 430 | def forward(self, X, istrain=False): 431 | """Calculate model output from input data. 432 | 433 | :param X: input data 434 | :param istrain: whether called during training, to activate dropout 435 | """ 436 | y = F.relu(self.fc0(X)) 437 | if istrain: 438 | y = self.dropoutl(y) 439 | y = F.relu(self.fc1(y)) 440 | if istrain: 441 | y = self.dropoutl(y) 442 | y = self.output(y) 443 | return y 444 | 445 | 446 | def loader_from_dataframe(X: pd.DataFrame, 447 | Y: Union[pd.Series, pd.DataFrame], 448 | batch_size: int = 1024): 449 | """Get PyTorch data loaders from pandas dataframes 450 | 451 | :param X: features to predict Y from 452 | :param Y: feature(s) to be predicted (dependent variable(s)) 453 | :param batch_size: batch size of the data loader 454 | """ 455 | if Y is None: 456 | raise ValueError('Y must be specified') 457 | if isinstance(X, pd.DataFrame): 458 | X = X.values 459 | if isinstance(Y, (pd.Series, pd.DataFrame)): 460 | Y = Y.values 461 | if len(Y.shape) == 1: 462 | Y = Y.reshape(Y.shape[0], 1) 463 | dataset = TensorDataset(T.Tensor(X), T.Tensor(Y)) 464 | loader = DataLoader(dataset, batch_size=batch_size) 465 | return loader 466 | 467 | 468 | def loader_from_iterator(X: Union[PandasTextFileReader, Iterator], 469 | Y: Union[PandasTextFileReader, Iterator] = None, 470 | y_col: Optional[str] = None, 471 | batch_size: int = 1024): 472 | """Get PyTorch data loaders from iterators 473 | 474 | :param X: features to predict Y from 475 | :param Y: features to be predicted (dependent variables) 476 | :param y_col: name of the columns in X containing the dependent variables to be predicted 477 | :param batch_size: batch size of the data loader 478 | """ 479 | if Y is None and y_col is None: 480 | raise ValueError('either Y or y_col must be specified') 481 | if Y is None: 482 | X, Y = split_into_x_and_y(X, y_col) 483 | dataset = IterableDataset(X, Y) 484 | return DataLoader(dataset, batch_size=batch_size) 485 | 486 | 487 | class IterableDataset(PandasIterableDataset): 488 | def __init__(self, x_iterator: Iterator, y_iterator: Iterator): 489 | self.iterator = zip(x_iterator, y_iterator) 490 | 491 | def __iter__(self): 492 | for chunk_x, chunk_y in self.iterator: 493 | for row in zip(chunk_x, chunk_y): 494 | yield row 495 | 496 | 497 | def split_into_x_and_y(data: Union[PandasTextFileReader, Iterator], 498 | y_col: Union[str, List[str]]): 499 | """Extract the columns for the data iterator into another iterator. 500 | 501 | :param data: the input iterator to extract columns from 502 | :param y_col: name of the columns to be extracted 503 | :return: first iterator 504 | """ 505 | if isinstance(y_col, list) and not len(y_col): 506 | raise ValueError('at least one column must be extracted') 507 | if not isinstance(y_col, list): 508 | y_col = [y_col] 509 | gen_x, gen_y = itertools.tee(data, 2) 510 | return extract_x(gen_x, y_col), extract_y(gen_y, y_col) 511 | 512 | 513 | def extract_y(data: Union[PandasTextFileReader, Iterator], y_col: List[str]): 514 | """Extract the columns from the data.""" 515 | for chunk in data: 516 | if not np.all(chunk.columns.isin(y_col)): 517 | raise ValueError(f'columns {chunk.columns} not found in data') 518 | return T.Tensor(chunk[y_col]) 519 | 520 | 521 | def extract_x(data: Union[PandasTextFileReader, Iterator], y_col: List[str]): 522 | """Extract the columns from the data.""" 523 | for chunk in data: 524 | if not np.all(data.columns.isin(y_col)): 525 | raise ValueError(f'columns {chunk.columns} not found in data') 526 | return T.Tensor(chunk.drop(columns=y_col)) 527 | -------------------------------------------------------------------------------- /src/papyrus_scripts/reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Reading capacities of the Papyrus-scripts.""" 4 | from __future__ import annotations 5 | 6 | import json 7 | import os 8 | from typing import Optional, Union, Iterator, List 9 | from functools import partial 10 | 11 | import pystow 12 | import pandas as pd 13 | from tqdm.auto import tqdm 14 | from prodec import Descriptor, Transform 15 | 16 | from .utils.mol_reader import MolSupplier 17 | from .utils.IO import locate_file, process_data_version, TypeDecoder, PapyrusVersion 18 | 19 | 20 | def read_papyrus(is3d: bool = False, version: str | PapyrusVersion = 'latest', plusplus: bool = True, 21 | chunksize: Optional[int] = None, source_path: Optional[str] = None 22 | ) -> Union[Iterator[pd.DataFrame], pd.DataFrame]: 23 | 24 | """Read the Papyrus dataset. 25 | 26 | :param is3d: whether to consider stereochemistry or not (default: False) 27 | :param version: version of the dataset to be read 28 | :param plusplus: read the Papyrus++ curated subset of very high quality 29 | :param chunksize: number of lines per chunk. To read without chunks, set to None 30 | :param source_path: folder containing the bioactivity dataset (default: pystow's home folder) 31 | :return: the Papyrus activity dataset 32 | """ 33 | # Papyrus++ with stereo does not exist 34 | if is3d and plusplus: 35 | raise ValueError('Papyrus++ is only available without stereochemistry.') 36 | # Determine default paths 37 | if source_path is not None: 38 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path) 39 | version = process_data_version(version=version, root_folder=source_path) 40 | source_path = pystow.module('papyrus', version.version_old_fmt) 41 | # Load data types 42 | dtype_file = source_path.join(name='data_types.json').as_posix() 43 | with open(dtype_file, 'r') as jsonfile: 44 | dtypes = json.load(jsonfile, cls=TypeDecoder)['papyrus'] 45 | # Find the file 46 | filenames = locate_file(source_path.base.as_posix(), 47 | r'\d+\.\d+' + (r'\+\+' if plusplus else '') + '_combined_set_' 48 | f'with{"out" if not is3d else ""}' + r'_stereochemistry\.tsv.*') 49 | return pd.read_csv(filenames[0], sep='\t', chunksize=chunksize, dtype=dtypes, low_memory=True) 50 | 51 | 52 | def read_protein_set(source_path: Optional[str] = None, version: str | PapyrusVersion = 'latest') -> pd.DataFrame: 53 | """Read the protein targets of the Papyrus dataset. 54 | 55 | :param source_path: folder containing the molecular descriptor datasets 56 | :param version: version of the dataset to be read 57 | :return: the set of protein targets in the Papyrus dataset 58 | """ 59 | version = process_data_version(version=version, root_folder=source_path) 60 | # Determine default paths 61 | if source_path is not None: 62 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path) 63 | source_path = pystow.module('papyrus', version.version_old_fmt) 64 | # Find the file 65 | filenames = locate_file(source_path.base.as_posix(), r'\d+\.\d+_combined_set_protein_targets\.tsv.*') 66 | return pd.read_csv(filenames[0], sep='\t', keep_default_na=False) 67 | 68 | 69 | def read_molecular_descriptors(desc_type: str = 'mold2', is3d: bool = False, 70 | version: str | PapyrusVersion = 'latest', chunksize: Optional[int] = None, 71 | source_path: Optional[str] = None, 72 | ids: Optional[List[str]] = None, verbose: bool = True): 73 | """Get molecular descriptors 74 | 75 | :param desc_type: type of descriptor {'mold2', 'mordred', 'cddd', 'fingerprint', 'moe', 'all'} 76 | :param is3d: whether to load descriptors of the dataset containing stereochemistry 77 | :param version: version of the dataset to be read 78 | :param chunksize: number of lines per chunk. To read without chunks, set to None 79 | :param source_path: folder containing the bioactivity dataset (default: pystow's home folder) 80 | :param ids: identifiers of the molecules which descriptors should be loaded 81 | if is3d=True, then identifiers are InChIKeys, otherwise connectivities 82 | :param verbose: whether to show progress 83 | :return: the dataframe of molecular descriptors 84 | """ 85 | if desc_type not in ['mold2', 'mordred', 'cddd', 'fingerprint', 'moe', 'all']: 86 | raise ValueError("descriptor type not supported") 87 | # Determine default paths 88 | if source_path is not None: 89 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path) 90 | version = process_data_version(version=version, root_folder=source_path) 91 | source_path = pystow.module('papyrus', version.version_old_fmt) 92 | # Load data types 93 | dtype_file = source_path.join(name='data_types.json').as_posix() 94 | with open(dtype_file, 'r') as jsonfile: 95 | dtypes = json.load(jsonfile, cls=TypeDecoder) 96 | # Find the files 97 | if desc_type in ['mold2', 'all']: 98 | mold2_files = locate_file(source_path.join('descriptors').as_posix(), 99 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_mold2\.tsv.*') 100 | elif desc_type in ['mordred', 'all']: 101 | mordd_files = locate_file(source_path.join('descriptors').as_posix(), 102 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_mordred{3 if is3d else 2}D\.tsv.*') 103 | elif desc_type in ['cddd', 'all']: 104 | cddds_files = locate_file(source_path.join('descriptors').as_posix(), 105 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_CDDDs.tsv.*') 106 | elif desc_type in ['fingerprint', 'all']: 107 | molfp_files = locate_file(source_path.join('descriptors').as_posix(), 108 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_{"E3FP" if is3d else "ECFP6"}\.tsv.*') 109 | elif desc_type in ['moe', 'all']: 110 | moe_files = locate_file(source_path.join('descriptors').as_posix(), 111 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_moldescs_MOE\.tsv.*') 112 | if verbose: 113 | pbar = partial(tqdm, desc='Loading molecular descriptors') 114 | else: 115 | pbar = partial(iter) 116 | if desc_type == 'mold2': 117 | return _filter_molecular_descriptors(pbar(pd.read_csv(mold2_files[0], sep='\t', 118 | dtype=dtypes['mold2'], low_memory=True, chunksize=chunksize)), 119 | ids, 'InChIKey' if is3d else 'connectivity') 120 | elif desc_type == 'mordred': 121 | return _filter_molecular_descriptors(pbar(pd.read_csv(mordd_files[0], sep='\t', 122 | dtype=dtypes[f'mordred_{3 if is3d else 2}D'], low_memory=True, 123 | chunksize=chunksize)), 124 | ids, 'InChIKey' if is3d else 'connectivity') 125 | elif desc_type == 'cddd': 126 | return _filter_molecular_descriptors(pbar(pd.read_csv(cddds_files[0], sep='\t', 127 | dtype=dtypes['CDDD'], low_memory=True, chunksize=chunksize)), 128 | ids, 'InChIKey' if is3d else 'connectivity') 129 | elif desc_type == 'fingerprint': 130 | return _filter_molecular_descriptors(pbar(pd.read_csv(molfp_files[0], sep='\t', 131 | dtype=dtypes[f'{"E3FP" if is3d else "ECFP6"}'], 132 | low_memory=True, chunksize=chunksize)), 133 | ids, 'InChIKey' if is3d else 'connectivity') 134 | elif desc_type == 'moe': 135 | return _filter_molecular_descriptors(pbar(pd.read_csv(moe_files[0], sep='\t', 136 | low_memory=True, chunksize=chunksize)), 137 | ids, 'InChIKey' if is3d else 'connectivity') 138 | elif desc_type == 'all': 139 | mold2 = _filter_molecular_descriptors(pd.read_csv(mold2_files[0], sep='\t', 140 | dtype=dtypes['mold2'], low_memory=True, chunksize=chunksize), 141 | ids, 'InChIKey' if is3d else 'connectivity') 142 | mordd = _filter_molecular_descriptors(pd.read_csv(mordd_files[0], sep='\t', 143 | dtype=dtypes[f'mordred_{3 if is3d else 2}D'], 144 | low_memory=True, chunksize=chunksize), 145 | ids, 'InChIKey' if is3d else 'connectivity') 146 | cddds = _filter_molecular_descriptors(pd.read_csv(cddds_files[0], sep='\t', dtype=dtypes['CDDD'], 147 | low_memory=True, chunksize=chunksize), 148 | ids, 'InChIKey' if is3d else 'connectivity') 149 | molfp = _filter_molecular_descriptors(pd.read_csv(molfp_files[0], sep='\t', 150 | dtype=dtypes[f'{"E3FP" if is3d else "ECFP6"}'], 151 | low_memory=True, chunksize=chunksize), 152 | ids, 'InChIKey' if is3d else 'connectivity') 153 | moe = _filter_molecular_descriptors(pd.read_csv(moe_files[0], sep='\t', low_memory=True, chunksize=chunksize), 154 | ids, 'InChIKey' if is3d else 'connectivity') 155 | if chunksize is None: 156 | mold2.set_index('InChIKey' if is3d else 'connectivity', inplace=True) 157 | mordd.set_index('InChIKey' if is3d else 'connectivity', inplace=True) 158 | molfp.set_index('InChIKey' if is3d else 'connectivity', inplace=True) 159 | cddds.set_index('InChIKey' if is3d else 'connectivity', inplace=True) 160 | moe.set_index('InChIKey' if is3d else 'connectivity', inplace=True) 161 | data = pd.concat([mold2, mordd, cddds, molfp, moe], axis=1) 162 | del mold2, mordd, cddds, molfp, moe 163 | data.reset_index(inplace=True) 164 | return data 165 | return _filter_molecular_descriptors(pbar(_join_molecular_descriptors(mold2, mordd, molfp, cddds, moe, 166 | on='InChIKey' if is3d else 'connectivity')), 167 | ids, 'InChIKey' if is3d else 'connectivity') 168 | 169 | 170 | def _join_molecular_descriptors(*descriptors: Iterator, on: str = 'connectivity') -> Iterator: 171 | """Concatenate multiple types of molecular descriptors on the same identifier. 172 | 173 | :param descriptors: the different iterators of descriptors to be joined 174 | :param on: identifier to join the descriptors on 175 | """ 176 | try: 177 | while True: 178 | values = [next(descriptor).set_index(on) for descriptor in descriptors] 179 | data = pd.concat(values, axis=1) 180 | data.reset_index(inplace=True) 181 | yield data 182 | except StopIteration: 183 | raise StopIteration 184 | 185 | 186 | def _filter_molecular_descriptors(data: Union[pd.DataFrame, Iterator], 187 | ids: Optional[List[str]], id_name: str): 188 | if isinstance(data, pd.DataFrame): 189 | if ids is None: 190 | return _iterate_filter_descriptors(data, None, None) 191 | return data[data[id_name].isin(ids)] 192 | else: 193 | return _iterate_filter_descriptors(data, ids, id_name) 194 | 195 | 196 | def _iterate_filter_descriptors(data: Iterator, ids: Optional[List[str]], id_name: Optional[str]): 197 | for chunk in data: 198 | if ids is None: 199 | yield chunk 200 | else: 201 | yield chunk[chunk[id_name].isin(ids)] 202 | 203 | 204 | def read_protein_descriptors(desc_type: Union[str, Descriptor, Transform] = 'unirep', 205 | version: str | PapyrusVersion = 'latest', chunksize: Optional[int] = None, 206 | source_path: Optional[str] = None, 207 | ids: Optional[List[str]] = None, verbose: bool = True, 208 | **kwargs): 209 | """Get protein descriptors 210 | 211 | :param desc_type: type of descriptor {'unirep'} or a prodec instance of a Descriptor or Transform 212 | :param version: version of the dataset to be read 213 | :param chunksize: number of lines per chunk. To read without chunks, set to None 214 | :param source_path: If desc_type is 'unirep', folder containing the protein descriptor datasets. 215 | If desc_type is 'custom', the file path to a tab-separated dataframe containing target_id 216 | as its first column and custom descriptors in the following ones. 217 | If desc_type is a ProDEC Descriptor or Transform instance, folder containing the bioactivity dataset 218 | (default: pystow's home folder) 219 | :param ids: identifiers of the sequences which descriptors should be loaded (e.g. P30542_WT) 220 | :param verbose: whether to show progress 221 | :param kwargs: keyword arguments passed to the `pandas` method of the ProDEC Descriptor or Transform instance 222 | (is ignored if `desc_type` is not a ProDEC Descriptor or Transform instance) 223 | :return: the dataframe of protein descriptors 224 | """ 225 | if desc_type not in ['unirep', 'custom'] and not isinstance(desc_type, (Descriptor, Transform)): 226 | raise ValueError("descriptor type not supported") 227 | if desc_type != 'custom': 228 | # Determine default paths 229 | if source_path is not None: 230 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path) 231 | version = process_data_version(version=version, root_folder=source_path) 232 | source_path = pystow.module('papyrus', version.version_old_fmt) 233 | if not isinstance(desc_type, (Descriptor, Transform)): 234 | # Load data types 235 | dtype_file = source_path.join(name='data_types.json').as_posix() 236 | with open(dtype_file, 'r') as jsonfile: 237 | dtypes = json.load(jsonfile, cls=TypeDecoder) 238 | # Set verbose level 239 | if verbose: 240 | pbar = partial(tqdm, desc='Loading protein descriptors') 241 | else: 242 | pbar = partial(iter) 243 | if desc_type == 'unirep': 244 | unirep_files = locate_file(source_path.join('descriptors').as_posix(), r'(?:\d+\.\d+_combined_prot_embeddings_unirep\.tsv.*)|(?:\d+\.\d+_combined_protdescs_unirep\.tsv.*)') 245 | if len(unirep_files) == 0: 246 | raise ValueError('Could not find unirep descriptor file') 247 | if desc_type == 'unirep': 248 | if chunksize is None and ids is None: 249 | return pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'], low_memory=True) 250 | elif chunksize is None and ids is not None: 251 | descriptors = pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'], low_memory=True) 252 | if 'target_id' in descriptors.columns: 253 | return descriptors[descriptors['target_id'].isin(ids)] 254 | return descriptors[descriptors['TARGET_NAME'].isin(ids)].rename(columns={'TARGET_NAME': 'target_id'}) 255 | elif chunksize is not None and ids is None: 256 | return pd.concat([chunk 257 | for chunk in pbar(pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'], 258 | low_memory=True, chunksize=chunksize)) 259 | ]).rename(columns={'TARGET_NAME': 'target_id'}) 260 | return pd.concat([chunk[chunk['target_id'].isin(ids)] 261 | if 'target_id' in chunk.columns 262 | else chunk[chunk['TARGET_NAME'].isin(ids)] 263 | for chunk in pbar(pd.read_csv(unirep_files[0], sep='\t', dtype=dtypes['unirep'], 264 | low_memory=True, chunksize=chunksize)) 265 | ]).rename(columns={'TARGET_NAME': 'target_id'}) 266 | else: 267 | # Calculate protein descriptors 268 | protein_data = read_protein_set(pystow.module('').base.as_posix(), version=version) 269 | protein_data.rename(columns={'TARGET_NAME': 'target_id'}, inplace=True) 270 | # Keep only selected proteins 271 | if ids is not None: 272 | protein_data = protein_data[protein_data['target_id'].isin(ids)] 273 | # Filter out non-natural amino-acids 274 | protein_data = protein_data.loc[protein_data['Sequence'].map(desc_type.Descriptor.is_sequence_valid), :] 275 | # Obtain descriptors 276 | descriptors = desc_type.pandas_get(protein_data['Sequence'].tolist(), protein_data['target_id'].tolist(), 277 | **kwargs) 278 | descriptors.rename(columns={'ID': 'target_id'}, inplace=True) 279 | return descriptors 280 | elif desc_type == 'custom': 281 | # Check path exists 282 | if not os.path.isfile(source_path): 283 | raise ValueError('source_path must point to an existing file if using a custom descriptor type') 284 | # No chunksier, no filtering 285 | if chunksize is None and ids is None: 286 | return pd.read_csv(source_path, sep='\t', low_memory=True).rename(columns={'TARGET_NAME': 'target_id'}) 287 | # No chunksize but filtering 288 | elif chunksize is None and ids is not None: 289 | descriptors = pd.read_csv(source_path, sep='\t', low_memory=True) 290 | descriptors.rename(columns={'TARGET_NAME': 'target_id'}, inplace=True) 291 | return descriptors[descriptors['target_id'].isin(ids)] 292 | else: 293 | # Set verbose level 294 | if verbose: 295 | pbar = partial(tqdm, desc='Loading custom protein descriptors') 296 | else: 297 | pbar = partial(iter) 298 | # Chunksize but no filtering 299 | if chunksize is not None and ids is None: 300 | return pd.concat([chunk 301 | for chunk in pbar(pd.read_csv(source_path, sep='\t', 302 | low_memory=True, chunksize=chunksize)) 303 | ]).rename(columns={'TARGET_NAME': 'target_id'}) 304 | # Both chunksize and filtering 305 | return pd.concat([chunk[chunk['target_id'].isin(ids)] 306 | if 'target_id' in chunk.columns 307 | else chunk[chunk['TARGET_NAME'].isin(ids)] 308 | for chunk in pbar(pd.read_csv(source_path, 309 | sep='\t', low_memory=True, chunksize=chunksize)) 310 | ]).rename(columns={'TARGET_NAME': 'target_id'}) 311 | 312 | 313 | def read_molecular_structures(is3d: bool = False, version: str | PapyrusVersion = 'latest', 314 | chunksize: Optional[int] = None, source_path: Optional[str] = None, 315 | ids: Optional[List[str]] = None, verbose: bool = True): 316 | """Get molecular structures 317 | 318 | :param is3d: whether to load descriptors of the dataset containing stereochemistry 319 | :param version: version of the dataset to be read 320 | :param chunksize: number of lines per chunk. To read without chunks, set to None 321 | :param source_path: folder containing the bioactivity dataset (default: pystow's home folder) 322 | :param ids: identifiers of the molecules which descriptors should be loaded 323 | if is3d=True, then identifiers are InChIKeys, otherwise connectivities 324 | :param verbose: whether to show progress 325 | :return: the dataframe of molecular structures 326 | """ 327 | # Determine default paths 328 | if source_path is not None: 329 | os.environ['PYSTOW_HOME'] = os.path.abspath(source_path) 330 | version = process_data_version(version=version, root_folder=source_path) 331 | source_path = pystow.module('papyrus', version.version_old_fmt) 332 | # Find the files 333 | sd_files = locate_file(source_path.join('structures').as_posix(), 334 | rf'\d+\.\d+_combined_{3 if is3d else 2}D_set_with{"" if is3d else "out"}_stereochemistry.sd.*') 335 | if chunksize is None: 336 | data = [] 337 | # Iterate through the file 338 | with MolSupplier(sd_files[0], show_progress=True) as f_handle: 339 | for _, mol in f_handle: 340 | # Obtain SD molecular properties 341 | props = mol.GetPropsAsDict() 342 | # If IDs given and not in the list, skip 343 | if ids is not None and props['InChIKey' if is3d else 'connectivity'] not in ids: 344 | continue 345 | # Else add structure to the dict 346 | # and add the dict to data 347 | props['mol'] = mol 348 | data.append(props) 349 | # Return the list of dicts as a pandas DataFrame 350 | return pd.DataFrame(data) 351 | else: 352 | # Process the data through an iterator 353 | structure_iterator = _structures_iterator(sd_files[0], chunksize, ids, is3d, verbose) 354 | return structure_iterator 355 | 356 | 357 | def _structures_iterator(sd_file: str, chunksize: int, 358 | ids: Optional[List[str]] = None, 359 | is3d: bool = False, verbose: bool = True) -> Iterator[pd.DataFrame]: 360 | if not isinstance(chunksize, int) or chunksize < 1: 361 | raise ValueError('Chunksize must be a non-null positive integer.') 362 | if verbose: 363 | pbar = tqdm(desc='Loading molecular structures') 364 | data = [] 365 | # Iterate through the file 366 | with MolSupplier(sd_file) as f_handle: 367 | for _, mol in f_handle: 368 | # Obtain SD molecular properties 369 | props = mol.GetPropsAsDict() 370 | # If IDs given and not in the list, skip 371 | id_ = props['InChIKey' if is3d else 'connectivity'] 372 | if (ids is not None) and (id_ not in ids): 373 | continue 374 | props['mol'] = mol 375 | data.append(props) 376 | # Chunk is complete 377 | if len(data) == chunksize: 378 | if verbose: 379 | pbar.update() 380 | yield pd.DataFrame(data) 381 | data = [] 382 | if verbose: 383 | pbar.update() 384 | yield pd.DataFrame(data) 385 | --------------------------------------------------------------------------------