├── docs ├── _static │ ├── README │ ├── images │ │ └── custom_favicon.png │ └── css │ │ └── custom.css ├── ipython_kernel_config.py ├── notebooks │ ├── getting_started.nblink │ ├── ligand-only-smiles-EGFR.nblink │ ├── OpenEye_structural_featurizer.nblink │ ├── ligand-only-morgan1024-EGFR.nblink │ ├── Schrodinger_structural_featurizer.nblink │ ├── kinoml_object_model.nblink │ ├── kinase-ligand-informed-smiles-sequence-EGFR.nblink │ └── kinase-ligand-informed-morgan-composition-EGFR.nblink ├── Makefile ├── developers │ ├── autodocs.py │ ├── _docstrings_example.py │ └── api_docs.md ├── index.md └── conf.py ├── kinoml ├── core │ ├── __init__.py │ ├── components.py │ ├── conditions.py │ ├── systems.py │ └── ligands.py ├── ml │ ├── __init__.py │ ├── torch_trees.py │ ├── torch_geometric_models.py │ └── tensorflow_models.py ├── analysis │ ├── __init__.py │ ├── plots.py │ └── metrics.py ├── databases │ ├── __init__.py │ ├── uniprot.py │ ├── klifs.py │ └── pdb.py ├── datasets │ ├── __init__.py │ ├── torch_geometric_datasets.py │ ├── groups.py │ ├── pkis2.py │ └── chembl.py ├── docking │ ├── __init__.py │ └── SCHRODINGERDocking.py ├── modeling │ ├── __init__.py │ ├── alignment.py │ └── SCHRODINGERModeling.py ├── optimize │ └── __init__.py ├── workflows │ ├── __init__.py │ └── images │ │ ├── KinoML_Workflow_single.png │ │ └── KinoML_Workflow_multiple.png ├── data │ ├── molecules │ │ ├── __init__.py │ │ ├── chloroform.pdb │ │ ├── chloroform.sdf │ │ ├── chloroform_acetamide.pdb │ │ └── chloroform_acetamide.sdf │ ├── proteins │ │ ├── __init__.py │ │ ├── kinoml_tests_4f8o_spruce.loop_db │ │ └── README.md │ ├── electron_densities │ │ ├── __init__.py │ │ └── 4f8o_phases.mtz │ ├── object_model.png │ ├── fig_1_kinomltechpaper_v2.png │ ├── first_tutorial_scheme_v2.png │ ├── look_and_say.dat │ └── README.md ├── tests │ ├── core │ │ ├── __init__.py │ │ ├── test_conditions.py │ │ ├── test_measurements.py │ │ ├── test_ligands.py │ │ ├── test_systems.py │ │ ├── test_sequences.py │ │ └── test_proteins.py │ ├── data │ │ └── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── test_chembl.py │ │ └── test_pkis2.py │ ├── docking │ │ ├── __init__.py │ │ └── test_oedocking.py │ ├── features │ │ ├── __init__.py │ │ ├── test_ligand.py │ │ ├── test_core.py │ │ ├── test_protein.py │ │ └── test_complexes.py │ ├── modeling │ │ ├── __init__.py │ │ └── test_alignment.py │ ├── databases │ │ ├── __init__.py │ │ ├── test_klifs.py │ │ ├── test_uniprot.py │ │ └── test_pdb.py │ ├── __init__.py │ └── test_kinoml.py ├── features │ ├── __init__.py │ ├── protein.py │ └── ligand.py └── __init__.py ├── MANIFEST.in ├── devtools ├── github-actions │ └── initialize_conda.sh ├── conda-envs │ ├── docs_env.yaml │ └── test_env.yaml ├── README.md └── scripts │ └── create_conda_env.py ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── workflows │ ├── cancel.yml │ ├── docs.yml │ ├── lint.yml │ └── ci.yml └── CONTRIBUTING.md ├── .codecov.yml ├── .lgtm.yml ├── CHANGELOG.md ├── setup.cfg ├── LICENSE ├── .gitignore ├── setup.py ├── tutorials └── README.md ├── CITATION.cff ├── CODE_OF_CONDUCT.md └── README.md /docs/_static/README: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/databases/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/docking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/optimize/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/data/molecules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/data/proteins/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/docking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/tests/databases/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/data/electron_densities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kinoml/data/object_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/object_model.png -------------------------------------------------------------------------------- /docs/ipython_kernel_config.py: -------------------------------------------------------------------------------- 1 | c.InlineBackend.figure_formats = {"svg"} 2 | c.InlineBackend.rc = {"figure.dpi": 96} 3 | -------------------------------------------------------------------------------- /docs/notebooks/getting_started.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/getting_started/getting_started_with_kinoml.ipynb"} 2 | -------------------------------------------------------------------------------- /docs/_static/images/custom_favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/docs/_static/images/custom_favicon.png -------------------------------------------------------------------------------- /kinoml/data/fig_1_kinomltechpaper_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/fig_1_kinomltechpaper_v2.png -------------------------------------------------------------------------------- /kinoml/data/first_tutorial_scheme_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/first_tutorial_scheme_v2.png -------------------------------------------------------------------------------- /docs/notebooks/ligand-only-smiles-EGFR.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/experiments/ligand-only-smiles-EGFR/experiment_notebook.ipynb"} -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include MANIFEST.in 3 | include versioneer.py 4 | 5 | graft kinoml 6 | global-exclude *.py[cod] __pycache__ *.so -------------------------------------------------------------------------------- /docs/notebooks/OpenEye_structural_featurizer.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/getting_started/OpenEye_structural_featurizer_showcase.ipynb"} 2 | -------------------------------------------------------------------------------- /docs/notebooks/ligand-only-morgan1024-EGFR.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/experiments/ligand-only-morgan1024-EGFR/experiment_notebook.ipynb"} 2 | -------------------------------------------------------------------------------- /kinoml/data/electron_densities/4f8o_phases.mtz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/electron_densities/4f8o_phases.mtz -------------------------------------------------------------------------------- /kinoml/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Empty init file in case you choose a package besides PyTest such as Nose which may look for such a file 3 | """ 4 | -------------------------------------------------------------------------------- /docs/notebooks/Schrodinger_structural_featurizer.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/getting_started/Schrodinger_structural_featurizer_showcase.ipynb"} 2 | -------------------------------------------------------------------------------- /kinoml/workflows/images/KinoML_Workflow_single.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/workflows/images/KinoML_Workflow_single.png -------------------------------------------------------------------------------- /docs/notebooks/kinoml_object_model.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/getting_started/kinoml_object_model.ipynb", "extra-media": ["../../kinoml/data/"]} 2 | -------------------------------------------------------------------------------- /kinoml/workflows/images/KinoML_Workflow_multiple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/workflows/images/KinoML_Workflow_multiple.png -------------------------------------------------------------------------------- /kinoml/data/proteins/kinoml_tests_4f8o_spruce.loop_db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/proteins/kinoml_tests_4f8o_spruce.loop_db -------------------------------------------------------------------------------- /kinoml/features/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Featurizers will always output arrays 3 | but they will use structure-oriented methods 4 | underneath to do it. 5 | """ 6 | -------------------------------------------------------------------------------- /docs/notebooks/kinase-ligand-informed-smiles-sequence-EGFR.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/experiments/kinase-ligand-informed-smiles-sequence-EGFR/experiment_notebook.ipynb"} 2 | -------------------------------------------------------------------------------- /docs/notebooks/kinase-ligand-informed-morgan-composition-EGFR.nblink: -------------------------------------------------------------------------------- 1 | {"path": "../../tutorials/experiments/kinase-ligand-informed-morgan-composition-EGFR/experiments_notebook.ipynb"} 2 | -------------------------------------------------------------------------------- /devtools/github-actions/initialize_conda.sh: -------------------------------------------------------------------------------- 1 | case ${CI_OS} in 2 | windows*) 3 | eval "$(${CONDA}/condabin/conda.bat shell.bash hook)";; 4 | *) 5 | eval "$(${CONDA}/condabin/conda shell.bash hook)";; 6 | esac 7 | -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | div.autodoc { 2 | padding-left: 25px; 3 | } 4 | article>div.autodoc { 5 | border-left: 4px solid rgba(230, 230, 230); 6 | } 7 | article>div.autodoc>div.autodoc { 8 | border-left: 1px solid rgba(230, 230, 230); 9 | } -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | Provide a brief description of the PR's purpose here. 3 | 4 | ## Todos 5 | Notable points that this PR has either accomplished or will accomplish. 6 | - [ ] TODO 1 7 | 8 | ## Questions 9 | - [ ] Question1 10 | 11 | ## Status 12 | - [ ] Ready to go -------------------------------------------------------------------------------- /kinoml/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | KinoML is a structure-informed machine learning library 3 | with a focus on kinase modeling 4 | """ 5 | 6 | # Handle versioneer 7 | from ._version import get_versions 8 | 9 | versions = get_versions() 10 | __version__ = versions["version"] 11 | __git_revision__ = versions["full-revisionid"] 12 | del get_versions, versions 13 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | # Codecov configuration to make it a bit less noisy 2 | coverage: 3 | status: 4 | patch: false 5 | project: 6 | default: 7 | threshold: 50% 8 | comment: 9 | layout: "header" 10 | require_changes: false 11 | branches: null 12 | behavior: default 13 | flags: null 14 | paths: null 15 | ignore: 16 | - "**/test_*.py" -------------------------------------------------------------------------------- /.lgtm.yml: -------------------------------------------------------------------------------- 1 | # Configure LGTM for this package 2 | 3 | extraction: 4 | python: # Configure Python 5 | python_setup: # Configure the setup 6 | version: 3 # Specify Version 3 7 | path_classifiers: 8 | library: 9 | - versioneer.py # Set Versioneer.py to an external "library" (3rd party code) 10 | - devtools/* 11 | generated: 12 | - kinoml/_version.py 13 | -------------------------------------------------------------------------------- /.github/workflows/cancel.yml: -------------------------------------------------------------------------------- 1 | name: Cancel previous 2 | on: [push] 3 | jobs: 4 | cancel: 5 | if: github.repository == 'openkinome/kinoml' 6 | name: 'Cancel Previous Runs' 7 | runs-on: ubuntu-latest 8 | timeout-minutes: 3 9 | steps: 10 | - uses: styfle/cancel-workflow-action@0.4.1 11 | with: 12 | workflow_id: 231683,116359 13 | access_token: ${{ github.token }} -------------------------------------------------------------------------------- /kinoml/data/look_and_say.dat: -------------------------------------------------------------------------------- 1 | 1 2 | 11 3 | 21 4 | 1211 5 | 111221 6 | 312211 7 | 13112221 8 | 1113213211 9 | 31131211131221 10 | 13211311123113112211 11 | 11131221133112132113212221 12 | 3113112221232112111312211312113211 13 | 1321132132111213122112311311222113111221131221 14 | 11131221131211131231121113112221121321132132211331222113112211 15 | 311311222113111231131112132112311321322112111312211312111322212311322113212221 -------------------------------------------------------------------------------- /kinoml/tests/core/test_conditions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.core.conditions 3 | """ 4 | 5 | 6 | def test_assay_conditions(): 7 | from kinoml.core.conditions import AssayConditions 8 | 9 | conditions = AssayConditions() 10 | assert isinstance(conditions.pH, float) 11 | assert conditions.pH == 7.0 12 | 13 | assert conditions == AssayConditions(pH=7.0) 14 | assert conditions != AssayConditions(pH=8.0) 15 | -------------------------------------------------------------------------------- /kinoml/ml/torch_trees.py: -------------------------------------------------------------------------------- 1 | """ 2 | Expose some useful objects in the sklearn-pytorch library 3 | 4 | These will provide Random Forest and Decision Tree implementations 5 | for PyTorch using the sklearn API. 6 | """ 7 | # pylint: disable=unused-import 8 | from Sklearn_PyTorch import ( 9 | TorchRandomForestClassifier, 10 | TorchRandomForestRegressor, 11 | TorchDecisionTreeClassifier, 12 | TorchDecisionTreeRegressor, 13 | ) 14 | -------------------------------------------------------------------------------- /kinoml/data/molecules/chloroform.pdb: -------------------------------------------------------------------------------- 1 | HETATM 1 C1 UNL 1 0.000 0.000 0.000 1.00 20.00 C 2 | HETATM 2 CL2 UNL 1 0.000 0.000 0.000 1.00 20.00 CL 3 | HETATM 3 CL3 UNL 1 0.000 0.000 0.000 1.00 20.00 CL 4 | HETATM 4 CL4 UNL 1 0.000 0.000 0.000 1.00 20.00 CL 5 | TER 5 UNL 1 6 | CONECT 1 2 3 4 7 | CONECT 2 1 8 | CONECT 3 1 9 | CONECT 4 1 10 | END 11 | -------------------------------------------------------------------------------- /kinoml/data/molecules/chloroform.sdf: -------------------------------------------------------------------------------- 1 | 2 | -OEChem-06022113532D 3 | 4 | 4 3 0 0 0 0 0 0 0999 V2000 5 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | 1.0000 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0 7 | -0.5000 -0.8660 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0 8 | -0.5000 0.8660 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0 9 | 1 2 1 0 0 0 0 10 | 1 3 1 0 0 0 0 11 | 1 4 1 0 0 0 0 12 | M END 13 | $$$$ 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [v1.0.0] - 2025-10-01 8 | ### Added 9 | - First stable, reproducible release accompanying the manuscript. 10 | - Conda installation via `devtools/conda-envs/test_env.yaml`. 11 | - Docker image published on Docker Hub: `openkinome/kinoml:v1`. 12 | - Examples of code usage in tutorials. 13 | 14 | ### Notes 15 | - This release will be the reference for the manuscript submission. 16 | -------------------------------------------------------------------------------- /kinoml/tests/core/test_measurements.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.core.measurements 3 | """ 4 | 5 | 6 | def test_measurements(): 7 | from kinoml.core.measurements import BaseMeasurement 8 | from kinoml.core.conditions import AssayConditions 9 | from kinoml.core.components import MolecularComponent 10 | from kinoml.core.systems import System 11 | 12 | conditions = AssayConditions() 13 | system = System([MolecularComponent()]) 14 | measurement = BaseMeasurement(50, conditions=conditions, system=system) 15 | assert isinstance(measurement, BaseMeasurement) 16 | assert measurement == BaseMeasurement(50, conditions=conditions, system=system) 17 | assert measurement != BaseMeasurement(10, conditions=conditions, system=system) 18 | -------------------------------------------------------------------------------- /devtools/conda-envs/docs_env.yaml: -------------------------------------------------------------------------------- 1 | name: docs 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # Base depends 6 | - python 7 | - pip 8 | - ipython 9 | # core 10 | #- sphinx~=2.4.0 11 | - docutils=0.20 # Support for nbsphinx_link 1.3.0 12 | - sphinx 13 | - jinja2 14 | - nbsphinx 15 | - nbsphinx-link 16 | - sphinx-notfound-page 17 | - sphinx-prompt 18 | - sphinx-copybutton 19 | - sphinx-autoapi>=3,<4 20 | - myst-parser 21 | - sphinxcontrib-httpdomain 22 | - linkify-it-py 23 | #- sphinx-panels 24 | # themes 25 | - sphinx-material 26 | # local building 27 | - sphinx-autobuild 28 | #- pip: 29 | # core 30 | #- sphinx-version-warning 31 | #- sphinxemoji 32 | #- sphinx-last-updated-by-git 33 | -------------------------------------------------------------------------------- /kinoml/tests/test_kinoml.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit and regression test for the kinoml package. 3 | """ 4 | 5 | # Import package, test suite, and other packages as needed 6 | import kinoml # pylint: disable=unused-import 7 | import sys 8 | 9 | 10 | def test_kinoml_imported(): 11 | """ 12 | Sample test, will always pass so long as import statement worked 13 | """ 14 | assert "kinoml" in sys.modules 15 | 16 | 17 | def test_3rdparty_imports(): 18 | """ 19 | Some packages can be tricky to install. Make sure we can import them. 20 | """ 21 | import torch # pylint: disable=unused-import 22 | 23 | assert "torch" in sys.modules 24 | 25 | import torch_geometric # pylint: disable=unused-import 26 | 27 | assert "torch_geometric" in sys.modules 28 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Helper file to handle all configs 2 | 3 | [coverage:run] 4 | # .coveragerc to control coverage.py and pytest-cov 5 | omit = 6 | # Omit the tests 7 | */tests/* 8 | # Omit generated versioneer 9 | kinoml/_version.py 10 | 11 | [yapf] 12 | # YAPF, in .style.yapf files this shows up as "[style]" header 13 | COLUMN_LIMIT = 119 14 | INDENT_WIDTH = 4 15 | USE_TABS = False 16 | 17 | [flake8] 18 | # Flake8, PyFlakes, etc 19 | max-line-length = 119 20 | 21 | [versioneer] 22 | # Automatic version numbering scheme 23 | VCS = git 24 | style = pep440 25 | versionfile_source = kinoml/_version.py 26 | versionfile_build = kinoml/_version.py 27 | tag_prefix = '' 28 | 29 | [aliases] 30 | test = pytest 31 | 32 | [tool:pytest] 33 | markers = 34 | slow: marks tests as slow (deselect with '-m "not slow"') -------------------------------------------------------------------------------- /kinoml/tests/modeling/test_alignment.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test alignment functionalities of `kinoml.modeling` 3 | """ 4 | import pytest 5 | 6 | from kinoml.modeling.alignment import sequence_similarity 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "sequence1, sequence2, similarity", 11 | [ 12 | ( 13 | "NVG", 14 | "NVG", 15 | 16, 16 | ), 17 | ( 18 | "NVG", 19 | "NG", 20 | 1, 21 | ), 22 | ( 23 | "NVG", 24 | "VG", 25 | -1, 26 | ), 27 | ], 28 | ) 29 | def test_sequence_similarity(sequence1, sequence2, similarity): 30 | """Compare results to expected similarity.""" 31 | score = sequence_similarity(sequence1, sequence2) 32 | assert score == similarity 33 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | livehtml: 23 | sphinx-autobuild -b html --ignore "sphinx-notfound-page/*" --ignore "autoapi/*" --ignore ".ipynb_checkpoints/*" --ignore ".#*" $(SPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html -------------------------------------------------------------------------------- /kinoml/core/components.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base classes for all MolecularComponents. 3 | 4 | One or more components can form a System. 5 | Proteins, ligands, and other molecular entities are 6 | derived the base class ``MolecularComponent``. 7 | """ 8 | 9 | 10 | class MolecularComponent: 11 | """ 12 | Abstract base molecular entity. 13 | """ 14 | 15 | def __init__(self, name="", metadata=None, *args, **kwargs): 16 | self.name = name 17 | if metadata is None: 18 | metadata = {} 19 | self.metadata = metadata 20 | 21 | def __repr__(self) -> str: 22 | return f"<{self.__class__.__name__} name={self.name}>" 23 | 24 | 25 | class BaseLigand(MolecularComponent): 26 | """ 27 | Base class for all ligand-like entities. 28 | """ 29 | 30 | 31 | class BaseProtein(MolecularComponent): 32 | """ 33 | Base class for all protein-like entities. 34 | """ 35 | -------------------------------------------------------------------------------- /kinoml/data/proteins/README.md: -------------------------------------------------------------------------------- 1 | ## File description 2 | 3 | ### 4f8o.pdb 4 | 5 | This protein was chosen for writing unit tests, since it contains protein and ligand residues as 6 | well as multiple chains and alternate locations. 7 | 8 | ### 4f8o_edit.pdb 9 | 10 | The 4f8o.pdb structure was altered in the following fashion: 11 | - translated along x axis by 20 A --> superposition 12 | - selected alternate location A 13 | - removed non protein atoms 14 | - deleted ASP82 --> loop modeling 15 | - deleted LYS135 --> detection of short protein segments 16 | - deleted sidechain of ASN2 --> sidechain perception and modeling 17 | - altered Chi1 dihedral of PHE4 to -1 radians --> detection of sidechain clashes 18 | 19 | ### kinoml_tests_4f8o_spruce.loop_db 20 | 21 | This loop template database was created using the loopdb_builder app based on 4f8o.pdb. It is used 22 | for testing loop modeling. 23 | -------------------------------------------------------------------------------- /docs/developers/autodocs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | from collections import defaultdict 4 | 5 | import yaml 6 | 7 | """ 8 | Create stubs for all API reference *.md files 9 | and propose a menu tree (you probably need to edit it to your liking) 10 | 11 | Redefine docs and package below and execute from repository root 12 | """ 13 | 14 | docs = "apidocs" 15 | package = "kinoml" 16 | here = Path(package) 17 | tree = defaultdict(list) 18 | 19 | Path(docs).mkdir() 20 | for py in here.rglob("*.py"): 21 | if len(py.parts) > 2: 22 | directory = Path(docs, *py.parts[1:-1]) 23 | directory.mkdir(parents=True, exist_ok=True) 24 | file = Path(docs, *py.parts[1:-1], py.stem + ".md") 25 | module = ".".join([package, *py.parts[1:-1], py.stem]) 26 | file.touch() 27 | file.write_text(f"::: {module}") 28 | tree[".".join([package, *py.parts[1:-1]])].append({module: str(file)}) 29 | 30 | print(yaml.dump(dict(tree))) 31 | 32 | -------------------------------------------------------------------------------- /kinoml/data/molecules/chloroform_acetamide.pdb: -------------------------------------------------------------------------------- 1 | HETATM 1 C1 UNL 1 0.000 0.000 0.000 1.00 20.00 C 2 | HETATM 2 CL2 UNL 1 0.000 0.000 0.000 1.00 20.00 CL 3 | HETATM 3 CL3 UNL 1 0.000 0.000 0.000 1.00 20.00 CL 4 | HETATM 4 CL4 UNL 1 0.000 0.000 0.000 1.00 20.00 CL 5 | TER 5 UNL 1 6 | CONECT 1 2 3 4 7 | CONECT 2 1 8 | CONECT 3 1 9 | CONECT 4 1 10 | END 11 | HETATM 1 C1 UNL 1 0.000 0.000 0.000 1.00 20.00 C 12 | HETATM 2 C2 UNL 1 0.000 0.000 0.000 1.00 20.00 C 13 | HETATM 3 O3 UNL 1 0.000 0.000 0.000 1.00 20.00 O 14 | HETATM 4 N4 UNL 1 0.000 0.000 0.000 1.00 20.00 N 15 | TER 5 UNL 1 16 | CONECT 1 2 17 | CONECT 2 1 3 3 4 18 | CONECT 3 2 2 19 | CONECT 4 2 20 | END 21 | -------------------------------------------------------------------------------- /kinoml/data/molecules/chloroform_acetamide.sdf: -------------------------------------------------------------------------------- 1 | 2 | -OEChem-06022113542D 3 | 4 | 4 3 0 0 0 0 0 0 0999 V2000 5 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | 1.0000 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0 7 | -0.5000 -0.8660 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0 8 | -0.5000 0.8660 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0 9 | 1 2 1 0 0 0 0 10 | 1 3 1 0 0 0 0 11 | 1 4 1 0 0 0 0 12 | M END 13 | $$$$ 14 | 15 | -OEChem-06022113542D 16 | 17 | 4 3 0 0 0 0 0 0 0999 V2000 18 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 19 | 1.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 20 | 1.5000 0.8660 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 21 | 1.5000 -0.8660 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 22 | 1 2 1 0 0 0 0 23 | 2 3 2 0 0 0 0 24 | 2 4 1 0 0 0 0 25 | M END 26 | $$$$ 27 | -------------------------------------------------------------------------------- /kinoml/tests/databases/test_klifs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test klifs functionalities of `kinoml.databases` 3 | """ 4 | from contextlib import contextmanager 5 | import pytest 6 | 7 | from kinoml.databases.klifs import klifs_kinase_from_uniprot_id 8 | 9 | 10 | @contextmanager 11 | def does_not_raise(): 12 | yield 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "uniprot_id, expectation, klifs_kinase_id", 17 | [ 18 | ( 19 | "P00519", 20 | does_not_raise(), 21 | 392, 22 | ), 23 | ( 24 | "XXXXX", 25 | pytest.raises(ValueError), 26 | 392, 27 | ), 28 | ], 29 | ) 30 | def test_klifs_kinase_from_uniprot_id(uniprot_id, expectation, klifs_kinase_id): 31 | """Compare klifs kinase ID for expected value.""" 32 | with expectation: 33 | kinase = klifs_kinase_from_uniprot_id(uniprot_id) 34 | assert kinase["kinase.klifs_id"] == klifs_kinase_id 35 | -------------------------------------------------------------------------------- /kinoml/tests/databases/test_uniprot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test uniprot functionalities of `kinoml.databases` 3 | """ 4 | import pytest 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "uniprot_id, valid_uniprot_id", 9 | [ 10 | ( 11 | "P00519", 12 | True, 13 | ), 14 | ( 15 | "O95271", 16 | True, 17 | ), 18 | ( 19 | "PXXXXX", 20 | False, 21 | ), 22 | ], 23 | ) 24 | def test_download_fasta_file(uniprot_id, valid_uniprot_id): 25 | """Check if UniProt entries can be downloaded in fasta format.""" 26 | from kinoml.databases.uniprot import download_fasta_file 27 | 28 | success = False 29 | fasta_path = download_fasta_file(uniprot_id) 30 | if fasta_path: 31 | success = True 32 | with open(fasta_path, "r") as fasta_file: 33 | first_character = fasta_file.read()[0] 34 | assert first_character == ">" 35 | 36 | assert success == valid_uniprot_id 37 | -------------------------------------------------------------------------------- /kinoml/databases/uniprot.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Union 3 | 4 | from appdirs import user_cache_dir 5 | 6 | 7 | def download_fasta_file( 8 | uniprot_id: str, directory: Union[Path, str] = user_cache_dir() 9 | ) -> Union[Path, bool]: 10 | """ 11 | Download a fasta file for a given UniProt identifier. 12 | 13 | Parameters 14 | ---------- 15 | uniprot_id: str 16 | The UniProt entry of interest. 17 | directory: Path or str 18 | The path to a directory for saving the file. 19 | 20 | Returns 21 | ------- 22 | : Path or bool 23 | The path to the downloaded file, False if not successful. 24 | """ 25 | from ..utils import download_file 26 | 27 | fasta_path = Path(directory) / f"{uniprot_id}.fasta" 28 | if fasta_path.is_file(): 29 | return fasta_path 30 | 31 | url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta" 32 | if download_file(url, fasta_path): 33 | return fasta_path 34 | 35 | return False 36 | -------------------------------------------------------------------------------- /kinoml/analysis/plots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common plots for ML model performance analysis 3 | """ 4 | 5 | import numpy as np 6 | from matplotlib import pyplot as plt 7 | from .metrics import performance 8 | 9 | 10 | def predicted_vs_observed(predicted, observed, measurement_type, with_metrics=True, **kwargs): 11 | plt.ioff() 12 | fig, ax = plt.subplots() 13 | ax.scatter(predicted, observed) 14 | ax.set_xlabel("Predicted") 15 | ax.set_ylabel("Observed") 16 | 17 | if measurement_type is not None: 18 | limits = np.array(measurement_type.RANGE) 19 | padded_limits = limits[0] - 0.05 * limits.max(), limits[1] + 0.05 * limits.max() 20 | 21 | ax.set(xlim=padded_limits, ylim=padded_limits) 22 | 23 | x = np.linspace(padded_limits[0], padded_limits[1], 100) 24 | ax.plot(x, x) 25 | ax.set_aspect("equal", adjustable="box") 26 | ax.set_title(f"{predicted.shape[0]} {measurement_type.__name__}") 27 | 28 | if with_metrics: 29 | performance(predicted, observed, **kwargs) 30 | 31 | plt.close() 32 | return fig 33 | -------------------------------------------------------------------------------- /devtools/conda-envs/test_env.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | - pytorch 4 | - pyg 5 | - conda-forge 6 | - openeye 7 | dependencies: 8 | # base depends 9 | - python 10 | - pip 11 | - pandas 12 | - requests 13 | - pint =0.19.2 14 | - appdirs 15 | - tqdm 16 | - openff-toolkit >=0.9 17 | - xgboost 18 | - openeye-toolkits >=2020.2.0 19 | - biotite 20 | - bravado 21 | - biopandas 22 | - awkward 23 | - pyarrow >=6.0.1 24 | - mdanalysis >=2.0.0 25 | - opencadd 26 | - matplotlib-base 27 | - ruamel.yaml 28 | 29 | # distributed computing 30 | - dask 31 | - dask-jobqueue 32 | 33 | # reproducible workflows 34 | - papermill 2.2.* 35 | - watermark 36 | 37 | # pytorch 38 | - pytorch >=1.8.0 39 | - pyg 40 | - pytorch-lightning 41 | 42 | # development 43 | - jupyterlab 44 | - nglview 45 | 46 | # testing 47 | - pytest 48 | - pytest-cov 49 | - pytest-xdist 50 | - codecov 51 | - nbval 52 | 53 | - pip: 54 | # PyTorch trees 55 | - https://github.com/ValentinFigue/Sklearn_PyTorch/archive/1b56a43e41de331ecdf73d08418f75bb34c9fa06.tar.gz 56 | -------------------------------------------------------------------------------- /kinoml/tests/core/test_ligands.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.core.ligands 3 | """ 4 | from importlib import resources 5 | 6 | 7 | def test_ligand(): 8 | from openeye import oechem 9 | from openff.toolkit.topology import Molecule 10 | import rdkit 11 | 12 | from kinoml.core.ligands import Ligand 13 | from kinoml.core.components import BaseLigand 14 | 15 | smiles = "CCCCC" 16 | ligand = Ligand.from_smiles(smiles=smiles) 17 | assert isinstance(ligand.molecule, Molecule) 18 | with resources.path("kinoml.data.molecules", "chloroform.sdf") as path: 19 | ligand = Ligand.from_file(str(path)) 20 | assert isinstance(ligand.molecule, Molecule) 21 | ligand = Ligand(smiles=smiles) 22 | assert isinstance(ligand, BaseLigand) 23 | assert isinstance(ligand.molecule, Molecule) 24 | assert isinstance(ligand.molecule.to_rdkit(), rdkit.Chem.Mol) 25 | assert isinstance(ligand.molecule.to_openeye(), oechem.OEMol) 26 | assert isinstance(ligand._smiles, str) 27 | assert isinstance(ligand.metadata["smiles"], str) 28 | assert isinstance(ligand.molecule.to_smiles(), str) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2019 OpenKinome 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /kinoml/databases/klifs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def klifs_kinase_from_uniprot_id(uniprot_id: str) -> pd.DataFrame: 5 | """ 6 | Retrieve KLIFS kinase details about the kinase matching the given Uniprot ID. 7 | 8 | Parameters 9 | ---------- 10 | uniprot_id: str 11 | Uniprot identifier. 12 | 13 | Returns 14 | ------- 15 | kinase: pd.Series 16 | KLIFS kinase details. 17 | 18 | Raises 19 | ------ 20 | ValueError: 21 | No KLIFS kinase found for UniProt ID. 22 | ValueError: 23 | Multiple KLIFS kinases found for UniProt ID. 24 | """ 25 | from opencadd.databases.klifs import setup_remote 26 | 27 | remote = setup_remote() 28 | kinase_ids = remote.kinases.all_kinases()["kinase.klifs_id"] 29 | kinases = remote.kinases.by_kinase_klifs_id(list(kinase_ids)) 30 | kinases = kinases[kinases["kinase.uniprot"] == uniprot_id] 31 | if len(kinases) == 0: 32 | raise ValueError("No KLIFS kinase found for UniProt ID.") 33 | elif len(kinases) > 1: 34 | raise ValueError("Multiple KLIFS kinases found for UniProt ID.") 35 | kinase = kinases.iloc[0] 36 | 37 | return kinase 38 | -------------------------------------------------------------------------------- /kinoml/data/README.md: -------------------------------------------------------------------------------- 1 | # Sample Package Data 2 | 3 | This directory contains sample additional data you may want to include with your package. 4 | This is a place where non-code related additional information (such as data files, molecular structures, etc.) can 5 | go that you want to ship alongside your code. 6 | 7 | Please note that it is not recommended to place large files in your git directory. If your project requires files larger 8 | than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files 9 | as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version 10 | in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository 11 | cap. 12 | 13 | ## Including package data 14 | 15 | Modify your package's `setup.py` file and the `setup()` command. Include the 16 | [`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the 17 | correct files. 18 | 19 | ## Manifest 20 | 21 | * `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150) 22 | -------------------------------------------------------------------------------- /kinoml/tests/datasets/test_chembl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.datasets.core 3 | """ 4 | 5 | 6 | def test_chembl(): 7 | from kinoml.core.proteins import Protein, KLIFSKinase 8 | from kinoml.datasets.chembl import ChEMBLDatasetProvider 9 | 10 | # we will use a small subset with 100 entries only, for speed 11 | chembl = ChEMBLDatasetProvider.from_source( 12 | "https://github.com/openkinome/kinodata/releases/download/v0.3/activities-chembl29_v0.3.zip", 13 | uniprot_ids=["P00533"], 14 | sample=100, 15 | protein_type="Protein", 16 | toolkit="OpenEye", 17 | ) 18 | assert len(chembl) == 100 19 | assert isinstance(chembl.systems[0].protein, Protein) 20 | assert chembl.systems[0].protein.toolkit == "OpenEye" 21 | 22 | chembl = ChEMBLDatasetProvider.from_source( 23 | "https://github.com/openkinome/kinodata/releases/download/v0.3/activities-chembl29_v0.3.zip", 24 | uniprot_ids=["P00533"], 25 | sample=100, 26 | protein_type="KLIFSKinase", 27 | toolkit="MDAnalysis", 28 | ) 29 | assert len(chembl) == 100 30 | assert isinstance(chembl.systems[0].protein, KLIFSKinase) 31 | assert chembl.systems[0].protein.toolkit == "MDAnalysis" 32 | -------------------------------------------------------------------------------- /kinoml/modeling/alignment.py: -------------------------------------------------------------------------------- 1 | def sequence_similarity( 2 | sequence1: str, 3 | sequence2: str, 4 | open_gap_penalty: int = -11, 5 | extend_gap_penalty: int = -1, 6 | substitution_matrix: str = "BLOSUM62", 7 | ) -> float: 8 | """ 9 | Calculate the squence similarity of two amino acid sequences. 10 | 11 | Parameters 12 | ---------- 13 | sequence1: str 14 | The first sequence. 15 | sequence2: str 16 | The second sequence. 17 | open_gap_penalty: int 18 | The penalty to open a gap. 19 | extend_gap_penalty: int 20 | The penalty to extend a gap. 21 | substitution_matrix: str 22 | The substitution matrix to use during alignment. 23 | Available matrices can be found via: 24 | >>> from Bio.Align import substitution_matrices 25 | >>> substitution_matrices.load() 26 | 27 | Returns 28 | ------- 29 | score: float 30 | Similarity of sequences. 31 | """ 32 | from Bio import pairwise2 33 | from Bio.Align import substitution_matrices 34 | 35 | substitution_matrix = substitution_matrices.load(substitution_matrix) 36 | # replace any characters unknown to the substitution matrix by * 37 | sequence1_clean = "".join([x if x in substitution_matrix.alphabet else "*" for x in sequence1]) 38 | sequence2_clean = "".join([x if x in substitution_matrix.alphabet else "*" for x in sequence2]) 39 | score = pairwise2.align.globalds( 40 | sequence1_clean, 41 | sequence2_clean, 42 | substitution_matrix, 43 | open_gap_penalty, 44 | extend_gap_penalty, 45 | score_only=True, 46 | ) 47 | return score 48 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: DOCS 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | - "maintenance/.+" 8 | pull_request: 9 | branches: 10 | - "master" 11 | - "maintenance/.+" 12 | schedule: 13 | # Run a cron job once daily 14 | - cron: "0 0 * * *" 15 | 16 | jobs: 17 | mkdocs: 18 | name: Docs 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | with: 24 | fetch-depth: 0 25 | 26 | - name: "Setup micromamba" 27 | uses: mamba-org/setup-micromamba@v2 28 | with: 29 | environment-file: devtools/conda-envs/docs_env.yaml 30 | environment-name: kinoml-docs-env 31 | cache-environment: true 32 | cache-downloads: true 33 | create-args: >- 34 | python=3.10 35 | init-shell: bash 36 | 37 | - name: "Additional info about the build" 38 | shell: bash 39 | run: | 40 | uname -a 41 | df -h 42 | ulimit -a 43 | 44 | - name: "Environment Information" 45 | shell: bash -l {0} 46 | run: | 47 | conda info --all 48 | conda list 49 | 50 | - name: "Build docs" 51 | shell: bash -l {0} 52 | run: | 53 | cd docs 54 | make clean 55 | SPHINXOPTS="-T --keep-going" make html 56 | 57 | - name: "Deploy" 58 | uses: peaceiris/actions-gh-pages@v3 59 | with: 60 | github_token: ${{ secrets.GITHUB_TOKEN }} 61 | publish_dir: ./docs/_build/html 62 | if: github.ref == 'refs/heads/master' 63 | -------------------------------------------------------------------------------- /docs/developers/_docstrings_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example module to show how docstrings are written for 3 | mkdocs + mkdocstrings 4 | """ 5 | 6 | import typing 7 | 8 | 9 | def example_function(arg1, kwarg=None) -> object: 10 | """ 11 | Example function to demonstrate how APIs are rendered 12 | 13 | Parameters: 14 | arg1 (dict): Some description for this argument. 15 | This type (in parenthesis) is ignored. 16 | kwarg: Some more descriptions 17 | 18 | Returns: 19 | A description for the returned value 20 | 21 | __Examples__ 22 | 23 | This can be automatically tested with `pytest --doctest-modules`! 24 | Syntax might change subtly in the future. 25 | Check https://github.com/pawamoy/mkdocstrings/issues/52 26 | 27 | ```python 28 | >>> 2 + 2 == 4 29 | True # this passes pytest 30 | >>> 2 + 2 == 5 31 | True # this fails pytest 32 | 33 | ``` 34 | """ 35 | pass 36 | 37 | 38 | def example_function_with_type_hints(arg1: dict, kwarg: typing.Any = None) -> object: 39 | """ 40 | Example function to demonstrate how APIs are rendered 41 | 42 | Parameters: 43 | arg1: Some description for this argument. 44 | kwarg: Some more descriptions 45 | 46 | Returns: 47 | A description for the returned value 48 | 49 | __Examples__ 50 | 51 | This can be automatically tested with `pytest --doctest-modules`! 52 | Syntax might change subtly in the future. 53 | Check https://github.com/pawamoy/mkdocstrings/issues/52 54 | 55 | ```python 56 | >>> 2 + 2 == 4 57 | True # this passes pytest 58 | >>> 2 + 2 == 5 59 | True # this fails pytest 60 | 61 | ``` 62 | """ 63 | pass 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | .pytest_cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # VS Code 105 | .vscode/ 106 | 107 | # MacOS 108 | .DS_Store 109 | 110 | # PyCharm 111 | .idea/ 112 | -------------------------------------------------------------------------------- /kinoml/tests/core/test_systems.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.core.systems 3 | """ 4 | 5 | import pytest 6 | 7 | 8 | def test_system(): 9 | from kinoml.core.components import MolecularComponent 10 | from kinoml.core.systems import System 11 | 12 | components = [MolecularComponent()] 13 | system = System(components=components) 14 | # This doesn't raise an error 15 | System(components=[], strict=False) 16 | # This does 17 | with pytest.raises(AssertionError): 18 | System(components=[]) 19 | 20 | 21 | def test_ligand_system(): 22 | from kinoml.core.systems import LigandSystem 23 | from kinoml.core.ligands import BaseLigand 24 | 25 | pl = LigandSystem(components=[BaseLigand()]) 26 | assert pl.ligand == list(pl.ligands)[0] 27 | assert isinstance(pl.name, str) 28 | 29 | LigandSystem(components=[], strict=False) 30 | with pytest.raises(AssertionError): 31 | LigandSystem(components=[]) 32 | 33 | 34 | def test_protein_system(): 35 | from kinoml.core.systems import ProteinSystem 36 | from kinoml.core.proteins import BaseProtein 37 | 38 | pl = ProteinSystem(components=[BaseProtein()]) 39 | assert pl.protein == list(pl.proteins)[0] 40 | 41 | ProteinSystem(components=[], strict=False) 42 | with pytest.raises(AssertionError): 43 | ProteinSystem(components=[]) 44 | 45 | 46 | def test_protein_ligand_complex(): 47 | from kinoml.core.systems import ProteinLigandComplex 48 | from kinoml.core.proteins import BaseProtein 49 | from kinoml.core.ligands import BaseLigand 50 | 51 | pl = ProteinLigandComplex(components=[BaseProtein(), BaseLigand()]) 52 | assert pl.ligand == list(pl.ligands)[0] 53 | assert pl.protein == list(pl.proteins)[0] 54 | 55 | ProteinLigandComplex(components=[], strict=False) 56 | with pytest.raises(AssertionError): 57 | ProteinLigandComplex(components=[]) 58 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | - "maintenance/.+" 8 | pull_request: 9 | branches: 10 | - "master" 11 | - "maintenance/.+" 12 | schedule: 13 | # Run a cron job once daily 14 | - cron: "0 0 * * *" 15 | 16 | jobs: 17 | lint-format: 18 | name: Lint & format checks 19 | runs-on: ubuntu-18.04 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - uses: conda-incubator/setup-miniconda@v2 25 | with: 26 | installer-url: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 27 | python-version: "3.7" 28 | activate-environment: test 29 | channel-priority: true 30 | environment-file: devtools/conda-envs/test_env.yaml 31 | auto-activate-base: false 32 | use-mamba: true 33 | 34 | - name: Additional info about the build 35 | shell: bash 36 | run: | 37 | uname -a 38 | df -h 39 | ulimit -a 40 | 41 | - name: Environment Information 42 | shell: bash -l {0} 43 | run: | 44 | conda info --all 45 | conda list 46 | 47 | - name: Install linter / formatter 48 | shell: bash -l {0} 49 | run: | 50 | mamba install 'pylint<2.13.0' black 51 | 52 | - name: Install package 53 | shell: bash -l {0} 54 | run: | 55 | python -m pip install --no-deps . 56 | 57 | - name: Run pylint 58 | shell: bash -l {0} 59 | run: | 60 | pylint --disable=W kinoml/ 61 | 62 | - name: Run black check 63 | shell: bash -l {0} 64 | if: always() 65 | run: | 66 | black --check -l 99 kinoml/ --exclude kinoml/_version.py 67 | -------------------------------------------------------------------------------- /kinoml/tests/datasets/test_pkis2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.datasets.kinomescan 3 | """ 4 | 5 | 6 | def test_pkis2_protein_openeye(): 7 | from kinoml.core.proteins import Protein 8 | from kinoml.datasets.pkis2 import PKIS2DatasetProvider 9 | 10 | provider = PKIS2DatasetProvider.from_source( 11 | protein_type="Protein", 12 | toolkit="OpenEye", 13 | ) 14 | assert len(provider.measurements) == 261_870 15 | assert (provider.measurements[0].values == 14.0).all() 16 | # check order in provider matches order in file 17 | assert ( # matches line 43 in file 18 | provider[17051].system.ligand.name 19 | == "O=C1NC(C2=C(C3=CC=CC=C3)C=C4C(C(C=C(O)C=C5)=C5N4)=C21)=O" 20 | ) 21 | assert ( # matches line 44 in file 22 | provider[17052].system.ligand.name 23 | == "CN(N=C1)C=C1C(C=C2)=NN3C2=NN=C3[C@@H](C)C4=CC=C(N=CC=C5)C5=C4" 24 | ) 25 | assert isinstance(provider.systems[0].protein, Protein) 26 | assert provider.systems[0].protein.toolkit == "OpenEye" 27 | 28 | 29 | def test_pkis2_klifskinase_mdanalysis(): 30 | from kinoml.core.proteins import KLIFSKinase 31 | from kinoml.datasets.pkis2 import PKIS2DatasetProvider 32 | 33 | provider = PKIS2DatasetProvider.from_source( 34 | protein_type="KLIFSKinase", 35 | toolkit="MDAnalysis", 36 | ) 37 | assert len(provider.measurements) == 261_870 38 | assert (provider.measurements[0].values == 14.0).all() 39 | # check order in provider matches order in file 40 | assert ( # matches line 43 in file 41 | provider[17051].system.ligand.name 42 | == "O=C1NC(C2=C(C3=CC=CC=C3)C=C4C(C(C=C(O)C=C5)=C5N4)=C21)=O" 43 | ) 44 | assert ( # matches line 44 in file 45 | provider[17052].system.ligand.name 46 | == "CN(N=C1)C=C1C(C=C2)=NN3C2=NN=C3[C@@H](C)C4=CC=C(N=CC=C5)C5=C4" 47 | ) 48 | assert isinstance(provider.systems[0].protein, KLIFSKinase) 49 | assert provider.systems[0].protein.toolkit == "MDAnalysis" 50 | -------------------------------------------------------------------------------- /kinoml/tests/core/test_sequences.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.core.sequences and derived objects 3 | """ 4 | 5 | 6 | def test_biosequence_mutation(): 7 | from kinoml.core.sequences import Biosequence 8 | 9 | s = Biosequence("ATCGTHCTCH") 10 | s.substitute("C3P") 11 | assert s.sequence == "ATPGTHCTCH" 12 | s.delete(2, 5) 13 | assert s.sequence == "AHCTCH" 14 | s.delete(2, 5, insert="AA") 15 | assert s.sequence == "AAAH" 16 | s.insert(5, "T") 17 | assert s.sequence == "AAAHT" 18 | 19 | 20 | def test_aminoacidsequence_fetching(): 21 | from kinoml.core.proteins import AminoAcidSequence 22 | 23 | s1 = AminoAcidSequence(uniprot_id="P00519-1") 24 | s2 = AminoAcidSequence(ncbi_id="NP_005148.2") 25 | assert s1.sequence == s2.sequence 26 | 27 | 28 | def test_aminoacidsequence_fetching_with_alterations(): 29 | from kinoml.core.proteins import AminoAcidSequence 30 | 31 | sequence = AminoAcidSequence(uniprot_id="P00519") 32 | assert len(sequence.sequence) == 1130 33 | assert sequence.sequence[314] == "T" 34 | 35 | sequence = AminoAcidSequence(uniprot_id="P00519", metadata={"construct_range": "229-512"}) 36 | assert len(sequence.sequence) == 284 37 | 38 | sequence = AminoAcidSequence(uniprot_id="P00519", metadata={"mutations": "T315A"}) 39 | assert sequence.sequence[314] == "A" 40 | 41 | sequence = AminoAcidSequence( 42 | uniprot_id="P00519", 43 | metadata={ 44 | "mutations": "T315A del320-322P ins321AAA", 45 | "construct_range": "229-512", 46 | }, 47 | ) 48 | assert sequence.sequence[86] == "A" 49 | assert sequence.sequence[91] == "P" 50 | assert sequence.sequence[92:95] == "AAA" 51 | assert len(sequence.sequence) == 284 52 | 53 | 54 | def test_aminoacidsequence_ncbi_to_uniprot(): 55 | from kinoml.core.proteins import AminoAcidSequence 56 | 57 | uniprot_id = AminoAcidSequence.ncbi_to_uniprot("NP_005148") 58 | assert uniprot_id == "P00519" 59 | uniprot_id = AminoAcidSequence.ncbi_to_uniprot("BBB") 60 | assert uniprot_id == "" 61 | -------------------------------------------------------------------------------- /kinoml/datasets/torch_geometric_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper classes to convert between DatasetProvider objects and 3 | Dataset-like objects native to the PyTorch ecosystem 4 | """ 5 | 6 | import awkward as ak 7 | import torch 8 | from torch.utils.data import Dataset as _NativeTorchDataset 9 | from torch_geometric.data import Data 10 | 11 | 12 | # Disable false positive lint with torch.tensor 13 | # see https://github.com/pytorch/pytorch/issues/24807 14 | # pylint: disable=not-callable 15 | 16 | 17 | class AwkwardArrayGeometricDataset(_NativeTorchDataset): 18 | """ 19 | Loads an Awkward array of Records suitable for PyTorch Geometric. 20 | It assumes the following: 21 | 22 | - The Awkward array contains three fields: 0, 1 and 2 23 | - 0: Conn. matrix --> Data's ``edge_index`` 24 | - 1: Node features --> Data's ``x`` 25 | - 2: y labels 26 | 27 | If more attributes are needed, you need to modify ``__getitem__`` logic 28 | """ 29 | 30 | def __init__(self, data): 31 | assert len(data.fields) == 3, ( 32 | f"Graph datasets should only contain three groups: " 33 | "0, 1 and 2 (conn. matrix, node features, y; respectively)" 34 | ) 35 | self.data = data 36 | 37 | def __len__(self): 38 | return len(self.data) 39 | 40 | def __getitem__(self, index): 41 | if isinstance(index, int): 42 | index = [index] 43 | fields = self.data.fields 44 | edge_index = self.data[index, fields[0]] 45 | node_features = self.data[index, fields[1]] 46 | y = torch.tensor(self.data[index, fields[2]]) 47 | X = [ 48 | Data(x=torch.tensor(nf), edge_index=torch.tensor(ei).long()) 49 | for (nf, ei) in zip(node_features, edge_index) 50 | ] 51 | return X, y 52 | 53 | def __iter__(self): 54 | raise NotImplementedError 55 | 56 | def __repr__(self): 57 | return self.data.__repr__() 58 | 59 | def __str__(self): 60 | return self.data.__str__() 61 | 62 | @classmethod 63 | def from_parquet(cls, path, **kwargs): 64 | return cls(ak.from_parquet(path, **kwargs)) 65 | -------------------------------------------------------------------------------- /kinoml/ml/torch_geometric_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of some Deep Neural Networks in Pytorch using Pytorch Geometric. 3 | """ 4 | 5 | from torch.nn import Linear 6 | import torch.nn.functional as F 7 | from torch_geometric.nn import GCNConv, global_mean_pool 8 | from .torch_models import _BaseModule 9 | 10 | 11 | class GraphConvolutionNeuralNetwork(_BaseModule): 12 | """ 13 | Builds a Graph Convolutional Network and a feed-forward pass 14 | 15 | Parameters 16 | ---------- 17 | input_shape : int 18 | Number of features per node in the graph. 19 | embedding_shape : int, default=100 20 | Dimension of latent vector. 21 | hidden_shape : int, default=50 22 | Dimension of the hidden shape. 23 | output_shape : int, default=1 24 | Size of the last unit, representing delta_g_over_kt in our setting. 25 | _activation : torch function, default=relu 26 | The activation function used in the hidden (only!) layer of the network. 27 | """ 28 | 29 | needs_input_shape = True 30 | 31 | @staticmethod 32 | def estimate_input_shape(input_sample): 33 | # Take the first batch [0] 34 | return input_sample[0].num_node_features 35 | 36 | def __init__( 37 | self, 38 | input_shape, 39 | embedding_shape=100, 40 | hidden_shape=50, 41 | output_shape=1, 42 | activation=F.relu, 43 | ): 44 | super().__init__() 45 | self.input_shape = input_shape 46 | self.embedding_shape = embedding_shape 47 | self.hidden_shape = hidden_shape 48 | self.output_shape = output_shape 49 | self._activation = activation 50 | 51 | self.GraphConvLayer1 = GCNConv(self.input_shape, self.embedding_shape) 52 | self.GraphConvLayer2 = GCNConv(self.embedding_shape, self.hidden_shape) 53 | 54 | self.linear = Linear(self.hidden_shape, self.output_shape) 55 | 56 | def forward(self, data): 57 | data = data[0] # get the first one only? 58 | x, edge_index, batch = data.x.float(), data.edge_index.long(), data.batch 59 | x = self._activation(self.GraphConvLayer1(x, edge_index)) 60 | x = self._activation(self.GraphConvLayer2(x, edge_index)) 61 | x = global_mean_pool(x, batch) 62 | return self.linear(x) 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | KinoML 3 | Machine Learning for kinase modeling 4 | """ 5 | import sys 6 | from setuptools import setup, find_packages 7 | import versioneer 8 | 9 | short_description = __doc__.split("\n") 10 | 11 | # from https://github.com/pytest-dev/pytest-runner#conditional-requirement 12 | needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) 13 | pytest_runner = ['pytest-runner'] if needs_pytest else [] 14 | 15 | try: 16 | with open("README.md", "r") as handle: 17 | long_description = handle.read() 18 | except: 19 | long_description = "\n".join(short_description[2:]) 20 | 21 | 22 | setup( 23 | # Self-descriptive entries which should always be present 24 | name='kinoml', 25 | author='OpenKinome', 26 | author_email='jaime.rodriguez-guerra@choderalab.org', 27 | description=short_description[0], 28 | long_description=long_description, 29 | long_description_content_type="text/markdown", 30 | version=versioneer.get_version(), 31 | cmdclass=versioneer.get_cmdclass(), 32 | license='MIT', 33 | 34 | # Which Python importable modules should be included when your package is installed 35 | # Handled automatically by setuptools. Use 'exclude' to prevent some specific 36 | # subpackage(s) from being added, if needed 37 | packages=find_packages(), 38 | 39 | # Optional include package data to ship with your package 40 | # Customize MANIFEST.in if the general case does not suit your needs 41 | # Comment out this line to prevent the files from being packaged with your software 42 | include_package_data=True, 43 | 44 | # Allows `setup.py test` to work correctly with pytest 45 | setup_requires=[] + pytest_runner, 46 | 47 | # Additional entries you may want simply uncomment the lines you want and fill in the data 48 | # url='http://www.my_package.com', # Website 49 | # install_requires=[], # Required packages, pulls from pip if needed; do not use for Conda deployment 50 | # platforms=['Linux', 51 | # 'Mac OS-X', 52 | # 'Unix', 53 | # 'Windows'], # Valid platforms your code works on, adjust to your flavor 54 | # python_requires=">=3.5", # Python version restrictions 55 | 56 | # Manual control if final package is compressible or not, set False to prevent the .egg from being made 57 | # zip_safe=False, 58 | 59 | ) 60 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | We welcome contributions from external contributors, and this document 4 | describes how to merge code changes into this kinoml. 5 | 6 | ## Getting Started 7 | 8 | * Make sure you have a [GitHub account](https://github.com/signup/free). 9 | * [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub. 10 | * On your local machine, 11 | [clone](https://help.github.com/articles/cloning-a-repository/) your fork of 12 | the repository. 13 | 14 | ## Making Changes 15 | 16 | * Add some really awesome code to your local fork. It's usually a [good 17 | idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/) 18 | to make changes on a 19 | [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/) 20 | with the branch name relating to the feature you are going to add. 21 | * When you are ready for others to examine and comment on your new feature, 22 | navigate to your fork of kinoml on GitHub and open a [pull 23 | request](https://help.github.com/articles/using-pull-requests/) (PR). Note that 24 | after you launch a PR from one of your fork's branches, all 25 | subsequent commits to that branch will be added to the open pull request 26 | automatically. Each commit added to the PR will be validated for 27 | mergability, compilation and test suite compliance; the results of these tests 28 | will be visible on the PR page. 29 | * If you're providing a new feature, you must add test cases and documentation. 30 | * When the code is ready to go, make sure you run the test suite using pytest. 31 | * When you're ready to be considered for merging, check the "Ready to go" 32 | box on the PR page to let the kinoml devs know that the changes are complete. 33 | The code will not be merged until this box is checked, the continuous 34 | integration returns checkmarks, 35 | and multiple core developers give "Approved" reviews. 36 | 37 | # Additional Resources 38 | 39 | * [General GitHub documentation](https://help.github.com/) 40 | * [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/) 41 | * [A guide to contributing to software packages](http://www.contribution-guide.org) 42 | * [Thinkful PR example](http://www.thinkful.com/learn/github-pull-request-tutorial/#Time-to-Submit-Your-First-PR) 43 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | How to use the tutorials folder 2 | ============================== 3 | This tutorial folder contains two subfolders: 4 | 5 | 6 | 7 | * **getting_started**: this folder contains four jupyter notebook tutorials that give the user a general overview of KinoML potential usage and capabilities. 8 | 9 | * **getting_started_with_kinoml**: this notebook aims to give a brief overview of KinoML capabilities. This notebook is divided into three parts that show how to use KinoML to: (1) filter and obtain the desired data from an external data source, (2) featurize this data to make it ML readable and (3) train and evaluate a ML model on the featurized data obtain from the previous steps. 10 | 11 | * **kinoml_object_model**: this notebook aims to guide the user through the KinoML object model, showing how to access each object. 12 | 13 | * **OpenEye_structural_featurizer_showcase**: this notebook displays all the OpenEye-based structural modeling featurizers implemented in KinoML and how to use each of them. 14 | 15 | * **Schrodinger_structural_featurizer_showcase**: this notebook introduces the structural modeling featurizers implemented in KinoML that use the molecular modeling capabilities from the Schrodinger Suite to prepare protein structures and to dock small molecules into their binding sites. 16 | 17 | 18 | 19 | * **experiments**: this folder contains four separate structure-based experiments to predict ligand binding affinity to the EGFR kinase. The aim of these notebook are to showcase how to use KinoML to conduct experiments end-to-end, from obtaining the data from the database to training and evaluating a ML model to predict ligand binding affinity. Note that if the user wants to run this notebooks with their own data, they can do so by adjusting the neccesary parameters within the notebooks. All experiments are divided into two parts: 20 | 21 | 1. **Featurize the data set**: obtaining the data set and featurize it with the featurization pipeline of choice. 22 | 23 | 2. **Run the experiment**: the ML model of choice, implemented in the `kinoml.ml` class is trained and evaluated. 24 | 25 | 26 | Please note that the order in which the different notebooks are displayed here is the recommended order for running them, providing a more comprehensive understanding of KinoML. 27 | 28 | ⚠️ You will need a valid OpenEye License for the tutorials to work. For the Schrodinger featurizers tutorial (`Schrodinger_structural_featurizer_showcase.ipynb`) you will also need a Schrodinger License! 29 | 30 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ```{admonition} Warning! 2 | :class: warning 3 | 4 | This is module is undergoing heavy development. None of the API calls are final. This software is provided without any guarantees of correctness, you will likely encounter bugs. 5 | 6 | If you are interested in this code, please wait for the official release to use it. In the mean time, to stay informed of development progress you are encouraged to: 7 | 8 | - Subscribe for new releases (use `Watch> Releases only` on GitHub) 9 | - Check out the [Github repository](https://github.com/openkinome/kinoml). 10 | 11 | ``` 12 | 13 | # KinoML 14 | 15 | Welcome to the Documentation of KinoML! The documentation is divided into two parts: 16 | 17 | * **User guide**: in this section you will learn how to use KinoML to filter and download data from a data base, featurize your kinase data so that it is ML friendly and train and evaluate a ML model on your featurized kinase data. You will also learn about the KinoML object model, and how to access each of these objects. We also provide a detailed examples of how to use every featurizer implemented within KinoML. 18 | 19 | * **Experiment tutorials**: this section shows how to use KinoML to ML structure-based experiments. All experiments are structure-based and they are all end to end, from data collection to model training and evaluation. 20 | 21 | 22 | 23 | KinoML falls under the [OpenKinome](https://openkinome.org) initiative, which aims to leverage the increasingly available bioactivity data and scalable computational resources to perform kinase-centric drug design in the context of structure-informed machine learning and free energy calculations. `KinoML` is the main library supporting these efforts. 24 | 25 | Do you want to know more about OpenKinome ecosystem? Check its [website](https://openkinome.org). 26 | 27 | 28 | 29 | ```{toctree} 30 | :caption: User guide 31 | :maxdepth: 3 32 | :hidden: 33 | 34 | notebooks/getting_started.nblink 35 | notebooks/kinoml_object_model.nblink 36 | notebooks/OpenEye_structural_featurizer.nblink 37 | notebooks/Schrodinger_structural_featurizer.nblink 38 | ``` 39 | 40 | ```{toctree} 41 | :caption: Experiment tutorials 42 | :maxdepth: 2 43 | :hidden: 44 | 45 | notebooks/ligand-only-smiles-EGFR.nblink 46 | notebooks/ligand-only-morgan1024-EGFR.nblink 47 | notebooks/kinase-ligand-informed-smiles-sequence-EGFR.nblink 48 | notebooks/kinase-ligand-informed-morgan-composition-EGFR.nblink 49 | ``` 50 | 51 | ```{toctree} 52 | :caption: Developers 53 | :maxdepth: 1 54 | :hidden: 55 | 56 | API Reference 57 | ``` 58 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | title: "KinoML" 4 | version: "1.0.0" 5 | date-released: "2025-10-01" 6 | repository-code: "https://github.com/openkinome/kinoml" 7 | url: "https://github.com/openkinome/kinoml" 8 | 9 | authors: 10 | - family-names: "López-Ríos de Castro" 11 | given-names: "Raquel" 12 | - family-names: "Rodríguez-Guerra" 13 | given-names: "Jaime" 14 | - family-names: "Schaller" 15 | given-names: "David" 16 | - family-names: "Kimber" 17 | given-names: "Talia B." 18 | - family-names: "Taylor" 19 | given-names: "Corey" 20 | - family-names: "White" 21 | given-names: "Jessica B." 22 | - family-names: "Backenköhler" 23 | given-names: "Michael" 24 | - family-names: "Groß" 25 | given-names: "Joschka" 26 | - family-names: "Payne" 27 | given-names: "Alexander" 28 | - family-names: "Kaminow" 29 | given-names: "Ben" 30 | - family-names: "Pulido" 31 | given-names: "Iván" 32 | - family-names: "Singh" 33 | given-names: "Sukrit" 34 | - family-names: "Kramer" 35 | given-names: "Paula Linh" 36 | - family-names: "Pérez-Hernández" 37 | given-names: "Guillermo" 38 | - family-names: "Volkamer" 39 | given-names: "Andrea" 40 | - family-names: "Chodera" 41 | given-names: "John D." 42 | 43 | preferred-citation: 44 | type: article 45 | title: "Lessons learned during the journey of data: from experiment to model for predicting kinase affinity, selectivity, polypharmacology, and resistance" 46 | authors: 47 | - family-names: "López-Ríos de Castro" 48 | given-names: "Raquel" 49 | - family-names: "Rodríguez-Guerra" 50 | given-names: "Jaime" 51 | - family-names: "Schaller" 52 | given-names: "David" 53 | - family-names: "Kimber" 54 | given-names: "Talia B." 55 | - family-names: "Taylor" 56 | given-names: "Corey" 57 | - family-names: "White" 58 | given-names: "Jessica B." 59 | - family-names: "Backenköhler" 60 | given-names: "Michael" 61 | - family-names: "Groß" 62 | given-names: "Joschka" 63 | - family-names: "Payne" 64 | given-names: "Alexander" 65 | - family-names: "Kaminow" 66 | given-names: "Ben" 67 | - family-names: "Pulido" 68 | given-names: "Iván" 69 | - family-names: "Singh" 70 | given-names: "Sukrit" 71 | - family-names: "Kramer" 72 | given-names: "Paula Linh" 73 | - family-names: "Pérez-Hernández" 74 | given-names: "Guillermo" 75 | - family-names: "Volkamer" 76 | given-names: "Andrea" 77 | - family-names: "Chodera" 78 | given-names: "John D." 79 | journal: "bioRxiv" 80 | year: 2024 81 | doi: "10.1101/2024.09.10.612176" 82 | url: "https://doi.org/10.1101/2024.09.10.612176" 83 | -------------------------------------------------------------------------------- /kinoml/tests/core/test_proteins.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test kinoml.core.proteins 3 | """ 4 | from importlib import resources 5 | 6 | from MDAnalysis.core.universe import Universe 7 | from openeye import oechem 8 | import pandas as pd 9 | 10 | 11 | def test_protein_from_file(): 12 | """Check from file reading with MDAnalysis and OpenEye.""" 13 | from kinoml.core.proteins import Protein 14 | 15 | with resources.path("kinoml.data.proteins", "4f8o.pdb") as path: 16 | protein = Protein.from_file(path) 17 | assert isinstance(protein.molecule, oechem.OEGraphMol) 18 | protein = Protein.from_file(path, toolkit="MDAnalysis") 19 | assert isinstance(protein.molecule, Universe) 20 | 21 | 22 | def test_protein_from_pdb(): 23 | """Check instantation from PDB ID.""" 24 | from kinoml.core.proteins import Protein 25 | 26 | protein = Protein.from_pdb("4yne") 27 | assert isinstance(protein.molecule, oechem.OEGraphMol) 28 | protein = Protein.from_pdb("4yne", toolkit="MDAnalysis") 29 | assert isinstance(protein.molecule, Universe) 30 | 31 | 32 | def test_lazy_protein(): 33 | """Check lazy instantiation via PDB ID.""" 34 | from kinoml.core.proteins import Protein 35 | 36 | protein = Protein(pdb_id="4yne") 37 | assert isinstance(protein._molecule, type(None)) 38 | assert isinstance(protein.molecule, oechem.OEGraphMol) 39 | assert isinstance(protein._molecule, oechem.OEGraphMol) 40 | protein = Protein(pdb_id="4yne", toolkit="MDAnalysis") 41 | assert isinstance(protein.molecule, Universe) 42 | 43 | 44 | def test_klifskinase_kinase_klifs_sequence(): 45 | """Check access to kinase_klifs_sequence.""" 46 | from kinoml.core.proteins import KLIFSKinase 47 | 48 | kinase = KLIFSKinase(uniprot_id="P04629") 49 | assert len(kinase.kinase_klifs_sequence) == 85 50 | assert isinstance(kinase.sequence, str) 51 | kinase = KLIFSKinase(kinase_klifs_id=480) 52 | assert len(kinase.kinase_klifs_sequence) == 85 53 | assert isinstance(kinase.sequence, str) 54 | kinase = KLIFSKinase(structure_klifs_id=3620) 55 | assert len(kinase.kinase_klifs_sequence) == 85 56 | assert isinstance(kinase.sequence, str) 57 | 58 | 59 | def test_klifskinase_structure_klifs_sequence(): 60 | """Check access to structure_klifs_sequence.""" 61 | from kinoml.core.proteins import KLIFSKinase 62 | 63 | kinase = KLIFSKinase(structure_klifs_id=3620) 64 | assert len(kinase.structure_klifs_sequence) == 85 65 | 66 | 67 | def test_klifskinase_structure_klifs_residues(): 68 | """Check access to structure_klifs_residues.""" 69 | from kinoml.core.proteins import KLIFSKinase 70 | 71 | kinase = KLIFSKinase(structure_klifs_id=3620) 72 | assert isinstance(kinase.structure_klifs_residues, pd.DataFrame) is True 73 | assert len(kinase.structure_klifs_residues) == 85 74 | -------------------------------------------------------------------------------- /kinoml/core/conditions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Each Measurement object can store a ``conditions`` 3 | attribute which should contain one of the classes 4 | here defined. 5 | 6 | For example, experimental measurements can have an 7 | ``AssayConditions`` object specifying the variables 8 | involved in the experiment, like pH. 9 | """ 10 | 11 | from typing import Union 12 | import json 13 | 14 | 15 | class BaseConditions: 16 | 17 | """ 18 | Contains information about the experimental conditions. 19 | We ensure objects are immutable by using read-only properties 20 | for all attributes. Do NOT modify private attributes or 21 | hashing will break. 22 | 23 | Parameters 24 | ---------- 25 | strict : bool, optional=True 26 | Whether to perform safety checks at initialization. 27 | """ 28 | 29 | def __init__(self, strict: bool = True): 30 | if strict: 31 | self.check() 32 | 33 | def check(self): 34 | """ 35 | Perform some checks for valid values 36 | """ 37 | 38 | def _properties(self, classname: bool = True) -> dict: 39 | """ 40 | Return a dictionary with the classname and all defined properties. 41 | Used for equality comparisons in subclasses. 42 | 43 | Parameters 44 | ---------- 45 | classname : bool, optional=True 46 | Whether to include the name of the instance class 47 | 48 | Returns 49 | ------- 50 | dict 51 | """ 52 | props = {"classname": self.__class__.__name__} if classname else {} 53 | for name in dir(self): 54 | if name.startswith("_"): 55 | continue 56 | clsattr = getattr(self.__class__, name) 57 | if isinstance(clsattr, property): 58 | props[name] = getattr(self, name) 59 | return props 60 | 61 | def __hash__(self): 62 | return hash(json.dumps(self._properties())) 63 | 64 | def __eq__(self, other): 65 | return self._properties() == other._properties() 66 | 67 | def __repr__(self) -> str: 68 | return ( 69 | f"<{self.__class__.__name__} " 70 | f"{' '.join([f'{k}={v}' for k, v in self._properties(classname=False).items()])}>" 71 | ) 72 | 73 | 74 | class AssayConditions(BaseConditions): 75 | """ 76 | Contains information about the experimental conditions 77 | of a given assay. 78 | 79 | Parameters 80 | ---------- 81 | pH : int or float, optional=7.0 82 | Acidity conditions 83 | """ 84 | 85 | def __init__(self, pH: Union[int, float] = 7.0, *args, **kwargs): 86 | self._pH = pH 87 | 88 | # Finish initialization 89 | super().__init__(*args, **kwargs) 90 | 91 | @property 92 | def pH(self): 93 | return self._pH 94 | 95 | def check(self): 96 | super().check() 97 | assert 0 <= self.pH <= 14, f"pH must be within [0, 14], but {self.pH} was specified" 98 | -------------------------------------------------------------------------------- /kinoml/tests/databases/test_pdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test pdb functionalities of `kinoml.databases` 3 | """ 4 | from contextlib import contextmanager 5 | from pathlib import PosixPath 6 | import pytest 7 | 8 | 9 | @contextmanager 10 | def does_not_raise(): 11 | yield 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "pdb_ids, expectation, smiles_list", 16 | [ 17 | ( 18 | ["EDO"], 19 | does_not_raise(), 20 | ["C(CO)O"], 21 | ), 22 | ( 23 | ["---"], 24 | pytest.raises(KeyError), 25 | ["---"], 26 | ), 27 | ( 28 | ["EDO", "GOL"], 29 | does_not_raise(), 30 | ["C(CO)O", "C(C(CO)O)O"], 31 | ), 32 | ], 33 | ) 34 | def test_smiles_from_pdb(pdb_ids, expectation, smiles_list): 35 | """Compare results for expected SMILES.""" 36 | from kinoml.databases.pdb import smiles_from_pdb 37 | 38 | with expectation: 39 | ligands = smiles_from_pdb(pdb_ids) 40 | for pdb_id, smiles in zip(pdb_ids, smiles_list): 41 | assert ligands[pdb_id] == smiles 42 | 43 | 44 | @pytest.mark.parametrize( 45 | "pdb_id, return_type", 46 | [ 47 | ( 48 | "4YNE", 49 | PosixPath, 50 | ), # PDB and CIF format available 51 | ( 52 | "1BOS", 53 | PosixPath, 54 | ), # only CIF format available 55 | ( 56 | "XXXX", 57 | bool, 58 | ), # wrong code 59 | ], 60 | ) 61 | def test_download_pdb_structure(pdb_id, return_type): 62 | """Try to download PDB structures.""" 63 | from tempfile import TemporaryDirectory 64 | 65 | from kinoml.databases.pdb import download_pdb_structure 66 | 67 | with TemporaryDirectory() as temporary_directory: 68 | assert isinstance(download_pdb_structure(pdb_id, temporary_directory), return_type) 69 | 70 | 71 | @pytest.mark.parametrize( 72 | "pdb_id, chain_id, expo_id, smiles, return_type", 73 | [ 74 | ( 75 | "4YNE", # PDB and CIF format available 76 | "A", 77 | "4EK", 78 | "c1ccnc(c1)c2cnc3n2nc(cc3)N4CCC[C@@H]4c5cccc(c5)F", 79 | PosixPath, 80 | ), 81 | ( 82 | "1BOS", # only CIF format available 83 | "E", 84 | "GAL", 85 | "C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O)O)O)O)O", 86 | PosixPath, 87 | ), 88 | ( 89 | "XXXX", 90 | "X", 91 | "XXX", 92 | "xxxxx", 93 | bool, 94 | ), # wrong code 95 | ], 96 | ) 97 | def test_download_pdb_structure(pdb_id, chain_id, expo_id, smiles, return_type): 98 | """Try to download PDB ligands.""" 99 | from tempfile import TemporaryDirectory 100 | from kinoml.databases.pdb import download_pdb_ligand 101 | 102 | with TemporaryDirectory() as temporary_directory: 103 | assert isinstance(download_pdb_ligand(pdb_id, chain_id, expo_id, smiles), return_type) 104 | -------------------------------------------------------------------------------- /kinoml/analysis/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 3 | 4 | 5 | def root_mean_squared_error(*args, **kwargs): 6 | """ 7 | Returns the square-root of ``scikit-learn``'s ``mean_squared_error`` metric. 8 | All arguments are forwarded to that function. 9 | """ 10 | return np.sqrt(mean_squared_error(*args, **kwargs)) 11 | 12 | 13 | def performance( 14 | predicted, 15 | observed, 16 | verbose=True, 17 | n_boot=100, 18 | confidence=0.95, 19 | sample_ratio=0.8, 20 | _seed=1234, 21 | ): 22 | """ 23 | Measure the predicted vs observed performance with different metrics (R2, MSE, MAE, RMSE). 24 | 25 | Parameters 26 | ---------- 27 | predicted : array-like 28 | Data points predicted by the model. 29 | observed : array-like 30 | Observed data points, as available in the dataset. 31 | verbose : bool, optional=True 32 | Whether to print results to stdout. 33 | n_boot : int, optional=100 34 | Number of bootstrap iterations. Set to ``1`` to disable 35 | bootstrapping. 36 | confidence : float, optional=0.95 37 | Confidence interval, relative to 1. Default is 95%. 38 | sample_ratio : float, optional=0.8 39 | Proportion of data to sample in each iteration. 40 | _seed : int, optional=1234 41 | Random seed. Each bootstrap iteration gets a different seed 42 | based on this initial one. 43 | 44 | Returns 45 | ------- 46 | results : dict of tuple 47 | This dictionary contains one item per metric (see above), 48 | with a 4-element tuple each: mean, standard deviation, and lower and 49 | upper bounds for the confidence interval. 50 | 51 | Note 52 | ---- 53 | **TODO**: Reimplement samples with ``scipy.stats.norm`` or with ``numpy``. 54 | 55 | """ 56 | assert 0.5 <= confidence < 1, "Confidence must be in [0.5, 1)" 57 | assert 0 < sample_ratio <= 1, "Sample ratio must be in (0, 1]" 58 | 59 | high = predicted.shape[0] 60 | size = int(sample_ratio * high) 61 | metrics = { 62 | "r2": r2_score, 63 | "mse": mean_squared_error, 64 | "mae": mean_absolute_error, 65 | "rmse": root_mean_squared_error, 66 | } 67 | bootstrapped = np.empty((len(metrics), n_boot)) 68 | 69 | for i in range(n_boot): 70 | rng = np.random.RandomState(_seed + i) 71 | indices = rng.randint(low=0, high=high, size=size) 72 | obs, pred = observed[indices], predicted[indices] 73 | for j, (key, fn) in enumerate(sorted(metrics.items())): 74 | bootstrapped[j][i] = fn(obs, pred) 75 | 76 | # FIXME: Sort metrics as suggested here https://stackoverflow.com/a/40491405 77 | bootstrapped.sort(axis=1) 78 | 79 | results = {} 80 | for index, key in enumerate(sorted(metrics)): 81 | arr = bootstrapped[index] 82 | 83 | results[key] = mean, std, low, high = ( 84 | arr.mean(), 85 | arr.std(), 86 | np.quantile(arr, 1 - confidence), 87 | np.quantile(arr, confidence), 88 | ) 89 | if verbose: 90 | print( 91 | f"{key.upper():>4s}: {mean:.4f}±{std:.4f} {100*confidence:.0f}CI=({low:.4f}, {high:.4f})" 92 | ) 93 | 94 | return results 95 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | - "maintenance/.+" 8 | pull_request: 9 | branches: 10 | - "master" 11 | - "maintenance/.+" 12 | schedule: 13 | # Run a cron job once weekly 14 | - cron: "0 0 * * 0" 15 | workflow_dispatch: 16 | 17 | jobs: 18 | test: 19 | name: ${{ matrix.name }} 20 | runs-on: ${{ matrix.os }} 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | include: 25 | - name: Linux, Python 3.9 26 | os: ubuntu-latest 27 | python-version: "3.9" 28 | conda-installer: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 29 | 30 | - name: Linux, Python 3.10 31 | os: ubuntu-latest 32 | python-version: "3.9" 33 | conda-installer: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 34 | 35 | - name: MacOS, Python 3.9 36 | os: macOS-latest 37 | python-version: "3.9" 38 | conda-installer: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-MacOSX-x86_64.sh 39 | 40 | env: 41 | OE_LICENSE: ${{ github.workspace }}/oe_license.txt 42 | MODELLER_LICENSE: ${{ secrets.MODELLER_LICENSE }} 43 | 44 | steps: 45 | - uses: actions/checkout@v2 46 | 47 | - uses: conda-incubator/setup-miniconda@v2 48 | with: 49 | installer-url: ${{ matrix.conda-installer }} 50 | python-version: ${{ matrix.python-version }} 51 | activate-environment: test 52 | channel-priority: true 53 | environment-file: devtools/conda-envs/test_env.yaml 54 | auto-activate-base: false 55 | use-mamba: true 56 | 57 | - name: Additional info about the build 58 | shell: bash 59 | run: | 60 | uname -a 61 | df -h 62 | ulimit -a 63 | 64 | - name: Environment Information 65 | shell: bash -l {0} 66 | run: | 67 | conda info --all 68 | conda list 69 | mamba --version 70 | 71 | - name: Decrypt and check OE license 72 | shell: bash -l {0} 73 | env: 74 | OE_LICENSE_TEXT: ${{ secrets.OE_LICENSE }} 75 | run: | 76 | echo "${OE_LICENSE_TEXT}" > ${OE_LICENSE} 77 | python -c "import openeye; assert openeye.OEChemIsLicensed()" 78 | 79 | - name: Install package 80 | shell: bash -l {0} 81 | run: | 82 | python -m pip install --no-deps . 83 | 84 | - name: Run tests 85 | shell: bash -l {0} 86 | run: | 87 | pytest -v -n auto --dist load --cov=kinoml --cov-report=xml --color=yes -k "not read_electron_density" kinoml/tests/ 88 | 89 | - name: Run notebooks 90 | shell: bash -l {0} 91 | run: | 92 | pytest -v -n auto --dist loadscope --nbval-lax -k "not Schrodinger_structural_featurizer.ipynb" examples/*.ipynb 93 | 94 | - name: CodeCov 95 | uses: codecov/codecov-action@v1 96 | if: always() 97 | with: 98 | token: ${{ secrets.CODECOV_TOKEN }} 99 | file: ./coverage.xml 100 | flags: unittests 101 | yml: ./.codecov.yml 102 | -------------------------------------------------------------------------------- /devtools/README.md: -------------------------------------------------------------------------------- 1 | # Development, testing, and deployment tools 2 | 3 | This directory contains a collection of tools for running Continuous Integration (CI) tests, 4 | conda installation, and other development tools not directly related to the coding process. 5 | 6 | 7 | ## Manifest 8 | 9 | ### Continuous Integration 10 | 11 | You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and 12 | Windows testing if you only plan to deploy on specific platforms. These are just to help you get started 13 | 14 | * `travis-ci`: Linux and OSX based testing through [Travis-CI](https://about.travis-ci.com/) 15 | * `before_install.sh`: Pip/Miniconda pre-package installation script for Travis 16 | * `appveyor`: Windows based testing through [AppVeyor](https://www.appveyor.com/) (there are no files directly related to this) 17 | 18 | ### Conda Environment: 19 | 20 | This directory contains the files to setup the Conda environment for testing purposes 21 | 22 | * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's 23 | * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration 24 | 25 | ### Additional Scripts: 26 | 27 | This directory contains OS agnostic helper scripts which don't fall in any of the previous categories 28 | * `scripts` 29 | * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options 30 | 31 | 32 | ## How to contribute changes 33 | - Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator. 34 | - Make a new branch with `git checkout -b {your branch name}` 35 | - Make changes and test your code 36 | - Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`) 37 | - Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}` 38 | * Note that `origin` is the default name assigned to the remote, yours may be different 39 | - Make a PR on GitHub with your changes 40 | - We'll review the changes and get your code into the repo after lively discussion! 41 | 42 | 43 | ## Checklist for updates 44 | - [ ] Make sure there is an/are issue(s) opened for your specific update 45 | - [ ] Create the PR, referencing the issue 46 | - [ ] Debug the PR as needed until tests pass 47 | - [ ] Tag the final, debugged version 48 | * `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags` 49 | - [ ] Get the PR merged in 50 | 51 | ## Versioneer Auto-version 52 | [Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version 53 | is installed by looking at the `git` tags and how many commits ahead this version is. The format follows 54 | [PEP 440](https://www.python.org/dev/peps/pep-0440/) and has the regular expression of: 55 | ```regexp 56 | \d+.\d+.\d+(?\+\d+-[a-z0-9]+) 57 | ``` 58 | If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, 59 | e.g. `kinoml-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits 60 | ahead from the last tag, and then `-YYYYYY` where the `Y`'s are replaced with the `git` commit hash. 61 | -------------------------------------------------------------------------------- /kinoml/tests/features/test_ligand.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test ligand featurizers of `kinoml.features` 3 | """ 4 | import pytest 5 | import numpy as np 6 | 7 | from kinoml.core.systems import LigandSystem 8 | from kinoml.core.ligands import Ligand 9 | from kinoml.features.ligand import ( 10 | SingleLigandFeaturizer, 11 | MorganFingerprintFeaturizer, 12 | OneHotSMILESFeaturizer, 13 | GraphLigandFeaturizer, 14 | ) 15 | 16 | 17 | def test_single_ligand_featurizer(): 18 | ligand1 = Ligand(smiles="CCCC") 19 | single_ligand_system = LigandSystem(components=[ligand1]) 20 | featurizer = SingleLigandFeaturizer() 21 | featurizer.supports(single_ligand_system) 22 | 23 | ligand2 = Ligand(smiles="COCC") 24 | double_ligand_system = LigandSystem(components=[ligand1, ligand2]) 25 | with pytest.raises(ValueError): 26 | featurizer.featurize([double_ligand_system]) 27 | 28 | 29 | @pytest.mark.parametrize( 30 | "smiles, solution", 31 | [ 32 | ( 33 | "C", 34 | "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", 35 | ), 36 | ( 37 | "B", 38 | "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", 39 | ), 40 | ], 41 | ) 42 | def test_ligand_MorganFingerprintFeaturizer(smiles, solution): 43 | ligand = Ligand(smiles=smiles) 44 | system = LigandSystem([ligand]) 45 | featurizer = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False) 46 | featurizer.featurize([system]) 47 | fingerprint = system.featurizations[featurizer.name] 48 | solution_array = np.array(list(map(int, solution)), dtype="uint8") 49 | assert (fingerprint == solution_array).all() 50 | 51 | 52 | @pytest.mark.parametrize( 53 | "smiles, solution", 54 | [ 55 | ("C", np.array([[0, 1] + [0] * 51])), 56 | ("B", np.array([[1] + [0] * 52])), 57 | ("CC", np.array([[0, 1] + [0] * 51, [0, 1] + [0] * 51])), 58 | ], 59 | ) 60 | def test_ligand_OneHotSMILESFeaturizer(smiles, solution): 61 | ligand = Ligand(smiles=smiles) 62 | system = LigandSystem([ligand]) 63 | featurizer = OneHotSMILESFeaturizer(use_multiprocessing=False) 64 | featurizer.featurize([system]) 65 | matrix = system.featurizations[featurizer.name] 66 | assert matrix.shape == solution.T.shape 67 | assert (matrix == solution.T).all() 68 | 69 | 70 | @pytest.mark.parametrize( 71 | "smiles, n_edges, n_nodes, n_features", 72 | [("C", 0, 1, 69), ("CC", 2, 2, 69)], 73 | ) 74 | def test_ligand_GraphLigandFeaturizer_RDKit(smiles, n_edges, n_nodes, n_features): 75 | ligand = Ligand(smiles=smiles) 76 | system = LigandSystem([ligand]) 77 | GraphLigandFeaturizer(use_multiprocessing=False).featurize([system]) 78 | connectivity, features = system.featurizations["last"] 79 | assert len(connectivity[0]) == n_edges 80 | assert len(features) == n_nodes 81 | assert len(features[0]) == n_features 82 | -------------------------------------------------------------------------------- /kinoml/ml/tensorflow_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example models for TensorFlow 3 | 4 | .. note:: 5 | 6 | This code is not currently in use. 7 | """ 8 | 9 | import tensorflow as tf # pylint: disable=import-error 10 | 11 | 12 | def DNN(input_dim): 13 | """ 14 | DNN builds and compiles a TF model (a Deep Neural Network) that takes as input 'input_dim' 15 | 16 | Parameters 17 | ---------- 18 | input_dim : tuple of int 19 | Expected shape of the input data 20 | 21 | Returns 22 | ------- 23 | model : tf.keras.models.Sequential 24 | """ 25 | model = tf.keras.models.Sequential( 26 | [ 27 | tf.keras.layers.Dense(350, activation="relu", input_dim=input_dim), 28 | tf.keras.layers.Dropout(0.2), 29 | tf.keras.layers.Dense(200, activation="relu"), 30 | tf.keras.layers.Dropout(0.2), 31 | tf.keras.layers.Dense(100, activation="relu"), 32 | tf.keras.layers.Dense(50, activation="relu"), 33 | tf.keras.layers.Dense(16, activation="relu"), 34 | tf.keras.layers.Dense(1, activation="sigmoid"), 35 | ] 36 | ) 37 | 38 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) 39 | return model 40 | 41 | 42 | def CNN(input_shape): 43 | """ 44 | CNN builds and compiles a TF model (a Convolutional Neural Network) that takes as input 'input_shape' 45 | Parameters 46 | ========== 47 | input_shape : tuple of int 48 | Expected shape of the input data 49 | Returns 50 | ======= 51 | model : tf.keras.models.Sequential 52 | """ 53 | 54 | model = tf.keras.Sequential( 55 | [ 56 | tf.keras.layers.Conv2D( 57 | filters=16, 58 | kernel_size=3, 59 | activation="relu", 60 | padding="same", 61 | input_shape=input_shape, 62 | ), 63 | tf.keras.layers.MaxPooling2D(), 64 | tf.keras.layers.Flatten(), 65 | tf.keras.layers.Dropout(0.2), 66 | tf.keras.layers.Dense(64, activation="relu"), 67 | tf.keras.layers.BatchNormalization(), 68 | tf.keras.layers.Dense(32, activation="relu"), 69 | tf.keras.layers.BatchNormalization(), 70 | tf.keras.layers.Dense(1, activation="sigmoid"), 71 | ] 72 | ) 73 | 74 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) 75 | return model 76 | 77 | 78 | def MPNN(input_shape): 79 | """ 80 | MPNN builds and compiles a TF model (a Message Passing Neural Network) that takes as input 'input_shape' 81 | Parameters 82 | ========== 83 | input_shape : tuple of int 84 | Expected shape of the input data 85 | Returns 86 | ======= 87 | model : tf.keras.models.Sequential 88 | """ 89 | 90 | model = tf.keras.Sequential( 91 | [ 92 | tf.keras.layers.Conv2D( 93 | filters=8, 94 | kernel_size=3, 95 | activation="relu", 96 | padding="same", 97 | input_shape=input_shape, 98 | ), 99 | tf.keras.layers.MaxPooling2D(), 100 | tf.keras.layers.Flatten(), 101 | tf.keras.layers.Dropout(0.2), 102 | tf.keras.layers.Dense(64, activation="relu"), 103 | tf.keras.layers.BatchNormalization(), 104 | tf.keras.layers.Dense(32, activation="relu"), 105 | tf.keras.layers.BatchNormalization(), 106 | tf.keras.layers.Dense(1, activation="sigmoid"), 107 | ] 108 | ) 109 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) 110 | return model 111 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, 8 | body size, disability, ethnicity, gender identity and expression, level of 9 | experience, nationality, personal appearance, race, religion, or sexual 10 | identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment include: 15 | 16 | * Using welcoming and inclusive language 17 | * Being respectful of differing viewpoints and experiences 18 | * Gracefully accepting constructive criticism 19 | * Focusing on what is best for the community 20 | * Showing empathy towards other community members 21 | 22 | Examples of unacceptable behavior by participants include: 23 | 24 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 25 | * Trolling, insulting/derogatory comments, and personal or political attacks 26 | * Public or private harassment 27 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 28 | * Other conduct which could reasonably be considered inappropriate in a professional setting 29 | 30 | ## Our Responsibilities 31 | 32 | Project maintainers are responsible for clarifying the standards of acceptable 33 | behavior and are expected to take appropriate and fair corrective action in 34 | response to any instances of unacceptable behavior. 35 | 36 | Project maintainers have the right and responsibility to remove, edit, or 37 | reject comments, commits, code, wiki edits, issues, and other contributions 38 | that are not aligned to this Code of Conduct, or to ban temporarily or 39 | permanently any contributor for other behaviors that they deem inappropriate, 40 | threatening, offensive, or harmful. 41 | 42 | Moreover, project maintainers will strive to offer feedback and advice to 43 | ensure quality and consistency of contributions to the code. Contributions 44 | from outside the group of project maintainers are strongly welcomed but the 45 | final decision as to whether commits are merged into the codebase rests with 46 | the team of project maintainers. 47 | 48 | ## Scope 49 | 50 | This Code of Conduct applies both within project spaces and in public spaces 51 | when an individual is representing the project or its community. Examples of 52 | representing a project or community include using an official project e-mail 53 | address, posting via an official social media account, or acting as an 54 | appointed representative at an online or offline event. Representation of a 55 | project may be further defined and clarified by project maintainers. 56 | 57 | ## Enforcement 58 | 59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 60 | reported by contacting the project team at 'jaime.rodriguez-guerra@choderalab.org'. The project team will 61 | review and investigate all complaints, and will respond in a way that it deems 62 | appropriate to the circumstances. The project team is obligated to maintain 63 | confidentiality with regard to the reporter of an incident. Further details of 64 | specific enforcement policies may be posted separately. 65 | 66 | Project maintainers who do not follow or enforce the Code of Conduct in good 67 | faith may face temporary or permanent repercussions as determined by other 68 | members of the project's leadership. 69 | 70 | ## Attribution 71 | 72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 73 | version 1.4, available at 74 | [http://contributor-covenant.org/version/1/4][version] 75 | 76 | [homepage]: http://contributor-covenant.org 77 | [version]: http://contributor-covenant.org/version/1/4/ 78 | -------------------------------------------------------------------------------- /docs/developers/api_docs.md: -------------------------------------------------------------------------------- 1 | # How to write docs with Sphinx, MyST and Material Theme 2 | 3 | We are using Sphinx for our documentation. However, instead of using the default RST, 4 | you can also use Markdown syntax thanks to the [MyST parser](https://myst-parser.readthedocs.io/). 5 | The theme is [Material for Sphinx](https://github.com/bashtage/sphinx-material/). 6 | 7 | ## Basics 8 | 9 | - `cd docs && make livebuild` - Start the live-reloading docs server locally 10 | 11 | Project layout: 12 | 13 | docs/ 14 | index.md # The documentation homepage. 15 | conf.py # The configuration file 16 | ... # Other markdown pages, images and other files. 17 | 18 | We prefer using Markdown for the documentation, but the Python docstrings 19 | use RST with [NumpyDoc](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard) 20 | conventions. Check the existing docstrings for syntax examples. 21 | 22 | ## Supported extensions 23 | 24 | MyST adds some [extra stuff](https://myst-parser.readthedocs.io/en/latest/using/syntax.html#) 25 | on top of plain Markdown. Some examples: 26 | 27 | ### Admonitions 28 | 29 | ```{note} 30 | This is so cool huh? Check all styles [here](https://docutils.sourceforge.io/docs/ref/rst/directives.html#specific-admonitions). 31 | ``` 32 | 33 | ````md 34 | ```{note} 35 | This is so cool huh? Check all styles [here](https://docutils.sourceforge.io/docs/ref/rst/directives.html#specific-admonitions). 36 | ``` 37 | ```` 38 | 39 | ### Footnotes 40 | 41 | > This is a very important finding.[^1] 42 | 43 | > This is yet another finding.[^jaimergp1990] 44 | 45 | [^1]: Lorem ipsum dolor sit amet, consectetur adipiscing elit. 46 | [^jaimergp1990]: A kid named Jaime. 47 | 48 | These are written with labels like this: 49 | 50 | ```md 51 | > This is a very important finding.[^1] 52 | 53 | > This is yet another finding.[^jaimergp1990] 54 | 55 | [^1]: Lorem ipsum dolor sit amet, consectetur adipiscing elit. 56 | [^jaimergp1990]: A kid named Jaime. 57 | ``` 58 | 59 | ### LaTeX 60 | 61 | Either in blocks 62 | 63 | $$ 64 | \frac{n!}{k!(n-k)!} = \binom{n}{k} * KinoML 65 | $$ 66 | 67 | ```latex 68 | $$ 69 | \frac{n!}{k!(n-k)!} = \binom{n}{k} * KinoML 70 | $$ 71 | ``` 72 | 73 | or inline: 74 | 75 | This my best equation ever: $p(x|y) = \frac{p(y|x)p(x)}{p(y)}$ 76 | 77 | ```latex 78 | This my best equation ever: $p(x|y) = \frac{p(y|x)p(x)}{p(y)}$ 79 | ``` 80 | 81 | ### Tabbed fences 82 | 83 | :::{tabbed} Step 1 84 | 85 | This is the step 1 86 | ::: 87 | 88 | :::{tabbed} Step 2 89 | 90 | ```python 91 | # This is the step 2 with python code highlighting 92 | he = Element("Helium") 93 | ``` 94 | 95 | ::: 96 | 97 | :::{tabbed} Step 3 98 | 99 | This is the step 3 100 | ::: 101 | 102 | This line interrupts the fences and creates a new block of tabs 103 | 104 | :::{tabbed} Step 4 105 | 106 | ```python 107 | # This is the step 4 with python code highlighting 108 | 109 | be = Element("Beryllium") 110 | ``` 111 | 112 | ::: 113 | 114 | Obtained with: 115 | 116 | ```` 117 | :::{tabbed} Step 1 118 | 119 | This is the step 1 120 | ::: 121 | 122 | ::::{tabbed} Step 2 123 | ```python 124 | # This is the step 2 with python code highlighting 125 | he = Element("Helium") 126 | ``` 127 | :::: 128 | 129 | :::{tabbed} Step 3 130 | 131 | This is the step 3 132 | ::: 133 | 134 | This line interrupts the fences and creates a new block of tabs 135 | 136 | :::{tabbed} Step 4 137 | ```python 138 | # This is the step 4 with python code highlighting 139 | 140 | be = Element("Beryllium") 141 | ``` 142 | ::: 143 | 144 | ```` 145 | 146 | ### Extra inline markup 147 | 148 | | Code | Result | 149 | | --------- | ------- | 150 | | `==hey==` | ==hey== | 151 | | `~~hey~~` | ~~hey~~ | 152 | | `^^hey^^` | ^^hey^^ | 153 | | `a^migo^` | a^migo^ | 154 | | `-->` | --> | 155 | -------------------------------------------------------------------------------- /devtools/scripts/create_conda_env.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import glob 5 | import shutil 6 | import subprocess as sp 7 | from tempfile import TemporaryDirectory 8 | from contextlib import contextmanager 9 | # YAML imports 10 | try: 11 | import yaml # PyYAML 12 | loader = yaml.load 13 | except ImportError: 14 | try: 15 | import ruamel_yaml as yaml # Ruamel YAML 16 | except ImportError: 17 | try: 18 | # Load Ruamel YAML from the base conda environment 19 | from importlib import util as import_util 20 | CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE']) 21 | ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..', 22 | 'lib', 'python*.*', 'site-packages', 23 | 'ruamel_yaml', '__init__.py'))[0] 24 | # Based on importlib example, but only needs to load_module since its the whole package, not just 25 | # a module 26 | spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path) 27 | yaml = spec.loader.load_module() 28 | except (KeyError, ImportError, IndexError): 29 | raise ImportError("No YAML parser could be found in this or the conda environment. " 30 | "Could not find PyYAML or Ruamel YAML in the current environment, " 31 | "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " 32 | "Environment not created!") 33 | loader = yaml.YAML(typ="safe").load # typ="safe" avoids odd typing on output 34 | 35 | 36 | @contextmanager 37 | def temp_cd(): 38 | """Temporary CD Helper""" 39 | cwd = os.getcwd() 40 | with TemporaryDirectory() as td: 41 | try: 42 | os.chdir(td) 43 | yield 44 | finally: 45 | os.chdir(cwd) 46 | 47 | 48 | # Args 49 | parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.') 50 | parser.add_argument('-n', '--name', type=str, 51 | help='The name of the created Python environment') 52 | parser.add_argument('-p', '--python', type=str, 53 | help='The version of the created Python environment') 54 | parser.add_argument('conda_file', 55 | help='The file for the created Python environment') 56 | 57 | args = parser.parse_args() 58 | 59 | # Open the base file 60 | with open(args.conda_file, "r") as handle: 61 | yaml_script = loader(handle.read()) 62 | 63 | python_replacement_string = "python {}*".format(args.python) 64 | 65 | try: 66 | for dep_index, dep_value in enumerate(yaml_script['dependencies']): 67 | if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value): # Match explicitly 'python' and its formats 68 | yaml_script['dependencies'].pop(dep_index) 69 | break # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse 70 | except (KeyError, TypeError): 71 | # Case of no dependencies key, or dependencies: None 72 | yaml_script['dependencies'] = [] 73 | finally: 74 | # Ensure the python version is added in. Even if the code does not need it, we assume the env does 75 | yaml_script['dependencies'].insert(0, python_replacement_string) 76 | 77 | # Figure out conda path 78 | if "CONDA_EXE" in os.environ: 79 | conda_path = os.environ["CONDA_EXE"] 80 | else: 81 | conda_path = shutil.which("conda") 82 | if conda_path is None: 83 | raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path") 84 | 85 | print("CONDA ENV NAME {}".format(args.name)) 86 | print("PYTHON VERSION {}".format(args.python)) 87 | print("CONDA FILE NAME {}".format(args.conda_file)) 88 | print("CONDA PATH {}".format(conda_path)) 89 | 90 | # Write to a temp directory which will always be cleaned up 91 | with temp_cd(): 92 | temp_file_name = "temp_script.yaml" 93 | with open(temp_file_name, 'w') as f: 94 | f.write(yaml.dump(yaml_script)) 95 | sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True) 96 | -------------------------------------------------------------------------------- /kinoml/datasets/groups.py: -------------------------------------------------------------------------------- 1 | """ 2 | Splitting strategies for datasets 3 | """ 4 | import random 5 | from collections import defaultdict 6 | 7 | from tqdm.auto import tqdm 8 | 9 | 10 | class BaseGrouper: 11 | """ 12 | Base class to assign groups to measurements in a DatasetProvider 13 | """ 14 | 15 | def __init__(self): 16 | pass 17 | 18 | def assign(self, dataset, overwrite=False, **kwargs): 19 | """ 20 | Given a DatasetProvider, assign a key to the elements 21 | of each group, as provided by ``.indices()`` 22 | 23 | Parameters 24 | ---------- 25 | dataset : DatasetProvider 26 | overwrite : bool, optional=False 27 | If a measurement has been assigned a group already, 28 | do not overwrite unless this option is set to True. 29 | 30 | Returns 31 | ------- 32 | dataset : DatasetProvider 33 | The same dataset passed in the input, with 34 | measurements modified in place. 35 | """ 36 | groups = self.indices(dataset, **kwargs) 37 | measurements = dataset.measurements 38 | for key, indices in groups.items(): 39 | for index in indices: 40 | ms = measurements[index] 41 | if not overwrite and ms.group is not None: 42 | raise ValueError( 43 | f"Cannot assign group to `{ms}` because a group is " 44 | f"already assigned: {ms.group}. Choose `overwrite=True` " 45 | f"to ignore existing groups." 46 | ) 47 | ms.group = key 48 | return dataset 49 | 50 | def indices(self, dataset, **kwargs): 51 | """ 52 | Given a dataset, create a dictionary that maps keys or labels 53 | to a set of numerical indices. The strategy to follow will 54 | depend on the subclass. 55 | 56 | Parameters 57 | ---------- 58 | dataset : DatasetProvider 59 | 60 | Returns 61 | ------- 62 | dict 63 | Maps ``int` or ``str`` to a list of ``int`` 64 | """ 65 | raise NotImplementedError("Implement in your subclass") 66 | 67 | 68 | class RandomGrouper(BaseGrouper): 69 | 70 | """ 71 | Randomized groups following a split proportional to the provided ratios 72 | 73 | Parameters 74 | ---------- 75 | ratios : tuple or dict 76 | 1-based ratios for the different groups. They must sum 1.0. If a 77 | dict is provided, the keys are used to label the resulting groups. 78 | Otherwise, the groups are 0-enumerated. 79 | 80 | """ 81 | 82 | def __init__(self, ratios): 83 | if isinstance(ratios, (list, tuple)): 84 | ratios = {i: ratio for i, ratio in enumerate(ratios)} 85 | assert sum(ratios.values()) == 1, f"`ratios` must sum 1, but you provided {ratios}" 86 | self.ratios = ratios 87 | 88 | def indices(self, dataset, **kwargs): 89 | length = len(dataset) 90 | indices = list(range(length)) 91 | random.shuffle(indices) 92 | groups = {} 93 | start = 0 94 | for key, ratio in self.ratios.items(): 95 | end = start + int(round(ratio * length, 0)) 96 | groups[key] = indices[start:end] 97 | start = end 98 | return groups 99 | 100 | 101 | class CallableGrouper(BaseGrouper): 102 | """ 103 | A grouper that applies a user-provided function to each Measurement 104 | in the Dataset. Returned value should be the name of the group. 105 | 106 | Parameters 107 | ---------- 108 | function : callable 109 | This function must be able to take a ``Measurement`` object 110 | and return a ``str`` or ``int``. 111 | """ 112 | 113 | def __init__(self, function): 114 | self.function = function 115 | 116 | def indices(self, dataset, progress=True): 117 | iterator = enumerate(dataset.measurements) 118 | if progress: 119 | iterator = tqdm(iterator) 120 | 121 | groups = defaultdict(list) 122 | for i, measurement in iterator: 123 | key = self.function(measurement) 124 | groups[key].append(i) 125 | return groups 126 | 127 | 128 | class BaseFilter(BaseGrouper): 129 | pass 130 | -------------------------------------------------------------------------------- /kinoml/core/systems.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``System`` objects define a collection of related 3 | ``MolecularComponent`` instances. They are normally 4 | attached to a ``Measurement``, and, in the context 5 | of a machine learning exercise, will be featurized 6 | with different classes found under ``kinoml.features``. 7 | Featurization turns a ``System`` into a tensor-like 8 | object, like Numpy arrays. 9 | """ 10 | from __future__ import annotations 11 | 12 | from typing import Iterable 13 | 14 | from .components import MolecularComponent 15 | from .ligands import BaseLigand 16 | from .proteins import BaseProtein 17 | 18 | 19 | class System: 20 | 21 | """ 22 | System objects host one or more MolecularComponent. 23 | 24 | Parameters 25 | ---------- 26 | components : list of MolecularComponent 27 | Molecular entities defining this system 28 | strict: bool, optional=True 29 | Whether to perform sanity checks (default) or not. 30 | 31 | Attributes 32 | ---------- 33 | featurizations : dict 34 | This dictionary will store the different featurization 35 | steps a ``System`` is submitted to. The keys for this 36 | dictionary are usually the *name* of the featurizer 37 | class. Additionally, a ``Pipeline`` might define 38 | a ``last`` key, indicating that particular object 39 | was the final result of a chain of featurizers. 40 | """ 41 | 42 | def __init__( 43 | self, 44 | components: Iterable[MolecularComponent], 45 | strict: bool = True, 46 | *args, 47 | **kwargs, 48 | ): 49 | super().__init__(*args, **kwargs) 50 | self.components = components 51 | self.featurizations = {} 52 | if strict: 53 | self.check() 54 | 55 | def _components_by_type(self, type_): 56 | """ 57 | Yield MolecularComponent objects of a given type only 58 | """ 59 | for component in self.components: 60 | if isinstance(component, type_): 61 | yield component 62 | 63 | def check(self): 64 | assert self.components, "`System` must specify at least one component" 65 | return True 66 | 67 | @property 68 | def name(self) -> str: 69 | """ 70 | Generates a readable name out of the components names 71 | """ 72 | return " & ".join([str(c.name) for c in self.components]) 73 | 74 | @property 75 | def weight(self) -> float: 76 | """ 77 | Calculate the molecular weight of the system 78 | 79 | Note: This is just an example on how/why this level of 80 | abstraction can be useful. 81 | """ 82 | mass = 0 83 | for component in self.components: 84 | if not hasattr(component, "mass"): # It will be unimplemented for some types! 85 | raise TypeError("This system contains at least one component without mass.") 86 | mass += component.mass 87 | return mass 88 | 89 | def __repr__(self) -> str: 90 | return ( 91 | f"<{self.__class__.__name__} with " 92 | f"{len(self.components)} components ({', '.join([repr(c) for c in self.components])})>" 93 | ) 94 | 95 | 96 | class ProteinSystem(System): 97 | """ 98 | A System that contains Protein objects. It defines two properties: 99 | 100 | - ``protein``: get the first Protein found in the components 101 | - ``proteins``: get all Protein objects found in the components 102 | """ 103 | 104 | @property 105 | def protein(self): 106 | return next(self._components_by_type(BaseProtein)) 107 | 108 | @property 109 | def proteins(self): 110 | return list(self._components_by_type(BaseProtein)) 111 | 112 | def check(self): # this is a requirement 113 | super().check() 114 | assert ( 115 | len(self.proteins) >= 1 116 | ), f"A ProteinSystem must specify at least one Protein. Current contents: {self}." 117 | return True 118 | 119 | 120 | class LigandSystem(System): 121 | """ 122 | A System that contains Ligand objects. It defines two properties: 123 | 124 | - ``ligand``: get the first Ligand found in the components 125 | - ``ligands``: get all Ligand objects found in the components 126 | """ 127 | 128 | @property 129 | def ligand(self): 130 | return next(self._components_by_type(BaseLigand)) 131 | 132 | @property 133 | def ligands(self): 134 | return list(self._components_by_type(BaseLigand)) 135 | 136 | def check(self): # this is a requirement 137 | super().check() 138 | assert ( 139 | len(self.ligands) >= 1 140 | ), f"A LigandSystem must specify at least one Ligand. Current contents: {self}." 141 | return True 142 | 143 | 144 | class ProteinLigandComplex(ProteinSystem, LigandSystem): 145 | """ 146 | A system with at least one protein and one ligand 147 | """ 148 | 149 | def check(self): 150 | assert ProteinSystem.check(self) and LigandSystem.check(self), ( 151 | "A ProteinLigandComplex must specify at least one Protein and one Ligand. " 152 | f"Current contents: {self}" 153 | ) 154 | -------------------------------------------------------------------------------- /kinoml/datasets/pkis2.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | import pandas as pd 6 | 7 | from .core import DatasetProvider 8 | from ..core.proteins import Protein, KLIFSKinase 9 | from ..core.ligands import Ligand 10 | from ..core.systems import ProteinLigandComplex 11 | from ..core.measurements import PercentageDisplacementMeasurement 12 | from ..core.conditions import AssayConditions 13 | from ..utils import datapath 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class PKIS2DatasetProvider(DatasetProvider): 20 | 21 | """ 22 | Loads the PKIS2 dataset as provided in _Progress towards a public chemogenomic set for protein 23 | kinases and a call for contributions [1]. 24 | 25 | [1]: DOI: 10.1371/journal.pone.0181585 26 | 27 | Examples 28 | -------- 29 | >>> from kinoml.datasets.pkis2 import PKIS2DatasetProvider 30 | >>> provider = PKIS2DatasetProvider.from_source() 31 | >>> provider 32 | """ 33 | 34 | @classmethod 35 | def from_source( 36 | cls, 37 | path_or_url: Union[str, Path] = datapath("kinomescan/journal.pone.0181585.s004.csv"), 38 | path_or_url_constructs: Union[str, Path] = datapath( 39 | "kinomescan/DiscoverX_489_Kinase_Assay_Construct_Information.csv" 40 | ), 41 | protein_type: str = "KLIFSKinase", 42 | toolkit: str = "OpenEye", 43 | ): 44 | """ 45 | Create a PKIS2 DatasetProvider from the raw data. 46 | 47 | Parameters 48 | ---------- 49 | path_or_url: str or pathlib.Path 50 | CSV file with the protein-ligand measurements. 51 | path_or_url_constructs: str or pathlib.Path 52 | CSV file with the construct information. 53 | protein_type: str, default=KLIFSKinase 54 | The protein object type to use ('Protein' or 'KLIFSKinase'). 55 | toolkit: str, default=OpenEye 56 | The toolkit to use for creating protein objects (e.g. 'OpenEye', 'MDAnalysis'), 57 | allowed values depend on the specified `protein_type`. 58 | 59 | Raises 60 | ------ 61 | ValueError 62 | Given protein_type {protein_type} is not valid, only {protein_type_classes.keys()} are 63 | allowed. 64 | """ 65 | logger.debug("Checking protein type ...") 66 | protein_type_classes = {"Protein": Protein, "KLIFSKinase": KLIFSKinase} 67 | if protein_type not in protein_type_classes.keys(): 68 | raise ValueError( 69 | f"Given protein_type {protein_type} is not valid, " 70 | f"only {protein_type_classes.keys()} are allowed." 71 | ) 72 | 73 | logger.debug("Loading CSV with construct information ...") 74 | constructs_df = pd.read_csv(path_or_url_constructs) 75 | 76 | logger.debug("Creating protein objects ...") 77 | kinases = dict() 78 | for _, construct in constructs_df.iterrows(): 79 | if construct["Construct Description"] != "Wild Type": 80 | # mutants not in measurements 81 | continue 82 | discoverx_id = construct["DiscoverX Gene Symbol"] 83 | ncbi_id = construct["Accession Number"] 84 | if construct["AA Start/Stop"] == "Null": 85 | # ambiguous, will consider full sequence 86 | kinase = protein_type_classes[protein_type]( 87 | name=discoverx_id, 88 | ncbi_id=ncbi_id, 89 | toolkit=toolkit, 90 | ) 91 | else: 92 | first, last = [x[1:] for x in construct["AA Start/Stop"].split("/")] 93 | kinase = protein_type_classes[protein_type]( 94 | name=discoverx_id, 95 | ncbi_id=ncbi_id, 96 | metadata={"construct_range": f"{first}-{last}"}, 97 | toolkit=toolkit, 98 | ) 99 | kinases[discoverx_id] = kinase 100 | 101 | logger.debug("Loading CSV with measurements ...") 102 | # column 0 is name, column 3 is smiles, column 7 - 412 are measurements for each kinase 103 | measurements_df = pd.read_csv(path_or_url, usecols=[0, 3] + list(range(7, 413))) 104 | 105 | logger.debug("Creating systems and measurements ...") 106 | measurements = [] 107 | kinase_names = measurements_df.columns[2:] 108 | for _, ligand_measurements in measurements_df.iterrows(): 109 | ligand_name = ligand_measurements["Regno"] 110 | smiles = ligand_measurements["Smiles"] 111 | if ligand_name == "0": 112 | ligand_name = smiles 113 | ligand = Ligand(smiles=smiles, name=ligand_name) 114 | for kinase_name, inhibition_value in zip(kinase_names, ligand_measurements.values[2:]): 115 | measurement = PercentageDisplacementMeasurement( 116 | inhibition_value, 117 | conditions=AssayConditions(pH=7.0), 118 | system=ProteinLigandComplex(components=[ligand, kinases[kinase_name]]), 119 | ) 120 | measurements.append(measurement) 121 | 122 | return cls(measurements=measurements, metadata={"path_or_url": path_or_url}) 123 | -------------------------------------------------------------------------------- /kinoml/tests/docking/test_oedocking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test OEDocking functionalities of `kinoml.docking` 3 | """ 4 | from contextlib import contextmanager 5 | from importlib import resources 6 | import pytest 7 | 8 | 9 | @contextmanager 10 | def does_not_raise(): 11 | yield 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "package, resource, resids, expectation, min_x", 16 | [ 17 | ( 18 | "kinoml.data.proteins", 19 | "4f8o_edit.pdb", 20 | [50, 51, 52, 62, 63, 64, 70, 77], 21 | does_not_raise(), 22 | 21.225000381469727, 23 | ), 24 | ( 25 | "kinoml.data.proteins", 26 | "4f8o_edit.pdb", 27 | [700, 701, 702], 28 | pytest.raises(ValueError), 29 | 21.225000381469727, 30 | ), 31 | ], 32 | ) 33 | def test_resids_to_box_molecule(package, resource, resids, expectation, min_x): 34 | """Compare results to expected minimal x_coordinate.""" 35 | from kinoml.modeling.OEModeling import read_molecules 36 | from kinoml.docking.OEDocking import resids_to_box_molecule 37 | 38 | with resources.path(package, resource) as path: 39 | with expectation: 40 | protein = read_molecules(str(path))[0] 41 | box_molecule = resids_to_box_molecule(protein, resids) 42 | x_coordinates = [coordinates[0] for coordinates in box_molecule.GetCoords().values()] 43 | assert round(min(x_coordinates), 3) == round(min_x, 3) 44 | 45 | 46 | @pytest.mark.parametrize( 47 | "package, resource, smiles_list, n_poses", 48 | [ 49 | ( 50 | "kinoml.data.proteins", 51 | "4f8o.pdb", 52 | ["c1cc(ccc1CCN)S(=O)(=O)F", "c1cc(ccc1CCN)S(=O)(=O)N"], 53 | 3, 54 | ), 55 | ], 56 | ) 57 | def test_hybrid_docking(package, resource, smiles_list, n_poses): 58 | """Compare results to expected number of docked molecules and docking poses""" 59 | from openeye import oedocking 60 | 61 | from kinoml.docking.OEDocking import hybrid_docking 62 | from kinoml.modeling.OEModeling import read_molecules, read_smiles, prepare_complex 63 | 64 | with resources.path(package, resource) as path: 65 | structure = read_molecules(str(path))[0] 66 | design_unit = prepare_complex(structure) 67 | if not design_unit.HasReceptor(): 68 | oedocking.OEMakeReceptor(design_unit) 69 | docking_poses = hybrid_docking( 70 | design_unit, [read_smiles(smiles) for smiles in smiles_list], n_poses 71 | ) 72 | assert len(docking_poses) == len(smiles_list) * n_poses 73 | 74 | 75 | @pytest.mark.parametrize( 76 | "package, resource, resids, smiles_list, n_poses", 77 | [ 78 | ( 79 | "kinoml.data.proteins", 80 | "4f8o_edit.pdb", 81 | [50, 51, 52, 62, 63, 64, 70, 77], 82 | ["c1cc(ccc1CCN)S(=O)(=O)F", "c1cc(ccc1CCN)S(=O)(=O)N"], 83 | 3, 84 | ), 85 | ], 86 | ) 87 | def test_fred_docking(package, resource, resids, smiles_list, n_poses): 88 | """Compare results to expected number of docked molecules and docking poses""" 89 | from openeye import oechem, oedocking 90 | 91 | from kinoml.docking.OEDocking import fred_docking, resids_to_box_molecule 92 | from kinoml.modeling.OEModeling import read_molecules, read_smiles, prepare_protein 93 | 94 | with resources.path(package, resource) as path: 95 | structure = read_molecules(str(path))[0] 96 | design_unit = prepare_protein(structure) 97 | protein = oechem.OEGraphMol() 98 | design_unit.GetProtein(protein) 99 | box_molecule = resids_to_box_molecule(protein, resids) 100 | options = oedocking.OEMakeReceptorOptions() 101 | options.SetBoxMol(box_molecule) 102 | oedocking.OEMakeReceptor(design_unit, options) 103 | docking_poses = fred_docking( 104 | design_unit, [read_smiles(smiles) for smiles in smiles_list], n_poses 105 | ) 106 | assert len(docking_poses) == len(smiles_list) * n_poses 107 | 108 | 109 | @pytest.mark.parametrize( 110 | "package, resource, smiles_list", 111 | [ 112 | ( 113 | "kinoml.data.proteins", 114 | "4f8o.pdb", 115 | ["c1cc(ccc1CCN)S(=O)(=O)F", "c1cc(ccc1CCN)S(=O)(=O)N"], 116 | ), 117 | ], 118 | ) 119 | def test_pose_molecules(package, resource, smiles_list): 120 | """Compare results to expected number of docked molecules and docking poses""" 121 | from openeye import oechem, oedocking 122 | 123 | from kinoml.docking.OEDocking import pose_molecules 124 | from kinoml.modeling.OEModeling import read_molecules, read_smiles, prepare_complex 125 | 126 | with resources.path(package, resource) as path: 127 | structure = read_molecules(str(path))[0] 128 | design_unit = prepare_complex(structure) 129 | if not design_unit.HasReceptor(): 130 | oedocking.OEMakeReceptor(design_unit) 131 | docking_poses = pose_molecules( 132 | design_unit, 133 | [read_smiles(smiles) for smiles in smiles_list], 134 | score_pose=True, 135 | ) 136 | assert len(docking_poses) == len(smiles_list) 137 | assert all( 138 | [oechem.OEHasSDData(docking_pose, "Chemgauss4") for docking_pose in docking_poses] 139 | ) 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | KinoML 2 | ============================== 3 | [//]: # (Badges) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) 5 | [![CI](https://github.com/openkinome/kinoml/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/openkinome/kinoml/actions/workflows/ci.yml) 6 | [![DOCS](https://github.com/openkinome/kinoml/actions/workflows/docs.yml/badge.svg?branch=master)](https://github.com/openkinome/kinoml/actions/workflows/docs.yml) 7 | [![codecov](https://codecov.io/gh/openkinome/KinoML/branch/master/graph/badge.svg)](https://codecov.io/gh/openkinome/KinoML/branch/master) 8 | 9 | ![GitHub closed pr](https://img.shields.io/github/issues-pr-closed-raw/openkinome/kinoml) 10 | ![GitHub open pr](https://img.shields.io/github/issues-pr-raw/openkinome/kinoml) 11 | ![GitHub closed issues](https://img.shields.io/github/issues-closed-raw/openkinome/kinoml) 12 | ![GitHub open issues](https://img.shields.io/github/issues/openkinome/kinoml) 13 | 14 | **KinoML** is a modular and extensible framework for machine learning (ML) in small molecule drug discovery with a special focus on kinases. The publication can be found [here](https://www.biorxiv.org/content/10.1101/2024.09.10.612176v1). KinoML enables users to easily: 15 | 1. **Access and download data**: from online data sources, such as ChEMBL or PubChem as well as from their own files, with a focus on data availability and inmutability. 16 | 2. **Featurize data**: so that it is ML readeable. KinoML offers a wide variety of featurization schemes, from ligand-only to ligand:kinase complexes. 17 | 3. **Run structure-based experiments**: using KinoML's implemented models, with a special focus on reproducibility. 18 | 19 | 20 | 21 | The purpose of KinoML is to help users conduct ML kinase experiments, from data collection to model evaluation. Tutorials on how to use KinoML as well as working examples showcasing how to use KinoML to perform experiments end-to-end can be found [here.](https://github.com/raquellrios/kinoml/tree/master/tutorials) Note that despite KinoML's focus being on kinases, it can be applied to any protein system. For more detailed instructions, please refer to the [Documentation](https://openkinome.org/kinoml/index.html). 22 | 23 | A KinoML workflow to achieve points **1, 2** and **3** is illustrated in the following image: 24 | 25 | ![KinoML object model](kinoml/data/fig_1_kinomltechpaper_v2.png) 26 | **Fig. 1:** KinoML workflow overview. Colors represent objects of the same class. 27 | 28 | 29 | 30 | ### Notice 31 | 32 | Please be aware that this code is work in progress and is not guaranteed to provide the expected results. The API can change at any time without warning. 33 | 34 | ### Installation 35 | 36 | #### Option 1: Install with Docker 37 | 38 | A prebuilt Docker image of this software is available on Docker Hub: 39 | 40 | **Image:** `openkinome/kinoml:v1` 41 | **Link:** [Docker Hub page](https://hub.docker.com/r/openkinome/kinoml) 42 | 43 | ```bash 44 | # Download the container image 45 | docker pull openkinome/kinoml:v1 46 | 47 | # Run the software 48 | docker run --rm openkinome/kinoml:v1 --help 49 | ``` 50 | 51 | #### Option 2: Install with conda/mamba 52 | 53 | KinoML and its dependencies can be installed via conda/mamba. 54 | 55 | ```bash 56 | git clone https://github.com/openkinome/kinoml.git # clone the repo 57 | cd kinoml # change directory to local copy of repo 58 | mamba env create -n kinoml -f devtools/conda-envs/test_env.yaml 59 | conda activate kinoml 60 | python -m pip install git+https://github.com/openkinome/kinoml.git 61 | ``` 62 | 63 | ### Usage 64 | 65 | The tutorials folder is divided into two parts: 66 | 67 | 1. [**Getting started**](https://github.com/raquellrios/kinoml/tree/master/tutorials/getting_started): the notebooks in this folder aim to give the user an understanding of how to use KinoML to: (1) **access and download** data, (2) **featurize** data, and (3) **run a** (simple) **ML model** on the featurized data obtained with KinoML to predict ligand binding affinity. Additionally, this folder contains notebooks that explain the **KinoML object model** and how to access the different objects, as well as notebooks **showcasing all the different featurizers** implemented within KinoML and how to use each of them. 68 | 69 | 2. [**Experiments**](https://github.com/raquellrios/kinoml/tree/master/tutorials/experiments): this folder contains four individual structure-based experiments to predict ligand binding affinity. All experiments use KinoML to obtain the data, featurize it and train and evaluate a ML model implemented within the`kinoml.ml` class. The purpose of these experiments is to display usage examples of KinoML to conduct end-to-end structure-based kinases experiments. 70 | 71 | 72 | ⚠️ You will need a valid OpenEye License for the tutorials to work. For the Schrodinger featurizers tutorial you will also need a Schrodinger License! 73 | 74 | 75 | For users interested in more KinoML usage examples, they can checkout other repositories under the initative [OpenKinome](https://github.com/openkinome/). Particularly, other two repositories that may be of interest are: 76 | 77 | 78 | - [kinodata](https://github.com/openkinome/kinodata): repository with ready-to-use kinase-focused datasets from ChEMBL, as well as tutorials explaining how to process kinase data for ML applications. 79 | - [experiments-binding-affinity](https://github.com/openkinome/experiments-binding-affinity): more advanced and reproducible ML experiments using KinoML. 80 | 81 | 82 | 83 | Copyright (c) 2019, OpenKinome 84 | 85 | 86 | #### Acknowledgements 87 | 88 | Project based on the 89 | [Computational Molecular Science Python Cookiecutter](https://github.com/molssi/cookiecutter-cms) version 1.1. 90 | -------------------------------------------------------------------------------- /kinoml/core/ligands.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``MolecularComponent`` objects that represent ligand-like entities. 3 | """ 4 | 5 | import logging 6 | from pathlib import Path 7 | from typing import Union 8 | 9 | from openff.toolkit.topology import Molecule 10 | 11 | from .components import BaseLigand 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Ligand(BaseLigand): 18 | """ 19 | Create a new Ligand object. An openff representation is accessible via the molecule attribute. 20 | 21 | Examples 22 | -------- 23 | 24 | Create a ligand from file: 25 | 26 | >>> ligand = Ligand.from_file("data/molecules/chloroform.sdf", name="chloroform") 27 | 28 | Create a ligand from an openff molecule: 29 | 30 | >>> from openff.toolkit.topology import Molecule 31 | >>> molecule = Molecule.from_file("data/molecules/chloroform.sdf") 32 | >>> ligand = Ligand(molecule=molecule, name="chloroform") 33 | 34 | Create a ligand from SMILES: 35 | 36 | >>> ligand = Ligand.from_smiles("C(Cl)(Cl)Cl", name="chloroform") 37 | 38 | Create a ligand from SMILES with lazy instantiation: 39 | 40 | >>> ligand = Ligand(smiles="C(Cl)(Cl)Cl", name="chloroform") 41 | 42 | """ 43 | 44 | def __init__( 45 | self, 46 | molecule: Union[Molecule, None] = None, 47 | smiles: str = "", 48 | name: str = "", 49 | metadata: Union[dict, None] = None, 50 | **kwargs 51 | ): 52 | """ 53 | Create a new Ligand object. Lazy instantiation is possible via the smiles parameter. 54 | 55 | Parameters 56 | ---------- 57 | molecule: openff.toolkit.topology.Molecule or None, default=None 58 | An openff representation of the ligand. 59 | smiles: str, default="" 60 | The SMILES representation of the ligand. Can be used for lazy instantiation, i.e. will 61 | interpreted when calling the molecule attribute the first time. 62 | name: str, default="" 63 | The name of the ligand. 64 | metadata: dict or None, default=None 65 | Additional metadata of the needed for e.g. featurizers or provenance. 66 | """ 67 | BaseLigand.__init__(self, name=name, metadata=metadata, **kwargs) 68 | self._molecule = molecule 69 | self._smiles = smiles 70 | 71 | @property 72 | def molecule(self): 73 | """Decorate molecule to modify setter and getter.""" 74 | return self._molecule 75 | 76 | @molecule.setter 77 | def molecule(self, new_value: Union[Molecule, None]): 78 | """ 79 | Store a new value for molecule in the _molecule attribute. 80 | 81 | Parameters 82 | ---------- 83 | new_value: openff.toolkit.topology.Molecule or None 84 | The new openff molecule. 85 | """ 86 | self._molecule = new_value 87 | 88 | @molecule.getter 89 | def molecule(self): 90 | """ 91 | Get the _molecule attribute. If the _smiles attribute is given and _molecule is None, a 92 | new openff molecule will be created from smiles, e.g. in case of lazy instantiation. 93 | 94 | Returns 95 | ------ 96 | : openff.toolkit.topology.Molecule or None 97 | The openff molecular representation of the ligand. 98 | """ 99 | if not self._molecule and self._smiles: 100 | self._molecule = Molecule.from_smiles(smiles=self._smiles, allow_undefined_stereo=True) 101 | if not self.name: 102 | self.name = self._smiles 103 | if self.metadata is None: 104 | self.metadata = {"smiles": self._smiles} 105 | else: 106 | self.metadata.update({"smiles": self._smiles}) 107 | return self._molecule 108 | 109 | @classmethod 110 | def from_smiles( 111 | cls, smiles: str, name: str = "", allow_undefined_stereo: bool = True, **kwargs 112 | ): 113 | """ 114 | Create a Ligand from a SMILES representation. 115 | 116 | Parameters 117 | ---------- 118 | smiles: str 119 | smiles: str 120 | The SMILES representation of the ligand. 121 | name: str, default="" 122 | The name of the ligand. 123 | allow_undefined_stereo: bool, default=True 124 | If undefined stereo centers should be allowed. 125 | kwargs: 126 | Any keyword arguments allowed for the from_smiles method of the openff molecule class. 127 | """ 128 | molecule = Molecule.from_smiles( 129 | smiles=smiles, allow_undefined_stereo=allow_undefined_stereo, **kwargs 130 | ) 131 | if not name: 132 | name = smiles 133 | return cls(molecule=molecule, name=name, metadata={"smiles": smiles}) 134 | 135 | @classmethod 136 | def from_file( 137 | cls, 138 | file_path: Union[Path, str], 139 | name: str = "", 140 | allow_undefined_stereo: bool = True, 141 | **kwargs 142 | ): 143 | """ 144 | Create a Ligand from file. 145 | 146 | Parameters 147 | ---------- 148 | file_path: pathlib.Path or str 149 | The path to the molecular file. For supported formats see the openff molecule 150 | documentation. 151 | name: str, default="" 152 | The name of the ligand. 153 | allow_undefined_stereo: bool, default=True 154 | If undefined stereo centers should be allowed. 155 | kwargs: 156 | Any keyword arguments allowed for the from_file method of the openff molecule class. 157 | """ 158 | molecule = Molecule.from_file( 159 | file_path=file_path, allow_undefined_stereo=allow_undefined_stereo, **kwargs 160 | ) 161 | if not name: 162 | name = molecule.to_smiles(explicit_hydrogens=False) 163 | return cls(molecule=molecule, name=name, metadata={"file_path": file_path}) 164 | -------------------------------------------------------------------------------- /kinoml/tests/features/test_core.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test core objects of ``kinoml.features`` 3 | """ 4 | import pytest 5 | import numpy as np 6 | 7 | from kinoml.core.systems import LigandSystem 8 | from kinoml.core.ligands import Ligand 9 | from kinoml.features.core import ( 10 | BaseFeaturizer, 11 | Pipeline, 12 | Concatenated, 13 | BaseOneHotEncodingFeaturizer, 14 | PadFeaturizer, 15 | HashFeaturizer, 16 | NullFeaturizer, 17 | CallableFeaturizer, 18 | ClearFeaturizations, 19 | TupleOfArrays, 20 | ) 21 | 22 | 23 | def test_BaseFeaturizer(): 24 | ligand = Ligand(smiles="CCCC") 25 | systems = [ 26 | LigandSystem(components=[ligand]), 27 | LigandSystem(components=[ligand]), 28 | LigandSystem(components=[ligand]), 29 | ] 30 | featurizer = BaseFeaturizer() 31 | with pytest.raises(NotImplementedError): 32 | featurizer(systems) 33 | 34 | with pytest.raises(NotImplementedError): 35 | featurizer.featurize(systems) 36 | 37 | 38 | def test_Pipeline(): 39 | ligand = Ligand("CCCC") 40 | systems = [ 41 | LigandSystem(components=[ligand]), 42 | LigandSystem(components=[ligand]), 43 | LigandSystem(components=[ligand]), 44 | ] 45 | featurizers = (NullFeaturizer(), NullFeaturizer()) 46 | pipeline = Pipeline(featurizers) 47 | pipeline.featurize(systems) 48 | assert [s.featurizations["last"] for s in systems] == systems 49 | 50 | 51 | def test_Concatenated(): 52 | from kinoml.features.ligand import MorganFingerprintFeaturizer 53 | 54 | ligand = Ligand(smiles="CCCC") 55 | system = LigandSystem([ligand]) 56 | featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False) 57 | featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False) 58 | concatenated = Concatenated([featurizer1, featurizer2], axis=1) 59 | concatenated.featurize([system]) 60 | assert system.featurizations["last"].shape[0] == 1024 61 | 62 | 63 | def test_TupleOfArrays(): 64 | from kinoml.features.ligand import MorganFingerprintFeaturizer 65 | 66 | ligand = Ligand(smiles="CCCC") 67 | system = LigandSystem([ligand]) 68 | featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False) 69 | featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024, use_multiprocessing=False) 70 | aggregated = TupleOfArrays([featurizer1, featurizer2]) 71 | aggregated.featurize([system]) 72 | assert len(system.featurizations["last"]) == 2 73 | assert system.featurizations["last"][0].shape[0] == 512 74 | assert system.featurizations["last"][1].shape[0] == 1024 75 | 76 | 77 | def test_BaseOneHotEncodingFeaturizer(): 78 | assert ( 79 | BaseOneHotEncodingFeaturizer.one_hot_encode("AAA", "ABC") == np.array([[1, 0, 0]] * 3).T 80 | ).all() 81 | assert ( 82 | BaseOneHotEncodingFeaturizer.one_hot_encode("AAA", {"A": 0, "B": 1, "C": 2}) 83 | == np.array([[1, 0, 0]] * 3).T 84 | ).all() 85 | assert ( 86 | BaseOneHotEncodingFeaturizer.one_hot_encode(["A", "A", "A"], ["A", "B", "C"]) 87 | == np.array([[1, 0, 0]] * 3).T 88 | ).all() 89 | 90 | 91 | def test_PadFeaturizer(): 92 | from kinoml.features.ligand import OneHotSMILESFeaturizer 93 | 94 | systems = ( 95 | LigandSystem([Ligand(smiles="C")]), 96 | LigandSystem([Ligand(smiles="CC")]), 97 | LigandSystem([Ligand(smiles="CCC")]), 98 | ) 99 | OneHotSMILESFeaturizer(use_multiprocessing=False).featurize(systems) 100 | PadFeaturizer(use_multiprocessing=False).featurize(systems) 101 | 102 | for s in systems: 103 | assert s.featurizations["last"].shape == (53, 3) 104 | 105 | return systems 106 | 107 | 108 | def test_HashFeaturizer(): 109 | system = LigandSystem([Ligand(smiles="CCC")]) 110 | HashFeaturizer(getter=lambda s: s.ligand.molecule.to_smiles(), normalize=True).featurize( 111 | [system] 112 | ) 113 | assert system.featurizations["last"] == pytest.approx(0.62342903) 114 | 115 | 116 | def test_NullFeaturizer(): 117 | system = LigandSystem([Ligand(smiles="CCC")]) 118 | NullFeaturizer().featurize([system]) 119 | 120 | assert system == system.featurizations["last"] 121 | 122 | 123 | def test_CallableFeaturizer(): 124 | from sklearn.preprocessing import scale 125 | 126 | systems = ( 127 | LigandSystem([Ligand(smiles="C")]), 128 | LigandSystem([Ligand(smiles="CC")]), 129 | LigandSystem([Ligand(smiles="CCC")]), 130 | ) 131 | HashFeaturizer(getter=lambda s: s.ligand.molecule.to_smiles(), normalize=False).featurize( 132 | systems 133 | ) 134 | CallableFeaturizer(lambda s: scale(s.featurizations["last"].reshape((1,)))).featurize(systems) 135 | 136 | for s in systems: 137 | assert s.featurizations["last"].shape 138 | 139 | 140 | def test_ClearFeaturizations_keeplast(): 141 | from kinoml.features.ligand import OneHotSMILESFeaturizer 142 | 143 | systems = ( 144 | LigandSystem([Ligand(smiles="C")]), 145 | LigandSystem([Ligand(smiles="CC")]), 146 | LigandSystem([Ligand(smiles="CCC")]), 147 | ) 148 | OneHotSMILESFeaturizer(use_multiprocessing=False).featurize(systems) 149 | PadFeaturizer(use_multiprocessing=False).featurize(systems) 150 | ClearFeaturizations().featurize(systems) 151 | 152 | for s in systems: 153 | assert len(s.featurizations) == 1 154 | assert "last" in s.featurizations 155 | 156 | 157 | def test_ClearFeaturizations_removeall(): 158 | from kinoml.features.ligand import OneHotSMILESFeaturizer 159 | 160 | systems = ( 161 | LigandSystem([Ligand(smiles="C")]), 162 | LigandSystem([Ligand(smiles="CC")]), 163 | LigandSystem([Ligand(smiles="CCC")]), 164 | ) 165 | OneHotSMILESFeaturizer(use_multiprocessing=False).featurize(systems) 166 | PadFeaturizer(use_multiprocessing=False).featurize(systems) 167 | ClearFeaturizations(keys=tuple(), style="keep").featurize(systems) 168 | 169 | for s in systems: 170 | assert not s.featurizations 171 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "KinoML" 21 | copyright = "2021, OpenKinome" 22 | author = "OpenKinome" 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = "0.1" 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | "sphinx.ext.autosectionlabel", 35 | "sphinx.ext.todo", 36 | "sphinx.ext.napoleon", 37 | # "sphinxemoji.sphinxemoji", 38 | "sphinx-prompt", 39 | "sphinx_copybutton", 40 | # "notfound.extension", 41 | "myst_parser", 42 | # "sphinxcontrib.httpdomain", 43 | "autoapi.extension", 44 | "nbsphinx", 45 | "nbsphinx_link", 46 | # "sphinx_last_updated_by_git", 47 | # "sphinx_panels", 48 | "IPython.sphinxext.ipython_console_highlighting", 49 | ] 50 | 51 | autosectionlabel_prefix_document = True 52 | 53 | nbsphinx_execute = "auto" 54 | nbsphinx_execute_arguments = [ 55 | "--InlineBackend.figure_formats={'svg', 'pdf'}", 56 | "--InlineBackend.rc={'figure.dpi': 96}", 57 | ] 58 | 59 | sphinxemoji_style = "twemoji" 60 | 61 | 62 | autoapi_dirs = ["../kinoml"] 63 | autoapi_root = "api" 64 | autoapi_add_toctree_entry = False 65 | autoapi_ignore = [ 66 | "*migrations*", 67 | "*_version*", 68 | "*tests*", 69 | "*/data/*", 70 | ] 71 | autoapi_options = [ 72 | "members", 73 | "undoc-members", 74 | "private-members", 75 | "show-inheritance", 76 | # "show-module-summary", 77 | "special-members", 78 | "imported-members", 79 | ] 80 | autoapi_keep_files = False 81 | 82 | # Napoleon settings 83 | napoleon_google_docstring = True 84 | napoleon_numpy_docstring = True 85 | napoleon_include_init_with_doc = False 86 | napoleon_include_private_with_doc = False 87 | napoleon_include_special_with_doc = True 88 | napoleon_use_admonition_for_examples = True 89 | napoleon_use_admonition_for_notes = True 90 | napoleon_use_admonition_for_references = False 91 | napoleon_type_aliases = None 92 | napoleon_attr_annotations = True 93 | 94 | # Add any paths that contain templates here, relative to this directory. 95 | templates_path = ["_templates"] 96 | 97 | # List of patterns, relative to source directory, that match files and 98 | # directories to ignore when looking for source files. 99 | # This pattern also affects html_static_path and html_extra_path. 100 | exclude_patterns = [ 101 | "_build", 102 | "Thumbs.db", 103 | ".DS_Store", 104 | "sphinx-notfound-page", 105 | ".ipynb_checkpoints/*", 106 | "__pycache__", 107 | "kinoml/data", 108 | "developers", 109 | ] 110 | 111 | 112 | # -- Options for HTML output ------------------------------------------------- 113 | 114 | # The theme to use for HTML and HTML Help pages. See the documentation for 115 | # a list of builtin themes. 116 | # 117 | import sphinx_material 118 | 119 | # Choose the material theme 120 | html_theme = "sphinx_material" 121 | # Get the them path 122 | html_theme_path = sphinx_material.html_theme_path() 123 | # Register the required helpers for the html context 124 | html_context = sphinx_material.get_html_context() 125 | version_dropdown = False 126 | 127 | # Material theme options (see theme.conf for more information) 128 | html_theme_options = { 129 | "nav_title": "KinoML", 130 | "repo_url": "https://github.com/openkinome/kinoml/", 131 | "repo_name": "KinoML", 132 | "logo_icon": "", 133 | "base_url": "https://openkinome.org/kinoml/", 134 | # "google_analytics_account": "UA-XXXXX", 135 | "html_minify": False, 136 | "html_prettify": True, 137 | "css_minify": True, 138 | "repo_type": "github", 139 | "globaltoc_depth": 3, 140 | "color_primary": "#3f51b5", 141 | "color_accent": "blue", 142 | "touch_icon": "images/custom_favicon.png", 143 | "theme_color": "#3f51b5", 144 | "master_doc": False, 145 | "nav_links": [ 146 | {"href": "index", "internal": True, "title": "User guide"}, 147 | {"href": "api/kinoml/index", "internal": True, "title": "API Reference"}, 148 | { 149 | "href": "https://openkinome.org", 150 | "internal": False, 151 | "title": "OpenKinome", 152 | }, 153 | ], 154 | "heroes": { 155 | "index": "Structure-informed machine learning for kinase modeling", 156 | }, 157 | "version_dropdown": False, 158 | "version_json": "_static/versions.json", 159 | "version_info": { 160 | "Release": "", 161 | "Development": "", 162 | "Release (rel)": "", 163 | "Development (rel)": "", 164 | }, 165 | "table_classes": ["plain"], 166 | } 167 | 168 | # globaltoc seems it's not added by default 169 | html_sidebars = { 170 | "**": [ 171 | "globaltoc.html", 172 | "localtoc.html", 173 | "searchbox.html", 174 | ] 175 | } 176 | 177 | 178 | # Add any paths that contain custom static files (such as style sheets) here, 179 | # relative to this directory. They are copied after the builtin static files, 180 | # so a file named "default.css" will overwrite the builtin "default.css". 181 | html_static_path = ["_static"] 182 | html_favicon = "_static/images/custom_favicon.png" 183 | 184 | # ------- 185 | # MyST 186 | # ------- 187 | myst_enable_extensions = [ 188 | "amsmath", 189 | "colon_fence", 190 | "deflist", 191 | "dollarmath", 192 | "html_admonition", 193 | "html_image", 194 | "linkify", 195 | "replacements", 196 | "smartquotes", 197 | "substitution", 198 | ] 199 | 200 | myst_update_mathjax = False 201 | mathjax3_config = { 202 | "tex2jax": { 203 | "inlineMath": [["\\(", "\\)"]], 204 | "displayMath": [["\\[", "\\]"]], 205 | "processRefs": False, 206 | "processEnvironments": False, 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /kinoml/datasets/chembl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Creates DatasetProvider objects from ChEMBL activity data 3 | """ 4 | import logging 5 | import random 6 | 7 | import pandas as pd 8 | from tqdm.auto import tqdm 9 | 10 | from .core import MultiDatasetProvider 11 | from ..core.conditions import AssayConditions 12 | from ..core.proteins import Protein, KLIFSKinase 13 | from ..core.ligands import Ligand 14 | from ..core.systems import ProteinLigandComplex 15 | from ..core.measurements import pIC50Measurement, pKiMeasurement, pKdMeasurement 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class ChEMBLDatasetProvider(MultiDatasetProvider): 22 | 23 | """ 24 | This provider relies heavily on ``openkinome/kinodata`` data ingestion 25 | pipelines. It will load ChEMBL activities from its releases page. 26 | """ 27 | 28 | @classmethod 29 | def from_source( 30 | cls, 31 | path_or_url="https://github.com/openkinome/datascripts/releases/download/v0.2/activities-chembl28_v0.2.zip", 32 | measurement_types=("pIC50", "pKi", "pKd"), 33 | uniprot_ids=None, 34 | sample=None, 35 | protein_type: str = "KLIFSKinase", 36 | toolkit: str = "OpenEye", 37 | ): 38 | """ 39 | Create a MultiDatasetProvider out of the raw data contained in the zip file. 40 | 41 | Parameters 42 | ---------- 43 | path_or_url: str, optional 44 | path or URL to a (zipped) CSV file containing activities from ChEMBL, 45 | using schema detailed below. 46 | measurement_types: tuple of str, optional 47 | Which measurement types must be imported from the CSV. By default, all 48 | three (pIC50, pKi, pKd) will be loaded, but you can choose a subset ( 49 | e.g. ``("pIC50",)``). 50 | uniprot_ids: None or list of str, default=None 51 | Restrict measurements to the given UniProt IDs. 52 | sample: int, optional=None 53 | If set to larger than zero, load only N data points from the dataset. 54 | protein_type: str, default=KLIFSKinase 55 | The protein object type to use ('Protein' or 'KLIFSKinase'). 56 | toolkit: str, default=OpenEye 57 | The toolkit to use for creating protein objects (e.g. 'OpenEye', 'MDAnalysis'), 58 | allowed values depend on the specified `protein_type`. 59 | 60 | Raises 61 | ------ 62 | ValueError 63 | Given protein_type {protein_type} is not valid, only {allowed_protein_types} are 64 | allowed. 65 | 66 | Note 67 | ---- 68 | ChEMBL aggregates data from lots of sources, so conditions are guaranteed 69 | to be different across experiments. 70 | """ 71 | logger.debug("Checking protein type ...") 72 | protein_type_classes = {"Protein": Protein, "KLIFSKinase": KLIFSKinase} 73 | if protein_type not in protein_type_classes.keys(): 74 | raise ValueError( 75 | f"Given protein_type {protein_type} is not valid, " 76 | f"only {protein_type_classes.keys()} are allowed." 77 | ) 78 | 79 | logger.debug("Retrieving and reading CSV ...") 80 | cached_path = cls._download_to_cache_or_retrieve(path_or_url) 81 | df = pd.read_csv(cached_path) 82 | df = df.dropna( 83 | subset=[ 84 | "compound_structures.canonical_smiles", 85 | "component_sequences.sequence", 86 | "activities.standard_type", 87 | ] 88 | ) 89 | 90 | if uniprot_ids: 91 | logger.debug(f"Filtering for UniProt IDs {uniprot_ids}...") 92 | df = df[df["UniprotID"].isin(uniprot_ids)] 93 | 94 | logger.debug(f"Filtering for measurement types {measurement_types} ...") 95 | chosen_types_labels = df["activities.standard_type"].isin(set(measurement_types)) 96 | filtered_records = df[chosen_types_labels].to_dict("records") 97 | 98 | if sample is not None: 99 | logger.debug(f"Getting sample of size {sample} ...") 100 | filtered_records = random.sample(filtered_records, sample) 101 | 102 | measurement_type_classes = { 103 | "pIC50": pIC50Measurement, 104 | "pKi": pKiMeasurement, 105 | "pKd": pKdMeasurement, 106 | } 107 | measurements = [] 108 | systems = {} 109 | proteins = {} 110 | ligands = {} 111 | logger.debug(f"Creating systems and measurements ...") 112 | for row in tqdm(filtered_records): 113 | try: 114 | measurement_type_key = row["activities.standard_type"] 115 | protein_key = row["component_sequences.sequence"] 116 | ligand_key = row["compound_structures.canonical_smiles"] 117 | system_key = (protein_key, ligand_key) 118 | if protein_key not in proteins: 119 | metadata = { 120 | "uniprot_id": row["UniprotID"], 121 | "chembl_target_id": row["target_dictionary.chembl_id"], 122 | } 123 | protein = protein_type_classes[protein_type]( 124 | sequence=protein_key, 125 | name=row["UniprotID"], 126 | uniprot_id=row["UniprotID"], 127 | metadata=metadata, 128 | toolkit=toolkit, 129 | ) 130 | proteins[protein_key] = protein 131 | if ligand_key not in ligands: 132 | ligands[ligand_key] = Ligand(smiles=ligand_key, name=ligand_key) 133 | if system_key not in systems: 134 | systems[system_key] = ProteinLigandComplex( 135 | [proteins[protein_key], ligands[ligand_key]] 136 | ) 137 | 138 | MeasurementType = measurement_type_classes[measurement_type_key] 139 | conditions = AssayConditions(pH=7) 140 | system = systems[system_key] 141 | metadata = { 142 | "unit": f"-log10({row['activities.standard_units']}E-9)", 143 | "confidence": row["assays.confidence_score"], 144 | "chembl_activity": row["activities.activity_id"], 145 | "chembl_document": row["docs.chembl_id"], 146 | "year": row["docs.year"], 147 | } 148 | measurement = MeasurementType( 149 | values=row["activities.standard_value"], 150 | system=system, 151 | conditions=conditions, 152 | metadata=metadata, 153 | ) 154 | measurements.append(measurement) 155 | except Exception as exc: 156 | print("Couldn't process record", row) 157 | print("Exception:", exc) 158 | 159 | return cls( 160 | measurements, 161 | metadata={ 162 | "path_or_url": path_or_url, 163 | "measurement_types": measurement_types, 164 | "sample": sample, 165 | }, 166 | ) 167 | -------------------------------------------------------------------------------- /kinoml/databases/pdb.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import Iterable, Union 4 | 5 | from appdirs import user_cache_dir 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def smiles_from_pdb(ligand_ids: Iterable[str]) -> dict: 12 | """ 13 | Retrieve SMILES of molecules defined by their PDB chemical identifier. 14 | 15 | Parameters 16 | ---------- 17 | ligand_ids: iterable of str 18 | PDB chemical identifier. 19 | 20 | Returns 21 | ------- 22 | ligands: dict 23 | Dictionary with PDB chemical identifier as keys and SMILES as values. 24 | """ 25 | import json 26 | import math 27 | import requests 28 | import urllib 29 | 30 | ligand_ids = list(set(ligand_ids)) 31 | ligands = {} 32 | base_url = "https://data.rcsb.org/graphql?query=" 33 | n_batches = math.ceil(len(ligand_ids) / 50) # request maximal 50 smiles at a time 34 | for i in range(n_batches): 35 | ligand_ids_batch = ligand_ids[i * 50 : (i * 50) + 50] 36 | logger.debug(f"Batch {i}\n{ligand_ids_batch}") 37 | query = ( 38 | "{chem_comps(comp_ids:[" 39 | + ",".join(['"' + ligand_id + '"' for ligand_id in ligand_ids_batch]) 40 | + "]){chem_comp{id}rcsb_chem_comp_descriptor{SMILES_stereo}}}" 41 | ) 42 | response = requests.get(base_url + urllib.parse.quote(query)) 43 | for ligand in json.loads(response.text)["data"]["chem_comps"]: 44 | try: 45 | ligands[ligand["chem_comp"]["id"]] = ligand["rcsb_chem_comp_descriptor"][ 46 | "SMILES_stereo" 47 | ] 48 | except TypeError: 49 | # missing smiles entry 50 | pass 51 | 52 | return ligands 53 | 54 | 55 | def download_pdb_structure( 56 | pdb_id: str, directory: Union[str, Path] = user_cache_dir() 57 | ) -> Union[Path, bool]: 58 | """ 59 | Download a PDB structure. If the structure is not available in PDB format, it will be download 60 | in CIF format. 61 | 62 | Parameters 63 | ---------- 64 | pdb_id: str 65 | The PDB ID of interest. 66 | directory: str or Path, default=user_cache_dir 67 | The directory for saving the downloaded structure. 68 | 69 | Returns 70 | ------- 71 | : Path or False 72 | The path to the the downloaded file if successful, else False. 73 | """ 74 | from pathlib import Path 75 | 76 | from ..utils import LocalFileStorage, FileDownloader 77 | 78 | directory = Path(directory) 79 | 80 | # check for structure in PDB format 81 | pdb_path = LocalFileStorage.rcsb_structure_pdb(pdb_id, directory) 82 | if not pdb_path.is_file(): 83 | logger.debug("Downloading PDB entry in PDB format ...") 84 | if FileDownloader.rcsb_structure_pdb(pdb_id, directory): 85 | return pdb_path 86 | else: 87 | return pdb_path 88 | 89 | # check for structure in CIF format 90 | cif_path = LocalFileStorage.rcsb_structure_cif(pdb_id, directory) 91 | if not cif_path.is_file(): 92 | logger.debug("Downloading PDB entry in CIF format ...") 93 | if FileDownloader.rcsb_structure_cif(pdb_id, directory): 94 | return cif_path 95 | else: 96 | return cif_path 97 | logger.debug(f"Could not download PDB entry {pdb_id}.") 98 | return False 99 | 100 | 101 | def download_pdb_ligand( 102 | pdb_id: str, 103 | chain_id: str, 104 | expo_id: str, 105 | smiles: str = "", 106 | directory: Union[str, Path] = user_cache_dir(), 107 | ) -> Union[Path, bool]: 108 | """ 109 | Download a ligand co-crystallized to a PDB structure and save in SDF format. If a SMILES is 110 | provided, the connectivity and protonation will be adjusted accordingly. 111 | 112 | Parameters 113 | ---------- 114 | pdb_id: str 115 | The PDB ID of interest. 116 | chain_id: str 117 | The chain ID of the ligand. 118 | expo_id: str 119 | The residue name of the ligand. 120 | smiles: str, default="" 121 | The smiles of the small molecule describing the connectivity and protonation of the 122 | ligand. 123 | directory: str or Path, default=user_cache_dir 124 | The directory for saving the downloaded structure. 125 | 126 | Returns 127 | ------- 128 | : Path or False 129 | The path to the the processed ligand file in SDF format if successful, else False. 130 | """ 131 | from rdkit import Chem 132 | from rdkit.Chem import AllChem 133 | from ..utils import LocalFileStorage 134 | 135 | directory = Path(directory) 136 | sdf_path = LocalFileStorage.rcsb_ligand_sdf( 137 | pdb_id=pdb_id, 138 | chain_id=chain_id, 139 | expo_id=expo_id, 140 | altloc=None, 141 | directory=directory, 142 | ) 143 | if sdf_path.is_file(): 144 | logger.debug( 145 | f"Found cached ligand file for PDB entry {pdb_id}, chain {chain_id}, ligand {expo_id}." 146 | ) 147 | return sdf_path 148 | 149 | pdb_path = download_pdb_structure(pdb_id=pdb_id, directory=directory) 150 | if not pdb_path: 151 | return False 152 | 153 | suffix = str(pdb_path).split(".")[-1] 154 | if suffix == "cif": 155 | cif_path = str(pdb_path) 156 | pdb_path = LocalFileStorage.rcsb_structure_pdb( 157 | pdb_id=f"{pdb_id}_chain{chain_id}", directory=directory 158 | ) 159 | if not pdb_path.is_file(): 160 | from Bio.PDB import MMCIFParser, PDBIO 161 | 162 | logger.debug("Converting CIF to PDB format ...") 163 | parser = MMCIFParser() 164 | try: 165 | structure = parser.get_structure("", cif_path)[0][chain_id] 166 | except KeyError: 167 | logger.debug(f"Could not find chain {chain_id} in CIF file!") 168 | return False 169 | io = PDBIO() 170 | io.set_structure(structure) 171 | io.save(str(pdb_path)) 172 | 173 | logger.debug("Extracting ligand with RDKit ...") 174 | try: 175 | pdb_mol = Chem.MolFromPDBFile(str(pdb_path), sanitize=False) 176 | if pdb_mol is None: 177 | logger.debug(f"Could not read {pdb_path} with RDKit.") 178 | return False 179 | pdb_mol_chains = Chem.SplitMolByPDBChainId(pdb_mol) 180 | chain = pdb_mol_chains[chain_id] 181 | chain_residues = Chem.SplitMolByPDBResidues(chain) 182 | ligand = chain_residues[expo_id] 183 | except KeyError: 184 | logger.debug( 185 | f"Could not find ligand {expo_id} for chain {chain_id} in PDB entry {pdb_id}." 186 | ) 187 | return False 188 | 189 | if smiles: 190 | logger.debug("Adjusting connectivity and protonation according to given SMILES ...") 191 | ligand = Chem.RemoveHs(ligand) 192 | reference_mol = Chem.MolFromSmiles(smiles) 193 | ligand = AllChem.AssignBondOrdersFromTemplate(reference_mol, ligand) 194 | ligand = Chem.AddHs(ligand, addCoords=True) 195 | 196 | logger.debug("Writing extracted ligand to SDF file ...") 197 | writer = Chem.SDWriter(str(sdf_path)) 198 | writer.write(ligand) 199 | 200 | return sdf_path 201 | -------------------------------------------------------------------------------- /kinoml/tests/features/test_protein.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test ligand featurizers of `kinoml.protein` 3 | """ 4 | from importlib import resources 5 | 6 | 7 | def test_aminoacidcompositionfeaturizer(): 8 | """Check AminoAcidCompositionFeaturizer.""" 9 | from kinoml.core.proteins import Protein 10 | from kinoml.core.systems import ProteinSystem 11 | from kinoml.features.protein import AminoAcidCompositionFeaturizer 12 | 13 | systems = [ 14 | ProteinSystem([Protein(sequence="")]), 15 | ProteinSystem([Protein(sequence="A")]), 16 | ProteinSystem([Protein(uniprot_id="P00519")]), 17 | ProteinSystem([Protein(uniprot_id="xxxxx")]), 18 | ] 19 | featurizer = AminoAcidCompositionFeaturizer(use_multiprocessing=False) 20 | featurized_systems = featurizer.featurize(systems) 21 | 22 | assert len(featurized_systems) == 3 # filter protein with wrong UniProt ID 23 | assert list(featurized_systems[0].featurizations["last"]) == [ 24 | 0, 25 | 0, 26 | 0, 27 | 0, 28 | 0, 29 | 0, 30 | 0, 31 | 0, 32 | 0, 33 | 0, 34 | 0, 35 | 0, 36 | 0, 37 | 0, 38 | 0, 39 | 0, 40 | 0, 41 | 0, 42 | 0, 43 | 0, 44 | ] 45 | assert list(featurized_systems[1].featurizations["last"]) == [ 46 | 1, 47 | 0, 48 | 0, 49 | 0, 50 | 0, 51 | 0, 52 | 0, 53 | 0, 54 | 0, 55 | 0, 56 | 0, 57 | 0, 58 | 0, 59 | 0, 60 | 0, 61 | 0, 62 | 0, 63 | 0, 64 | 0, 65 | 0, 66 | ] 67 | assert list(featurized_systems[2].featurizations["last"]) == [ 68 | 97, 69 | 14, 70 | 42, 71 | 90, 72 | 29, 73 | 88, 74 | 24, 75 | 33, 76 | 82, 77 | 96, 78 | 18, 79 | 40, 80 | 90, 81 | 33, 82 | 64, 83 | 120, 84 | 65, 85 | 62, 86 | 13, 87 | 30, 88 | ] 89 | 90 | 91 | def test_onehotencodedsequencefeaturizer_full(): 92 | """Check OneHotEncodedSequenceFeaturizer with full sequence.""" 93 | from kinoml.core.proteins import Protein 94 | from kinoml.core.systems import ProteinSystem 95 | from kinoml.features.protein import OneHotEncodedSequenceFeaturizer 96 | 97 | systems = [ 98 | ProteinSystem([Protein(sequence="")]), 99 | ProteinSystem([Protein(sequence="A")]), 100 | ProteinSystem([Protein(uniprot_id="P00519")]), 101 | ProteinSystem([Protein(uniprot_id="xxxxx")]), 102 | ] 103 | featurizer = OneHotEncodedSequenceFeaturizer(use_multiprocessing=False) 104 | featurized_systems = featurizer.featurize(systems) 105 | 106 | assert len(featurized_systems) == 2 # filter protein with wrong UniProt ID and empty string 107 | assert list(featurized_systems[0].featurizations["last"])[0][0] == 1 108 | assert list(featurized_systems[1].featurizations["last"])[3][2] == 1 109 | 110 | 111 | def test_onehotencodedsequencefeaturizer_klifs_kinase(): 112 | """Check OneHotEncodedSequenceFeaturizer with kinase KLIFS sequence.""" 113 | from kinoml.core.proteins import KLIFSKinase 114 | from kinoml.core.systems import ProteinSystem 115 | from kinoml.features.protein import OneHotEncodedSequenceFeaturizer 116 | 117 | systems = [ 118 | ProteinSystem([KLIFSKinase(sequence="")]), 119 | ProteinSystem([KLIFSKinase(kinase_klifs_sequence="A")]), 120 | ProteinSystem([KLIFSKinase(uniprot_id="P00519")]), 121 | ProteinSystem([KLIFSKinase(uniprot_id="xxxxx")]), 122 | ProteinSystem([KLIFSKinase(ncbi_id="NP_005148.2")]), 123 | ProteinSystem([KLIFSKinase(kinase_klifs_id=480)]), 124 | ProteinSystem([KLIFSKinase(structure_klifs_id=3620)]), 125 | ] 126 | featurizer = OneHotEncodedSequenceFeaturizer( 127 | sequence_type="klifs_kinase", use_multiprocessing=False 128 | ) 129 | featurized_systems = featurizer.featurize(systems) 130 | 131 | assert len(featurized_systems) == 5 # filter protein with wrong UniProt ID and empty string 132 | assert list(featurized_systems[0].featurizations["last"])[0][0] == 1 133 | assert list(featurized_systems[1].featurizations["last"])[0][14] == 1 134 | assert list(featurized_systems[2].featurizations["last"])[0][14] == 1 135 | assert list(featurized_systems[3].featurizations["last"])[0][14] == 1 136 | assert list(featurized_systems[4].featurizations["last"])[0][14] == 1 137 | 138 | 139 | def test_onehotencodedsequencefeaturizer_klifs_structure(): 140 | """Check OneHotEncodedSequenceFeaturizer with structure KLIFS sequence.""" 141 | from kinoml.core.proteins import KLIFSKinase 142 | from kinoml.core.systems import ProteinSystem 143 | from kinoml.features.protein import OneHotEncodedSequenceFeaturizer 144 | 145 | systems = [ 146 | ProteinSystem([KLIFSKinase(sequence="")]), 147 | ProteinSystem([KLIFSKinase(structure_klifs_sequence="A")]), 148 | ProteinSystem([KLIFSKinase(uniprot_id="P00519")]), 149 | ProteinSystem([KLIFSKinase(kinase_klifs_id=480)]), 150 | ProteinSystem([KLIFSKinase(structure_klifs_id=3620)]), 151 | ] 152 | featurizer = OneHotEncodedSequenceFeaturizer( 153 | sequence_type="klifs_structure", use_multiprocessing=False 154 | ) 155 | featurized_systems = featurizer.featurize(systems) 156 | 157 | assert len(featurized_systems) == 2 # needs structure_klifs_sequence or structure_klifs_id 158 | assert list(featurized_systems[0].featurizations["last"])[0][0] == 1 159 | assert list(featurized_systems[1].featurizations["last"])[0][14] == 1 160 | 161 | 162 | def test_oeproteinstructurefeaturizer(): 163 | """Check OEProteinStructureFeaturizer with different inputs.""" 164 | from kinoml.core.proteins import Protein 165 | from kinoml.core.systems import ProteinSystem 166 | from kinoml.features.protein import OEProteinStructureFeaturizer 167 | 168 | systems = [] 169 | # unspecifc definition of the system, only via PDB ID 170 | # modeling will be performed according to the sequence stored in the PDB Header 171 | protein = Protein(pdb_id="4f8o", name="PsaA") 172 | system = ProteinSystem(components=[protein]) 173 | systems.append(system) 174 | # more specific definition of the system, protein of chain A co-crystallized with ligand AES 175 | # and alternate location B, modeling will be performed according to the sequence of the given 176 | # UniProt ID 177 | protein = Protein.from_pdb(pdb_id="4f8o", name="PsaA") 178 | protein.uniprot_id = "P31522" 179 | protein.chain_id = "A" 180 | protein.alternate_location = "B" 181 | protein.expo_id = "AES" 182 | system = ProteinSystem(components=[protein]) 183 | systems.append(system) 184 | # use a protein structure form file 185 | with resources.path("kinoml.data.proteins", "4f8o_edit.pdb") as structure_path: 186 | protein = Protein.from_file(file_path=structure_path, name="PsaA") 187 | protein.uniprot_id = "P31522" 188 | system = ProteinSystem(components=[protein]) 189 | systems.append(system) 190 | 191 | with resources.path("kinoml.data.proteins", "kinoml_tests_4f8o_spruce.loop_db") as loop_db: 192 | featurizer = OEProteinStructureFeaturizer(loop_db=loop_db, use_multiprocessing=False) 193 | systems = featurizer.featurize(systems) 194 | # check number of residues 195 | assert len(systems[0].featurizations["last"].residues) == 239 196 | assert len(systems[1].featurizations["last"].residues) == 216 197 | assert len(systems[2].featurizations["last"].residues) == 109 198 | # check numbering of first residue 199 | assert systems[0].featurizations["last"].residues[0].resid == 1 200 | assert systems[1].featurizations["last"].residues[0].resid == 44 201 | assert systems[2].featurizations["last"].residues[0].resid == 47 202 | -------------------------------------------------------------------------------- /kinoml/modeling/SCHRODINGERModeling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | import subprocess 4 | from tempfile import NamedTemporaryFile 5 | from typing import Union 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def run_prepwizard( 12 | schrodinger_directory: Union[Path, str], 13 | input_file: Union[Path, str], 14 | output_file: Union[Path, str], 15 | cap_termini: bool = True, 16 | build_loops: bool = True, 17 | sequence: Union[str, None] = None, 18 | chain_id: str = "", 19 | protein_pH: str = "neutral", 20 | propka_pH: float = 7.4, 21 | epik_pH: float = 7.4, 22 | force_field: str = "3", 23 | ): 24 | """ 25 | Run the prepwizard utility to prepare a protein structure. 26 | 27 | Parameters 28 | ---------- 29 | schrodinger_directory: Path or str 30 | The path to the directory of the Schrodinger installation. 31 | input_file: Path or str 32 | The path to the input file. 33 | output_file: Path or str 34 | The path to the output file. 35 | cap_termini: bool, default=True 36 | If termini should be capped. 37 | build_loops: bool, default=True 38 | If loops should be built. 39 | sequence: str or None 40 | The amino acid sequence in single letter codes that should be used for loop building. 41 | Also needs the chain_id parameter to work correctly. 42 | chain_id: str, default="" 43 | The chain ID of the protein that should be modeled based on the given sequence. 44 | protein_pH: str, default='neutral' 45 | The pH used during protonation of the protein ('very_low', 'low', 'neutral', 'high'). 46 | propka_pH: float, default=7.4 47 | Run PROPKA at given pH. 48 | epik_pH: float, default=7.4 49 | The pH used during protonation of the ligand. 50 | force_field: str, default='3' 51 | Force field to use during minimization (2005, 3) 52 | """ 53 | schrodinger_directory = Path(schrodinger_directory) 54 | executable = str(schrodinger_directory / "utilities/prepwizard") 55 | standard_arguments = [ 56 | str(input_file), 57 | str(output_file), 58 | "-HOST", 59 | "localhost", 60 | "-WAIT", 61 | "-keepfarwat", 62 | "-disulfides", 63 | "-glycosylation", 64 | "-palmitoylation", 65 | "-mse", 66 | "-fillsidechains", 67 | "-samplewater", 68 | "-pH", 69 | protein_pH, 70 | "-propka_pH", 71 | str(propka_pH), 72 | "-minimize_adj_h", 73 | "-epik_pH", 74 | str(epik_pH), 75 | "-f", 76 | force_field, 77 | ] 78 | optional_arguments = [] 79 | if cap_termini: 80 | optional_arguments.append("-c") 81 | if build_loops: 82 | optional_arguments.append("-fillloops") 83 | 84 | if sequence: # one letter characters, 60 per line, no header 85 | with NamedTemporaryFile(mode="w", suffix=".fasta") as fasta_file: 86 | sequence = "\n".join([sequence[i : i + 60] for i in range(0, len(sequence), 60)]) 87 | fasta_file.write(f">entry:{chain_id}\n") 88 | fasta_file.write(sequence) 89 | fasta_file.flush() 90 | subprocess.run( 91 | [executable] 92 | + standard_arguments 93 | + optional_arguments 94 | + ["-fasta_file", fasta_file.name] 95 | ) 96 | else: 97 | subprocess.run([executable] + standard_arguments + optional_arguments) 98 | 99 | if logger.getEffectiveLevel() != logging.DEBUG: # remove prepwizard log 100 | paths = Path(".").glob(f"*{Path(input_file).stem}*") 101 | for path in paths: 102 | try: 103 | path.unlink() 104 | except FileNotFoundError: 105 | # may happen in multiprocessing of the same structure 106 | pass 107 | 108 | return 109 | 110 | 111 | def mae_to_pdb( 112 | schrodinger_directory: Union[str, Path], 113 | mae_file_path: Union[str, Path], 114 | pdb_file_path: Union[str, Path], 115 | ): 116 | """ 117 | Convert a structure file from MAE to PDB format. 118 | 119 | Parameters 120 | ---------- 121 | schrodinger_directory: str or pathlib.Path 122 | The path to the directory of the Schrodinger installation. 123 | mae_file_path: str or pathlib.Path 124 | The path to the input file in MAE format. 125 | pdb_file_path: str or pathlib.Path 126 | The path to the output file in PDB format. 127 | """ 128 | schrodinger_directory = Path(schrodinger_directory) 129 | arguments = [ 130 | str(schrodinger_directory / "utilities/pdbconvert"), # executable 131 | "-imae", 132 | str(mae_file_path), 133 | "-opdb", 134 | str(pdb_file_path), # file paths 135 | ] 136 | subprocess.run(arguments) 137 | return 138 | 139 | 140 | def shape_screen( 141 | schrodinger_directory: Union[Path, str], 142 | query_path: Union[str, Path], 143 | library_path: Union[str, Path], 144 | output_sdf_path: Union[str, Path], 145 | flexible: bool = True, 146 | thorough_sampling: bool = True, 147 | keep_best_match_only: bool = True, 148 | ): 149 | """ 150 | Run the shape_screen tool to align a library of small molecules to the given shape query. 151 | 152 | Parameters 153 | ---------- 154 | schrodinger_directory: Path or str 155 | The path to the directory of the Schrodinger installation. 156 | query_path: Path or str 157 | The path to a valid shape query, e.g. an SDF file with one or more small molecules. 158 | library_path: Path or str 159 | The path to a valid ligand library for shape screening, e.g. and SDF file with one more 160 | small molecules. 161 | output_sdf_path: Path or str 162 | The path to the output SDF file of the shape screening. 163 | flexible: bool, default=True 164 | If conformers shell be generated for the small molecule library to screen. 165 | thorough_sampling: bool, default=True 166 | If conformations shell thoroughly sampled. 167 | keep_best_match_only: bool, default=True 168 | In case multiple shape queries, if only the results for best matching shape query shell 169 | be returned. 170 | """ 171 | import gzip 172 | import shutil 173 | 174 | schrodinger_directory = Path(schrodinger_directory) 175 | executable = str(schrodinger_directory / "shape_screen") 176 | standard_arguments = [ 177 | "-shape", 178 | str(query_path), 179 | "-screen", 180 | str(library_path), 181 | "-osd", 182 | "-atomtypes", 183 | "element", 184 | "-HOST", 185 | "localhost", 186 | "-WAIT", 187 | ] 188 | optional_arguments = [] 189 | if flexible: 190 | optional_arguments.append("-flex") 191 | optional_arguments.append("-max") 192 | optional_arguments.append("800") 193 | if thorough_sampling: 194 | optional_arguments += ["-sample", "thorough"] 195 | if keep_best_match_only: 196 | optional_arguments.append("-best") 197 | 198 | subprocess.run([executable] + standard_arguments + optional_arguments) 199 | if logger.getEffectiveLevel() != logging.DEBUG: # remove shape_screen log and okay 200 | paths = [ 201 | Path(".") / f"{Path(query_path).stem}_shape.log", 202 | Path(".") / f"{Path(query_path).stem}_shape.okay", 203 | ] 204 | for path in paths: 205 | try: 206 | path.unlink() 207 | except FileNotFoundError: 208 | # may happen in multiprocessing of the same query file 209 | pass 210 | 211 | logger.debug("Unzipping and renaming results ...") 212 | output_sdfgz_path = Path(".") / f"{Path(query_path).stem}_align.sdfgz" 213 | with gzip.open(output_sdfgz_path, "rb") as sdfgz: 214 | with open(output_sdf_path, "wb") as sdf: 215 | shutil.copyfileobj(sdfgz, sdf) 216 | output_sdfgz_path.unlink() 217 | 218 | return 219 | -------------------------------------------------------------------------------- /kinoml/tests/features/test_complexes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test complex featurizers of `kinoml.features` 3 | """ 4 | import pandas as pd 5 | 6 | 7 | def test_oecomplexfeaturizer(): 8 | """Check OEComplexFeaturizer with different inputs.""" 9 | from kinoml.core.ligands import Ligand 10 | from kinoml.core.proteins import Protein 11 | from kinoml.core.systems import ProteinLigandComplex 12 | from kinoml.features.complexes import OEComplexFeaturizer 13 | 14 | systems = [] 15 | protein = Protein(pdb_id="4f8o", name="PsaA") 16 | ligand = Ligand(name="AEBSF") 17 | system = ProteinLigandComplex(components=[protein, ligand]) 18 | systems.append(system) 19 | protein = Protein.from_pdb(pdb_id="4f8o", name="PsaA") 20 | protein.uniprot_id = "P31522" 21 | protein.chain_id = "A" 22 | protein.alternate_location = "B" 23 | protein.expo_id = "AES" 24 | ligand = Ligand(name="AEBSF") 25 | system = ProteinLigandComplex(components=[protein, ligand]) 26 | systems.append(system) 27 | featurizer = OEComplexFeaturizer(use_multiprocessing=False) 28 | systems = featurizer.featurize(systems) 29 | # check LIG exists 30 | assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1 31 | assert len(systems[1].featurizations["last"].select_atoms("resname LIG").residues) == 1 32 | # check caps 33 | assert ( 34 | len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues) 35 | == 2 36 | ) 37 | assert ( 38 | len(systems[1].featurizations["last"].select_atoms("resname ACE or resname NME").residues) 39 | == 1 40 | ) 41 | # check number of residues 42 | assert len(systems[0].featurizations["last"].residues) == 240 43 | assert len(systems[1].featurizations["last"].residues) == 217 44 | # check numbering of first residue 45 | assert systems[0].featurizations["last"].residues[0].resid == 1 46 | assert systems[1].featurizations["last"].residues[0].resid == 44 47 | 48 | 49 | def test_oedockingfeaturizer_fred(): 50 | """Check OEDockingFeaturizer with Fred and different inputs.""" 51 | from kinoml.core.ligands import Ligand 52 | from kinoml.core.proteins import Protein 53 | from kinoml.core.systems import ProteinLigandComplex 54 | from kinoml.features.complexes import OEDockingFeaturizer 55 | 56 | systems = [] 57 | # define the binding site for docking via co-crystallized ligand 58 | protein = Protein(pdb_id="4yne", name="NTRK1") 59 | protein.expo_id = "4EK" 60 | ligand = Ligand( 61 | smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", 62 | name="larotrectinib_fred", 63 | ) 64 | system = ProteinLigandComplex(components=[protein, ligand]) 65 | systems.append(system) 66 | # define the binding site for docking via residue IDs 67 | protein = Protein(pdb_id="4yne", name="NTRK1") 68 | protein.pocket_resids = [ 69 | 516, 70 | 517, 71 | 521, 72 | 524, 73 | 542, 74 | 544, 75 | 573, 76 | 589, 77 | 590, 78 | 591, 79 | 592, 80 | 595, 81 | 596, 82 | 654, 83 | 655, 84 | 656, 85 | 657, 86 | 667, 87 | 668, 88 | ] 89 | ligand = Ligand( 90 | smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", 91 | name="larotrectinib_fred_2", 92 | ) 93 | system = ProteinLigandComplex(components=[protein, ligand]) 94 | systems.append(system) 95 | featurizer = OEDockingFeaturizer(method="Fred", use_multiprocessing=False) 96 | systems = featurizer.featurize(systems) 97 | # check docking score was stored 98 | assert isinstance(systems[0].featurizations["last"]._topology.docking_score, float) 99 | # check LIG exists 100 | assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1 101 | assert len(systems[1].featurizations["last"].select_atoms("resname LIG").residues) == 1 102 | # check caps 103 | assert ( 104 | len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues) 105 | == 10 106 | ) 107 | assert ( 108 | len(systems[1].featurizations["last"].select_atoms("resname ACE or resname NME").residues) 109 | == 10 110 | ) 111 | # check numbering of first residue 112 | assert systems[0].featurizations["last"].residues[0].resid == 501 113 | assert systems[1].featurizations["last"].residues[0].resid == 501 114 | 115 | 116 | def test_oedockingfeaturizer_hybrid(): 117 | """Check OEDockingFeaturizer with Hybrid.""" 118 | from kinoml.core.ligands import Ligand 119 | from kinoml.core.proteins import Protein 120 | from kinoml.core.systems import ProteinLigandComplex 121 | from kinoml.features.complexes import OEDockingFeaturizer 122 | 123 | systems = [] 124 | protein = Protein(pdb_id="4yne", name="NTRK1") 125 | protein.expo_id = "4EK" 126 | ligand = Ligand( 127 | smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", 128 | name="larotrectinib_hybrid", 129 | ) 130 | system = ProteinLigandComplex(components=[protein, ligand]) 131 | systems.append(system) 132 | featurizer = OEDockingFeaturizer(method="Hybrid", use_multiprocessing=False) 133 | systems = featurizer.featurize(systems) 134 | # check LIG exists 135 | assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1 136 | # check caps 137 | assert ( 138 | len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues) 139 | == 10 140 | ) 141 | # check numbering of first residue 142 | assert systems[0].featurizations["last"].residues[0].resid == 501 143 | 144 | 145 | def test_oedockingfeaturizer_posit(): 146 | """Check OEDockingFeaturizer with Posit.""" 147 | from kinoml.core.ligands import Ligand 148 | from kinoml.core.proteins import Protein 149 | from kinoml.core.systems import ProteinLigandComplex 150 | from kinoml.features.complexes import OEDockingFeaturizer 151 | 152 | systems = [] 153 | protein = Protein(pdb_id="4yne", name="NTRK1") 154 | protein.expo_id = "4EK" 155 | ligand = Ligand( 156 | smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", 157 | name="larotrectinib_posit", 158 | ) 159 | system = ProteinLigandComplex(components=[protein, ligand]) 160 | systems.append(system) 161 | featurizer = OEDockingFeaturizer(method="Posit", use_multiprocessing=False) 162 | systems = featurizer.featurize(systems) 163 | # check LIG exists 164 | assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1 165 | # check caps 166 | assert ( 167 | len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues) 168 | == 10 169 | ) 170 | # check numbering of first residue 171 | assert systems[0].featurizations["last"].residues[0].resid == 501 172 | # check posit probability was stored 173 | assert isinstance(systems[0].featurizations["last"]._topology.posit_probability, float) 174 | 175 | 176 | def test_mostsimilarpdbligandfeaturizer(): 177 | """Check MostSimilarPDBLigandFeaturizer with different similarity metrics.""" 178 | from kinoml.core.ligands import Ligand 179 | from kinoml.core.proteins import Protein 180 | from kinoml.core.systems import ProteinLigandComplex 181 | from kinoml.features.complexes import MostSimilarPDBLigandFeaturizer 182 | 183 | for metric in ["mcs", "fingerprint", "openeye_shape"]: 184 | systems = [] 185 | protein = Protein(uniprot_id="P04629", name="NTRK1") 186 | ligand = Ligand( 187 | smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", 188 | name="larotrectinib", 189 | ) 190 | system = ProteinLigandComplex(components=[protein, ligand]) 191 | systems.append(system) 192 | featurizer = MostSimilarPDBLigandFeaturizer( 193 | similarity_metric=metric, use_multiprocessing=False 194 | ) 195 | systems = featurizer.featurize(systems) 196 | assert isinstance(systems[0].protein.pdb_id, str) 197 | assert isinstance(systems[0].protein.chain_id, str) 198 | assert isinstance(systems[0].protein.expo_id, str) 199 | 200 | 201 | def test_klifsconformationtemplatesfeaturizer(): 202 | """Check KLIFSConformationTemplatesFeaturizer with fingerprint only.""" 203 | from kinoml.core.ligands import Ligand 204 | from kinoml.core.proteins import KLIFSKinase 205 | from kinoml.core.systems import ProteinLigandComplex 206 | from kinoml.features.complexes import KLIFSConformationTemplatesFeaturizer 207 | 208 | systems = [] 209 | protein = KLIFSKinase(uniprot_id="P04629", name="NTRK1") 210 | ligand = Ligand( 211 | smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", 212 | name="larotrectinib", 213 | ) 214 | system = ProteinLigandComplex(components=[protein, ligand]) 215 | systems.append(system) 216 | featurizer = KLIFSConformationTemplatesFeaturizer( 217 | similarity_metric="fingerprint", use_multiprocessing=False 218 | ) 219 | systems = featurizer.featurize(systems) 220 | # check feature is dataframe 221 | assert isinstance(systems[0].featurizations["last"], pd.DataFrame) 222 | # check dataframe is not empty 223 | assert len(systems[0].featurizations["last"]) > 0 224 | -------------------------------------------------------------------------------- /kinoml/docking/SCHRODINGERDocking.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | import subprocess 4 | from tempfile import NamedTemporaryFile 5 | from typing import List, Union 6 | 7 | from appdirs import user_cache_dir 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def run_glide( 14 | schrodinger_directory: Union[Path, str], 15 | input_file_mae: Union[Path, str], 16 | output_file_sdf: Union[Path, str], 17 | mols_smiles: List[str], 18 | ligand_resname: Union[str, None], 19 | n_poses: int = 1, 20 | mols_names: Union[List[str], None] = None, 21 | shape_restrain: bool = True, 22 | macrocyles: bool = False, 23 | precision: str = "XP", 24 | cache_dir: Union[Path, str] = user_cache_dir(), 25 | ): 26 | """ 27 | Run glide for protein ligand docking. 28 | 29 | Parameters 30 | ---------- 31 | schrodinger_directory: Path or str 32 | The path to the directory of the Schrodinger installation. 33 | input_file_mae: Path or str 34 | The path to the input file in MAE format containing the protein structure to dock to and a 35 | co-crystallized ligand in the binding pocket of interest. 36 | output_file_sdf: Path or str 37 | The path to the output file of the generated in docking poses in SDF format. 38 | mols_smiles: list of str 39 | The molecules to dock as SMILES representation. 40 | ligand_resname: str or None 41 | The resname of the co-crystallized ligand, which will be used for pocket definition. 42 | mols_names: None or list of str, default=None 43 | The names of the molecules to dock. Will be used as molecule title in the SDF file. If 44 | None, names will be numbers (1,..,len(mols_smiles). 45 | n_poses: int, default=1 46 | Number of poses to generate per molecule. 47 | shape_restrain: bool, default=True 48 | If the co-crystallized ligand shell be used for shape restrained docking. 49 | macrocyles: bool, default=False 50 | Macrocycle conformations will be sampled with an appropriate algorithm. All non- 51 | macrocyclic molecules by detected by SCHRODINGER will be skipped. 52 | precision: str, default="XP" 53 | The docking precision to use ["HTVS", "SP", "XP"]. 54 | cache_dir: Path or str, default=appdirs.user_cache_dir() 55 | Path to a directory for caching grids for docking. 56 | """ 57 | import shutil 58 | 59 | from rdkit import Chem 60 | from rdkit.Chem import AllChem 61 | 62 | from ..utils import sha256_objects 63 | 64 | if precision not in ["HTVS", "SP", "XP"]: 65 | raise ValueError( 66 | f"Only 'HTVS', 'SP', 'XP' are allowed for precision, you provided {precision}!" 67 | ) 68 | 69 | schrodinger_directory = Path(schrodinger_directory).resolve() 70 | input_file_mae = Path(input_file_mae).resolve() 71 | with NamedTemporaryFile(mode="w", suffix=".mae") as protein_file_mae, NamedTemporaryFile( 72 | mode="w", suffix=".mae" 73 | ) as ligand_file_mae, NamedTemporaryFile( 74 | mode="w", suffix=".mae" 75 | ) as protein_ligand_file_mae, NamedTemporaryFile( 76 | mode="w", suffix=".sdf" 77 | ) as mols_file_sdf, NamedTemporaryFile( 78 | mode="w", suffix=".in" 79 | ) as grid_input_file, NamedTemporaryFile( 80 | mode="w", suffix=".in" 81 | ) as docking_input_file: 82 | 83 | logger.debug("Selecting and writing protein from MAE input file ...") 84 | subprocess.run( 85 | [ 86 | str(schrodinger_directory / "run"), 87 | "delete_atoms.py", 88 | str(input_file_mae), 89 | protein_file_mae.name, 90 | "-asl", 91 | "not protein", 92 | ] 93 | ) 94 | 95 | with NamedTemporaryFile(mode="w", suffix=".mae") as ligand_file_raw_mae: 96 | logger.debug("Selecting and writing ligand from MAE input file ...") 97 | subprocess.run( # first everything that could be ligand 98 | [ 99 | str(schrodinger_directory / "run"), 100 | "delete_atoms.py", 101 | str(input_file_mae), 102 | ligand_file_raw_mae.name, 103 | "-asl", 104 | f"not res. {ligand_resname}" if ligand_resname else "not ligand", 105 | ] 106 | ) 107 | subprocess.run( # then only first molecule from potential ligands 108 | [ 109 | str(schrodinger_directory / "run"), 110 | "delete_atoms.py", 111 | ligand_file_raw_mae.name, 112 | ligand_file_mae.name, 113 | "-asl", 114 | "mol. >1", 115 | ] 116 | ) 117 | 118 | logger.debug("Merging protein and ligand in the right order ...") 119 | subprocess.run( 120 | [ 121 | str(schrodinger_directory / "utilities/structcat"), 122 | "-i", 123 | protein_file_mae.name, 124 | ligand_file_mae.name, 125 | "-o", 126 | protein_ligand_file_mae.name, 127 | ] 128 | ) 129 | 130 | logger.debug("Writing molecules to SDF ...") 131 | if not mols_names or len(mols_names) != len(mols_smiles): 132 | logger.debug("Creating molecule names ...") 133 | mols_names = [str(x) for x in range(1, len(mols_smiles) + 1)] 134 | sd_writer = Chem.SDWriter(mols_file_sdf.name) 135 | for smiles, name in zip(mols_smiles, mols_names): 136 | mol = Chem.MolFromSmiles(smiles) 137 | if not mol: 138 | logger.debug(f"Skipping molecule {name} with erroneous smiles ...") 139 | continue 140 | mol.SetProp("_Name", name) 141 | mol = Chem.AddHs(mol) 142 | AllChem.EmbedMolecule(mol) 143 | sd_writer.write(mol) 144 | 145 | logger.debug("Writing input file for grid generation ...") 146 | grid_input_file.write(f"RECEP_FILE '{protein_ligand_file_mae.name}'\n") 147 | grid_input_file.write("LIGAND_INDEX 2\n") 148 | grid_input_file.flush() 149 | 150 | grid_file_path = Path(cache_dir) / ( 151 | sha256_objects([input_file_mae, ligand_resname]) + ".zip" 152 | ) # caching via hash based on input structure and chosen ligand 153 | if grid_file_path.is_file(): 154 | logger.debug("Found cached grid file ..") 155 | else: 156 | logger.debug("Generating grid for docking ...") 157 | subprocess.run( 158 | [ 159 | str(schrodinger_directory / "glide"), 160 | grid_input_file.name, 161 | "-HOST", 162 | "localhost", 163 | "-WAIT", 164 | "-OVERWRITE", 165 | ] 166 | ) 167 | shutil.move( 168 | str(Path(".") / (Path(grid_input_file.name).stem + ".zip")), grid_file_path 169 | ) 170 | 171 | if logger.getEffectiveLevel() != 10: # remove grid logs etc. 172 | paths = Path(".").glob(f"*{Path(grid_input_file.name).stem}*") 173 | for path in paths: 174 | path.unlink() 175 | 176 | logger.debug("Writing input file for docking ...") 177 | docking_input_file.write(f"GRIDFILE '{str(grid_file_path)}'\n") 178 | docking_input_file.write(f"LIGANDFILE '{mols_file_sdf.name}'\n") 179 | docking_input_file.write(f"LIGPREP True\n") 180 | docking_input_file.write("POSE_OUTTYPE ligandlib_sd\n") 181 | docking_input_file.write(f"COMPRESS_POSES False\n") 182 | docking_input_file.write(f"POSES_PER_LIG {n_poses}\n") 183 | docking_input_file.write(f"PRECISION {precision}\n") 184 | if shape_restrain: 185 | docking_input_file.write(f"SHAPE_RESTRAIN True\n") 186 | docking_input_file.write(f"SHAPE_REF_LIGAND_FILE '{ligand_file_mae.name}'\n") 187 | if macrocyles: 188 | docking_input_file.write(f"MACROCYCLE True\n") 189 | docking_input_file.flush() 190 | 191 | logger.debug("Running docking ...") 192 | subprocess.run( 193 | [ 194 | str(schrodinger_directory / "glide"), 195 | docking_input_file.name, 196 | "-HOST", 197 | "localhost", 198 | "-WAIT", 199 | "-OVERWRITE", 200 | ] 201 | ) 202 | 203 | logger.debug("Filtering poses for appropriate number ...") 204 | docking_input_file_path = Path(docking_input_file.name) 205 | sd_file_path = Path(".") / (docking_input_file_path.stem + "_lib.sdf") 206 | if not sd_file_path.is_file(): 207 | logger.debug("No docking poses were generated during docking ...") 208 | return 209 | supplier = Chem.SDMolSupplier(str(sd_file_path), removeHs=False) 210 | sd_writer = Chem.SDWriter(str(output_file_sdf)) 211 | mol_counter_dict = {} 212 | for mol in supplier: 213 | # SDF from glide is sorted by docking score, but mols are in mixed order 214 | name = mol.GetProp("_Name") 215 | if name not in mol_counter_dict.keys(): 216 | mol_counter_dict[name] = 0 217 | if mol_counter_dict[name] < n_poses: 218 | sd_writer.write(mol) 219 | mol_counter_dict[name] += 1 220 | sd_file_path.unlink() # manually delete file 221 | 222 | if logger.getEffectiveLevel() != 10: # remove docking logs etc. 223 | paths = Path(".").glob(f"*{docking_input_file_path.stem}*") 224 | for path in paths: 225 | path.unlink() 226 | 227 | return 228 | -------------------------------------------------------------------------------- /kinoml/features/protein.py: -------------------------------------------------------------------------------- 1 | """ 2 | Featurizers that mostly concern protein-based models 3 | """ 4 | from __future__ import annotations 5 | from collections import Counter 6 | import logging 7 | from typing import Union 8 | 9 | import numpy as np 10 | 11 | from .core import ParallelBaseFeaturizer, BaseOneHotEncodingFeaturizer, OEBaseModelingFeaturizer 12 | from ..core.proteins import Protein, KLIFSKinase 13 | from ..core.sequences import AminoAcidSequence 14 | from ..core.systems import ProteinSystem, ProteinLigandComplex 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class SingleProteinFeaturizer(ParallelBaseFeaturizer): 21 | """ 22 | Provides a minimally useful ``._supports()`` method for all Protein-like featurizers. 23 | """ 24 | 25 | _COMPATIBLE_PROTEIN_TYPES = (Protein, KLIFSKinase) 26 | 27 | def __init__(self, **kwargs): 28 | super().__init__(**kwargs) 29 | 30 | def _supports(self, system: Union[ProteinSystem, ProteinLigandComplex]) -> bool: 31 | """ 32 | Check that exactly one protein is present in the System 33 | """ 34 | super_checks = super()._supports(system) 35 | proteins = [c for c in system.components if isinstance(c, self._COMPATIBLE_PROTEIN_TYPES)] 36 | return all([super_checks, len(proteins) == 1]) 37 | 38 | 39 | class AminoAcidCompositionFeaturizer(SingleProteinFeaturizer): 40 | 41 | """Featurizes the protein using the composition of the residues in the binding site.""" 42 | 43 | def __init__(self, **kwargs): 44 | super().__init__(**kwargs) 45 | 46 | # Initialize a Counter object with 0 counts 47 | _counter = Counter(sorted(AminoAcidSequence.ALPHABET)) 48 | for k in _counter.keys(): 49 | _counter[k] = 0 50 | 51 | def _featurize_one( 52 | self, system: Union[ProteinSystem, ProteinLigandComplex] 53 | ) -> Union[np.array, None]: 54 | """ 55 | Featurizes a protein using the residue count in the sequence. 56 | 57 | Parameters 58 | ---------- 59 | system: ProteinSystem or ProteinLigandComplex 60 | The System to be featurized. 61 | 62 | Returns 63 | ------- 64 | : np.array or None 65 | The count of amino acids in the binding site. 66 | """ 67 | count = self._counter.copy() 68 | try: 69 | sequence = system.protein.sequence 70 | except ValueError: # e.g. erroneous uniprot_id in lazy instantiation 71 | return None 72 | count.update(system.protein.sequence) 73 | sorted_count = sorted(count.items(), key=lambda kv: kv[0]) 74 | return np.array([number for _, number in sorted_count]) 75 | 76 | 77 | class OneHotEncodedSequenceFeaturizer(BaseOneHotEncodingFeaturizer, SingleProteinFeaturizer): 78 | 79 | """Featurizes the sequence of the protein to a one hot encoding.""" 80 | 81 | ALPHABET = AminoAcidSequence.ALPHABET 82 | 83 | def __init__(self, sequence_type: str = "full", **kwargs): 84 | """ 85 | Featurizes the sequence of the protein to a one hot encoding. 86 | 87 | Parameters 88 | ---------- 89 | sequence_type: str, default=full 90 | The sequence to use for one hot encoding ('full', 'klifs_kinase' or 'klifs_structure'). 91 | """ 92 | if sequence_type not in ["full", "klifs_kinase", "klifs_structure"]: 93 | raise ValueError( 94 | "Only 'full', 'klifs_kinase' and 'klifs_structure' are supported sequence_types, " 95 | f"you provided {sequence_type}." 96 | ) 97 | self.sequence_type = sequence_type 98 | if sequence_type != "full": 99 | self.ALPHABET += "-" # add gap symbol for KLIFS sequence to ALPHABET 100 | super().__init__(**kwargs) # update ALPHABET first 101 | 102 | def _retrieve_sequence(self, system: Union[ProteinSystem, ProteinLigandComplex]) -> str: 103 | try: 104 | if self.sequence_type == "full": 105 | sequence = system.protein.sequence 106 | elif self.sequence_type == "klifs_kinase": 107 | sequence = system.protein.kinase_klifs_sequence 108 | else: 109 | sequence = system.protein.structure_klifs_sequence 110 | except ValueError: # e.g. erroneous uniprot_id in lazy instantiation 111 | return "" 112 | return sequence 113 | 114 | 115 | class OEProteinStructureFeaturizer(OEBaseModelingFeaturizer, SingleProteinFeaturizer): 116 | """ 117 | Given systems with exactly one protein, prepare the protein structure by: 118 | 119 | - modeling missing loops with OESpruce according to the PDB header unless 120 | a custom sequence is specified via the `uniprot_id` or `sequence` 121 | attribute in the protein component (see below), missing sequences at 122 | N- and C-termini are not modeled 123 | - building missing side chains 124 | - substitutions, deletions and insertions, if a `uniprot_id` or `sequence` 125 | attribute is provided for the protein component alteration will be 126 | modeled with OESpruce, if an alteration could not be modeled, the 127 | corresponding mismatch in the structure will be deleted 128 | - removing everything but protein and water 129 | - protonation at pH 7.4 130 | 131 | The protein component of each system must be a `core.proteins.Protein` 132 | or a subclass thereof, must be initialized with toolkit='OpenEye' and 133 | give access to a molecular structure, e.g. via a pdb_id. Additionally, 134 | the protein component can have the following optional attributes to 135 | customize the protein modeling: 136 | 137 | - `name`: A string specifying the name of the protein, will be used for 138 | generating the output file name. 139 | - `chain_id`: A string specifying which chain should be used. 140 | - `alternate_location`: A string specifying which alternate location 141 | should be used. 142 | - `expo_id`: A string specifying a ligand bound to the protein of 143 | interest. This is especially useful if multiple proteins are found in 144 | one PDB structure. 145 | - `uniprot_id`: A string specifying the UniProt ID that will be used to 146 | fetch the amino acid sequence from UniProt, which will be used for 147 | modeling the protein. This will supersede the sequence information 148 | given in the PDB header. 149 | - `sequence`: A string specifying the amino acid sequence in 150 | one-letter-codes that should be used during modeling the protein. This 151 | will supersede a given `uniprot_id` and the sequence information given 152 | in the PDB header. 153 | 154 | Parameters 155 | ---------- 156 | loop_db: str 157 | The path to the loop database used by OESpruce to model missing loops. 158 | cache_dir: str, Path or None, default=None 159 | Path to directory used for saving intermediate files. If None, default 160 | location provided by `appdirs.user_cache_dir()` will be used. 161 | output_dir: str, Path or None, default=None 162 | Path to directory used for saving output files. If None, output 163 | structures will not be saved. 164 | use_multiprocessing : bool, default=True 165 | If multiprocessing to use. 166 | n_processes : int or None, default=None 167 | How many processes to use in case of multiprocessing. Defaults to 168 | number of available CPUs. 169 | """ 170 | 171 | from MDAnalysis.core.universe import Universe 172 | 173 | def __init__(self, **kwargs): 174 | super().__init__(**kwargs) 175 | 176 | def _featurize_one(self, system: ProteinSystem) -> Union[Universe, None]: 177 | """ 178 | Prepare a protein structure. 179 | 180 | Parameters 181 | ---------- 182 | system: ProteinSystem 183 | A system object holding a protein component. 184 | 185 | Returns 186 | ------- 187 | : Universe or None 188 | An MDAnalysis universe of the featurized system. None if no design unit was found. 189 | """ 190 | from pathlib import Path 191 | 192 | from ..modeling.MDAnalysisModeling import read_molecule 193 | 194 | structure = self._read_protein_structure(system.protein) 195 | if structure is None: 196 | logger.warning( 197 | f"Could not read protein structure for {system.protein}, returning None!" 198 | ) 199 | return None 200 | 201 | logging.debug("Preparing protein structure ...") 202 | design_unit = self._get_design_unit( 203 | structure=structure, 204 | chain_id=system.protein.chain_id if hasattr(system.protein, "chain_id") else None, 205 | alternate_location=system.protein.alternate_location 206 | if hasattr(system.protein, "alternate_location") 207 | else None, 208 | has_ligand=hasattr(system.protein, "expo_id"), 209 | ligand_name=system.protein.expo_id if hasattr(system.protein, "expo_id") else None, 210 | model_loops_and_caps=False if system.protein.sequence else True, 211 | ) # if sequence is given model loops and caps separately later 212 | if not design_unit: 213 | logging.debug("No design unit found, returning None!") 214 | return None 215 | 216 | logging.debug("Extracting design unit components ...") 217 | protein, solvent = self._get_components( 218 | design_unit=design_unit, 219 | chain_id=system.protein.chain_id if hasattr(system.protein, "chain_id") else None, 220 | )[:-1] 221 | 222 | if system.protein.sequence: 223 | first_id = 1 224 | if "construct_range" in system.protein.metadata.keys(): 225 | first_id = int(system.protein.metadata["construct_range"].split("-")[0]) 226 | protein = self._process_protein( 227 | protein_structure=protein, 228 | amino_acid_sequence=system.protein.sequence, 229 | first_id=first_id, 230 | ) 231 | 232 | logging.debug("Assembling components ...") 233 | solvated_protein = self._assemble_components(protein, solvent) 234 | 235 | logging.debug("Updating pdb header ...") 236 | solvated_protein = self._update_pdb_header( 237 | solvated_protein, protein_name=system.protein.name 238 | ) 239 | 240 | logging.debug("Writing results ...") 241 | file_path = self._write_results( 242 | solvated_protein, 243 | "_".join( 244 | [ 245 | info 246 | for info in [ 247 | system.protein.name, 248 | system.protein.pdb_id 249 | if system.protein.pdb_id 250 | else Path(system.protein.metadata["file_path"]).stem, 251 | f"chain{system.protein.chain_id}" 252 | if hasattr(system.protein, "chain_id") 253 | else None, 254 | f"altloc{system.protein.alternate_location}" 255 | if hasattr(system.protein, "alternate_location") 256 | else None, 257 | ] 258 | if info 259 | ] 260 | ), 261 | ) 262 | 263 | logging.debug("Generating new MDAnalysis universe ...") 264 | structure = read_molecule(file_path) 265 | 266 | if not self.output_dir: 267 | logging.debug("Removing structure file ...") 268 | file_path.unlink() 269 | 270 | return structure 271 | -------------------------------------------------------------------------------- /kinoml/features/ligand.py: -------------------------------------------------------------------------------- 1 | """ 2 | Featurizers that mostly concern ligand-based models 3 | """ 4 | 5 | from __future__ import annotations 6 | from typing import Union 7 | 8 | import numpy as np 9 | from openff.toolkit.utils.exceptions import SMILESParseError, RadicalsNotSupportedError 10 | from rdkit import Chem 11 | 12 | from .core import ParallelBaseFeaturizer, BaseOneHotEncodingFeaturizer 13 | from ..core.systems import LigandSystem, ProteinLigandComplex 14 | from ..core.ligands import Ligand 15 | 16 | 17 | class SingleLigandFeaturizer(ParallelBaseFeaturizer): 18 | """ 19 | Provides a minimally useful ``._supports()`` method for all Ligand-like featurizers. 20 | """ 21 | 22 | _COMPATIBLE_LIGAND_TYPES = (Ligand,) 23 | 24 | def __init__(self, **kwargs): 25 | super().__init__(**kwargs) 26 | 27 | def _supports(self, system: Union[LigandSystem, ProteinLigandComplex]) -> bool: 28 | """ 29 | Check that exactly one ligand is present in the System 30 | """ 31 | super_checks = super()._supports(system) 32 | ligands = [c for c in system.components if isinstance(c, self._COMPATIBLE_LIGAND_TYPES)] 33 | return all([super_checks, len(ligands) == 1]) 34 | 35 | 36 | class MorganFingerprintFeaturizer(SingleLigandFeaturizer): 37 | """ 38 | Given a ``System`` containing one ``Ligand`` component, convert it to an RDKit molecule and 39 | generate the Morgan fingerprints bitvectors. 40 | 41 | Parameters 42 | ---------- 43 | radius: int, optional=2 44 | Morgan fingerprint neighborhood radius 45 | nbits: int, optional=512 46 | Length of the resulting bit vector 47 | """ 48 | 49 | def __init__(self, radius: int = 2, nbits: int = 512, **kwargs): 50 | super().__init__(**kwargs) 51 | self.radius = radius 52 | self.nbits = nbits 53 | 54 | def _featurize_one( 55 | self, system: Union[LigandSystem, ProteinLigandComplex] 56 | ) -> Union[np.ndarray, None]: 57 | """ 58 | Return the Morgan fingerprint for the given system. 59 | 60 | Parameters 61 | ---------- 62 | system: LigandSystem or ProteinLigandComplex 63 | The System to be featurized. 64 | 65 | Returns 66 | ------- 67 | : np.array or None 68 | """ 69 | from rdkit.Chem import RemoveHs 70 | from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect 71 | 72 | try: # catch erroneous smiles not yet interpreted in case of lazy instantiation 73 | rdkit_mol = system.ligand.molecule.to_rdkit() 74 | except (SMILESParseError, RadicalsNotSupportedError): 75 | return None 76 | 77 | rdkit_mol = RemoveHs(rdkit_mol) 78 | fp = GetMorganFingerprintAsBitVect(rdkit_mol, radius=self.radius, nBits=self.nbits) 79 | return np.asarray(fp, dtype="int64") 80 | 81 | 82 | class OneHotSMILESFeaturizer(BaseOneHotEncodingFeaturizer, SingleLigandFeaturizer): 83 | 84 | """ 85 | One-hot encodes a ``Ligand`` from a SMILES representation. 86 | 87 | Attributes 88 | ---------- 89 | ALPHABET: str 90 | Defines the character-integer mapping (as a sequence) 91 | of the one-hot encoding. 92 | """ 93 | 94 | ALPHABET = ( 95 | "BCFHIKNOPSUVWY" # atoms 96 | "acegilnosru" # aromatic atoms 97 | "-=#" # bonds 98 | "1234567890" # ring closures 99 | ".*" # disconnections 100 | "()" # branches 101 | "/+@:[]%\\" # other characters 102 | "LR$" # single-char representation of Cl, Br, @@ 103 | ) 104 | 105 | def __init__(self, smiles_type: str = "canonical", **kwargs): 106 | """ 107 | One-hot encodes a ``Ligand`` from a SMILES representation. 108 | 109 | Parameters 110 | ---------- 111 | smiles_type: str, default=canonical 112 | The smiles type to use ('canonical' or 'raw'). 113 | """ 114 | super().__init__(**kwargs) 115 | if smiles_type not in ["canonical", "raw"]: 116 | raise ValueError( 117 | "Only 'canonical' and 'raw' are supported smiles_type, you provided " 118 | f"{smiles_type}." 119 | ) 120 | self.smiles_type = smiles_type 121 | 122 | def _retrieve_sequence(self, system: Union[LigandSystem, ProteinLigandComplex]) -> str: 123 | """ 124 | Get SMILES string from a `Ligand`-like component and postprocesses it. 125 | 126 | Double element symbols (such as `Cl`, ``Br`` for atoms and ``@@`` for chirality) 127 | are replaced with single element symbols (`L`, ``R`` and ``$`` respectively). 128 | """ 129 | try: 130 | if self.smiles_type == "canonical": 131 | smiles = system.ligand.molecule.to_smiles(explicit_hydrogens=False) 132 | else: 133 | smiles = system.ligand.metadata["smiles"] 134 | except SMILESParseError: # erroneous SMILES string 135 | return "" 136 | except KeyError: # no SMILES string given during initialization 137 | return "" 138 | 139 | return smiles.replace("Cl", "L").replace("Br", "R").replace("@@", "$") 140 | 141 | 142 | class GraphLigandFeaturizer(SingleLigandFeaturizer): 143 | 144 | """ 145 | Creates a graph representation of a `Ligand`-like component. 146 | Each node (atom) is decorated with several RDKit descriptors 147 | Check ```self._per_atom_features``` for details. 148 | 149 | Parameters 150 | ---------- 151 | max_in_ring_size: int, optional=10 152 | Maximum ring size for testing whether an atom belongs to a 153 | ring or not. *Currently unused* 154 | """ 155 | 156 | ALL_ATOMIC_SYMBOLS = [ 157 | "C", 158 | "N", 159 | "O", 160 | "S", 161 | "F", 162 | "Si", 163 | "P", 164 | "Cl", 165 | "Br", 166 | "Mg", 167 | "Na", 168 | "Ca", 169 | "Fe", 170 | "As", 171 | "Al", 172 | "I", 173 | "B", 174 | "V", 175 | "K", 176 | "Tl", 177 | "Yb", 178 | "Sb", 179 | "Sn", 180 | "Ag", 181 | "Pd", 182 | "Co", 183 | "Se", 184 | "Ti", 185 | "Zn", 186 | "H", 187 | "Li", 188 | "Ge", 189 | "Cu", 190 | "Au", 191 | "Ni", 192 | "Cd", 193 | "In", 194 | "Mn", 195 | "Zr", 196 | "Cr", 197 | "Pt", 198 | "Hg", 199 | "Pb", 200 | "Unknown", 201 | ] 202 | 203 | def __init__(self, max_in_ring_size: int = 10, **kwargs): 204 | super().__init__(**kwargs) 205 | self.max_in_ring_size = max_in_ring_size 206 | self._hybridization_names = sorted(Chem.rdchem.HybridizationType.names) 207 | 208 | def _featurize_one( 209 | self, system: Union[LigandSystem, ProteinLigandComplex] 210 | ) -> Union[tuple, None]: 211 | """ 212 | Featurizes ligands contained in a System as a labeled graph. 213 | 214 | Parameters 215 | ---------- 216 | system: LigandSystem or ProteinLigandComplex 217 | The System being featurized. 218 | 219 | Returns 220 | ------- 221 | tuple of np.array or None 222 | A two-tuple with: 223 | - Graph connectivity of the molecule with shape ``(2, n_edges)`` 224 | - Feature matrix with shape ``(n_atoms, n_features)`` 225 | """ 226 | try: # catch erroneous smiles not yet interpreted in case of lazy instantiation 227 | # rdkit_mol = system.ligand.molecule.to_rdkit() 228 | # this does not work, since openff toolkit will permit implicit hydrogens when 229 | # converting to rdkit (see https://github.com/openforcefield/openff-toolkit/pull/1001) 230 | smiles = system.ligand.molecule.to_smiles(explicit_hydrogens=False) 231 | rdkit_mol = Chem.MolFromSmiles(smiles) 232 | except SMILESParseError: 233 | return None 234 | 235 | connectivity_graph = self._connectivity_COO_format(rdkit_mol) 236 | per_atom_features = np.array([self._per_atom_features(a) for a in rdkit_mol.GetAtoms()]) 237 | return connectivity_graph, per_atom_features 238 | 239 | def _per_atom_features(self, atom) -> np.ndarray: 240 | """ 241 | Computes desired features for each atom in the molecular graph. 242 | 243 | Parameters 244 | ---------- 245 | atom: rdkit.Chem.Atom 246 | Atom to extract features from 247 | 248 | Returns 249 | ------- 250 | tuple of atomic features. 251 | atomic_symbol : array 252 | the one-hot encoded atomic symbol from `ALL_ATOMIC_SYMBOLS`. 253 | formal_charge : int 254 | the formal charge of atom. 255 | hybridization_type : array 256 | the one-hot encoded hybridization type from 257 | ``rdkit.Chem.rdchem.HybridizationType``. 258 | aromatic : bool 259 | if atom is aromatic. 260 | degree : array 261 | the one-hot encoded degree of the atom in the molecule. 262 | total_h : int 263 | the total number of hydrogens on the atom (implicit and explicit). 264 | implicit_h : int 265 | the number of implicit hydrogens on the atom. 266 | radical_electrons : int 267 | the number of radical electrons. 268 | 269 | Notes 270 | ----- 271 | The atomic features are the same as in PotentialNet [1]_. 272 | 273 | .. [1] https://doi.org/10.1021/acscentsci.8b00507 274 | """ 275 | # Return flattened array; notice how the OHE'd matrices are flattened 276 | # and iterated with the * unpacking operator -- 277 | return np.array( 278 | [ 279 | # 1. Chemical element, one-hot encoded 280 | *BaseOneHotEncodingFeaturizer.one_hot_encode( 281 | [atom.GetSymbol()], self.ALL_ATOMIC_SYMBOLS 282 | ).flatten(), 283 | # 2. Formal charge 284 | atom.GetFormalCharge(), 285 | # 3. Hybridization, one-hot encoded 286 | *BaseOneHotEncodingFeaturizer.one_hot_encode( 287 | [atom.GetHybridization().name], 288 | self._hybridization_names, 289 | ).flatten(), 290 | # 4. Aromaticity 291 | atom.GetIsAromatic(), 292 | # 5. Total numbers of bonds, one-hot encoded 293 | *BaseOneHotEncodingFeaturizer.one_hot_encode( 294 | [atom.GetDegree()], list(range(11)) 295 | ).flatten(), 296 | # 6. Total number of hydrogens 297 | atom.GetTotalNumHs(), 298 | # 7. Number of implicit hydrogens 299 | atom.GetNumImplicitHs(), 300 | # 8. Number of radical electrons 301 | atom.GetNumRadicalElectrons(), 302 | ], 303 | dtype="float64", 304 | ) 305 | 306 | @staticmethod 307 | def _connectivity_COO_format(mol: Chem.Mol) -> np.ndarray: 308 | """ 309 | Returns the connectivity of the molecular graph in COO format. 310 | 311 | Parameters 312 | ---------- 313 | mol: rdkit.Chem.Mol 314 | RDKit molecule to extract bonds from 315 | 316 | Returns 317 | ------- 318 | np.ndarray 319 | graph connectivity in COO format with shape ``[2, num_edges]`` 320 | """ 321 | 322 | row, col = [], [] 323 | 324 | for bond in mol.GetBonds(): 325 | start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() 326 | row += [start, end] 327 | col += [end, start] 328 | 329 | return np.array([row, col]) 330 | --------------------------------------------------------------------------------