├── docs
    ├── _static
    │   ├── README
    │   ├── images
    │   │   └── custom_favicon.png
    │   └── css
    │   │   └── custom.css
    ├── ipython_kernel_config.py
    ├── notebooks
    │   ├── getting_started.nblink
    │   ├── ligand-only-smiles-EGFR.nblink
    │   ├── OpenEye_structural_featurizer.nblink
    │   ├── ligand-only-morgan1024-EGFR.nblink
    │   ├── Schrodinger_structural_featurizer.nblink
    │   ├── kinoml_object_model.nblink
    │   ├── kinase-ligand-informed-smiles-sequence-EGFR.nblink
    │   └── kinase-ligand-informed-morgan-composition-EGFR.nblink
    ├── Makefile
    ├── developers
    │   ├── autodocs.py
    │   ├── _docstrings_example.py
    │   └── api_docs.md
    ├── index.md
    └── conf.py
├── kinoml
    ├── core
    │   ├── __init__.py
    │   ├── components.py
    │   ├── conditions.py
    │   ├── systems.py
    │   └── ligands.py
    ├── ml
    │   ├── __init__.py
    │   ├── torch_trees.py
    │   ├── torch_geometric_models.py
    │   └── tensorflow_models.py
    ├── analysis
    │   ├── __init__.py
    │   ├── plots.py
    │   └── metrics.py
    ├── databases
    │   ├── __init__.py
    │   ├── uniprot.py
    │   ├── klifs.py
    │   └── pdb.py
    ├── datasets
    │   ├── __init__.py
    │   ├── torch_geometric_datasets.py
    │   ├── groups.py
    │   ├── pkis2.py
    │   └── chembl.py
    ├── docking
    │   ├── __init__.py
    │   └── SCHRODINGERDocking.py
    ├── modeling
    │   ├── __init__.py
    │   ├── alignment.py
    │   └── SCHRODINGERModeling.py
    ├── optimize
    │   └── __init__.py
    ├── workflows
    │   ├── __init__.py
    │   └── images
    │   │   ├── KinoML_Workflow_single.png
    │   │   └── KinoML_Workflow_multiple.png
    ├── data
    │   ├── molecules
    │   │   ├── __init__.py
    │   │   ├── chloroform.pdb
    │   │   ├── chloroform.sdf
    │   │   ├── chloroform_acetamide.pdb
    │   │   └── chloroform_acetamide.sdf
    │   ├── proteins
    │   │   ├── __init__.py
    │   │   ├── kinoml_tests_4f8o_spruce.loop_db
    │   │   └── README.md
    │   ├── electron_densities
    │   │   ├── __init__.py
    │   │   └── 4f8o_phases.mtz
    │   ├── object_model.png
    │   ├── fig_1_kinomltechpaper_v2.png
    │   ├── first_tutorial_scheme_v2.png
    │   ├── look_and_say.dat
    │   └── README.md
    ├── tests
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── test_conditions.py
    │   │   ├── test_measurements.py
    │   │   ├── test_ligands.py
    │   │   ├── test_systems.py
    │   │   ├── test_sequences.py
    │   │   └── test_proteins.py
    │   ├── data
    │   │   └── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── test_chembl.py
    │   │   └── test_pkis2.py
    │   ├── docking
    │   │   ├── __init__.py
    │   │   └── test_oedocking.py
    │   ├── features
    │   │   ├── __init__.py
    │   │   ├── test_ligand.py
    │   │   ├── test_core.py
    │   │   ├── test_protein.py
    │   │   └── test_complexes.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   └── test_alignment.py
    │   ├── databases
    │   │   ├── __init__.py
    │   │   ├── test_klifs.py
    │   │   ├── test_uniprot.py
    │   │   └── test_pdb.py
    │   ├── __init__.py
    │   └── test_kinoml.py
    ├── features
    │   ├── __init__.py
    │   ├── protein.py
    │   └── ligand.py
    └── __init__.py
├── MANIFEST.in
├── devtools
    ├── github-actions
    │   └── initialize_conda.sh
    ├── conda-envs
    │   ├── docs_env.yaml
    │   └── test_env.yaml
    ├── README.md
    └── scripts
    │   └── create_conda_env.py
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── workflows
    │   ├── cancel.yml
    │   ├── docs.yml
    │   ├── lint.yml
    │   └── ci.yml
    └── CONTRIBUTING.md
├── .codecov.yml
├── .lgtm.yml
├── CHANGELOG.md
├── setup.cfg
├── LICENSE
├── .gitignore
├── setup.py
├── tutorials
    └── README.md
├── CITATION.cff
├── CODE_OF_CONDUCT.md
└── README.md


/docs/_static/README:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/ml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/databases/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/docking/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/optimize/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/workflows/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/data/molecules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/data/proteins/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/docking/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/tests/databases/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/data/electron_densities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kinoml/data/object_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/object_model.png


--------------------------------------------------------------------------------
/docs/ipython_kernel_config.py:
--------------------------------------------------------------------------------
1 | c.InlineBackend.figure_formats = {"svg"}
2 | c.InlineBackend.rc = {"figure.dpi": 96}
3 | 


--------------------------------------------------------------------------------
/docs/notebooks/getting_started.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/getting_started/getting_started_with_kinoml.ipynb"}
2 | 


--------------------------------------------------------------------------------
/docs/_static/images/custom_favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/docs/_static/images/custom_favicon.png


--------------------------------------------------------------------------------
/kinoml/data/fig_1_kinomltechpaper_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/fig_1_kinomltechpaper_v2.png


--------------------------------------------------------------------------------
/kinoml/data/first_tutorial_scheme_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/first_tutorial_scheme_v2.png


--------------------------------------------------------------------------------
/docs/notebooks/ligand-only-smiles-EGFR.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/experiments/ligand-only-smiles-EGFR/experiment_notebook.ipynb"}


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include MANIFEST.in
3 | include versioneer.py
4 | 
5 | graft kinoml
6 | global-exclude *.py[cod] __pycache__ *.so


--------------------------------------------------------------------------------
/docs/notebooks/OpenEye_structural_featurizer.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/getting_started/OpenEye_structural_featurizer_showcase.ipynb"}
2 | 


--------------------------------------------------------------------------------
/docs/notebooks/ligand-only-morgan1024-EGFR.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/experiments/ligand-only-morgan1024-EGFR/experiment_notebook.ipynb"}
2 | 


--------------------------------------------------------------------------------
/kinoml/data/electron_densities/4f8o_phases.mtz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/electron_densities/4f8o_phases.mtz


--------------------------------------------------------------------------------
/kinoml/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Empty init file in case you choose a package besides PyTest such as Nose which may look for such a file
3 | """
4 | 


--------------------------------------------------------------------------------
/docs/notebooks/Schrodinger_structural_featurizer.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/getting_started/Schrodinger_structural_featurizer_showcase.ipynb"}
2 | 


--------------------------------------------------------------------------------
/kinoml/workflows/images/KinoML_Workflow_single.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/workflows/images/KinoML_Workflow_single.png


--------------------------------------------------------------------------------
/docs/notebooks/kinoml_object_model.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/getting_started/kinoml_object_model.ipynb", "extra-media": ["../../kinoml/data/"]}
2 | 


--------------------------------------------------------------------------------
/kinoml/workflows/images/KinoML_Workflow_multiple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/workflows/images/KinoML_Workflow_multiple.png


--------------------------------------------------------------------------------
/kinoml/data/proteins/kinoml_tests_4f8o_spruce.loop_db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openkinome/kinoml/HEAD/kinoml/data/proteins/kinoml_tests_4f8o_spruce.loop_db


--------------------------------------------------------------------------------
/kinoml/features/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Featurizers will always output arrays
3 | but they will use structure-oriented methods
4 | underneath to do it. 
5 | """
6 | 


--------------------------------------------------------------------------------
/docs/notebooks/kinase-ligand-informed-smiles-sequence-EGFR.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/experiments/kinase-ligand-informed-smiles-sequence-EGFR/experiment_notebook.ipynb"}
2 | 


--------------------------------------------------------------------------------
/docs/notebooks/kinase-ligand-informed-morgan-composition-EGFR.nblink:
--------------------------------------------------------------------------------
1 | {"path": "../../tutorials/experiments/kinase-ligand-informed-morgan-composition-EGFR/experiments_notebook.ipynb"}
2 | 


--------------------------------------------------------------------------------
/devtools/github-actions/initialize_conda.sh:
--------------------------------------------------------------------------------
1 | case ${CI_OS} in
2 |     windows*)
3 |         eval "$(${CONDA}/condabin/conda.bat shell.bash hook)";;
4 |     *)
5 |         eval "$(${CONDA}/condabin/conda shell.bash hook)";;
6 | esac
7 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | div.autodoc {
2 |   padding-left: 25px;
3 | }
4 | article>div.autodoc {
5 |   border-left: 4px solid rgba(230, 230, 230);
6 | }
7 | article>div.autodoc>div.autodoc {
8 |   border-left: 1px solid rgba(230, 230, 230);
9 | }


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | Provide a brief description of the PR's purpose here.
 3 | 
 4 | ## Todos
 5 | Notable points that this PR has either accomplished or will accomplish.
 6 |   - [ ] TODO 1
 7 | 
 8 | ## Questions
 9 | - [ ] Question1
10 | 
11 | ## Status
12 | - [ ] Ready to go


--------------------------------------------------------------------------------
/kinoml/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | KinoML is a structure-informed machine learning library
 3 | with a focus on kinase modeling
 4 | """
 5 | 
 6 | # Handle versioneer
 7 | from ._version import get_versions
 8 | 
 9 | versions = get_versions()
10 | __version__ = versions["version"]
11 | __git_revision__ = versions["full-revisionid"]
12 | del get_versions, versions
13 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
 1 | # Codecov configuration to make it a bit less noisy
 2 | coverage:
 3 |   status:
 4 |     patch: false
 5 |     project:
 6 |       default:
 7 |         threshold: 50%
 8 | comment:
 9 |   layout: "header"
10 |   require_changes: false
11 |   branches: null
12 |   behavior: default
13 |   flags: null
14 |   paths: null
15 | ignore:
16 |    - "**/test_*.py"


--------------------------------------------------------------------------------
/.lgtm.yml:
--------------------------------------------------------------------------------
 1 | # Configure LGTM for this package
 2 | 
 3 | extraction:
 4 |   python:  # Configure Python
 5 |     python_setup:  # Configure the setup
 6 |       version: 3  # Specify Version 3
 7 | path_classifiers:
 8 |   library:
 9 |     - versioneer.py  # Set Versioneer.py to an external "library" (3rd party code)
10 |     - devtools/*
11 |   generated:
12 |     - kinoml/_version.py
13 | 


--------------------------------------------------------------------------------
/.github/workflows/cancel.yml:
--------------------------------------------------------------------------------
 1 | name: Cancel previous
 2 | on: [push]
 3 | jobs:
 4 |   cancel:
 5 |     if: github.repository == 'openkinome/kinoml'
 6 |     name: 'Cancel Previous Runs'
 7 |     runs-on: ubuntu-latest
 8 |     timeout-minutes: 3
 9 |     steps:
10 |       - uses: styfle/cancel-workflow-action@0.4.1
11 |         with:
12 |           workflow_id: 231683,116359
13 |           access_token: ${{ github.token }}


--------------------------------------------------------------------------------
/kinoml/data/look_and_say.dat:
--------------------------------------------------------------------------------
 1 | 1
 2 | 11
 3 | 21
 4 | 1211
 5 | 111221
 6 | 312211
 7 | 13112221
 8 | 1113213211
 9 | 31131211131221
10 | 13211311123113112211
11 | 11131221133112132113212221
12 | 3113112221232112111312211312113211
13 | 1321132132111213122112311311222113111221131221
14 | 11131221131211131231121113112221121321132132211331222113112211
15 | 311311222113111231131112132112311321322112111312211312111322212311322113212221


--------------------------------------------------------------------------------
/kinoml/tests/core/test_conditions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.core.conditions
 3 | """
 4 | 
 5 | 
 6 | def test_assay_conditions():
 7 |     from kinoml.core.conditions import AssayConditions
 8 | 
 9 |     conditions = AssayConditions()
10 |     assert isinstance(conditions.pH, float)
11 |     assert conditions.pH == 7.0
12 | 
13 |     assert conditions == AssayConditions(pH=7.0)
14 |     assert conditions != AssayConditions(pH=8.0)
15 | 


--------------------------------------------------------------------------------
/kinoml/ml/torch_trees.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Expose some useful objects in the sklearn-pytorch library
 3 | 
 4 | These will provide Random Forest and Decision Tree implementations
 5 | for PyTorch using the sklearn API.
 6 | """
 7 | # pylint: disable=unused-import
 8 | from Sklearn_PyTorch import (
 9 |     TorchRandomForestClassifier,
10 |     TorchRandomForestRegressor,
11 |     TorchDecisionTreeClassifier,
12 |     TorchDecisionTreeRegressor,
13 | )
14 | 


--------------------------------------------------------------------------------
/kinoml/data/molecules/chloroform.pdb:
--------------------------------------------------------------------------------
 1 | HETATM    1  C1  UNL     1       0.000   0.000   0.000  1.00 20.00           C
 2 | HETATM    2 CL2  UNL     1       0.000   0.000   0.000  1.00 20.00          CL
 3 | HETATM    3 CL3  UNL     1       0.000   0.000   0.000  1.00 20.00          CL
 4 | HETATM    4 CL4  UNL     1       0.000   0.000   0.000  1.00 20.00          CL
 5 | TER       5      UNL     1 
 6 | CONECT    1    2    3    4
 7 | CONECT    2    1
 8 | CONECT    3    1
 9 | CONECT    4    1
10 | END
11 | 


--------------------------------------------------------------------------------
/kinoml/data/molecules/chloroform.sdf:
--------------------------------------------------------------------------------
 1 | 
 2 |   -OEChem-06022113532D
 3 | 
 4 |   4  3  0     0  0  0  0  0  0999 V2000
 5 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     1.0000    0.0000    0.0000 Cl  0  0  0  0  0  0  0  0  0  0  0  0
 7 |    -0.5000   -0.8660    0.0000 Cl  0  0  0  0  0  0  0  0  0  0  0  0
 8 |    -0.5000    0.8660    0.0000 Cl  0  0  0  0  0  0  0  0  0  0  0  0
 9 |   1  2  1  0  0  0  0
10 |   1  3  1  0  0  0  0
11 |   1  4  1  0  0  0  0
12 | M  END
13 | $$$$
14 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | ## [v1.0.0] - 2025-10-01
 8 | ### Added
 9 | - First stable, reproducible release accompanying the manuscript.
10 | - Conda installation via `devtools/conda-envs/test_env.yaml`.
11 | - Docker image published on Docker Hub: `openkinome/kinoml:v1`.
12 | - Examples of code usage in tutorials.
13 | 
14 | ### Notes
15 | - This release will be the reference for the manuscript submission.
16 | 


--------------------------------------------------------------------------------
/kinoml/tests/core/test_measurements.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.core.measurements
 3 | """
 4 | 
 5 | 
 6 | def test_measurements():
 7 |     from kinoml.core.measurements import BaseMeasurement
 8 |     from kinoml.core.conditions import AssayConditions
 9 |     from kinoml.core.components import MolecularComponent
10 |     from kinoml.core.systems import System
11 | 
12 |     conditions = AssayConditions()
13 |     system = System([MolecularComponent()])
14 |     measurement = BaseMeasurement(50, conditions=conditions, system=system)
15 |     assert isinstance(measurement, BaseMeasurement)
16 |     assert measurement == BaseMeasurement(50, conditions=conditions, system=system)
17 |     assert measurement != BaseMeasurement(10, conditions=conditions, system=system)
18 | 


--------------------------------------------------------------------------------
/devtools/conda-envs/docs_env.yaml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   # Base depends
 6 |   - python
 7 |   - pip
 8 |   - ipython
 9 |   # core
10 |   #- sphinx~=2.4.0
11 |   - docutils=0.20  # Support for nbsphinx_link 1.3.0
12 |   - sphinx
13 |   - jinja2
14 |   - nbsphinx
15 |   - nbsphinx-link
16 |   - sphinx-notfound-page
17 |   - sphinx-prompt
18 |   - sphinx-copybutton
19 |   - sphinx-autoapi>=3,<4
20 |   - myst-parser
21 |   - sphinxcontrib-httpdomain
22 |   - linkify-it-py
23 |   #- sphinx-panels
24 |   # themes
25 |   - sphinx-material
26 |   # local building
27 |   - sphinx-autobuild
28 |   #- pip:
29 |       # core
30 |       #- sphinx-version-warning
31 |       #- sphinxemoji
32 |       #- sphinx-last-updated-by-git
33 | 


--------------------------------------------------------------------------------
/kinoml/tests/test_kinoml.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit and regression test for the kinoml package.
 3 | """
 4 | 
 5 | # Import package, test suite, and other packages as needed
 6 | import kinoml  # pylint: disable=unused-import
 7 | import sys
 8 | 
 9 | 
10 | def test_kinoml_imported():
11 |     """
12 |     Sample test, will always pass so long as import statement worked
13 |     """
14 |     assert "kinoml" in sys.modules
15 | 
16 | 
17 | def test_3rdparty_imports():
18 |     """
19 |     Some packages can be tricky to install. Make sure we can import them.
20 |     """
21 |     import torch  # pylint: disable=unused-import
22 | 
23 |     assert "torch" in sys.modules
24 | 
25 |     import torch_geometric  # pylint: disable=unused-import
26 | 
27 |     assert "torch_geometric" in sys.modules
28 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | # Helper file to handle all configs
 2 | 
 3 | [coverage:run]
 4 | # .coveragerc to control coverage.py and pytest-cov
 5 | omit =
 6 |     # Omit the tests
 7 |     */tests/*
 8 |     # Omit generated versioneer
 9 |     kinoml/_version.py
10 | 
11 | [yapf]
12 | # YAPF, in .style.yapf files this shows up as "[style]" header
13 | COLUMN_LIMIT = 119
14 | INDENT_WIDTH = 4
15 | USE_TABS = False
16 | 
17 | [flake8]
18 | # Flake8, PyFlakes, etc
19 | max-line-length = 119
20 | 
21 | [versioneer]
22 | # Automatic version numbering scheme
23 | VCS = git
24 | style = pep440
25 | versionfile_source = kinoml/_version.py
26 | versionfile_build = kinoml/_version.py
27 | tag_prefix = ''
28 | 
29 | [aliases]
30 | test = pytest
31 | 
32 | [tool:pytest]
33 | markers =
34 |     slow: marks tests as slow (deselect with '-m "not slow"')


--------------------------------------------------------------------------------
/kinoml/tests/modeling/test_alignment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test alignment functionalities of `kinoml.modeling`
 3 | """
 4 | import pytest
 5 | 
 6 | from kinoml.modeling.alignment import sequence_similarity
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "sequence1, sequence2, similarity",
11 |     [
12 |         (
13 |             "NVG",
14 |             "NVG",
15 |             16,
16 |         ),
17 |         (
18 |             "NVG",
19 |             "NG",
20 |             1,
21 |         ),
22 |         (
23 |             "NVG",
24 |             "VG",
25 |             -1,
26 |         ),
27 |     ],
28 | )
29 | def test_sequence_similarity(sequence1, sequence2, similarity):
30 |     """Compare results to expected similarity."""
31 |     score = sequence_similarity(sequence1, sequence2)
32 |     assert score == similarity
33 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | livehtml:
23 | 	sphinx-autobuild -b html --ignore "sphinx-notfound-page/*" --ignore "autoapi/*" --ignore ".ipynb_checkpoints/*" --ignore ".#*" $(SPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html


--------------------------------------------------------------------------------
/kinoml/core/components.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base classes for all MolecularComponents.
 3 | 
 4 | One or more components can form a System.
 5 | Proteins, ligands, and other molecular entities are
 6 | derived the base class ``MolecularComponent``.
 7 | """
 8 | 
 9 | 
10 | class MolecularComponent:
11 |     """
12 |     Abstract base molecular entity.
13 |     """
14 | 
15 |     def __init__(self, name="", metadata=None, *args, **kwargs):
16 |         self.name = name
17 |         if metadata is None:
18 |             metadata = {}
19 |         self.metadata = metadata
20 | 
21 |     def __repr__(self) -> str:
22 |         return f"<{self.__class__.__name__} name={self.name}>"
23 | 
24 | 
25 | class BaseLigand(MolecularComponent):
26 |     """
27 |     Base class for all ligand-like entities.
28 |     """
29 | 
30 | 
31 | class BaseProtein(MolecularComponent):
32 |     """
33 |     Base class for all protein-like entities.
34 |     """
35 | 


--------------------------------------------------------------------------------
/kinoml/data/proteins/README.md:
--------------------------------------------------------------------------------
 1 | ## File description
 2 | 
 3 | ### 4f8o.pdb
 4 | 
 5 | This protein was chosen for writing unit tests, since it contains protein and ligand residues as 
 6 | well as multiple chains and alternate locations.
 7 | 
 8 | ### 4f8o_edit.pdb
 9 | 
10 | The 4f8o.pdb structure was altered in the following fashion:
11 |  - translated along x axis by 20 A --> superposition
12 |  - selected alternate location A 
13 |  - removed non protein atoms
14 |  - deleted ASP82 --> loop modeling
15 |  - deleted LYS135 --> detection of short protein segments
16 |  - deleted sidechain of ASN2 --> sidechain perception and modeling
17 |  - altered Chi1 dihedral of PHE4 to -1 radians --> detection of sidechain clashes
18 | 
19 | ### kinoml_tests_4f8o_spruce.loop_db
20 | 
21 | This loop template database was created using the loopdb_builder app based on 4f8o.pdb. It is used 
22 | for testing loop modeling.
23 | 


--------------------------------------------------------------------------------
/docs/developers/autodocs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | from collections import defaultdict
 4 | 
 5 | import yaml
 6 | 
 7 | """
 8 | Create stubs for all API reference *.md files
 9 | and propose a menu tree (you probably need to edit it to your liking)
10 | 
11 | Redefine docs and package below and execute from repository root
12 | """
13 | 
14 | docs = "apidocs"
15 | package = "kinoml"
16 | here = Path(package)
17 | tree = defaultdict(list)
18 | 
19 | Path(docs).mkdir()
20 | for py in here.rglob("*.py"):
21 |     if len(py.parts) > 2:
22 |         directory = Path(docs, *py.parts[1:-1])
23 |         directory.mkdir(parents=True, exist_ok=True)
24 |     file = Path(docs, *py.parts[1:-1], py.stem + ".md")
25 |     module = ".".join([package, *py.parts[1:-1], py.stem])
26 |     file.touch()
27 |     file.write_text(f"::: {module}")
28 |     tree[".".join([package, *py.parts[1:-1]])].append({module: str(file)})
29 | 
30 | print(yaml.dump(dict(tree)))
31 | 
32 | 


--------------------------------------------------------------------------------
/kinoml/data/molecules/chloroform_acetamide.pdb:
--------------------------------------------------------------------------------
 1 | HETATM    1  C1  UNL     1       0.000   0.000   0.000  1.00 20.00           C
 2 | HETATM    2 CL2  UNL     1       0.000   0.000   0.000  1.00 20.00          CL
 3 | HETATM    3 CL3  UNL     1       0.000   0.000   0.000  1.00 20.00          CL
 4 | HETATM    4 CL4  UNL     1       0.000   0.000   0.000  1.00 20.00          CL
 5 | TER       5      UNL     1 
 6 | CONECT    1    2    3    4
 7 | CONECT    2    1
 8 | CONECT    3    1
 9 | CONECT    4    1
10 | END
11 | HETATM    1  C1  UNL     1       0.000   0.000   0.000  1.00 20.00           C
12 | HETATM    2  C2  UNL     1       0.000   0.000   0.000  1.00 20.00           C
13 | HETATM    3  O3  UNL     1       0.000   0.000   0.000  1.00 20.00           O
14 | HETATM    4  N4  UNL     1       0.000   0.000   0.000  1.00 20.00           N
15 | TER       5      UNL     1 
16 | CONECT    1    2
17 | CONECT    2    1    3    3    4
18 | CONECT    3    2    2
19 | CONECT    4    2
20 | END
21 | 


--------------------------------------------------------------------------------
/kinoml/data/molecules/chloroform_acetamide.sdf:
--------------------------------------------------------------------------------
 1 | 
 2 |   -OEChem-06022113542D
 3 | 
 4 |   4  3  0     0  0  0  0  0  0999 V2000
 5 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     1.0000    0.0000    0.0000 Cl  0  0  0  0  0  0  0  0  0  0  0  0
 7 |    -0.5000   -0.8660    0.0000 Cl  0  0  0  0  0  0  0  0  0  0  0  0
 8 |    -0.5000    0.8660    0.0000 Cl  0  0  0  0  0  0  0  0  0  0  0  0
 9 |   1  2  1  0  0  0  0
10 |   1  3  1  0  0  0  0
11 |   1  4  1  0  0  0  0
12 | M  END
13 | $$$$
14 | 
15 |   -OEChem-06022113542D
16 | 
17 |   4  3  0     0  0  0  0  0  0999 V2000
18 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
19 |     1.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
20 |     1.5000    0.8660    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
21 |     1.5000   -0.8660    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
22 |   1  2  1  0  0  0  0
23 |   2  3  2  0  0  0  0
24 |   2  4  1  0  0  0  0
25 | M  END
26 | $$$$
27 | 


--------------------------------------------------------------------------------
/kinoml/tests/databases/test_klifs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test klifs functionalities of `kinoml.databases`
 3 | """
 4 | from contextlib import contextmanager
 5 | import pytest
 6 | 
 7 | from kinoml.databases.klifs import klifs_kinase_from_uniprot_id
 8 | 
 9 | 
10 | @contextmanager
11 | def does_not_raise():
12 |     yield
13 | 
14 | 
15 | @pytest.mark.parametrize(
16 |     "uniprot_id, expectation, klifs_kinase_id",
17 |     [
18 |         (
19 |             "P00519",
20 |             does_not_raise(),
21 |             392,
22 |         ),
23 |         (
24 |             "XXXXX",
25 |             pytest.raises(ValueError),
26 |             392,
27 |         ),
28 |     ],
29 | )
30 | def test_klifs_kinase_from_uniprot_id(uniprot_id, expectation, klifs_kinase_id):
31 |     """Compare klifs kinase ID for expected value."""
32 |     with expectation:
33 |         kinase = klifs_kinase_from_uniprot_id(uniprot_id)
34 |         assert kinase["kinase.klifs_id"] == klifs_kinase_id
35 | 


--------------------------------------------------------------------------------
/kinoml/tests/databases/test_uniprot.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test uniprot functionalities of `kinoml.databases`
 3 | """
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "uniprot_id, valid_uniprot_id",
 9 |     [
10 |         (
11 |             "P00519",
12 |             True,
13 |         ),
14 |         (
15 |             "O95271",
16 |             True,
17 |         ),
18 |         (
19 |             "PXXXXX",
20 |             False,
21 |         ),
22 |     ],
23 | )
24 | def test_download_fasta_file(uniprot_id, valid_uniprot_id):
25 |     """Check if UniProt entries can be downloaded in fasta format."""
26 |     from kinoml.databases.uniprot import download_fasta_file
27 | 
28 |     success = False
29 |     fasta_path = download_fasta_file(uniprot_id)
30 |     if fasta_path:
31 |         success = True
32 |         with open(fasta_path, "r") as fasta_file:
33 |             first_character = fasta_file.read()[0]
34 |             assert first_character == ">"
35 | 
36 |     assert success == valid_uniprot_id
37 | 


--------------------------------------------------------------------------------
/kinoml/databases/uniprot.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Union
 3 | 
 4 | from appdirs import user_cache_dir
 5 | 
 6 | 
 7 | def download_fasta_file(
 8 |     uniprot_id: str, directory: Union[Path, str] = user_cache_dir()
 9 | ) -> Union[Path, bool]:
10 |     """
11 |     Download a fasta file for a given UniProt identifier.
12 | 
13 |     Parameters
14 |     ----------
15 |     uniprot_id: str
16 |         The UniProt entry of interest.
17 |     directory: Path or str
18 |         The path to a directory for saving the file.
19 | 
20 |     Returns
21 |     -------
22 |     : Path or bool
23 |         The path to the downloaded file, False if not successful.
24 |     """
25 |     from ..utils import download_file
26 | 
27 |     fasta_path = Path(directory) / f"{uniprot_id}.fasta"
28 |     if fasta_path.is_file():
29 |         return fasta_path
30 | 
31 |     url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
32 |     if download_file(url, fasta_path):
33 |         return fasta_path
34 | 
35 |     return False
36 | 


--------------------------------------------------------------------------------
/kinoml/analysis/plots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Common plots for ML model performance analysis
 3 | """
 4 | 
 5 | import numpy as np
 6 | from matplotlib import pyplot as plt
 7 | from .metrics import performance
 8 | 
 9 | 
10 | def predicted_vs_observed(predicted, observed, measurement_type, with_metrics=True, **kwargs):
11 |     plt.ioff()
12 |     fig, ax = plt.subplots()
13 |     ax.scatter(predicted, observed)
14 |     ax.set_xlabel("Predicted")
15 |     ax.set_ylabel("Observed")
16 | 
17 |     if measurement_type is not None:
18 |         limits = np.array(measurement_type.RANGE)
19 |         padded_limits = limits[0] - 0.05 * limits.max(), limits[1] + 0.05 * limits.max()
20 | 
21 |         ax.set(xlim=padded_limits, ylim=padded_limits)
22 | 
23 |         x = np.linspace(padded_limits[0], padded_limits[1], 100)
24 |         ax.plot(x, x)
25 |         ax.set_aspect("equal", adjustable="box")
26 |         ax.set_title(f"{predicted.shape[0]} {measurement_type.__name__}")
27 | 
28 |     if with_metrics:
29 |         performance(predicted, observed, **kwargs)
30 | 
31 |     plt.close()
32 |     return fig
33 | 


--------------------------------------------------------------------------------
/devtools/conda-envs/test_env.yaml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | channels:
 3 |   - pytorch
 4 |   - pyg
 5 |   - conda-forge
 6 |   - openeye
 7 | dependencies:
 8 |   # base depends
 9 |   - python
10 |   - pip
11 |   - pandas
12 |   - requests
13 |   - pint =0.19.2
14 |   - appdirs
15 |   - tqdm
16 |   - openff-toolkit >=0.9
17 |   - xgboost
18 |   - openeye-toolkits >=2020.2.0
19 |   - biotite
20 |   - bravado
21 |   - biopandas
22 |   - awkward
23 |   - pyarrow >=6.0.1
24 |   - mdanalysis >=2.0.0
25 |   - opencadd
26 |   - matplotlib-base
27 |   - ruamel.yaml
28 | 
29 |   # distributed computing
30 |   - dask
31 |   - dask-jobqueue
32 | 
33 |   # reproducible workflows
34 |   - papermill 2.2.*
35 |   - watermark
36 | 
37 |   # pytorch
38 |   - pytorch >=1.8.0
39 |   - pyg
40 |   - pytorch-lightning
41 | 
42 |   # development
43 |   - jupyterlab
44 |   - nglview
45 | 
46 |   # testing
47 |   - pytest
48 |   - pytest-cov
49 |   - pytest-xdist
50 |   - codecov
51 |   - nbval
52 | 
53 |   - pip:
54 |       # PyTorch trees
55 |       - https://github.com/ValentinFigue/Sklearn_PyTorch/archive/1b56a43e41de331ecdf73d08418f75bb34c9fa06.tar.gz
56 | 


--------------------------------------------------------------------------------
/kinoml/tests/core/test_ligands.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.core.ligands
 3 | """
 4 | from importlib import resources
 5 | 
 6 | 
 7 | def test_ligand():
 8 |     from openeye import oechem
 9 |     from openff.toolkit.topology import Molecule
10 |     import rdkit
11 | 
12 |     from kinoml.core.ligands import Ligand
13 |     from kinoml.core.components import BaseLigand
14 | 
15 |     smiles = "CCCCC"
16 |     ligand = Ligand.from_smiles(smiles=smiles)
17 |     assert isinstance(ligand.molecule, Molecule)
18 |     with resources.path("kinoml.data.molecules", "chloroform.sdf") as path:
19 |         ligand = Ligand.from_file(str(path))
20 |         assert isinstance(ligand.molecule, Molecule)
21 |     ligand = Ligand(smiles=smiles)
22 |     assert isinstance(ligand, BaseLigand)
23 |     assert isinstance(ligand.molecule, Molecule)
24 |     assert isinstance(ligand.molecule.to_rdkit(), rdkit.Chem.Mol)
25 |     assert isinstance(ligand.molecule.to_openeye(), oechem.OEMol)
26 |     assert isinstance(ligand._smiles, str)
27 |     assert isinstance(ligand.metadata["smiles"], str)
28 |     assert isinstance(ligand.molecule.to_smiles(), str)
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2019 OpenKinome
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/kinoml/databases/klifs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def klifs_kinase_from_uniprot_id(uniprot_id: str) -> pd.DataFrame:
 5 |     """
 6 |     Retrieve KLIFS kinase details about the kinase matching the given Uniprot ID.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     uniprot_id: str
11 |         Uniprot identifier.
12 | 
13 |     Returns
14 |     -------
15 |     kinase: pd.Series
16 |         KLIFS kinase details.
17 | 
18 |     Raises
19 |     ------
20 |     ValueError:
21 |         No KLIFS kinase found for UniProt ID.
22 |     ValueError:
23 |         Multiple KLIFS kinases found for UniProt ID.
24 |     """
25 |     from opencadd.databases.klifs import setup_remote
26 | 
27 |     remote = setup_remote()
28 |     kinase_ids = remote.kinases.all_kinases()["kinase.klifs_id"]
29 |     kinases = remote.kinases.by_kinase_klifs_id(list(kinase_ids))
30 |     kinases = kinases[kinases["kinase.uniprot"] == uniprot_id]
31 |     if len(kinases) == 0:
32 |         raise ValueError("No KLIFS kinase found for UniProt ID.")
33 |     elif len(kinases) > 1:
34 |         raise ValueError("Multiple KLIFS kinases found for UniProt ID.")
35 |     kinase = kinases.iloc[0]
36 | 
37 |     return kinase
38 | 


--------------------------------------------------------------------------------
/kinoml/data/README.md:
--------------------------------------------------------------------------------
 1 | # Sample Package Data
 2 | 
 3 | This directory contains sample additional data you may want to include with your package.
 4 | This is a place where non-code related additional information (such as data files, molecular structures,  etc.) can 
 5 | go that you want to ship alongside your code.
 6 | 
 7 | Please note that it is not recommended to place large files in your git directory. If your project requires files larger
 8 | than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files
 9 | as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version
10 | in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository
11 | cap.
12 | 
13 | ## Including package data
14 | 
15 | Modify your package's `setup.py` file and the `setup()` command. Include the 
16 | [`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the 
17 | correct files.
18 | 
19 | ## Manifest
20 | 
21 | * `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150)
22 | 


--------------------------------------------------------------------------------
/kinoml/tests/datasets/test_chembl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.datasets.core
 3 | """
 4 | 
 5 | 
 6 | def test_chembl():
 7 |     from kinoml.core.proteins import Protein, KLIFSKinase
 8 |     from kinoml.datasets.chembl import ChEMBLDatasetProvider
 9 | 
10 |     # we will use a small subset with 100 entries only, for speed
11 |     chembl = ChEMBLDatasetProvider.from_source(
12 |         "https://github.com/openkinome/kinodata/releases/download/v0.3/activities-chembl29_v0.3.zip",
13 |         uniprot_ids=["P00533"],
14 |         sample=100,
15 |         protein_type="Protein",
16 |         toolkit="OpenEye",
17 |     )
18 |     assert len(chembl) == 100
19 |     assert isinstance(chembl.systems[0].protein, Protein)
20 |     assert chembl.systems[0].protein.toolkit == "OpenEye"
21 | 
22 |     chembl = ChEMBLDatasetProvider.from_source(
23 |         "https://github.com/openkinome/kinodata/releases/download/v0.3/activities-chembl29_v0.3.zip",
24 |         uniprot_ids=["P00533"],
25 |         sample=100,
26 |         protein_type="KLIFSKinase",
27 |         toolkit="MDAnalysis",
28 |     )
29 |     assert len(chembl) == 100
30 |     assert isinstance(chembl.systems[0].protein, KLIFSKinase)
31 |     assert chembl.systems[0].protein.toolkit == "MDAnalysis"
32 | 


--------------------------------------------------------------------------------
/kinoml/modeling/alignment.py:
--------------------------------------------------------------------------------
 1 | def sequence_similarity(
 2 |     sequence1: str,
 3 |     sequence2: str,
 4 |     open_gap_penalty: int = -11,
 5 |     extend_gap_penalty: int = -1,
 6 |     substitution_matrix: str = "BLOSUM62",
 7 | ) -> float:
 8 |     """
 9 |     Calculate the squence similarity of two amino acid sequences.
10 | 
11 |     Parameters
12 |     ----------
13 |     sequence1: str
14 |         The first sequence.
15 |     sequence2: str
16 |         The second sequence.
17 |     open_gap_penalty: int
18 |         The penalty to open a gap.
19 |     extend_gap_penalty: int
20 |         The penalty to extend a gap.
21 |     substitution_matrix: str
22 |         The substitution matrix to use during alignment.
23 |         Available matrices can be found via:
24 |         >>> from Bio.Align import substitution_matrices
25 |         >>> substitution_matrices.load()
26 | 
27 |     Returns
28 |     -------
29 |     score: float
30 |         Similarity of sequences.
31 |     """
32 |     from Bio import pairwise2
33 |     from Bio.Align import substitution_matrices
34 | 
35 |     substitution_matrix = substitution_matrices.load(substitution_matrix)
36 |     # replace any characters unknown to the substitution matrix by *
37 |     sequence1_clean = "".join([x if x in substitution_matrix.alphabet else "*" for x in sequence1])
38 |     sequence2_clean = "".join([x if x in substitution_matrix.alphabet else "*" for x in sequence2])
39 |     score = pairwise2.align.globalds(
40 |         sequence1_clean,
41 |         sequence2_clean,
42 |         substitution_matrix,
43 |         open_gap_penalty,
44 |         extend_gap_penalty,
45 |         score_only=True,
46 |     )
47 |     return score
48 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: DOCS
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "master"
 7 |       - "maintenance/.+"
 8 |   pull_request:
 9 |     branches:
10 |       - "master"
11 |       - "maintenance/.+"
12 |   schedule:
13 |     # Run a cron job once daily
14 |     - cron: "0 0 * * *"
15 | 
16 | jobs:
17 |   mkdocs:
18 |     name: Docs
19 |     runs-on: ubuntu-latest
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |         with:
24 |           fetch-depth: 0
25 | 
26 |       - name: "Setup micromamba"
27 |         uses: mamba-org/setup-micromamba@v2
28 |         with:
29 |           environment-file: devtools/conda-envs/docs_env.yaml
30 |           environment-name: kinoml-docs-env
31 |           cache-environment: true
32 |           cache-downloads: true
33 |           create-args: >-
34 |             python=3.10
35 |           init-shell: bash
36 | 
37 |       - name: "Additional info about the build"
38 |         shell: bash
39 |         run: |
40 |           uname -a
41 |           df -h
42 |           ulimit -a
43 | 
44 |       - name: "Environment Information"
45 |         shell: bash -l {0}
46 |         run: |
47 |           conda info --all
48 |           conda list
49 | 
50 |       - name: "Build docs"
51 |         shell: bash -l {0}
52 |         run: |
53 |           cd docs
54 |           make clean
55 |           SPHINXOPTS="-T --keep-going" make html
56 | 
57 |       - name: "Deploy"
58 |         uses: peaceiris/actions-gh-pages@v3
59 |         with:
60 |           github_token: ${{ secrets.GITHUB_TOKEN }}
61 |           publish_dir: ./docs/_build/html
62 |         if: github.ref == 'refs/heads/master'
63 | 


--------------------------------------------------------------------------------
/docs/developers/_docstrings_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example module to show how docstrings are written for
 3 | mkdocs + mkdocstrings
 4 | """
 5 | 
 6 | import typing
 7 | 
 8 | 
 9 | def example_function(arg1, kwarg=None) -> object:
10 |     """
11 |     Example function to demonstrate how APIs are rendered
12 | 
13 |     Parameters:
14 |         arg1 (dict): Some description for this argument.
15 |             This type (in parenthesis) is ignored.
16 |         kwarg: Some more descriptions
17 | 
18 |     Returns:
19 |         A description for the returned value
20 | 
21 |     __Examples__
22 | 
23 |     This can be automatically tested with `pytest --doctest-modules`!
24 |     Syntax might change subtly in the future.
25 |     Check https://github.com/pawamoy/mkdocstrings/issues/52
26 | 
27 |     ```python
28 |     >>> 2 + 2 == 4
29 |     True  # this passes pytest
30 |     >>> 2 + 2 == 5
31 |     True  # this fails pytest
32 | 
33 |     ```
34 |     """
35 |     pass
36 | 
37 | 
38 | def example_function_with_type_hints(arg1: dict, kwarg: typing.Any = None) -> object:
39 |     """
40 |     Example function to demonstrate how APIs are rendered
41 | 
42 |     Parameters:
43 |         arg1: Some description for this argument.
44 |         kwarg: Some more descriptions
45 | 
46 |     Returns:
47 |         A description for the returned value
48 | 
49 |     __Examples__
50 | 
51 |     This can be automatically tested with `pytest --doctest-modules`!
52 |     Syntax might change subtly in the future.
53 |     Check https://github.com/pawamoy/mkdocstrings/issues/52
54 | 
55 |     ```python
56 |     >>> 2 + 2 == 4
57 |     True  # this passes pytest
58 |     >>> 2 + 2 == 5
59 |     True  # this fails pytest
60 | 
61 |     ```
62 |     """
63 |     pass
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | .pytest_cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # VS Code
105 | .vscode/
106 | 
107 | # MacOS
108 | .DS_Store
109 | 
110 | # PyCharm
111 | .idea/
112 | 


--------------------------------------------------------------------------------
/kinoml/tests/core/test_systems.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.core.systems
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | def test_system():
 9 |     from kinoml.core.components import MolecularComponent
10 |     from kinoml.core.systems import System
11 | 
12 |     components = [MolecularComponent()]
13 |     system = System(components=components)
14 |     # This doesn't raise an error
15 |     System(components=[], strict=False)
16 |     # This does
17 |     with pytest.raises(AssertionError):
18 |         System(components=[])
19 | 
20 | 
21 | def test_ligand_system():
22 |     from kinoml.core.systems import LigandSystem
23 |     from kinoml.core.ligands import BaseLigand
24 | 
25 |     pl = LigandSystem(components=[BaseLigand()])
26 |     assert pl.ligand == list(pl.ligands)[0]
27 |     assert isinstance(pl.name, str)
28 | 
29 |     LigandSystem(components=[], strict=False)
30 |     with pytest.raises(AssertionError):
31 |         LigandSystem(components=[])
32 | 
33 | 
34 | def test_protein_system():
35 |     from kinoml.core.systems import ProteinSystem
36 |     from kinoml.core.proteins import BaseProtein
37 | 
38 |     pl = ProteinSystem(components=[BaseProtein()])
39 |     assert pl.protein == list(pl.proteins)[0]
40 | 
41 |     ProteinSystem(components=[], strict=False)
42 |     with pytest.raises(AssertionError):
43 |         ProteinSystem(components=[])
44 | 
45 | 
46 | def test_protein_ligand_complex():
47 |     from kinoml.core.systems import ProteinLigandComplex
48 |     from kinoml.core.proteins import BaseProtein
49 |     from kinoml.core.ligands import BaseLigand
50 | 
51 |     pl = ProteinLigandComplex(components=[BaseProtein(), BaseLigand()])
52 |     assert pl.ligand == list(pl.ligands)[0]
53 |     assert pl.protein == list(pl.proteins)[0]
54 | 
55 |     ProteinLigandComplex(components=[], strict=False)
56 |     with pytest.raises(AssertionError):
57 |         ProteinLigandComplex(components=[])
58 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "master"
 7 |       - "maintenance/.+"
 8 |   pull_request:
 9 |     branches:
10 |       - "master"
11 |       - "maintenance/.+"
12 |   schedule:
13 |     # Run a cron job once daily
14 |     - cron: "0 0 * * *"
15 | 
16 | jobs:
17 |   lint-format:
18 |     name: Lint & format checks
19 |     runs-on: ubuntu-18.04
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v2
23 | 
24 |       - uses: conda-incubator/setup-miniconda@v2
25 |         with:
26 |           installer-url: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
27 |           python-version: "3.7"
28 |           activate-environment: test
29 |           channel-priority: true
30 |           environment-file: devtools/conda-envs/test_env.yaml
31 |           auto-activate-base: false
32 |           use-mamba: true
33 | 
34 |       - name: Additional info about the build
35 |         shell: bash
36 |         run: |
37 |           uname -a
38 |           df -h
39 |           ulimit -a
40 | 
41 |       - name: Environment Information
42 |         shell: bash -l {0}
43 |         run: |
44 |           conda info --all
45 |           conda list
46 | 
47 |       - name: Install linter / formatter
48 |         shell: bash -l {0}
49 |         run: |
50 |           mamba install 'pylint<2.13.0' black
51 | 
52 |       - name: Install package
53 |         shell: bash -l {0}
54 |         run: |
55 |           python -m pip install --no-deps .
56 | 
57 |       - name: Run pylint
58 |         shell: bash -l {0}
59 |         run: |
60 |           pylint --disable=W kinoml/
61 | 
62 |       - name: Run black check
63 |         shell: bash -l {0}
64 |         if: always()
65 |         run: |
66 |           black --check -l 99 kinoml/ --exclude kinoml/_version.py
67 | 


--------------------------------------------------------------------------------
/kinoml/tests/datasets/test_pkis2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.datasets.kinomescan
 3 | """
 4 | 
 5 | 
 6 | def test_pkis2_protein_openeye():
 7 |     from kinoml.core.proteins import Protein
 8 |     from kinoml.datasets.pkis2 import PKIS2DatasetProvider
 9 | 
10 |     provider = PKIS2DatasetProvider.from_source(
11 |         protein_type="Protein",
12 |         toolkit="OpenEye",
13 |     )
14 |     assert len(provider.measurements) == 261_870
15 |     assert (provider.measurements[0].values == 14.0).all()
16 |     # check order in provider matches order in file
17 |     assert (  # matches line 43 in file
18 |         provider[17051].system.ligand.name
19 |         == "O=C1NC(C2=C(C3=CC=CC=C3)C=C4C(C(C=C(O)C=C5)=C5N4)=C21)=O"
20 |     )
21 |     assert (  # matches line 44 in file
22 |         provider[17052].system.ligand.name
23 |         == "CN(N=C1)C=C1C(C=C2)=NN3C2=NN=C3[C@@H](C)C4=CC=C(N=CC=C5)C5=C4"
24 |     )
25 |     assert isinstance(provider.systems[0].protein, Protein)
26 |     assert provider.systems[0].protein.toolkit == "OpenEye"
27 | 
28 | 
29 | def test_pkis2_klifskinase_mdanalysis():
30 |     from kinoml.core.proteins import KLIFSKinase
31 |     from kinoml.datasets.pkis2 import PKIS2DatasetProvider
32 | 
33 |     provider = PKIS2DatasetProvider.from_source(
34 |         protein_type="KLIFSKinase",
35 |         toolkit="MDAnalysis",
36 |     )
37 |     assert len(provider.measurements) == 261_870
38 |     assert (provider.measurements[0].values == 14.0).all()
39 |     # check order in provider matches order in file
40 |     assert (  # matches line 43 in file
41 |         provider[17051].system.ligand.name
42 |         == "O=C1NC(C2=C(C3=CC=CC=C3)C=C4C(C(C=C(O)C=C5)=C5N4)=C21)=O"
43 |     )
44 |     assert (  # matches line 44 in file
45 |         provider[17052].system.ligand.name
46 |         == "CN(N=C1)C=C1C(C=C2)=NN3C2=NN=C3[C@@H](C)C4=CC=C(N=CC=C5)C5=C4"
47 |     )
48 |     assert isinstance(provider.systems[0].protein, KLIFSKinase)
49 |     assert provider.systems[0].protein.toolkit == "MDAnalysis"
50 | 


--------------------------------------------------------------------------------
/kinoml/tests/core/test_sequences.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.core.sequences and derived objects
 3 | """
 4 | 
 5 | 
 6 | def test_biosequence_mutation():
 7 |     from kinoml.core.sequences import Biosequence
 8 | 
 9 |     s = Biosequence("ATCGTHCTCH")
10 |     s.substitute("C3P")
11 |     assert s.sequence == "ATPGTHCTCH"
12 |     s.delete(2, 5)
13 |     assert s.sequence == "AHCTCH"
14 |     s.delete(2, 5, insert="AA")
15 |     assert s.sequence == "AAAH"
16 |     s.insert(5, "T")
17 |     assert s.sequence == "AAAHT"
18 | 
19 | 
20 | def test_aminoacidsequence_fetching():
21 |     from kinoml.core.proteins import AminoAcidSequence
22 | 
23 |     s1 = AminoAcidSequence(uniprot_id="P00519-1")
24 |     s2 = AminoAcidSequence(ncbi_id="NP_005148.2")
25 |     assert s1.sequence == s2.sequence
26 | 
27 | 
28 | def test_aminoacidsequence_fetching_with_alterations():
29 |     from kinoml.core.proteins import AminoAcidSequence
30 | 
31 |     sequence = AminoAcidSequence(uniprot_id="P00519")
32 |     assert len(sequence.sequence) == 1130
33 |     assert sequence.sequence[314] == "T"
34 | 
35 |     sequence = AminoAcidSequence(uniprot_id="P00519", metadata={"construct_range": "229-512"})
36 |     assert len(sequence.sequence) == 284
37 | 
38 |     sequence = AminoAcidSequence(uniprot_id="P00519", metadata={"mutations": "T315A"})
39 |     assert sequence.sequence[314] == "A"
40 | 
41 |     sequence = AminoAcidSequence(
42 |         uniprot_id="P00519",
43 |         metadata={
44 |             "mutations": "T315A del320-322P ins321AAA",
45 |             "construct_range": "229-512",
46 |         },
47 |     )
48 |     assert sequence.sequence[86] == "A"
49 |     assert sequence.sequence[91] == "P"
50 |     assert sequence.sequence[92:95] == "AAA"
51 |     assert len(sequence.sequence) == 284
52 | 
53 | 
54 | def test_aminoacidsequence_ncbi_to_uniprot():
55 |     from kinoml.core.proteins import AminoAcidSequence
56 | 
57 |     uniprot_id = AminoAcidSequence.ncbi_to_uniprot("NP_005148")
58 |     assert uniprot_id == "P00519"
59 |     uniprot_id = AminoAcidSequence.ncbi_to_uniprot("BBB")
60 |     assert uniprot_id == ""
61 | 


--------------------------------------------------------------------------------
/kinoml/datasets/torch_geometric_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper classes to convert between DatasetProvider objects and
 3 | Dataset-like objects native to the PyTorch ecosystem
 4 | """
 5 | 
 6 | import awkward as ak
 7 | import torch
 8 | from torch.utils.data import Dataset as _NativeTorchDataset
 9 | from torch_geometric.data import Data
10 | 
11 | 
12 | # Disable false positive lint with torch.tensor
13 | # see https://github.com/pytorch/pytorch/issues/24807
14 | # pylint: disable=not-callable
15 | 
16 | 
17 | class AwkwardArrayGeometricDataset(_NativeTorchDataset):
18 |     """
19 |     Loads an Awkward array of Records suitable for PyTorch Geometric.
20 |     It assumes the following:
21 | 
22 |     - The Awkward array contains three fields: 0, 1 and 2
23 |     - 0: Conn. matrix  --> Data's ``edge_index``
24 |     - 1: Node features --> Data's ``x``
25 |     - 2: y labels
26 | 
27 |     If more attributes are needed, you need to modify ``__getitem__`` logic
28 |     """
29 | 
30 |     def __init__(self, data):
31 |         assert len(data.fields) == 3, (
32 |             f"Graph datasets should only contain three groups: "
33 |             "0, 1 and 2 (conn. matrix, node features, y; respectively)"
34 |         )
35 |         self.data = data
36 | 
37 |     def __len__(self):
38 |         return len(self.data)
39 | 
40 |     def __getitem__(self, index):
41 |         if isinstance(index, int):
42 |             index = [index]
43 |         fields = self.data.fields
44 |         edge_index = self.data[index, fields[0]]
45 |         node_features = self.data[index, fields[1]]
46 |         y = torch.tensor(self.data[index, fields[2]])
47 |         X = [
48 |             Data(x=torch.tensor(nf), edge_index=torch.tensor(ei).long())
49 |             for (nf, ei) in zip(node_features, edge_index)
50 |         ]
51 |         return X, y
52 | 
53 |     def __iter__(self):
54 |         raise NotImplementedError
55 | 
56 |     def __repr__(self):
57 |         return self.data.__repr__()
58 | 
59 |     def __str__(self):
60 |         return self.data.__str__()
61 | 
62 |     @classmethod
63 |     def from_parquet(cls, path, **kwargs):
64 |         return cls(ak.from_parquet(path, **kwargs))
65 | 


--------------------------------------------------------------------------------
/kinoml/ml/torch_geometric_models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of some Deep Neural Networks in Pytorch using Pytorch Geometric.
 3 | """
 4 | 
 5 | from torch.nn import Linear
 6 | import torch.nn.functional as F
 7 | from torch_geometric.nn import GCNConv, global_mean_pool
 8 | from .torch_models import _BaseModule
 9 | 
10 | 
11 | class GraphConvolutionNeuralNetwork(_BaseModule):
12 |     """
13 |     Builds a Graph Convolutional Network and a feed-forward pass
14 | 
15 |     Parameters
16 |     ----------
17 |     input_shape : int
18 |         Number of features per node in the graph.
19 |     embedding_shape : int, default=100
20 |         Dimension of latent vector.
21 |     hidden_shape : int, default=50
22 |         Dimension of the hidden shape.
23 |     output_shape : int, default=1
24 |         Size of the last unit, representing delta_g_over_kt in our setting.
25 |     _activation : torch function, default=relu
26 |         The activation function used in the hidden (only!) layer of the network.
27 |     """
28 | 
29 |     needs_input_shape = True
30 | 
31 |     @staticmethod
32 |     def estimate_input_shape(input_sample):
33 |         # Take the first batch [0]
34 |         return input_sample[0].num_node_features
35 | 
36 |     def __init__(
37 |         self,
38 |         input_shape,
39 |         embedding_shape=100,
40 |         hidden_shape=50,
41 |         output_shape=1,
42 |         activation=F.relu,
43 |     ):
44 |         super().__init__()
45 |         self.input_shape = input_shape
46 |         self.embedding_shape = embedding_shape
47 |         self.hidden_shape = hidden_shape
48 |         self.output_shape = output_shape
49 |         self._activation = activation
50 | 
51 |         self.GraphConvLayer1 = GCNConv(self.input_shape, self.embedding_shape)
52 |         self.GraphConvLayer2 = GCNConv(self.embedding_shape, self.hidden_shape)
53 | 
54 |         self.linear = Linear(self.hidden_shape, self.output_shape)
55 | 
56 |     def forward(self, data):
57 |         data = data[0]  # get the first one only?
58 |         x, edge_index, batch = data.x.float(), data.edge_index.long(), data.batch
59 |         x = self._activation(self.GraphConvLayer1(x, edge_index))
60 |         x = self._activation(self.GraphConvLayer2(x, edge_index))
61 |         x = global_mean_pool(x, batch)
62 |         return self.linear(x)
63 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | KinoML
 3 | Machine Learning for kinase modeling
 4 | """
 5 | import sys
 6 | from setuptools import setup, find_packages
 7 | import versioneer
 8 | 
 9 | short_description = __doc__.split("\n")
10 | 
11 | # from https://github.com/pytest-dev/pytest-runner#conditional-requirement
12 | needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv)
13 | pytest_runner = ['pytest-runner'] if needs_pytest else []
14 | 
15 | try:
16 |     with open("README.md", "r") as handle:
17 |         long_description = handle.read()
18 | except:
19 |     long_description = "\n".join(short_description[2:])
20 | 
21 | 
22 | setup(
23 |     # Self-descriptive entries which should always be present
24 |     name='kinoml',
25 |     author='OpenKinome',
26 |     author_email='jaime.rodriguez-guerra@choderalab.org',
27 |     description=short_description[0],
28 |     long_description=long_description,
29 |     long_description_content_type="text/markdown",
30 |     version=versioneer.get_version(),
31 |     cmdclass=versioneer.get_cmdclass(),
32 |     license='MIT',
33 | 
34 |     # Which Python importable modules should be included when your package is installed
35 |     # Handled automatically by setuptools. Use 'exclude' to prevent some specific
36 |     # subpackage(s) from being added, if needed
37 |     packages=find_packages(),
38 | 
39 |     # Optional include package data to ship with your package
40 |     # Customize MANIFEST.in if the general case does not suit your needs
41 |     # Comment out this line to prevent the files from being packaged with your software
42 |     include_package_data=True,
43 | 
44 |     # Allows `setup.py test` to work correctly with pytest
45 |     setup_requires=[] + pytest_runner,
46 | 
47 |     # Additional entries you may want simply uncomment the lines you want and fill in the data
48 |     # url='http://www.my_package.com',  # Website
49 |     # install_requires=[],              # Required packages, pulls from pip if needed; do not use for Conda deployment
50 |     # platforms=['Linux',
51 |     #            'Mac OS-X',
52 |     #            'Unix',
53 |     #            'Windows'],            # Valid platforms your code works on, adjust to your flavor
54 |     # python_requires=">=3.5",          # Python version restrictions
55 | 
56 |     # Manual control if final package is compressible or not, set False to prevent the .egg from being made
57 |     # zip_safe=False,
58 | 
59 | )
60 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We welcome contributions from external contributors, and this document
 4 | describes how to merge code changes into this kinoml. 
 5 | 
 6 | ## Getting Started
 7 | 
 8 | * Make sure you have a [GitHub account](https://github.com/signup/free).
 9 | * [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub.
10 | * On your local machine,
11 |   [clone](https://help.github.com/articles/cloning-a-repository/) your fork of
12 |   the repository.
13 | 
14 | ## Making Changes
15 | 
16 | * Add some really awesome code to your local fork.  It's usually a [good
17 |   idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/)
18 |   to make changes on a
19 |   [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/)
20 |   with the branch name relating to the feature you are going to add.
21 | * When you are ready for others to examine and comment on your new feature,
22 |   navigate to your fork of kinoml on GitHub and open a [pull
23 |   request](https://help.github.com/articles/using-pull-requests/) (PR). Note that
24 |   after you launch a PR from one of your fork's branches, all
25 |   subsequent commits to that branch will be added to the open pull request
26 |   automatically.  Each commit added to the PR will be validated for
27 |   mergability, compilation and test suite compliance; the results of these tests
28 |   will be visible on the PR page.
29 | * If you're providing a new feature, you must add test cases and documentation.
30 | * When the code is ready to go, make sure you run the test suite using pytest.
31 | * When you're ready to be considered for merging, check the "Ready to go"
32 |   box on the PR page to let the kinoml devs know that the changes are complete.
33 |   The code will not be merged until this box is checked, the continuous
34 |   integration returns checkmarks,
35 |   and multiple core developers give "Approved" reviews.
36 | 
37 | # Additional Resources
38 | 
39 | * [General GitHub documentation](https://help.github.com/)
40 | * [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/)
41 | * [A guide to contributing to software packages](http://www.contribution-guide.org)
42 | * [Thinkful PR example](http://www.thinkful.com/learn/github-pull-request-tutorial/#Time-to-Submit-Your-First-PR)
43 | 


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | How to use the tutorials folder
 2 | ==============================
 3 | This tutorial folder contains two subfolders:
 4 | 
 5 | 
 6 | 
 7 | * **getting_started**: this folder contains four jupyter notebook tutorials that give the user a general overview of KinoML potential usage and capabilities.
 8 | 
 9 |     * **getting_started_with_kinoml**: this notebook aims to give a brief overview of KinoML capabilities. This notebook is divided into three parts that show how to use KinoML to: (1) filter and obtain the desired data from an external data source, (2) featurize this data to make it ML readable and (3) train and evaluate a ML model on the featurized data obtain from the previous steps. 
10 | 
11 |     * **kinoml_object_model**: this notebook aims to guide the user through the KinoML object model, showing how to access each object.
12 | 
13 |     * **OpenEye_structural_featurizer_showcase**: this notebook displays all the OpenEye-based structural modeling featurizers implemented in KinoML and how to use each of them.
14 | 
15 |     * **Schrodinger_structural_featurizer_showcase**: this notebook introduces the structural modeling featurizers implemented in KinoML that use the molecular modeling capabilities from the Schrodinger Suite to prepare protein structures and to dock small molecules into their binding sites.
16 | 
17 | 
18 | 
19 | * **experiments**:  this folder contains four separate structure-based experiments to predict ligand binding affinity to the EGFR kinase. The aim of these notebook are to showcase how to use KinoML to conduct experiments end-to-end, from obtaining the data from the database to training and evaluating a ML model to predict ligand binding affinity. Note that if the user wants to run this notebooks with their own data, they can do so by adjusting the neccesary parameters within the notebooks. All experiments are divided into two parts:
20 | 
21 |     1. **Featurize the data set**: obtaining the data set and featurize it with the featurization pipeline of choice.
22 | 
23 |     2. **Run the experiment**: the ML model of choice, implemented in the `kinoml.ml` class is trained and evaluated.
24 | 
25 | 
26 | Please note that the order in which the different notebooks are displayed here is the recommended order for running them, providing a more comprehensive understanding of KinoML.
27 | 
28 | ⚠️ You will need a valid OpenEye License for the tutorials to work. For the Schrodinger featurizers tutorial (`Schrodinger_structural_featurizer_showcase.ipynb`) you will also need a Schrodinger License!
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ```{admonition} Warning!
 2 | :class: warning
 3 | 
 4 | This is module is undergoing heavy development. None of the API calls are final. This software is provided without any guarantees of correctness, you will likely encounter bugs.
 5 | 
 6 | If you are interested in this code, please wait for the official release to use it. In the mean time, to stay informed of development progress you are encouraged to:
 7 | 
 8 | - Subscribe for new releases (use `Watch> Releases only` on GitHub)
 9 | - Check out the [Github repository](https://github.com/openkinome/kinoml).
10 | 
11 | ```
12 | 
13 | # KinoML
14 | 
15 | Welcome to the Documentation of KinoML! The documentation is divided into two parts:
16 | 
17 | * **User guide**: in this section you will learn how to use KinoML to filter and download data from a data base, featurize your kinase data so that it is ML friendly and train and evaluate a ML model on your featurized kinase data. You will also learn about the KinoML object model, and how to access each of these objects. We also provide a detailed examples of how to use every featurizer implemented within KinoML.
18 | 
19 | * **Experiment tutorials**: this section shows how to use KinoML to  ML structure-based experiments. All experiments are structure-based and they are all end to end, from data collection to model training and evaluation.
20 | 
21 |     
22 | 
23 | KinoML falls under the [OpenKinome](https://openkinome.org) initiative, which aims to leverage the increasingly available bioactivity data and scalable computational resources to perform kinase-centric drug design in the context of structure-informed machine learning and free energy calculations. `KinoML` is the main library supporting these efforts.
24 | 
25 | Do you want to know more about OpenKinome ecosystem? Check its [website](https://openkinome.org).
26 | 
27 | <!-- Notify Sphinx about the TOC -->
28 | 
29 | ```{toctree}
30 | :caption: User guide
31 | :maxdepth: 3
32 | :hidden:
33 | 
34 | notebooks/getting_started.nblink
35 | notebooks/kinoml_object_model.nblink
36 | notebooks/OpenEye_structural_featurizer.nblink
37 | notebooks/Schrodinger_structural_featurizer.nblink
38 | ```
39 | 
40 | ```{toctree}
41 | :caption: Experiment tutorials
42 | :maxdepth: 2
43 | :hidden:
44 | 
45 | notebooks/ligand-only-smiles-EGFR.nblink
46 | notebooks/ligand-only-morgan1024-EGFR.nblink
47 | notebooks/kinase-ligand-informed-smiles-sequence-EGFR.nblink
48 | notebooks/kinase-ligand-informed-morgan-composition-EGFR.nblink
49 | ```
50 | 
51 | ```{toctree}
52 | :caption: Developers
53 | :maxdepth: 1
54 | :hidden:
55 | 
56 | API Reference <api/kinoml/index>
57 | ```
58 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | title: "KinoML"
 4 | version: "1.0.0"
 5 | date-released: "2025-10-01"
 6 | repository-code: "https://github.com/openkinome/kinoml"
 7 | url: "https://github.com/openkinome/kinoml"
 8 | 
 9 | authors:
10 |   - family-names: "López-Ríos de Castro"
11 |     given-names: "Raquel"
12 |   - family-names: "Rodríguez-Guerra"
13 |     given-names: "Jaime"
14 |   - family-names: "Schaller"
15 |     given-names: "David"
16 |   - family-names: "Kimber"
17 |     given-names: "Talia B."
18 |   - family-names: "Taylor"
19 |     given-names: "Corey"
20 |   - family-names: "White"
21 |     given-names: "Jessica B."
22 |   - family-names: "Backenköhler"
23 |     given-names: "Michael"
24 |   - family-names: "Groß"
25 |     given-names: "Joschka"
26 |   - family-names: "Payne"
27 |     given-names: "Alexander"
28 |   - family-names: "Kaminow"
29 |     given-names: "Ben"
30 |   - family-names: "Pulido"
31 |     given-names: "Iván"
32 |   - family-names: "Singh"
33 |     given-names: "Sukrit"
34 |   - family-names: "Kramer"
35 |     given-names: "Paula Linh"
36 |   - family-names: "Pérez-Hernández"
37 |     given-names: "Guillermo"
38 |   - family-names: "Volkamer"
39 |     given-names: "Andrea"
40 |   - family-names: "Chodera"
41 |     given-names: "John D."
42 | 
43 | preferred-citation:
44 |   type: article
45 |   title: "Lessons learned during the journey of data: from experiment to model for predicting kinase affinity, selectivity, polypharmacology, and resistance"
46 |   authors:
47 |     - family-names: "López-Ríos de Castro"
48 |       given-names: "Raquel"
49 |     - family-names: "Rodríguez-Guerra"
50 |       given-names: "Jaime"
51 |     - family-names: "Schaller"
52 |       given-names: "David"
53 |     - family-names: "Kimber"
54 |       given-names: "Talia B."
55 |     - family-names: "Taylor"
56 |       given-names: "Corey"
57 |     - family-names: "White"
58 |       given-names: "Jessica B."
59 |     - family-names: "Backenköhler"
60 |       given-names: "Michael"
61 |     - family-names: "Groß"
62 |       given-names: "Joschka"
63 |     - family-names: "Payne"
64 |       given-names: "Alexander"
65 |     - family-names: "Kaminow"
66 |       given-names: "Ben"
67 |     - family-names: "Pulido"
68 |       given-names: "Iván"
69 |     - family-names: "Singh"
70 |       given-names: "Sukrit"
71 |     - family-names: "Kramer"
72 |       given-names: "Paula Linh"
73 |     - family-names: "Pérez-Hernández"
74 |       given-names: "Guillermo"
75 |     - family-names: "Volkamer"
76 |       given-names: "Andrea"
77 |     - family-names: "Chodera"
78 |       given-names: "John D."
79 |   journal: "bioRxiv"
80 |   year: 2024
81 |   doi: "10.1101/2024.09.10.612176"
82 |   url: "https://doi.org/10.1101/2024.09.10.612176"
83 | 


--------------------------------------------------------------------------------
/kinoml/tests/core/test_proteins.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test kinoml.core.proteins
 3 | """
 4 | from importlib import resources
 5 | 
 6 | from MDAnalysis.core.universe import Universe
 7 | from openeye import oechem
 8 | import pandas as pd
 9 | 
10 | 
11 | def test_protein_from_file():
12 |     """Check from file reading with MDAnalysis and OpenEye."""
13 |     from kinoml.core.proteins import Protein
14 | 
15 |     with resources.path("kinoml.data.proteins", "4f8o.pdb") as path:
16 |         protein = Protein.from_file(path)
17 |         assert isinstance(protein.molecule, oechem.OEGraphMol)
18 |         protein = Protein.from_file(path, toolkit="MDAnalysis")
19 |         assert isinstance(protein.molecule, Universe)
20 | 
21 | 
22 | def test_protein_from_pdb():
23 |     """Check instantation from PDB ID."""
24 |     from kinoml.core.proteins import Protein
25 | 
26 |     protein = Protein.from_pdb("4yne")
27 |     assert isinstance(protein.molecule, oechem.OEGraphMol)
28 |     protein = Protein.from_pdb("4yne", toolkit="MDAnalysis")
29 |     assert isinstance(protein.molecule, Universe)
30 | 
31 | 
32 | def test_lazy_protein():
33 |     """Check lazy instantiation via PDB ID."""
34 |     from kinoml.core.proteins import Protein
35 | 
36 |     protein = Protein(pdb_id="4yne")
37 |     assert isinstance(protein._molecule, type(None))
38 |     assert isinstance(protein.molecule, oechem.OEGraphMol)
39 |     assert isinstance(protein._molecule, oechem.OEGraphMol)
40 |     protein = Protein(pdb_id="4yne", toolkit="MDAnalysis")
41 |     assert isinstance(protein.molecule, Universe)
42 | 
43 | 
44 | def test_klifskinase_kinase_klifs_sequence():
45 |     """Check access to kinase_klifs_sequence."""
46 |     from kinoml.core.proteins import KLIFSKinase
47 | 
48 |     kinase = KLIFSKinase(uniprot_id="P04629")
49 |     assert len(kinase.kinase_klifs_sequence) == 85
50 |     assert isinstance(kinase.sequence, str)
51 |     kinase = KLIFSKinase(kinase_klifs_id=480)
52 |     assert len(kinase.kinase_klifs_sequence) == 85
53 |     assert isinstance(kinase.sequence, str)
54 |     kinase = KLIFSKinase(structure_klifs_id=3620)
55 |     assert len(kinase.kinase_klifs_sequence) == 85
56 |     assert isinstance(kinase.sequence, str)
57 | 
58 | 
59 | def test_klifskinase_structure_klifs_sequence():
60 |     """Check access to structure_klifs_sequence."""
61 |     from kinoml.core.proteins import KLIFSKinase
62 | 
63 |     kinase = KLIFSKinase(structure_klifs_id=3620)
64 |     assert len(kinase.structure_klifs_sequence) == 85
65 | 
66 | 
67 | def test_klifskinase_structure_klifs_residues():
68 |     """Check access to structure_klifs_residues."""
69 |     from kinoml.core.proteins import KLIFSKinase
70 | 
71 |     kinase = KLIFSKinase(structure_klifs_id=3620)
72 |     assert isinstance(kinase.structure_klifs_residues, pd.DataFrame) is True
73 |     assert len(kinase.structure_klifs_residues) == 85
74 | 


--------------------------------------------------------------------------------
/kinoml/core/conditions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Each Measurement object can store a ``conditions``
 3 | attribute which should contain one of the classes
 4 | here defined.
 5 | 
 6 | For example, experimental measurements can have an
 7 | ``AssayConditions`` object specifying the variables
 8 | involved in the experiment, like pH.
 9 | """
10 | 
11 | from typing import Union
12 | import json
13 | 
14 | 
15 | class BaseConditions:
16 | 
17 |     """
18 |     Contains information about the experimental conditions.
19 |     We ensure objects are immutable by using read-only properties
20 |     for all attributes. Do NOT modify private attributes or
21 |     hashing will break.
22 | 
23 |     Parameters
24 |     ----------
25 |     strict : bool, optional=True
26 |         Whether to perform safety checks at initialization.
27 |     """
28 | 
29 |     def __init__(self, strict: bool = True):
30 |         if strict:
31 |             self.check()
32 | 
33 |     def check(self):
34 |         """
35 |         Perform some checks for valid values
36 |         """
37 | 
38 |     def _properties(self, classname: bool = True) -> dict:
39 |         """
40 |         Return a dictionary with the classname and all defined properties.
41 |         Used for equality comparisons in subclasses.
42 | 
43 |         Parameters
44 |         ----------
45 |         classname : bool, optional=True
46 |             Whether to include the name of the instance class
47 | 
48 |         Returns
49 |         -------
50 |         dict
51 |         """
52 |         props = {"classname": self.__class__.__name__} if classname else {}
53 |         for name in dir(self):
54 |             if name.startswith("_"):
55 |                 continue
56 |             clsattr = getattr(self.__class__, name)
57 |             if isinstance(clsattr, property):
58 |                 props[name] = getattr(self, name)
59 |         return props
60 | 
61 |     def __hash__(self):
62 |         return hash(json.dumps(self._properties()))
63 | 
64 |     def __eq__(self, other):
65 |         return self._properties() == other._properties()
66 | 
67 |     def __repr__(self) -> str:
68 |         return (
69 |             f"<{self.__class__.__name__} "
70 |             f"{' '.join([f'{k}={v}' for k, v in self._properties(classname=False).items()])}>"
71 |         )
72 | 
73 | 
74 | class AssayConditions(BaseConditions):
75 |     """
76 |     Contains information about the experimental conditions
77 |     of a given assay.
78 | 
79 |     Parameters
80 |     ----------
81 |     pH : int or float, optional=7.0
82 |         Acidity conditions
83 |     """
84 | 
85 |     def __init__(self, pH: Union[int, float] = 7.0, *args, **kwargs):
86 |         self._pH = pH
87 | 
88 |         # Finish initialization
89 |         super().__init__(*args, **kwargs)
90 | 
91 |     @property
92 |     def pH(self):
93 |         return self._pH
94 | 
95 |     def check(self):
96 |         super().check()
97 |         assert 0 <= self.pH <= 14, f"pH must be within [0, 14], but {self.pH} was specified"
98 | 


--------------------------------------------------------------------------------
/kinoml/tests/databases/test_pdb.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test pdb functionalities of `kinoml.databases`
  3 | """
  4 | from contextlib import contextmanager
  5 | from pathlib import PosixPath
  6 | import pytest
  7 | 
  8 | 
  9 | @contextmanager
 10 | def does_not_raise():
 11 |     yield
 12 | 
 13 | 
 14 | @pytest.mark.parametrize(
 15 |     "pdb_ids, expectation, smiles_list",
 16 |     [
 17 |         (
 18 |             ["EDO"],
 19 |             does_not_raise(),
 20 |             ["C(CO)O"],
 21 |         ),
 22 |         (
 23 |             ["---"],
 24 |             pytest.raises(KeyError),
 25 |             ["---"],
 26 |         ),
 27 |         (
 28 |             ["EDO", "GOL"],
 29 |             does_not_raise(),
 30 |             ["C(CO)O", "C(C(CO)O)O"],
 31 |         ),
 32 |     ],
 33 | )
 34 | def test_smiles_from_pdb(pdb_ids, expectation, smiles_list):
 35 |     """Compare results for expected SMILES."""
 36 |     from kinoml.databases.pdb import smiles_from_pdb
 37 | 
 38 |     with expectation:
 39 |         ligands = smiles_from_pdb(pdb_ids)
 40 |         for pdb_id, smiles in zip(pdb_ids, smiles_list):
 41 |             assert ligands[pdb_id] == smiles
 42 | 
 43 | 
 44 | @pytest.mark.parametrize(
 45 |     "pdb_id, return_type",
 46 |     [
 47 |         (
 48 |             "4YNE",
 49 |             PosixPath,
 50 |         ),  # PDB and CIF format available
 51 |         (
 52 |             "1BOS",
 53 |             PosixPath,
 54 |         ),  # only CIF format available
 55 |         (
 56 |             "XXXX",
 57 |             bool,
 58 |         ),  # wrong code
 59 |     ],
 60 | )
 61 | def test_download_pdb_structure(pdb_id, return_type):
 62 |     """Try to download PDB structures."""
 63 |     from tempfile import TemporaryDirectory
 64 | 
 65 |     from kinoml.databases.pdb import download_pdb_structure
 66 | 
 67 |     with TemporaryDirectory() as temporary_directory:
 68 |         assert isinstance(download_pdb_structure(pdb_id, temporary_directory), return_type)
 69 | 
 70 | 
 71 | @pytest.mark.parametrize(
 72 |     "pdb_id, chain_id, expo_id, smiles, return_type",
 73 |     [
 74 |         (
 75 |             "4YNE",  # PDB and CIF format available
 76 |             "A",
 77 |             "4EK",
 78 |             "c1ccnc(c1)c2cnc3n2nc(cc3)N4CCC[C@@H]4c5cccc(c5)F",
 79 |             PosixPath,
 80 |         ),
 81 |         (
 82 |             "1BOS",  # only CIF format available
 83 |             "E",
 84 |             "GAL",
 85 |             "C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O)O)O)O)O",
 86 |             PosixPath,
 87 |         ),
 88 |         (
 89 |             "XXXX",
 90 |             "X",
 91 |             "XXX",
 92 |             "xxxxx",
 93 |             bool,
 94 |         ),  # wrong code
 95 |     ],
 96 | )
 97 | def test_download_pdb_structure(pdb_id, chain_id, expo_id, smiles, return_type):
 98 |     """Try to download PDB ligands."""
 99 |     from tempfile import TemporaryDirectory
100 |     from kinoml.databases.pdb import download_pdb_ligand
101 | 
102 |     with TemporaryDirectory() as temporary_directory:
103 |         assert isinstance(download_pdb_ligand(pdb_id, chain_id, expo_id, smiles), return_type)
104 | 


--------------------------------------------------------------------------------
/kinoml/analysis/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 3 | 
 4 | 
 5 | def root_mean_squared_error(*args, **kwargs):
 6 |     """
 7 |     Returns the square-root of ``scikit-learn``'s ``mean_squared_error`` metric.
 8 |     All arguments are forwarded to that function.
 9 |     """
10 |     return np.sqrt(mean_squared_error(*args, **kwargs))
11 | 
12 | 
13 | def performance(
14 |     predicted,
15 |     observed,
16 |     verbose=True,
17 |     n_boot=100,
18 |     confidence=0.95,
19 |     sample_ratio=0.8,
20 |     _seed=1234,
21 | ):
22 |     """
23 |     Measure the predicted vs observed performance with different metrics (R2, MSE, MAE, RMSE).
24 | 
25 |     Parameters
26 |     ----------
27 |     predicted : array-like
28 |         Data points predicted by the model.
29 |     observed : array-like
30 |         Observed data points, as available in the dataset.
31 |     verbose : bool, optional=True
32 |         Whether to print results to stdout.
33 |     n_boot : int, optional=100
34 |         Number of bootstrap iterations. Set to ``1`` to disable
35 |         bootstrapping.
36 |     confidence : float, optional=0.95
37 |         Confidence interval, relative to 1. Default is 95%.
38 |     sample_ratio : float, optional=0.8
39 |         Proportion of data to sample in each iteration.
40 |     _seed : int, optional=1234
41 |         Random seed. Each bootstrap iteration gets a different seed
42 |         based on this initial one.
43 | 
44 |     Returns
45 |     -------
46 |     results : dict of tuple
47 |         This dictionary contains one item per metric (see above),
48 |         with a 4-element tuple each: mean, standard deviation, and lower and
49 |         upper bounds for the confidence interval.
50 | 
51 |     Note
52 |     ----
53 |     **TODO**: Reimplement samples with ``scipy.stats.norm`` or with ``numpy``.
54 | 
55 |     """
56 |     assert 0.5 <= confidence < 1, "Confidence must be in [0.5, 1)"
57 |     assert 0 < sample_ratio <= 1, "Sample ratio must be in (0, 1]"
58 | 
59 |     high = predicted.shape[0]
60 |     size = int(sample_ratio * high)
61 |     metrics = {
62 |         "r2": r2_score,
63 |         "mse": mean_squared_error,
64 |         "mae": mean_absolute_error,
65 |         "rmse": root_mean_squared_error,
66 |     }
67 |     bootstrapped = np.empty((len(metrics), n_boot))
68 | 
69 |     for i in range(n_boot):
70 |         rng = np.random.RandomState(_seed + i)
71 |         indices = rng.randint(low=0, high=high, size=size)
72 |         obs, pred = observed[indices], predicted[indices]
73 |         for j, (key, fn) in enumerate(sorted(metrics.items())):
74 |             bootstrapped[j][i] = fn(obs, pred)
75 | 
76 |     # FIXME: Sort metrics as suggested here https://stackoverflow.com/a/40491405
77 |     bootstrapped.sort(axis=1)
78 | 
79 |     results = {}
80 |     for index, key in enumerate(sorted(metrics)):
81 |         arr = bootstrapped[index]
82 | 
83 |         results[key] = mean, std, low, high = (
84 |             arr.mean(),
85 |             arr.std(),
86 |             np.quantile(arr, 1 - confidence),
87 |             np.quantile(arr, confidence),
88 |         )
89 |         if verbose:
90 |             print(
91 |                 f"{key.upper():>4s}: {mean:.4f}±{std:.4f} {100*confidence:.0f}CI=({low:.4f}, {high:.4f})"
92 |             )
93 | 
94 |     return results
95 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - "master"
  7 |       - "maintenance/.+"
  8 |   pull_request:
  9 |     branches:
 10 |       - "master"
 11 |       - "maintenance/.+"
 12 |   schedule:
 13 |     # Run a cron job once weekly
 14 |     - cron: "0 0 * * 0"
 15 |   workflow_dispatch:
 16 | 
 17 | jobs:
 18 |   test:
 19 |     name: ${{ matrix.name }}
 20 |     runs-on: ${{ matrix.os }}
 21 |     strategy:
 22 |       fail-fast: false
 23 |       matrix:
 24 |         include:
 25 |           - name: Linux, Python 3.9
 26 |             os: ubuntu-latest
 27 |             python-version: "3.9"
 28 |             conda-installer: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
 29 | 
 30 |           - name: Linux, Python 3.10
 31 |             os: ubuntu-latest
 32 |             python-version: "3.9"
 33 |             conda-installer: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
 34 | 
 35 |           - name: MacOS, Python 3.9
 36 |             os: macOS-latest
 37 |             python-version: "3.9"
 38 |             conda-installer: https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-MacOSX-x86_64.sh
 39 | 
 40 |     env:
 41 |       OE_LICENSE: ${{ github.workspace }}/oe_license.txt
 42 |       MODELLER_LICENSE: ${{ secrets.MODELLER_LICENSE }}
 43 | 
 44 |     steps:
 45 |       - uses: actions/checkout@v2
 46 | 
 47 |       - uses: conda-incubator/setup-miniconda@v2
 48 |         with:
 49 |           installer-url: ${{ matrix.conda-installer }}
 50 |           python-version: ${{ matrix.python-version }}
 51 |           activate-environment: test
 52 |           channel-priority: true
 53 |           environment-file: devtools/conda-envs/test_env.yaml
 54 |           auto-activate-base: false
 55 |           use-mamba: true
 56 | 
 57 |       - name: Additional info about the build
 58 |         shell: bash
 59 |         run: |
 60 |           uname -a
 61 |           df -h
 62 |           ulimit -a
 63 | 
 64 |       - name: Environment Information
 65 |         shell: bash -l {0}
 66 |         run: |
 67 |           conda info --all
 68 |           conda list
 69 |           mamba --version
 70 | 
 71 |       - name: Decrypt and check OE license
 72 |         shell: bash -l {0}
 73 |         env:
 74 |           OE_LICENSE_TEXT: ${{ secrets.OE_LICENSE }}
 75 |         run: |
 76 |           echo "${OE_LICENSE_TEXT}" > ${OE_LICENSE}
 77 |           python -c "import openeye; assert openeye.OEChemIsLicensed()"
 78 | 
 79 |       - name: Install package
 80 |         shell: bash -l {0}
 81 |         run: |
 82 |           python -m pip install --no-deps .
 83 | 
 84 |       - name: Run tests
 85 |         shell: bash -l {0}
 86 |         run: |
 87 |           pytest -v -n auto --dist load --cov=kinoml --cov-report=xml --color=yes -k "not read_electron_density" kinoml/tests/
 88 | 
 89 |       - name: Run notebooks
 90 |         shell: bash -l {0}
 91 |         run: |
 92 |           pytest -v -n auto --dist loadscope --nbval-lax -k "not Schrodinger_structural_featurizer.ipynb" examples/*.ipynb
 93 | 
 94 |       - name: CodeCov
 95 |         uses: codecov/codecov-action@v1
 96 |         if: always()
 97 |         with:
 98 |           token: ${{ secrets.CODECOV_TOKEN }}
 99 |           file: ./coverage.xml
100 |           flags: unittests
101 |           yml: ./.codecov.yml
102 | 


--------------------------------------------------------------------------------
/devtools/README.md:
--------------------------------------------------------------------------------
 1 | # Development, testing, and deployment tools
 2 | 
 3 | This directory contains a collection of tools for running Continuous Integration (CI) tests, 
 4 | conda installation, and other development tools not directly related to the coding process.
 5 | 
 6 | 
 7 | ## Manifest
 8 | 
 9 | ### Continuous Integration
10 | 
11 | You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and 
12 | Windows testing if you only plan to deploy on specific platforms. These are just to help you get started
13 | 
14 | * `travis-ci`: Linux and OSX based testing through [Travis-CI](https://about.travis-ci.com/) 
15 |   * `before_install.sh`: Pip/Miniconda pre-package installation script for Travis 
16 | * `appveyor`: Windows based testing through [AppVeyor](https://www.appveyor.com/) (there are no files directly related to this)
17 | 
18 | ### Conda Environment:
19 | 
20 | This directory contains the files to setup the Conda environment for testing purposes
21 | 
22 | * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's
23 |   * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration
24 |   
25 | ### Additional Scripts:
26 | 
27 | This directory contains OS agnostic helper scripts which don't fall in any of the previous categories
28 | * `scripts`
29 |   * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options
30 | 
31 | 
32 | ## How to contribute changes
33 | - Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator.
34 | - Make a new branch with `git checkout -b {your branch name}`
35 | - Make changes and test your code
36 | - Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`)
37 | - Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}`
38 |   * Note that `origin` is the default name assigned to the remote, yours may be different
39 | - Make a PR on GitHub with your changes
40 | - We'll review the changes and get your code into the repo after lively discussion!
41 | 
42 | 
43 | ## Checklist for updates
44 | - [ ] Make sure there is an/are issue(s) opened for your specific update
45 | - [ ] Create the PR, referencing the issue
46 | - [ ] Debug the PR as needed until tests pass
47 | - [ ] Tag the final, debugged version 
48 |    *  `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags`
49 | - [ ] Get the PR merged in
50 | 
51 | ## Versioneer Auto-version
52 | [Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version 
53 | is installed by looking at the `git` tags and how many commits ahead this version is. The format follows 
54 | [PEP 440](https://www.python.org/dev/peps/pep-0440/) and has the regular expression of:
55 | ```regexp
56 | \d+.\d+.\d+(?\+\d+-[a-z0-9]+)
57 | ```
58 | If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, 
59 | e.g. `kinoml-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits 
60 | ahead from the last tag, and then `-YYYYYY` where the `Y`'s are replaced with the `git` commit hash.
61 | 


--------------------------------------------------------------------------------
/kinoml/tests/features/test_ligand.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test ligand featurizers of `kinoml.features`
 3 | """
 4 | import pytest
 5 | import numpy as np
 6 | 
 7 | from kinoml.core.systems import LigandSystem
 8 | from kinoml.core.ligands import Ligand
 9 | from kinoml.features.ligand import (
10 |     SingleLigandFeaturizer,
11 |     MorganFingerprintFeaturizer,
12 |     OneHotSMILESFeaturizer,
13 |     GraphLigandFeaturizer,
14 | )
15 | 
16 | 
17 | def test_single_ligand_featurizer():
18 |     ligand1 = Ligand(smiles="CCCC")
19 |     single_ligand_system = LigandSystem(components=[ligand1])
20 |     featurizer = SingleLigandFeaturizer()
21 |     featurizer.supports(single_ligand_system)
22 | 
23 |     ligand2 = Ligand(smiles="COCC")
24 |     double_ligand_system = LigandSystem(components=[ligand1, ligand2])
25 |     with pytest.raises(ValueError):
26 |         featurizer.featurize([double_ligand_system])
27 | 
28 | 
29 | @pytest.mark.parametrize(
30 |     "smiles, solution",
31 |     [
32 |         (
33 |             "C",
34 |             "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
35 |         ),
36 |         (
37 |             "B",
38 |             "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
39 |         ),
40 |     ],
41 | )
42 | def test_ligand_MorganFingerprintFeaturizer(smiles, solution):
43 |     ligand = Ligand(smiles=smiles)
44 |     system = LigandSystem([ligand])
45 |     featurizer = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False)
46 |     featurizer.featurize([system])
47 |     fingerprint = system.featurizations[featurizer.name]
48 |     solution_array = np.array(list(map(int, solution)), dtype="uint8")
49 |     assert (fingerprint == solution_array).all()
50 | 
51 | 
52 | @pytest.mark.parametrize(
53 |     "smiles, solution",
54 |     [
55 |         ("C", np.array([[0, 1] + [0] * 51])),
56 |         ("B", np.array([[1] + [0] * 52])),
57 |         ("CC", np.array([[0, 1] + [0] * 51, [0, 1] + [0] * 51])),
58 |     ],
59 | )
60 | def test_ligand_OneHotSMILESFeaturizer(smiles, solution):
61 |     ligand = Ligand(smiles=smiles)
62 |     system = LigandSystem([ligand])
63 |     featurizer = OneHotSMILESFeaturizer(use_multiprocessing=False)
64 |     featurizer.featurize([system])
65 |     matrix = system.featurizations[featurizer.name]
66 |     assert matrix.shape == solution.T.shape
67 |     assert (matrix == solution.T).all()
68 | 
69 | 
70 | @pytest.mark.parametrize(
71 |     "smiles, n_edges, n_nodes, n_features",
72 |     [("C", 0, 1, 69), ("CC", 2, 2, 69)],
73 | )
74 | def test_ligand_GraphLigandFeaturizer_RDKit(smiles, n_edges, n_nodes, n_features):
75 |     ligand = Ligand(smiles=smiles)
76 |     system = LigandSystem([ligand])
77 |     GraphLigandFeaturizer(use_multiprocessing=False).featurize([system])
78 |     connectivity, features = system.featurizations["last"]
79 |     assert len(connectivity[0]) == n_edges
80 |     assert len(features) == n_nodes
81 |     assert len(features[0]) == n_features
82 | 


--------------------------------------------------------------------------------
/kinoml/ml/tensorflow_models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example models for TensorFlow
  3 | 
  4 | .. note::
  5 | 
  6 |     This code is not currently in use.
  7 | """
  8 | 
  9 | import tensorflow as tf  # pylint: disable=import-error
 10 | 
 11 | 
 12 | def DNN(input_dim):
 13 |     """
 14 |     DNN builds and compiles a TF model (a Deep Neural Network) that takes as input 'input_dim'
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     input_dim : tuple of int
 19 |         Expected shape of the input data
 20 | 
 21 |     Returns
 22 |     -------
 23 |     model : tf.keras.models.Sequential
 24 |     """
 25 |     model = tf.keras.models.Sequential(
 26 |         [
 27 |             tf.keras.layers.Dense(350, activation="relu", input_dim=input_dim),
 28 |             tf.keras.layers.Dropout(0.2),
 29 |             tf.keras.layers.Dense(200, activation="relu"),
 30 |             tf.keras.layers.Dropout(0.2),
 31 |             tf.keras.layers.Dense(100, activation="relu"),
 32 |             tf.keras.layers.Dense(50, activation="relu"),
 33 |             tf.keras.layers.Dense(16, activation="relu"),
 34 |             tf.keras.layers.Dense(1, activation="sigmoid"),
 35 |         ]
 36 |     )
 37 | 
 38 |     model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
 39 |     return model
 40 | 
 41 | 
 42 | def CNN(input_shape):
 43 |     """
 44 |     CNN builds and compiles a TF model (a Convolutional Neural Network) that takes as input 'input_shape'
 45 |     Parameters
 46 |     ==========
 47 |     input_shape : tuple of int
 48 |         Expected shape of the input data
 49 |     Returns
 50 |     =======
 51 |     model : tf.keras.models.Sequential
 52 |     """
 53 | 
 54 |     model = tf.keras.Sequential(
 55 |         [
 56 |             tf.keras.layers.Conv2D(
 57 |                 filters=16,
 58 |                 kernel_size=3,
 59 |                 activation="relu",
 60 |                 padding="same",
 61 |                 input_shape=input_shape,
 62 |             ),
 63 |             tf.keras.layers.MaxPooling2D(),
 64 |             tf.keras.layers.Flatten(),
 65 |             tf.keras.layers.Dropout(0.2),
 66 |             tf.keras.layers.Dense(64, activation="relu"),
 67 |             tf.keras.layers.BatchNormalization(),
 68 |             tf.keras.layers.Dense(32, activation="relu"),
 69 |             tf.keras.layers.BatchNormalization(),
 70 |             tf.keras.layers.Dense(1, activation="sigmoid"),
 71 |         ]
 72 |     )
 73 | 
 74 |     model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
 75 |     return model
 76 | 
 77 | 
 78 | def MPNN(input_shape):
 79 |     """
 80 |     MPNN builds and compiles a TF model (a Message Passing Neural Network) that takes as input 'input_shape'
 81 |     Parameters
 82 |     ==========
 83 |     input_shape : tuple of int
 84 |         Expected shape of the input data
 85 |     Returns
 86 |     =======
 87 |     model : tf.keras.models.Sequential
 88 |     """
 89 | 
 90 |     model = tf.keras.Sequential(
 91 |         [
 92 |             tf.keras.layers.Conv2D(
 93 |                 filters=8,
 94 |                 kernel_size=3,
 95 |                 activation="relu",
 96 |                 padding="same",
 97 |                 input_shape=input_shape,
 98 |             ),
 99 |             tf.keras.layers.MaxPooling2D(),
100 |             tf.keras.layers.Flatten(),
101 |             tf.keras.layers.Dropout(0.2),
102 |             tf.keras.layers.Dense(64, activation="relu"),
103 |             tf.keras.layers.BatchNormalization(),
104 |             tf.keras.layers.Dense(32, activation="relu"),
105 |             tf.keras.layers.BatchNormalization(),
106 |             tf.keras.layers.Dense(1, activation="sigmoid"),
107 |         ]
108 |     )
109 |     model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
110 |     return model
111 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age,
 8 | body size, disability, ethnicity, gender identity and expression, level of
 9 | experience, nationality, personal appearance, race, religion, or sexual
10 | identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment include:
15 | 
16 | * Using welcoming and inclusive language
17 | * Being respectful of differing viewpoints and experiences
18 | * Gracefully accepting constructive criticism
19 | * Focusing on what is best for the community
20 | * Showing empathy towards other community members
21 | 
22 | Examples of unacceptable behavior by participants include:
23 | 
24 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
25 | * Trolling, insulting/derogatory comments, and personal or political attacks
26 | * Public or private harassment
27 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
28 | * Other conduct which could reasonably be considered inappropriate in a professional setting
29 | 
30 | ## Our Responsibilities
31 | 
32 | Project maintainers are responsible for clarifying the standards of acceptable
33 | behavior and are expected to take appropriate and fair corrective action in
34 | response to any instances of unacceptable behavior.
35 | 
36 | Project maintainers have the right and responsibility to remove, edit, or
37 | reject comments, commits, code, wiki edits, issues, and other contributions
38 | that are not aligned to this Code of Conduct, or to ban temporarily or
39 | permanently any contributor for other behaviors that they deem inappropriate,
40 | threatening, offensive, or harmful.
41 | 
42 | Moreover, project maintainers will strive to offer feedback and advice to
43 | ensure quality and consistency of contributions to the code.  Contributions
44 | from outside the group of project maintainers are strongly welcomed but the
45 | final decision as to whether commits are merged into the codebase rests with
46 | the team of project maintainers.
47 | 
48 | ## Scope
49 | 
50 | This Code of Conduct applies both within project spaces and in public spaces
51 | when an individual is representing the project or its community. Examples of
52 | representing a project or community include using an official project e-mail
53 | address, posting via an official social media account, or acting as an
54 | appointed representative at an online or offline event. Representation of a
55 | project may be further defined and clarified by project maintainers.
56 | 
57 | ## Enforcement
58 | 
59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
60 | reported by contacting the project team at 'jaime.rodriguez-guerra@choderalab.org'. The project team will
61 | review and investigate all complaints, and will respond in a way that it deems
62 | appropriate to the circumstances. The project team is obligated to maintain
63 | confidentiality with regard to the reporter of an incident. Further details of
64 | specific enforcement policies may be posted separately.
65 | 
66 | Project maintainers who do not follow or enforce the Code of Conduct in good
67 | faith may face temporary or permanent repercussions as determined by other
68 | members of the project's leadership.
69 | 
70 | ## Attribution
71 | 
72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
73 | version 1.4, available at
74 | [http://contributor-covenant.org/version/1/4][version]
75 | 
76 | [homepage]: http://contributor-covenant.org
77 | [version]: http://contributor-covenant.org/version/1/4/
78 | 


--------------------------------------------------------------------------------
/docs/developers/api_docs.md:
--------------------------------------------------------------------------------
  1 | # How to write docs with Sphinx, MyST and Material Theme
  2 | 
  3 | We are using Sphinx for our documentation. However, instead of using the default RST,
  4 | you can also use Markdown syntax thanks to the [MyST parser](https://myst-parser.readthedocs.io/).
  5 | The theme is [Material for Sphinx](https://github.com/bashtage/sphinx-material/).
  6 | 
  7 | ## Basics
  8 | 
  9 | - `cd docs && make livebuild` - Start the live-reloading docs server locally
 10 | 
 11 | Project layout:
 12 | 
 13 |     docs/
 14 |         index.md  # The documentation homepage.
 15 |         conf.py   # The configuration file
 16 |         ...       # Other markdown pages, images and other files.
 17 | 
 18 | We prefer using Markdown for the documentation, but the Python docstrings
 19 | use RST with [NumpyDoc](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard)
 20 | conventions. Check the existing docstrings for syntax examples.
 21 | 
 22 | ## Supported extensions
 23 | 
 24 | MyST adds some [extra stuff](https://myst-parser.readthedocs.io/en/latest/using/syntax.html#)
 25 | on top of plain Markdown. Some examples:
 26 | 
 27 | ### Admonitions
 28 | 
 29 | ```{note}
 30 | This is so cool huh? Check all styles [here](https://docutils.sourceforge.io/docs/ref/rst/directives.html#specific-admonitions).
 31 | ```
 32 | 
 33 | ````md
 34 | ```{note}
 35 | This is so cool huh? Check all styles [here](https://docutils.sourceforge.io/docs/ref/rst/directives.html#specific-admonitions).
 36 | ```
 37 | ````
 38 | 
 39 | ### Footnotes
 40 | 
 41 | > This is a very important finding.[^1]
 42 | 
 43 | > This is yet another finding.[^jaimergp1990]
 44 | 
 45 | [^1]: Lorem ipsum dolor sit amet, consectetur adipiscing elit.
 46 | [^jaimergp1990]: A kid named Jaime.
 47 | 
 48 | These are written with labels like this:
 49 | 
 50 | ```md
 51 | > This is a very important finding.[^1]
 52 | 
 53 | > This is yet another finding.[^jaimergp1990]
 54 | 
 55 | [^1]: Lorem ipsum dolor sit amet, consectetur adipiscing elit.
 56 | [^jaimergp1990]: A kid named Jaime.
 57 | ```
 58 | 
 59 | ### LaTeX
 60 | 
 61 | Either in blocks
 62 | 
 63 | $$
 64 | \frac{n!}{k!(n-k)!} = \binom{n}{k} * KinoML
 65 | $$
 66 | 
 67 | ```latex
 68 | $$
 69 | \frac{n!}{k!(n-k)!} = \binom{n}{k} * KinoML
 70 | $$
 71 | ```
 72 | 
 73 | or inline:
 74 | 
 75 | This my best equation ever: $p(x|y) = \frac{p(y|x)p(x)}{p(y)}$
 76 | 
 77 | ```latex
 78 | This my best equation ever: $p(x|y) = \frac{p(y|x)p(x)}{p(y)}$
 79 | ```
 80 | 
 81 | ### Tabbed fences
 82 | 
 83 | :::{tabbed} Step 1
 84 | 
 85 | This is the step 1
 86 | :::
 87 | 
 88 | :::{tabbed} Step 2
 89 | 
 90 | ```python
 91 | # This is the step 2 with python code highlighting
 92 | he = Element("Helium")
 93 | ```
 94 | 
 95 | :::
 96 | 
 97 | :::{tabbed} Step 3
 98 | 
 99 | This is the step 3
100 | :::
101 | 
102 | This line interrupts the fences and creates a new block of tabs
103 | 
104 | :::{tabbed} Step 4
105 | 
106 | ```python
107 | # This is the step 4 with python code highlighting
108 | 
109 | be = Element("Beryllium")
110 | ```
111 | 
112 | :::
113 | 
114 | Obtained with:
115 | 
116 | ````
117 | :::{tabbed} Step 1
118 | 
119 | This is the step 1
120 | :::
121 | 
122 | ::::{tabbed} Step 2
123 | ```python
124 | # This is the step 2 with python code highlighting
125 | he = Element("Helium")
126 | ```
127 | ::::
128 | 
129 | :::{tabbed} Step 3
130 | 
131 | This is the step 3
132 | :::
133 | 
134 | This line interrupts the fences and creates a new block of tabs
135 | 
136 | :::{tabbed} Step 4
137 | ```python
138 | # This is the step 4 with python code highlighting
139 | 
140 | be = Element("Beryllium")
141 | ```
142 | :::
143 | 
144 | ````
145 | 
146 | ### Extra inline markup
147 | 
148 | | Code      | Result  |
149 | | --------- | ------- |
150 | | `==hey==` | ==hey== |
151 | | `~~hey~~` | ~~hey~~ |
152 | | `^^hey^^` | ^^hey^^ |
153 | | `a^migo^` | a^migo^ |
154 | | `-->`     | -->     |
155 | 


--------------------------------------------------------------------------------
/devtools/scripts/create_conda_env.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | import glob
 5 | import shutil
 6 | import subprocess as sp
 7 | from tempfile import TemporaryDirectory
 8 | from contextlib import contextmanager
 9 | # YAML imports
10 | try:
11 |     import yaml  # PyYAML
12 |     loader = yaml.load
13 | except ImportError:
14 |     try:
15 |         import ruamel_yaml as yaml  # Ruamel YAML
16 |     except ImportError:
17 |         try:
18 |             # Load Ruamel YAML from the base conda environment
19 |             from importlib import util as import_util
20 |             CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE'])
21 |             ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..',
22 |                                                       'lib', 'python*.*', 'site-packages',
23 |                                                       'ruamel_yaml', '__init__.py'))[0]
24 |             # Based on importlib example, but only needs to load_module since its the whole package, not just
25 |             # a module
26 |             spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path)
27 |             yaml = spec.loader.load_module()
28 |         except (KeyError, ImportError, IndexError):
29 |             raise ImportError("No YAML parser could be found in this or the conda environment. "
30 |                               "Could not find PyYAML or Ruamel YAML in the current environment, "
31 |                               "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " 
32 |                               "Environment not created!")
33 |     loader = yaml.YAML(typ="safe").load  # typ="safe" avoids odd typing on output
34 | 
35 | 
36 | @contextmanager
37 | def temp_cd():
38 |     """Temporary CD Helper"""
39 |     cwd = os.getcwd()
40 |     with TemporaryDirectory() as td:
41 |         try:
42 |             os.chdir(td)
43 |             yield
44 |         finally:
45 |             os.chdir(cwd)
46 | 
47 | 
48 | # Args
49 | parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.')
50 | parser.add_argument('-n', '--name', type=str,
51 |                     help='The name of the created Python environment')
52 | parser.add_argument('-p', '--python', type=str,
53 |                     help='The version of the created Python environment')
54 | parser.add_argument('conda_file',
55 |                     help='The file for the created Python environment')
56 | 
57 | args = parser.parse_args()
58 | 
59 | # Open the base file
60 | with open(args.conda_file, "r") as handle:
61 |     yaml_script = loader(handle.read())
62 | 
63 | python_replacement_string = "python {}*".format(args.python)
64 | 
65 | try:
66 |     for dep_index, dep_value in enumerate(yaml_script['dependencies']):
67 |         if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value):  # Match explicitly 'python' and its formats
68 |             yaml_script['dependencies'].pop(dep_index)
69 |             break  # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse
70 | except (KeyError, TypeError):
71 |     # Case of no dependencies key, or dependencies: None
72 |     yaml_script['dependencies'] = []
73 | finally:
74 |     # Ensure the python version is added in. Even if the code does not need it, we assume the env does
75 |     yaml_script['dependencies'].insert(0, python_replacement_string)
76 | 
77 | # Figure out conda path
78 | if "CONDA_EXE" in os.environ:
79 |     conda_path = os.environ["CONDA_EXE"]
80 | else:
81 |     conda_path = shutil.which("conda")
82 | if conda_path is None:
83 |     raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path")
84 | 
85 | print("CONDA ENV NAME  {}".format(args.name))
86 | print("PYTHON VERSION  {}".format(args.python))
87 | print("CONDA FILE NAME {}".format(args.conda_file))
88 | print("CONDA PATH      {}".format(conda_path))
89 | 
90 | # Write to a temp directory which will always be cleaned up
91 | with temp_cd():
92 |     temp_file_name = "temp_script.yaml"
93 |     with open(temp_file_name, 'w') as f:
94 |         f.write(yaml.dump(yaml_script))
95 |     sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True)
96 | 


--------------------------------------------------------------------------------
/kinoml/datasets/groups.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Splitting strategies for datasets
  3 | """
  4 | import random
  5 | from collections import defaultdict
  6 | 
  7 | from tqdm.auto import tqdm
  8 | 
  9 | 
 10 | class BaseGrouper:
 11 |     """
 12 |     Base class to assign groups to measurements in a DatasetProvider
 13 |     """
 14 | 
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def assign(self, dataset, overwrite=False, **kwargs):
 19 |         """
 20 |         Given a DatasetProvider, assign a key to the elements
 21 |         of each group, as provided by ``.indices()``
 22 | 
 23 |         Parameters
 24 |         ----------
 25 |         dataset : DatasetProvider
 26 |         overwrite : bool, optional=False
 27 |             If a measurement has been assigned a group already,
 28 |             do not overwrite unless this option is set to True.
 29 | 
 30 |         Returns
 31 |         -------
 32 |         dataset : DatasetProvider
 33 |             The same dataset passed in the input, with
 34 |             measurements modified in place.
 35 |         """
 36 |         groups = self.indices(dataset, **kwargs)
 37 |         measurements = dataset.measurements
 38 |         for key, indices in groups.items():
 39 |             for index in indices:
 40 |                 ms = measurements[index]
 41 |                 if not overwrite and ms.group is not None:
 42 |                     raise ValueError(
 43 |                         f"Cannot assign group to `{ms}` because a group is "
 44 |                         f"already assigned: {ms.group}. Choose `overwrite=True` "
 45 |                         f"to ignore existing groups."
 46 |                     )
 47 |                 ms.group = key
 48 |         return dataset
 49 | 
 50 |     def indices(self, dataset, **kwargs):
 51 |         """
 52 |         Given a dataset, create a dictionary that maps keys or labels
 53 |         to a set of numerical indices. The strategy to follow will
 54 |         depend on the subclass.
 55 | 
 56 |         Parameters
 57 |         ----------
 58 |         dataset : DatasetProvider
 59 | 
 60 |         Returns
 61 |         -------
 62 |         dict
 63 |             Maps ``int` or ``str`` to a list of ``int``
 64 |         """
 65 |         raise NotImplementedError("Implement in your subclass")
 66 | 
 67 | 
 68 | class RandomGrouper(BaseGrouper):
 69 | 
 70 |     """
 71 |     Randomized groups following a split proportional to the provided ratios
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     ratios : tuple or dict
 76 |         1-based ratios for the different groups. They must sum 1.0. If a
 77 |         dict is provided, the keys are used to label the resulting groups.
 78 |         Otherwise, the groups are 0-enumerated.
 79 | 
 80 |     """
 81 | 
 82 |     def __init__(self, ratios):
 83 |         if isinstance(ratios, (list, tuple)):
 84 |             ratios = {i: ratio for i, ratio in enumerate(ratios)}
 85 |         assert sum(ratios.values()) == 1, f"`ratios` must sum 1, but you provided {ratios}"
 86 |         self.ratios = ratios
 87 | 
 88 |     def indices(self, dataset, **kwargs):
 89 |         length = len(dataset)
 90 |         indices = list(range(length))
 91 |         random.shuffle(indices)
 92 |         groups = {}
 93 |         start = 0
 94 |         for key, ratio in self.ratios.items():
 95 |             end = start + int(round(ratio * length, 0))
 96 |             groups[key] = indices[start:end]
 97 |             start = end
 98 |         return groups
 99 | 
100 | 
101 | class CallableGrouper(BaseGrouper):
102 |     """
103 |     A grouper that applies a user-provided function to each Measurement
104 |     in the Dataset. Returned value should be the name of the group.
105 | 
106 |     Parameters
107 |     ----------
108 |     function : callable
109 |         This function must be able to take a ``Measurement`` object
110 |         and return a ``str`` or ``int``.
111 |     """
112 | 
113 |     def __init__(self, function):
114 |         self.function = function
115 | 
116 |     def indices(self, dataset, progress=True):
117 |         iterator = enumerate(dataset.measurements)
118 |         if progress:
119 |             iterator = tqdm(iterator)
120 | 
121 |         groups = defaultdict(list)
122 |         for i, measurement in iterator:
123 |             key = self.function(measurement)
124 |             groups[key].append(i)
125 |         return groups
126 | 
127 | 
128 | class BaseFilter(BaseGrouper):
129 |     pass
130 | 


--------------------------------------------------------------------------------
/kinoml/core/systems.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ``System`` objects define a collection of related
  3 | ``MolecularComponent`` instances. They are normally
  4 | attached to a ``Measurement``, and, in the context
  5 | of a machine learning exercise, will be featurized
  6 | with different classes found under ``kinoml.features``.
  7 | Featurization turns a ``System`` into a tensor-like
  8 | object, like Numpy arrays.
  9 | """
 10 | from __future__ import annotations
 11 | 
 12 | from typing import Iterable
 13 | 
 14 | from .components import MolecularComponent
 15 | from .ligands import BaseLigand
 16 | from .proteins import BaseProtein
 17 | 
 18 | 
 19 | class System:
 20 | 
 21 |     """
 22 |     System objects host one or more MolecularComponent.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     components : list of MolecularComponent
 27 |         Molecular entities defining this system
 28 |     strict: bool, optional=True
 29 |         Whether to perform sanity checks (default) or not.
 30 | 
 31 |     Attributes
 32 |     ----------
 33 |     featurizations : dict
 34 |         This dictionary will store the different featurization
 35 |         steps a ``System`` is submitted to. The keys for this
 36 |         dictionary are usually the *name* of the featurizer
 37 |         class. Additionally, a ``Pipeline`` might define
 38 |         a ``last`` key, indicating that particular object
 39 |         was the final result of a chain of featurizers.
 40 |     """
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         components: Iterable[MolecularComponent],
 45 |         strict: bool = True,
 46 |         *args,
 47 |         **kwargs,
 48 |     ):
 49 |         super().__init__(*args, **kwargs)
 50 |         self.components = components
 51 |         self.featurizations = {}
 52 |         if strict:
 53 |             self.check()
 54 | 
 55 |     def _components_by_type(self, type_):
 56 |         """
 57 |         Yield MolecularComponent objects of a given type only
 58 |         """
 59 |         for component in self.components:
 60 |             if isinstance(component, type_):
 61 |                 yield component
 62 | 
 63 |     def check(self):
 64 |         assert self.components, "`System` must specify at least one component"
 65 |         return True
 66 | 
 67 |     @property
 68 |     def name(self) -> str:
 69 |         """
 70 |         Generates a readable name out of the components names
 71 |         """
 72 |         return " & ".join([str(c.name) for c in self.components])
 73 | 
 74 |     @property
 75 |     def weight(self) -> float:
 76 |         """
 77 |         Calculate the molecular weight of the system
 78 | 
 79 |         Note: This is just an example on how/why this level of
 80 |         abstraction can be useful.
 81 |         """
 82 |         mass = 0
 83 |         for component in self.components:
 84 |             if not hasattr(component, "mass"):  # It will be unimplemented for some types!
 85 |                 raise TypeError("This system contains at least one component without mass.")
 86 |             mass += component.mass
 87 |         return mass
 88 | 
 89 |     def __repr__(self) -> str:
 90 |         return (
 91 |             f"<{self.__class__.__name__} with "
 92 |             f"{len(self.components)} components ({', '.join([repr(c) for c in self.components])})>"
 93 |         )
 94 | 
 95 | 
 96 | class ProteinSystem(System):
 97 |     """
 98 |     A System that contains Protein objects. It defines two properties:
 99 | 
100 |     - ``protein``: get the first Protein found in the components
101 |     - ``proteins``: get all Protein objects found in the components
102 |     """
103 | 
104 |     @property
105 |     def protein(self):
106 |         return next(self._components_by_type(BaseProtein))
107 | 
108 |     @property
109 |     def proteins(self):
110 |         return list(self._components_by_type(BaseProtein))
111 | 
112 |     def check(self):  # this is a requirement
113 |         super().check()
114 |         assert (
115 |             len(self.proteins) >= 1
116 |         ), f"A ProteinSystem must specify at least one Protein. Current contents: {self}."
117 |         return True
118 | 
119 | 
120 | class LigandSystem(System):
121 |     """
122 |     A System that contains Ligand objects. It defines two properties:
123 | 
124 |     - ``ligand``: get the first Ligand found in the components
125 |     - ``ligands``: get all Ligand objects found in the components
126 |     """
127 | 
128 |     @property
129 |     def ligand(self):
130 |         return next(self._components_by_type(BaseLigand))
131 | 
132 |     @property
133 |     def ligands(self):
134 |         return list(self._components_by_type(BaseLigand))
135 | 
136 |     def check(self):  # this is a requirement
137 |         super().check()
138 |         assert (
139 |             len(self.ligands) >= 1
140 |         ), f"A LigandSystem must specify at least one Ligand. Current contents: {self}."
141 |         return True
142 | 
143 | 
144 | class ProteinLigandComplex(ProteinSystem, LigandSystem):
145 |     """
146 |     A system with at least one protein and one ligand
147 |     """
148 | 
149 |     def check(self):
150 |         assert ProteinSystem.check(self) and LigandSystem.check(self), (
151 |             "A ProteinLigandComplex must specify at least one Protein and one Ligand. "
152 |             f"Current contents: {self}"
153 |         )
154 | 


--------------------------------------------------------------------------------
/kinoml/datasets/pkis2.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | from typing import Union
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from .core import DatasetProvider
  8 | from ..core.proteins import Protein, KLIFSKinase
  9 | from ..core.ligands import Ligand
 10 | from ..core.systems import ProteinLigandComplex
 11 | from ..core.measurements import PercentageDisplacementMeasurement
 12 | from ..core.conditions import AssayConditions
 13 | from ..utils import datapath
 14 | 
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class PKIS2DatasetProvider(DatasetProvider):
 20 | 
 21 |     """
 22 |     Loads the PKIS2 dataset as provided in _Progress towards a public chemogenomic set for protein
 23 |     kinases and a call for contributions [1].
 24 | 
 25 |     [1]: DOI: 10.1371/journal.pone.0181585
 26 | 
 27 |     Examples
 28 |     --------
 29 |     >>> from kinoml.datasets.pkis2 import PKIS2DatasetProvider
 30 |     >>> provider = PKIS2DatasetProvider.from_source()
 31 |     >>> provider
 32 |     """
 33 | 
 34 |     @classmethod
 35 |     def from_source(
 36 |         cls,
 37 |         path_or_url: Union[str, Path] = datapath("kinomescan/journal.pone.0181585.s004.csv"),
 38 |         path_or_url_constructs: Union[str, Path] = datapath(
 39 |             "kinomescan/DiscoverX_489_Kinase_Assay_Construct_Information.csv"
 40 |         ),
 41 |         protein_type: str = "KLIFSKinase",
 42 |         toolkit: str = "OpenEye",
 43 |     ):
 44 |         """
 45 |         Create a PKIS2 DatasetProvider from the raw data.
 46 | 
 47 |         Parameters
 48 |         ----------
 49 |         path_or_url: str or pathlib.Path
 50 |             CSV file with the protein-ligand measurements.
 51 |         path_or_url_constructs: str or pathlib.Path
 52 |             CSV file with the construct information.
 53 |         protein_type: str, default=KLIFSKinase
 54 |             The protein object type to use ('Protein' or 'KLIFSKinase').
 55 |         toolkit: str, default=OpenEye
 56 |             The toolkit to use for creating protein objects (e.g. 'OpenEye', 'MDAnalysis'),
 57 |             allowed values depend on the specified `protein_type`.
 58 | 
 59 |         Raises
 60 |         ------
 61 |         ValueError
 62 |             Given protein_type {protein_type} is not valid, only {protein_type_classes.keys()} are
 63 |             allowed.
 64 |         """
 65 |         logger.debug("Checking protein type ...")
 66 |         protein_type_classes = {"Protein": Protein, "KLIFSKinase": KLIFSKinase}
 67 |         if protein_type not in protein_type_classes.keys():
 68 |             raise ValueError(
 69 |                 f"Given protein_type {protein_type} is not valid, "
 70 |                 f"only {protein_type_classes.keys()} are allowed."
 71 |             )
 72 | 
 73 |         logger.debug("Loading CSV with construct information ...")
 74 |         constructs_df = pd.read_csv(path_or_url_constructs)
 75 | 
 76 |         logger.debug("Creating protein objects ...")
 77 |         kinases = dict()
 78 |         for _, construct in constructs_df.iterrows():
 79 |             if construct["Construct Description"] != "Wild Type":
 80 |                 # mutants not in measurements
 81 |                 continue
 82 |             discoverx_id = construct["DiscoverX Gene Symbol"]
 83 |             ncbi_id = construct["Accession Number"]
 84 |             if construct["AA Start/Stop"] == "Null":
 85 |                 # ambiguous, will consider full sequence
 86 |                 kinase = protein_type_classes[protein_type](
 87 |                     name=discoverx_id,
 88 |                     ncbi_id=ncbi_id,
 89 |                     toolkit=toolkit,
 90 |                 )
 91 |             else:
 92 |                 first, last = [x[1:] for x in construct["AA Start/Stop"].split("/")]
 93 |                 kinase = protein_type_classes[protein_type](
 94 |                     name=discoverx_id,
 95 |                     ncbi_id=ncbi_id,
 96 |                     metadata={"construct_range": f"{first}-{last}"},
 97 |                     toolkit=toolkit,
 98 |                 )
 99 |             kinases[discoverx_id] = kinase
100 | 
101 |         logger.debug("Loading CSV with measurements ...")
102 |         # column 0 is name, column 3 is smiles, column 7 - 412 are measurements for each kinase
103 |         measurements_df = pd.read_csv(path_or_url, usecols=[0, 3] + list(range(7, 413)))
104 | 
105 |         logger.debug("Creating systems and measurements ...")
106 |         measurements = []
107 |         kinase_names = measurements_df.columns[2:]
108 |         for _, ligand_measurements in measurements_df.iterrows():
109 |             ligand_name = ligand_measurements["Regno"]
110 |             smiles = ligand_measurements["Smiles"]
111 |             if ligand_name == "0":
112 |                 ligand_name = smiles
113 |             ligand = Ligand(smiles=smiles, name=ligand_name)
114 |             for kinase_name, inhibition_value in zip(kinase_names, ligand_measurements.values[2:]):
115 |                 measurement = PercentageDisplacementMeasurement(
116 |                     inhibition_value,
117 |                     conditions=AssayConditions(pH=7.0),
118 |                     system=ProteinLigandComplex(components=[ligand, kinases[kinase_name]]),
119 |                 )
120 |                 measurements.append(measurement)
121 | 
122 |         return cls(measurements=measurements, metadata={"path_or_url": path_or_url})
123 | 


--------------------------------------------------------------------------------
/kinoml/tests/docking/test_oedocking.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test OEDocking functionalities of `kinoml.docking`
  3 | """
  4 | from contextlib import contextmanager
  5 | from importlib import resources
  6 | import pytest
  7 | 
  8 | 
  9 | @contextmanager
 10 | def does_not_raise():
 11 |     yield
 12 | 
 13 | 
 14 | @pytest.mark.parametrize(
 15 |     "package, resource, resids, expectation, min_x",
 16 |     [
 17 |         (
 18 |             "kinoml.data.proteins",
 19 |             "4f8o_edit.pdb",
 20 |             [50, 51, 52, 62, 63, 64, 70, 77],
 21 |             does_not_raise(),
 22 |             21.225000381469727,
 23 |         ),
 24 |         (
 25 |             "kinoml.data.proteins",
 26 |             "4f8o_edit.pdb",
 27 |             [700, 701, 702],
 28 |             pytest.raises(ValueError),
 29 |             21.225000381469727,
 30 |         ),
 31 |     ],
 32 | )
 33 | def test_resids_to_box_molecule(package, resource, resids, expectation, min_x):
 34 |     """Compare results to expected minimal x_coordinate."""
 35 |     from kinoml.modeling.OEModeling import read_molecules
 36 |     from kinoml.docking.OEDocking import resids_to_box_molecule
 37 | 
 38 |     with resources.path(package, resource) as path:
 39 |         with expectation:
 40 |             protein = read_molecules(str(path))[0]
 41 |             box_molecule = resids_to_box_molecule(protein, resids)
 42 |             x_coordinates = [coordinates[0] for coordinates in box_molecule.GetCoords().values()]
 43 |             assert round(min(x_coordinates), 3) == round(min_x, 3)
 44 | 
 45 | 
 46 | @pytest.mark.parametrize(
 47 |     "package, resource, smiles_list, n_poses",
 48 |     [
 49 |         (
 50 |             "kinoml.data.proteins",
 51 |             "4f8o.pdb",
 52 |             ["c1cc(ccc1CCN)S(=O)(=O)F", "c1cc(ccc1CCN)S(=O)(=O)N"],
 53 |             3,
 54 |         ),
 55 |     ],
 56 | )
 57 | def test_hybrid_docking(package, resource, smiles_list, n_poses):
 58 |     """Compare results to expected number of docked molecules and docking poses"""
 59 |     from openeye import oedocking
 60 | 
 61 |     from kinoml.docking.OEDocking import hybrid_docking
 62 |     from kinoml.modeling.OEModeling import read_molecules, read_smiles, prepare_complex
 63 | 
 64 |     with resources.path(package, resource) as path:
 65 |         structure = read_molecules(str(path))[0]
 66 |         design_unit = prepare_complex(structure)
 67 |         if not design_unit.HasReceptor():
 68 |             oedocking.OEMakeReceptor(design_unit)
 69 |         docking_poses = hybrid_docking(
 70 |             design_unit, [read_smiles(smiles) for smiles in smiles_list], n_poses
 71 |         )
 72 |         assert len(docking_poses) == len(smiles_list) * n_poses
 73 | 
 74 | 
 75 | @pytest.mark.parametrize(
 76 |     "package, resource, resids, smiles_list, n_poses",
 77 |     [
 78 |         (
 79 |             "kinoml.data.proteins",
 80 |             "4f8o_edit.pdb",
 81 |             [50, 51, 52, 62, 63, 64, 70, 77],
 82 |             ["c1cc(ccc1CCN)S(=O)(=O)F", "c1cc(ccc1CCN)S(=O)(=O)N"],
 83 |             3,
 84 |         ),
 85 |     ],
 86 | )
 87 | def test_fred_docking(package, resource, resids, smiles_list, n_poses):
 88 |     """Compare results to expected number of docked molecules and docking poses"""
 89 |     from openeye import oechem, oedocking
 90 | 
 91 |     from kinoml.docking.OEDocking import fred_docking, resids_to_box_molecule
 92 |     from kinoml.modeling.OEModeling import read_molecules, read_smiles, prepare_protein
 93 | 
 94 |     with resources.path(package, resource) as path:
 95 |         structure = read_molecules(str(path))[0]
 96 |         design_unit = prepare_protein(structure)
 97 |         protein = oechem.OEGraphMol()
 98 |         design_unit.GetProtein(protein)
 99 |         box_molecule = resids_to_box_molecule(protein, resids)
100 |         options = oedocking.OEMakeReceptorOptions()
101 |         options.SetBoxMol(box_molecule)
102 |         oedocking.OEMakeReceptor(design_unit, options)
103 |         docking_poses = fred_docking(
104 |             design_unit, [read_smiles(smiles) for smiles in smiles_list], n_poses
105 |         )
106 |         assert len(docking_poses) == len(smiles_list) * n_poses
107 | 
108 | 
109 | @pytest.mark.parametrize(
110 |     "package, resource, smiles_list",
111 |     [
112 |         (
113 |             "kinoml.data.proteins",
114 |             "4f8o.pdb",
115 |             ["c1cc(ccc1CCN)S(=O)(=O)F", "c1cc(ccc1CCN)S(=O)(=O)N"],
116 |         ),
117 |     ],
118 | )
119 | def test_pose_molecules(package, resource, smiles_list):
120 |     """Compare results to expected number of docked molecules and docking poses"""
121 |     from openeye import oechem, oedocking
122 | 
123 |     from kinoml.docking.OEDocking import pose_molecules
124 |     from kinoml.modeling.OEModeling import read_molecules, read_smiles, prepare_complex
125 | 
126 |     with resources.path(package, resource) as path:
127 |         structure = read_molecules(str(path))[0]
128 |         design_unit = prepare_complex(structure)
129 |         if not design_unit.HasReceptor():
130 |             oedocking.OEMakeReceptor(design_unit)
131 |         docking_poses = pose_molecules(
132 |             design_unit,
133 |             [read_smiles(smiles) for smiles in smiles_list],
134 |             score_pose=True,
135 |         )
136 |         assert len(docking_poses) == len(smiles_list)
137 |         assert all(
138 |             [oechem.OEHasSDData(docking_pose, "Chemgauss4") for docking_pose in docking_poses]
139 |         )
140 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | KinoML
 2 | ==============================
 3 | [//]: # (Badges)
 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 5 | [![CI](https://github.com/openkinome/kinoml/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/openkinome/kinoml/actions/workflows/ci.yml)
 6 | [![DOCS](https://github.com/openkinome/kinoml/actions/workflows/docs.yml/badge.svg?branch=master)](https://github.com/openkinome/kinoml/actions/workflows/docs.yml)
 7 | [![codecov](https://codecov.io/gh/openkinome/KinoML/branch/master/graph/badge.svg)](https://codecov.io/gh/openkinome/KinoML/branch/master)
 8 | 
 9 | ![GitHub closed pr](https://img.shields.io/github/issues-pr-closed-raw/openkinome/kinoml) 
10 | ![GitHub open pr](https://img.shields.io/github/issues-pr-raw/openkinome/kinoml) 
11 | ![GitHub closed issues](https://img.shields.io/github/issues-closed-raw/openkinome/kinoml) 
12 | ![GitHub open issues](https://img.shields.io/github/issues/openkinome/kinoml)
13 | 
14 | **KinoML** is a modular and extensible framework for machine learning (ML) in small molecule drug discovery with a special focus on kinases. The publication can be found [here](https://www.biorxiv.org/content/10.1101/2024.09.10.612176v1). KinoML enables users to easily:
15 | 1. **Access and download data**: from online data sources, such as ChEMBL or PubChem as well as from their own files, with a focus on data availability and inmutability.
16 | 2. **Featurize data**: so that it is ML readeable. KinoML offers a wide variety of featurization schemes, from ligand-only to ligand:kinase complexes.
17 | 3. **Run structure-based experiments**: using KinoML's implemented models, with a special focus on reproducibility.
18 | 
19 | 
20 | 
21 | The purpose of KinoML is to help users conduct ML kinase experiments, from data collection to model evaluation. Tutorials on how to use KinoML as well as working examples showcasing how to use KinoML to perform experiments end-to-end can be found [here.](https://github.com/raquellrios/kinoml/tree/master/tutorials) Note that despite KinoML's focus being on kinases, it can be applied to any protein system. For more detailed instructions, please refer to the [Documentation](https://openkinome.org/kinoml/index.html). 
22 | 
23 | A KinoML workflow to achieve points **1, 2** and **3** is illustrated in the following image:
24 | 
25 | ![KinoML object model](kinoml/data/fig_1_kinomltechpaper_v2.png)  
26 | **Fig. 1:** KinoML workflow overview. Colors represent objects of the same class.
27 | 
28 | 
29 | 
30 | ### Notice
31 | 
32 | Please be aware that this code is work in progress and is not guaranteed to provide the expected results. The API can change at any time without warning.
33 | 
34 | ### Installation
35 | 
36 | #### Option 1: Install with Docker
37 | 
38 | A prebuilt Docker image of this software is available on Docker Hub:
39 | 
40 | **Image:** `openkinome/kinoml:v1`  
41 | **Link:** [Docker Hub page](https://hub.docker.com/r/openkinome/kinoml)
42 | 
43 | ```bash
44 | # Download the container image
45 | docker pull openkinome/kinoml:v1
46 | 
47 | # Run the software
48 | docker run --rm openkinome/kinoml:v1 --help 
49 | ```
50 | 
51 | #### Option 2: Install with conda/mamba
52 | 
53 | KinoML and its dependencies can be installed via conda/mamba.
54 | 
55 | ```bash
56 | git clone https://github.com/openkinome/kinoml.git  # clone the repo
57 | cd kinoml  # change directory to local copy of repo
58 | mamba env create -n kinoml -f devtools/conda-envs/test_env.yaml
59 | conda activate kinoml  
60 | python -m pip install git+https://github.com/openkinome/kinoml.git 
61 | ```
62 | 
63 | ### Usage
64 | 
65 | The tutorials folder is divided into two parts:
66 | 
67 | 1. [**Getting started**](https://github.com/raquellrios/kinoml/tree/master/tutorials/getting_started): the notebooks in this folder aim to give the user an understanding of how to use KinoML to: (1) **access and download** data, (2) **featurize** data, and (3) **run a** (simple) **ML model** on the featurized data obtained with KinoML to predict ligand binding affinity. Additionally, this folder contains notebooks that explain the **KinoML object model** and how to access the different objects, as well as notebooks **showcasing all the different featurizers** implemented within KinoML and how to use each of them.
68 | 
69 | 2. [**Experiments**](https://github.com/raquellrios/kinoml/tree/master/tutorials/experiments): this folder contains four individual structure-based experiments to predict ligand binding affinity. All experiments use KinoML to obtain the data, featurize it and train and evaluate a ML model implemented within the`kinoml.ml` class. The purpose of these experiments is to display usage examples of KinoML to conduct end-to-end structure-based kinases experiments.
70 | 
71 | 
72 | ⚠️ You will need a valid OpenEye License for the tutorials to work. For the Schrodinger featurizers tutorial you will also need a Schrodinger License!
73 | 
74 | 
75 | For users interested in more KinoML usage examples, they can checkout other repositories under the initative [OpenKinome](https://github.com/openkinome/). Particularly, other two repositories that may be of interest are:
76 | 
77 | 
78 | - [kinodata](https://github.com/openkinome/kinodata): repository with ready-to-use kinase-focused datasets from ChEMBL, as well as tutorials explaining how to process kinase data for ML applications. 
79 | - [experiments-binding-affinity](https://github.com/openkinome/experiments-binding-affinity): more advanced and reproducible ML experiments using KinoML.
80 | 
81 | 
82 | 
83 | Copyright (c) 2019, OpenKinome
84 | 
85 | 
86 | #### Acknowledgements
87 |  
88 | Project based on the 
89 | [Computational Molecular Science Python Cookiecutter](https://github.com/molssi/cookiecutter-cms) version 1.1.
90 | 


--------------------------------------------------------------------------------
/kinoml/core/ligands.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ``MolecularComponent`` objects that represent ligand-like entities.
  3 | """
  4 | 
  5 | import logging
  6 | from pathlib import Path
  7 | from typing import Union
  8 | 
  9 | from openff.toolkit.topology import Molecule
 10 | 
 11 | from .components import BaseLigand
 12 | 
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class Ligand(BaseLigand):
 18 |     """
 19 |     Create a new Ligand object. An openff representation is accessible via the molecule attribute.
 20 | 
 21 |     Examples
 22 |     --------
 23 | 
 24 |     Create a ligand from file:
 25 | 
 26 |     >>> ligand = Ligand.from_file("data/molecules/chloroform.sdf", name="chloroform")
 27 | 
 28 |     Create a ligand from an openff molecule:
 29 | 
 30 |     >>> from openff.toolkit.topology import Molecule
 31 |     >>> molecule = Molecule.from_file("data/molecules/chloroform.sdf")
 32 |     >>> ligand = Ligand(molecule=molecule, name="chloroform")
 33 | 
 34 |     Create a ligand from SMILES:
 35 | 
 36 |     >>> ligand = Ligand.from_smiles("C(Cl)(Cl)Cl", name="chloroform")
 37 | 
 38 |     Create a ligand from SMILES with lazy instantiation:
 39 | 
 40 |     >>> ligand = Ligand(smiles="C(Cl)(Cl)Cl", name="chloroform")
 41 | 
 42 |     """
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         molecule: Union[Molecule, None] = None,
 47 |         smiles: str = "",
 48 |         name: str = "",
 49 |         metadata: Union[dict, None] = None,
 50 |         **kwargs
 51 |     ):
 52 |         """
 53 |         Create a new Ligand object. Lazy instantiation is possible via the smiles parameter.
 54 | 
 55 |         Parameters
 56 |         ----------
 57 |         molecule: openff.toolkit.topology.Molecule or None, default=None
 58 |             An openff representation of the ligand.
 59 |         smiles: str, default=""
 60 |             The SMILES representation of the ligand. Can be used for lazy instantiation, i.e. will
 61 |             interpreted when calling the molecule attribute the first time.
 62 |         name: str, default=""
 63 |             The name of the ligand.
 64 |         metadata: dict or None, default=None
 65 |             Additional metadata of the needed for e.g. featurizers or provenance.
 66 |         """
 67 |         BaseLigand.__init__(self, name=name, metadata=metadata, **kwargs)
 68 |         self._molecule = molecule
 69 |         self._smiles = smiles
 70 | 
 71 |     @property
 72 |     def molecule(self):
 73 |         """Decorate molecule to modify setter and getter."""
 74 |         return self._molecule
 75 | 
 76 |     @molecule.setter
 77 |     def molecule(self, new_value: Union[Molecule, None]):
 78 |         """
 79 |         Store a new value for molecule in the _molecule attribute.
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         new_value: openff.toolkit.topology.Molecule or None
 84 |             The new openff molecule.
 85 |         """
 86 |         self._molecule = new_value
 87 | 
 88 |     @molecule.getter
 89 |     def molecule(self):
 90 |         """
 91 |         Get the _molecule attribute. If the _smiles attribute is given and _molecule is None, a
 92 |         new openff molecule will be created from smiles, e.g. in case of lazy instantiation.
 93 | 
 94 |         Returns
 95 |         ------
 96 |         : openff.toolkit.topology.Molecule or None
 97 |             The openff molecular representation of the ligand.
 98 |         """
 99 |         if not self._molecule and self._smiles:
100 |             self._molecule = Molecule.from_smiles(smiles=self._smiles, allow_undefined_stereo=True)
101 |             if not self.name:
102 |                 self.name = self._smiles
103 |             if self.metadata is None:
104 |                 self.metadata = {"smiles": self._smiles}
105 |             else:
106 |                 self.metadata.update({"smiles": self._smiles})
107 |         return self._molecule
108 | 
109 |     @classmethod
110 |     def from_smiles(
111 |         cls, smiles: str, name: str = "", allow_undefined_stereo: bool = True, **kwargs
112 |     ):
113 |         """
114 |         Create a Ligand from a SMILES representation.
115 | 
116 |         Parameters
117 |         ----------
118 |         smiles: str
119 |             smiles: str
120 |             The SMILES representation of the ligand.
121 |         name: str, default=""
122 |             The name of the ligand.
123 |         allow_undefined_stereo: bool, default=True
124 |             If undefined stereo centers should be allowed.
125 |         kwargs:
126 |             Any keyword arguments allowed for the from_smiles method of the openff molecule class.
127 |         """
128 |         molecule = Molecule.from_smiles(
129 |             smiles=smiles, allow_undefined_stereo=allow_undefined_stereo, **kwargs
130 |         )
131 |         if not name:
132 |             name = smiles
133 |         return cls(molecule=molecule, name=name, metadata={"smiles": smiles})
134 | 
135 |     @classmethod
136 |     def from_file(
137 |         cls,
138 |         file_path: Union[Path, str],
139 |         name: str = "",
140 |         allow_undefined_stereo: bool = True,
141 |         **kwargs
142 |     ):
143 |         """
144 |         Create a Ligand from file.
145 | 
146 |         Parameters
147 |         ----------
148 |         file_path: pathlib.Path or str
149 |             The path to the molecular file. For supported formats see the openff molecule
150 |             documentation.
151 |         name: str, default=""
152 |             The name of the ligand.
153 |         allow_undefined_stereo: bool, default=True
154 |             If undefined stereo centers should be allowed.
155 |         kwargs:
156 |             Any keyword arguments allowed for the from_file method of the openff molecule class.
157 |         """
158 |         molecule = Molecule.from_file(
159 |             file_path=file_path, allow_undefined_stereo=allow_undefined_stereo, **kwargs
160 |         )
161 |         if not name:
162 |             name = molecule.to_smiles(explicit_hydrogens=False)
163 |         return cls(molecule=molecule, name=name, metadata={"file_path": file_path})
164 | 


--------------------------------------------------------------------------------
/kinoml/tests/features/test_core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test core objects of ``kinoml.features``
  3 | """
  4 | import pytest
  5 | import numpy as np
  6 | 
  7 | from kinoml.core.systems import LigandSystem
  8 | from kinoml.core.ligands import Ligand
  9 | from kinoml.features.core import (
 10 |     BaseFeaturizer,
 11 |     Pipeline,
 12 |     Concatenated,
 13 |     BaseOneHotEncodingFeaturizer,
 14 |     PadFeaturizer,
 15 |     HashFeaturizer,
 16 |     NullFeaturizer,
 17 |     CallableFeaturizer,
 18 |     ClearFeaturizations,
 19 |     TupleOfArrays,
 20 | )
 21 | 
 22 | 
 23 | def test_BaseFeaturizer():
 24 |     ligand = Ligand(smiles="CCCC")
 25 |     systems = [
 26 |         LigandSystem(components=[ligand]),
 27 |         LigandSystem(components=[ligand]),
 28 |         LigandSystem(components=[ligand]),
 29 |     ]
 30 |     featurizer = BaseFeaturizer()
 31 |     with pytest.raises(NotImplementedError):
 32 |         featurizer(systems)
 33 | 
 34 |     with pytest.raises(NotImplementedError):
 35 |         featurizer.featurize(systems)
 36 | 
 37 | 
 38 | def test_Pipeline():
 39 |     ligand = Ligand("CCCC")
 40 |     systems = [
 41 |         LigandSystem(components=[ligand]),
 42 |         LigandSystem(components=[ligand]),
 43 |         LigandSystem(components=[ligand]),
 44 |     ]
 45 |     featurizers = (NullFeaturizer(), NullFeaturizer())
 46 |     pipeline = Pipeline(featurizers)
 47 |     pipeline.featurize(systems)
 48 |     assert [s.featurizations["last"] for s in systems] == systems
 49 | 
 50 | 
 51 | def test_Concatenated():
 52 |     from kinoml.features.ligand import MorganFingerprintFeaturizer
 53 | 
 54 |     ligand = Ligand(smiles="CCCC")
 55 |     system = LigandSystem([ligand])
 56 |     featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False)
 57 |     featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False)
 58 |     concatenated = Concatenated([featurizer1, featurizer2], axis=1)
 59 |     concatenated.featurize([system])
 60 |     assert system.featurizations["last"].shape[0] == 1024
 61 | 
 62 | 
 63 | def test_TupleOfArrays():
 64 |     from kinoml.features.ligand import MorganFingerprintFeaturizer
 65 | 
 66 |     ligand = Ligand(smiles="CCCC")
 67 |     system = LigandSystem([ligand])
 68 |     featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512, use_multiprocessing=False)
 69 |     featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024, use_multiprocessing=False)
 70 |     aggregated = TupleOfArrays([featurizer1, featurizer2])
 71 |     aggregated.featurize([system])
 72 |     assert len(system.featurizations["last"]) == 2
 73 |     assert system.featurizations["last"][0].shape[0] == 512
 74 |     assert system.featurizations["last"][1].shape[0] == 1024
 75 | 
 76 | 
 77 | def test_BaseOneHotEncodingFeaturizer():
 78 |     assert (
 79 |         BaseOneHotEncodingFeaturizer.one_hot_encode("AAA", "ABC") == np.array([[1, 0, 0]] * 3).T
 80 |     ).all()
 81 |     assert (
 82 |         BaseOneHotEncodingFeaturizer.one_hot_encode("AAA", {"A": 0, "B": 1, "C": 2})
 83 |         == np.array([[1, 0, 0]] * 3).T
 84 |     ).all()
 85 |     assert (
 86 |         BaseOneHotEncodingFeaturizer.one_hot_encode(["A", "A", "A"], ["A", "B", "C"])
 87 |         == np.array([[1, 0, 0]] * 3).T
 88 |     ).all()
 89 | 
 90 | 
 91 | def test_PadFeaturizer():
 92 |     from kinoml.features.ligand import OneHotSMILESFeaturizer
 93 | 
 94 |     systems = (
 95 |         LigandSystem([Ligand(smiles="C")]),
 96 |         LigandSystem([Ligand(smiles="CC")]),
 97 |         LigandSystem([Ligand(smiles="CCC")]),
 98 |     )
 99 |     OneHotSMILESFeaturizer(use_multiprocessing=False).featurize(systems)
100 |     PadFeaturizer(use_multiprocessing=False).featurize(systems)
101 | 
102 |     for s in systems:
103 |         assert s.featurizations["last"].shape == (53, 3)
104 | 
105 |     return systems
106 | 
107 | 
108 | def test_HashFeaturizer():
109 |     system = LigandSystem([Ligand(smiles="CCC")])
110 |     HashFeaturizer(getter=lambda s: s.ligand.molecule.to_smiles(), normalize=True).featurize(
111 |         [system]
112 |     )
113 |     assert system.featurizations["last"] == pytest.approx(0.62342903)
114 | 
115 | 
116 | def test_NullFeaturizer():
117 |     system = LigandSystem([Ligand(smiles="CCC")])
118 |     NullFeaturizer().featurize([system])
119 | 
120 |     assert system == system.featurizations["last"]
121 | 
122 | 
123 | def test_CallableFeaturizer():
124 |     from sklearn.preprocessing import scale
125 | 
126 |     systems = (
127 |         LigandSystem([Ligand(smiles="C")]),
128 |         LigandSystem([Ligand(smiles="CC")]),
129 |         LigandSystem([Ligand(smiles="CCC")]),
130 |     )
131 |     HashFeaturizer(getter=lambda s: s.ligand.molecule.to_smiles(), normalize=False).featurize(
132 |         systems
133 |     )
134 |     CallableFeaturizer(lambda s: scale(s.featurizations["last"].reshape((1,)))).featurize(systems)
135 | 
136 |     for s in systems:
137 |         assert s.featurizations["last"].shape
138 | 
139 | 
140 | def test_ClearFeaturizations_keeplast():
141 |     from kinoml.features.ligand import OneHotSMILESFeaturizer
142 | 
143 |     systems = (
144 |         LigandSystem([Ligand(smiles="C")]),
145 |         LigandSystem([Ligand(smiles="CC")]),
146 |         LigandSystem([Ligand(smiles="CCC")]),
147 |     )
148 |     OneHotSMILESFeaturizer(use_multiprocessing=False).featurize(systems)
149 |     PadFeaturizer(use_multiprocessing=False).featurize(systems)
150 |     ClearFeaturizations().featurize(systems)
151 | 
152 |     for s in systems:
153 |         assert len(s.featurizations) == 1
154 |         assert "last" in s.featurizations
155 | 
156 | 
157 | def test_ClearFeaturizations_removeall():
158 |     from kinoml.features.ligand import OneHotSMILESFeaturizer
159 | 
160 |     systems = (
161 |         LigandSystem([Ligand(smiles="C")]),
162 |         LigandSystem([Ligand(smiles="CC")]),
163 |         LigandSystem([Ligand(smiles="CCC")]),
164 |     )
165 |     OneHotSMILESFeaturizer(use_multiprocessing=False).featurize(systems)
166 |     PadFeaturizer(use_multiprocessing=False).featurize(systems)
167 |     ClearFeaturizations(keys=tuple(), style="keep").featurize(systems)
168 | 
169 |     for s in systems:
170 |         assert not s.featurizations
171 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | # import os
 14 | # import sys
 15 | # sys.path.insert(0, os.path.abspath('.'))
 16 | 
 17 | 
 18 | # -- Project information -----------------------------------------------------
 19 | 
 20 | project = "KinoML"
 21 | copyright = "2021, OpenKinome"
 22 | author = "OpenKinome"
 23 | 
 24 | # The full version, including alpha/beta/rc tags
 25 | release = "0.1"
 26 | 
 27 | 
 28 | # -- General configuration ---------------------------------------------------
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = [
 34 |     "sphinx.ext.autosectionlabel",
 35 |     "sphinx.ext.todo",
 36 |     "sphinx.ext.napoleon",
 37 |     # "sphinxemoji.sphinxemoji",
 38 |     "sphinx-prompt",
 39 |     "sphinx_copybutton",
 40 |     # "notfound.extension",
 41 |     "myst_parser",
 42 |     # "sphinxcontrib.httpdomain",
 43 |     "autoapi.extension",
 44 |     "nbsphinx",
 45 |     "nbsphinx_link",
 46 |     # "sphinx_last_updated_by_git",
 47 |     # "sphinx_panels",
 48 |     "IPython.sphinxext.ipython_console_highlighting",
 49 | ]
 50 | 
 51 | autosectionlabel_prefix_document = True
 52 | 
 53 | nbsphinx_execute = "auto"
 54 | nbsphinx_execute_arguments = [
 55 |     "--InlineBackend.figure_formats={'svg', 'pdf'}",
 56 |     "--InlineBackend.rc={'figure.dpi': 96}",
 57 | ]
 58 | 
 59 | sphinxemoji_style = "twemoji"
 60 | 
 61 | 
 62 | autoapi_dirs = ["../kinoml"]
 63 | autoapi_root = "api"
 64 | autoapi_add_toctree_entry = False
 65 | autoapi_ignore = [
 66 |     "*migrations*",
 67 |     "*_version*",
 68 |     "*tests*",
 69 |     "*/data/*",
 70 | ]
 71 | autoapi_options = [
 72 |     "members",
 73 |     "undoc-members",
 74 |     "private-members",
 75 |     "show-inheritance",
 76 |     # "show-module-summary",
 77 |     "special-members",
 78 |     "imported-members",
 79 | ]
 80 | autoapi_keep_files = False
 81 | 
 82 | # Napoleon settings
 83 | napoleon_google_docstring = True
 84 | napoleon_numpy_docstring = True
 85 | napoleon_include_init_with_doc = False
 86 | napoleon_include_private_with_doc = False
 87 | napoleon_include_special_with_doc = True
 88 | napoleon_use_admonition_for_examples = True
 89 | napoleon_use_admonition_for_notes = True
 90 | napoleon_use_admonition_for_references = False
 91 | napoleon_type_aliases = None
 92 | napoleon_attr_annotations = True
 93 | 
 94 | # Add any paths that contain templates here, relative to this directory.
 95 | templates_path = ["_templates"]
 96 | 
 97 | # List of patterns, relative to source directory, that match files and
 98 | # directories to ignore when looking for source files.
 99 | # This pattern also affects html_static_path and html_extra_path.
100 | exclude_patterns = [
101 |     "_build",
102 |     "Thumbs.db",
103 |     ".DS_Store",
104 |     "sphinx-notfound-page",
105 |     ".ipynb_checkpoints/*",
106 |     "__pycache__",
107 |     "kinoml/data",
108 |     "developers",
109 | ]
110 | 
111 | 
112 | # -- Options for HTML output -------------------------------------------------
113 | 
114 | # The theme to use for HTML and HTML Help pages.  See the documentation for
115 | # a list of builtin themes.
116 | #
117 | import sphinx_material
118 | 
119 | # Choose the material theme
120 | html_theme = "sphinx_material"
121 | # Get the them path
122 | html_theme_path = sphinx_material.html_theme_path()
123 | # Register the required helpers for the html context
124 | html_context = sphinx_material.get_html_context()
125 | version_dropdown = False
126 | 
127 | # Material theme options (see theme.conf for more information)
128 | html_theme_options = {
129 |     "nav_title": "KinoML",
130 |     "repo_url": "https://github.com/openkinome/kinoml/",
131 |     "repo_name": "KinoML",
132 |     "logo_icon": "&#xe6dd",
133 |     "base_url": "https://openkinome.org/kinoml/",
134 |     # "google_analytics_account": "UA-XXXXX",
135 |     "html_minify": False,
136 |     "html_prettify": True,
137 |     "css_minify": True,
138 |     "repo_type": "github",
139 |     "globaltoc_depth": 3,
140 |     "color_primary": "#3f51b5",
141 |     "color_accent": "blue",
142 |     "touch_icon": "images/custom_favicon.png",
143 |     "theme_color": "#3f51b5",
144 |     "master_doc": False,
145 |     "nav_links": [
146 |         {"href": "index", "internal": True, "title": "User guide"},
147 |         {"href": "api/kinoml/index", "internal": True, "title": "API Reference"},
148 |         {
149 |             "href": "https://openkinome.org",
150 |             "internal": False,
151 |             "title": "OpenKinome",
152 |         },
153 |     ],
154 |     "heroes": {
155 |         "index": "Structure-informed machine learning for kinase modeling",
156 |     },
157 |     "version_dropdown": False,
158 |     "version_json": "_static/versions.json",
159 |     "version_info": {
160 |         "Release": "",
161 |         "Development": "",
162 |         "Release (rel)": "",
163 |         "Development (rel)": "",
164 |     },
165 |     "table_classes": ["plain"],
166 | }
167 | 
168 | # globaltoc seems it's not added by default
169 | html_sidebars = {
170 |     "**": [
171 |         "globaltoc.html",
172 |         "localtoc.html",
173 |         "searchbox.html",
174 |     ]
175 | }
176 | 
177 | 
178 | # Add any paths that contain custom static files (such as style sheets) here,
179 | # relative to this directory. They are copied after the builtin static files,
180 | # so a file named "default.css" will overwrite the builtin "default.css".
181 | html_static_path = ["_static"]
182 | html_favicon = "_static/images/custom_favicon.png"
183 | 
184 | # -------
185 | # MyST
186 | # -------
187 | myst_enable_extensions = [
188 |     "amsmath",
189 |     "colon_fence",
190 |     "deflist",
191 |     "dollarmath",
192 |     "html_admonition",
193 |     "html_image",
194 |     "linkify",
195 |     "replacements",
196 |     "smartquotes",
197 |     "substitution",
198 | ]
199 | 
200 | myst_update_mathjax = False
201 | mathjax3_config = {
202 |     "tex2jax": {
203 |         "inlineMath": [["\\(", "\\)"]],
204 |         "displayMath": [["\\[", "\\]"]],
205 |         "processRefs": False,
206 |         "processEnvironments": False,
207 |     }
208 | }
209 | 


--------------------------------------------------------------------------------
/kinoml/datasets/chembl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Creates DatasetProvider objects from ChEMBL activity data
  3 | """
  4 | import logging
  5 | import random
  6 | 
  7 | import pandas as pd
  8 | from tqdm.auto import tqdm
  9 | 
 10 | from .core import MultiDatasetProvider
 11 | from ..core.conditions import AssayConditions
 12 | from ..core.proteins import Protein, KLIFSKinase
 13 | from ..core.ligands import Ligand
 14 | from ..core.systems import ProteinLigandComplex
 15 | from ..core.measurements import pIC50Measurement, pKiMeasurement, pKdMeasurement
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class ChEMBLDatasetProvider(MultiDatasetProvider):
 22 | 
 23 |     """
 24 |     This provider relies heavily on ``openkinome/kinodata`` data ingestion
 25 |     pipelines. It will load ChEMBL activities from its releases page.
 26 |     """
 27 | 
 28 |     @classmethod
 29 |     def from_source(
 30 |         cls,
 31 |         path_or_url="https://github.com/openkinome/datascripts/releases/download/v0.2/activities-chembl28_v0.2.zip",
 32 |         measurement_types=("pIC50", "pKi", "pKd"),
 33 |         uniprot_ids=None,
 34 |         sample=None,
 35 |         protein_type: str = "KLIFSKinase",
 36 |         toolkit: str = "OpenEye",
 37 |     ):
 38 |         """
 39 |         Create a MultiDatasetProvider out of the raw data contained in the zip file.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         path_or_url: str, optional
 44 |             path or URL to a (zipped) CSV file containing activities from ChEMBL,
 45 |             using schema detailed below.
 46 |         measurement_types: tuple of str, optional
 47 |             Which measurement types must be imported from the CSV. By default, all
 48 |             three (pIC50, pKi, pKd) will be loaded, but you can choose a subset (
 49 |             e.g. ``("pIC50",)``).
 50 |         uniprot_ids: None or list of str, default=None
 51 |             Restrict measurements to the given UniProt IDs.
 52 |         sample: int, optional=None
 53 |             If set to larger than zero, load only N data points from the dataset.
 54 |         protein_type: str, default=KLIFSKinase
 55 |             The protein object type to use ('Protein' or 'KLIFSKinase').
 56 |         toolkit: str, default=OpenEye
 57 |             The toolkit to use for creating protein objects (e.g. 'OpenEye', 'MDAnalysis'),
 58 |             allowed values depend on the specified `protein_type`.
 59 | 
 60 |         Raises
 61 |         ------
 62 |         ValueError
 63 |             Given protein_type {protein_type} is not valid, only {allowed_protein_types} are
 64 |             allowed.
 65 | 
 66 |         Note
 67 |         ----
 68 |         ChEMBL aggregates data from lots of sources, so conditions are guaranteed
 69 |         to be different across experiments.
 70 |         """
 71 |         logger.debug("Checking protein type ...")
 72 |         protein_type_classes = {"Protein": Protein, "KLIFSKinase": KLIFSKinase}
 73 |         if protein_type not in protein_type_classes.keys():
 74 |             raise ValueError(
 75 |                 f"Given protein_type {protein_type} is not valid, "
 76 |                 f"only {protein_type_classes.keys()} are allowed."
 77 |             )
 78 | 
 79 |         logger.debug("Retrieving and reading CSV ...")
 80 |         cached_path = cls._download_to_cache_or_retrieve(path_or_url)
 81 |         df = pd.read_csv(cached_path)
 82 |         df = df.dropna(
 83 |             subset=[
 84 |                 "compound_structures.canonical_smiles",
 85 |                 "component_sequences.sequence",
 86 |                 "activities.standard_type",
 87 |             ]
 88 |         )
 89 | 
 90 |         if uniprot_ids:
 91 |             logger.debug(f"Filtering for UniProt IDs {uniprot_ids}...")
 92 |             df = df[df["UniprotID"].isin(uniprot_ids)]
 93 | 
 94 |         logger.debug(f"Filtering for measurement types {measurement_types} ...")
 95 |         chosen_types_labels = df["activities.standard_type"].isin(set(measurement_types))
 96 |         filtered_records = df[chosen_types_labels].to_dict("records")
 97 | 
 98 |         if sample is not None:
 99 |             logger.debug(f"Getting sample of size {sample} ...")
100 |             filtered_records = random.sample(filtered_records, sample)
101 | 
102 |         measurement_type_classes = {
103 |             "pIC50": pIC50Measurement,
104 |             "pKi": pKiMeasurement,
105 |             "pKd": pKdMeasurement,
106 |         }
107 |         measurements = []
108 |         systems = {}
109 |         proteins = {}
110 |         ligands = {}
111 |         logger.debug(f"Creating systems and measurements ...")
112 |         for row in tqdm(filtered_records):
113 |             try:
114 |                 measurement_type_key = row["activities.standard_type"]
115 |                 protein_key = row["component_sequences.sequence"]
116 |                 ligand_key = row["compound_structures.canonical_smiles"]
117 |                 system_key = (protein_key, ligand_key)
118 |                 if protein_key not in proteins:
119 |                     metadata = {
120 |                         "uniprot_id": row["UniprotID"],
121 |                         "chembl_target_id": row["target_dictionary.chembl_id"],
122 |                     }
123 |                     protein = protein_type_classes[protein_type](
124 |                         sequence=protein_key,
125 |                         name=row["UniprotID"],
126 |                         uniprot_id=row["UniprotID"],
127 |                         metadata=metadata,
128 |                         toolkit=toolkit,
129 |                     )
130 |                     proteins[protein_key] = protein
131 |                 if ligand_key not in ligands:
132 |                     ligands[ligand_key] = Ligand(smiles=ligand_key, name=ligand_key)
133 |                 if system_key not in systems:
134 |                     systems[system_key] = ProteinLigandComplex(
135 |                         [proteins[protein_key], ligands[ligand_key]]
136 |                     )
137 | 
138 |                 MeasurementType = measurement_type_classes[measurement_type_key]
139 |                 conditions = AssayConditions(pH=7)
140 |                 system = systems[system_key]
141 |                 metadata = {
142 |                     "unit": f"-log10({row['activities.standard_units']}E-9)",
143 |                     "confidence": row["assays.confidence_score"],
144 |                     "chembl_activity": row["activities.activity_id"],
145 |                     "chembl_document": row["docs.chembl_id"],
146 |                     "year": row["docs.year"],
147 |                 }
148 |                 measurement = MeasurementType(
149 |                     values=row["activities.standard_value"],
150 |                     system=system,
151 |                     conditions=conditions,
152 |                     metadata=metadata,
153 |                 )
154 |                 measurements.append(measurement)
155 |             except Exception as exc:
156 |                 print("Couldn't process record", row)
157 |                 print("Exception:", exc)
158 | 
159 |         return cls(
160 |             measurements,
161 |             metadata={
162 |                 "path_or_url": path_or_url,
163 |                 "measurement_types": measurement_types,
164 |                 "sample": sample,
165 |             },
166 |         )
167 | 


--------------------------------------------------------------------------------
/kinoml/databases/pdb.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | from typing import Iterable, Union
  4 | 
  5 | from appdirs import user_cache_dir
  6 | 
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def smiles_from_pdb(ligand_ids: Iterable[str]) -> dict:
 12 |     """
 13 |     Retrieve SMILES of molecules defined by their PDB chemical identifier.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     ligand_ids: iterable of str
 18 |         PDB chemical identifier.
 19 | 
 20 |     Returns
 21 |     -------
 22 |     ligands: dict
 23 |         Dictionary with PDB chemical identifier as keys and SMILES as values.
 24 |     """
 25 |     import json
 26 |     import math
 27 |     import requests
 28 |     import urllib
 29 | 
 30 |     ligand_ids = list(set(ligand_ids))
 31 |     ligands = {}
 32 |     base_url = "https://data.rcsb.org/graphql?query="
 33 |     n_batches = math.ceil(len(ligand_ids) / 50)  # request maximal 50 smiles at a time
 34 |     for i in range(n_batches):
 35 |         ligand_ids_batch = ligand_ids[i * 50 : (i * 50) + 50]
 36 |         logger.debug(f"Batch {i}\n{ligand_ids_batch}")
 37 |         query = (
 38 |             "{chem_comps(comp_ids:["
 39 |             + ",".join(['"' + ligand_id + '"' for ligand_id in ligand_ids_batch])
 40 |             + "]){chem_comp{id}rcsb_chem_comp_descriptor{SMILES_stereo}}}"
 41 |         )
 42 |         response = requests.get(base_url + urllib.parse.quote(query))
 43 |         for ligand in json.loads(response.text)["data"]["chem_comps"]:
 44 |             try:
 45 |                 ligands[ligand["chem_comp"]["id"]] = ligand["rcsb_chem_comp_descriptor"][
 46 |                     "SMILES_stereo"
 47 |                 ]
 48 |             except TypeError:
 49 |                 # missing smiles entry
 50 |                 pass
 51 | 
 52 |     return ligands
 53 | 
 54 | 
 55 | def download_pdb_structure(
 56 |     pdb_id: str, directory: Union[str, Path] = user_cache_dir()
 57 | ) -> Union[Path, bool]:
 58 |     """
 59 |     Download a PDB structure. If the structure is not available in PDB format, it will be download
 60 |     in CIF format.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     pdb_id: str
 65 |         The PDB ID of interest.
 66 |     directory: str or Path, default=user_cache_dir
 67 |         The directory for saving the downloaded structure.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     : Path or False
 72 |         The path to the the downloaded file if successful, else False.
 73 |     """
 74 |     from pathlib import Path
 75 | 
 76 |     from ..utils import LocalFileStorage, FileDownloader
 77 | 
 78 |     directory = Path(directory)
 79 | 
 80 |     # check for structure in PDB format
 81 |     pdb_path = LocalFileStorage.rcsb_structure_pdb(pdb_id, directory)
 82 |     if not pdb_path.is_file():
 83 |         logger.debug("Downloading PDB entry in PDB format ...")
 84 |         if FileDownloader.rcsb_structure_pdb(pdb_id, directory):
 85 |             return pdb_path
 86 |     else:
 87 |         return pdb_path
 88 | 
 89 |     # check for structure in CIF format
 90 |     cif_path = LocalFileStorage.rcsb_structure_cif(pdb_id, directory)
 91 |     if not cif_path.is_file():
 92 |         logger.debug("Downloading PDB entry in CIF format ...")
 93 |         if FileDownloader.rcsb_structure_cif(pdb_id, directory):
 94 |             return cif_path
 95 |     else:
 96 |         return cif_path
 97 |     logger.debug(f"Could not download PDB entry {pdb_id}.")
 98 |     return False
 99 | 
100 | 
101 | def download_pdb_ligand(
102 |     pdb_id: str,
103 |     chain_id: str,
104 |     expo_id: str,
105 |     smiles: str = "",
106 |     directory: Union[str, Path] = user_cache_dir(),
107 | ) -> Union[Path, bool]:
108 |     """
109 |     Download a ligand co-crystallized to a PDB structure and save in SDF format. If a SMILES is
110 |     provided, the connectivity and protonation will be adjusted accordingly.
111 | 
112 |     Parameters
113 |     ----------
114 |     pdb_id: str
115 |         The PDB ID of interest.
116 |     chain_id: str
117 |         The chain ID of the ligand.
118 |     expo_id: str
119 |         The residue name of the ligand.
120 |     smiles: str, default=""
121 |         The smiles of the small molecule describing the connectivity and protonation of the
122 |         ligand.
123 |     directory: str or Path, default=user_cache_dir
124 |         The directory for saving the downloaded structure.
125 | 
126 |     Returns
127 |     -------
128 |     : Path or False
129 |         The path to the the processed ligand file in SDF format if successful, else False.
130 |     """
131 |     from rdkit import Chem
132 |     from rdkit.Chem import AllChem
133 |     from ..utils import LocalFileStorage
134 | 
135 |     directory = Path(directory)
136 |     sdf_path = LocalFileStorage.rcsb_ligand_sdf(
137 |         pdb_id=pdb_id,
138 |         chain_id=chain_id,
139 |         expo_id=expo_id,
140 |         altloc=None,
141 |         directory=directory,
142 |     )
143 |     if sdf_path.is_file():
144 |         logger.debug(
145 |             f"Found cached ligand file for PDB entry {pdb_id}, chain {chain_id}, ligand {expo_id}."
146 |         )
147 |         return sdf_path
148 | 
149 |     pdb_path = download_pdb_structure(pdb_id=pdb_id, directory=directory)
150 |     if not pdb_path:
151 |         return False
152 | 
153 |     suffix = str(pdb_path).split(".")[-1]
154 |     if suffix == "cif":
155 |         cif_path = str(pdb_path)
156 |         pdb_path = LocalFileStorage.rcsb_structure_pdb(
157 |             pdb_id=f"{pdb_id}_chain{chain_id}", directory=directory
158 |         )
159 |         if not pdb_path.is_file():
160 |             from Bio.PDB import MMCIFParser, PDBIO
161 | 
162 |             logger.debug("Converting CIF to PDB format ...")
163 |             parser = MMCIFParser()
164 |             try:
165 |                 structure = parser.get_structure("", cif_path)[0][chain_id]
166 |             except KeyError:
167 |                 logger.debug(f"Could not find chain {chain_id} in CIF file!")
168 |                 return False
169 |             io = PDBIO()
170 |             io.set_structure(structure)
171 |             io.save(str(pdb_path))
172 | 
173 |     logger.debug("Extracting ligand with RDKit ...")
174 |     try:
175 |         pdb_mol = Chem.MolFromPDBFile(str(pdb_path), sanitize=False)
176 |         if pdb_mol is None:
177 |             logger.debug(f"Could not read {pdb_path} with RDKit.")
178 |             return False
179 |         pdb_mol_chains = Chem.SplitMolByPDBChainId(pdb_mol)
180 |         chain = pdb_mol_chains[chain_id]
181 |         chain_residues = Chem.SplitMolByPDBResidues(chain)
182 |         ligand = chain_residues[expo_id]
183 |     except KeyError:
184 |         logger.debug(
185 |             f"Could not find ligand {expo_id} for chain {chain_id} in PDB entry {pdb_id}."
186 |         )
187 |         return False
188 | 
189 |     if smiles:
190 |         logger.debug("Adjusting connectivity and protonation according to given SMILES ...")
191 |         ligand = Chem.RemoveHs(ligand)
192 |         reference_mol = Chem.MolFromSmiles(smiles)
193 |         ligand = AllChem.AssignBondOrdersFromTemplate(reference_mol, ligand)
194 |         ligand = Chem.AddHs(ligand, addCoords=True)
195 | 
196 |     logger.debug("Writing extracted ligand to SDF file ...")
197 |     writer = Chem.SDWriter(str(sdf_path))
198 |     writer.write(ligand)
199 | 
200 |     return sdf_path
201 | 


--------------------------------------------------------------------------------
/kinoml/tests/features/test_protein.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test ligand featurizers of `kinoml.protein`
  3 | """
  4 | from importlib import resources
  5 | 
  6 | 
  7 | def test_aminoacidcompositionfeaturizer():
  8 |     """Check AminoAcidCompositionFeaturizer."""
  9 |     from kinoml.core.proteins import Protein
 10 |     from kinoml.core.systems import ProteinSystem
 11 |     from kinoml.features.protein import AminoAcidCompositionFeaturizer
 12 | 
 13 |     systems = [
 14 |         ProteinSystem([Protein(sequence="")]),
 15 |         ProteinSystem([Protein(sequence="A")]),
 16 |         ProteinSystem([Protein(uniprot_id="P00519")]),
 17 |         ProteinSystem([Protein(uniprot_id="xxxxx")]),
 18 |     ]
 19 |     featurizer = AminoAcidCompositionFeaturizer(use_multiprocessing=False)
 20 |     featurized_systems = featurizer.featurize(systems)
 21 | 
 22 |     assert len(featurized_systems) == 3  # filter protein with wrong UniProt ID
 23 |     assert list(featurized_systems[0].featurizations["last"]) == [
 24 |         0,
 25 |         0,
 26 |         0,
 27 |         0,
 28 |         0,
 29 |         0,
 30 |         0,
 31 |         0,
 32 |         0,
 33 |         0,
 34 |         0,
 35 |         0,
 36 |         0,
 37 |         0,
 38 |         0,
 39 |         0,
 40 |         0,
 41 |         0,
 42 |         0,
 43 |         0,
 44 |     ]
 45 |     assert list(featurized_systems[1].featurizations["last"]) == [
 46 |         1,
 47 |         0,
 48 |         0,
 49 |         0,
 50 |         0,
 51 |         0,
 52 |         0,
 53 |         0,
 54 |         0,
 55 |         0,
 56 |         0,
 57 |         0,
 58 |         0,
 59 |         0,
 60 |         0,
 61 |         0,
 62 |         0,
 63 |         0,
 64 |         0,
 65 |         0,
 66 |     ]
 67 |     assert list(featurized_systems[2].featurizations["last"]) == [
 68 |         97,
 69 |         14,
 70 |         42,
 71 |         90,
 72 |         29,
 73 |         88,
 74 |         24,
 75 |         33,
 76 |         82,
 77 |         96,
 78 |         18,
 79 |         40,
 80 |         90,
 81 |         33,
 82 |         64,
 83 |         120,
 84 |         65,
 85 |         62,
 86 |         13,
 87 |         30,
 88 |     ]
 89 | 
 90 | 
 91 | def test_onehotencodedsequencefeaturizer_full():
 92 |     """Check OneHotEncodedSequenceFeaturizer with full sequence."""
 93 |     from kinoml.core.proteins import Protein
 94 |     from kinoml.core.systems import ProteinSystem
 95 |     from kinoml.features.protein import OneHotEncodedSequenceFeaturizer
 96 | 
 97 |     systems = [
 98 |         ProteinSystem([Protein(sequence="")]),
 99 |         ProteinSystem([Protein(sequence="A")]),
100 |         ProteinSystem([Protein(uniprot_id="P00519")]),
101 |         ProteinSystem([Protein(uniprot_id="xxxxx")]),
102 |     ]
103 |     featurizer = OneHotEncodedSequenceFeaturizer(use_multiprocessing=False)
104 |     featurized_systems = featurizer.featurize(systems)
105 | 
106 |     assert len(featurized_systems) == 2  # filter protein with wrong UniProt ID and empty string
107 |     assert list(featurized_systems[0].featurizations["last"])[0][0] == 1
108 |     assert list(featurized_systems[1].featurizations["last"])[3][2] == 1
109 | 
110 | 
111 | def test_onehotencodedsequencefeaturizer_klifs_kinase():
112 |     """Check OneHotEncodedSequenceFeaturizer with kinase KLIFS sequence."""
113 |     from kinoml.core.proteins import KLIFSKinase
114 |     from kinoml.core.systems import ProteinSystem
115 |     from kinoml.features.protein import OneHotEncodedSequenceFeaturizer
116 | 
117 |     systems = [
118 |         ProteinSystem([KLIFSKinase(sequence="")]),
119 |         ProteinSystem([KLIFSKinase(kinase_klifs_sequence="A")]),
120 |         ProteinSystem([KLIFSKinase(uniprot_id="P00519")]),
121 |         ProteinSystem([KLIFSKinase(uniprot_id="xxxxx")]),
122 |         ProteinSystem([KLIFSKinase(ncbi_id="NP_005148.2")]),
123 |         ProteinSystem([KLIFSKinase(kinase_klifs_id=480)]),
124 |         ProteinSystem([KLIFSKinase(structure_klifs_id=3620)]),
125 |     ]
126 |     featurizer = OneHotEncodedSequenceFeaturizer(
127 |         sequence_type="klifs_kinase", use_multiprocessing=False
128 |     )
129 |     featurized_systems = featurizer.featurize(systems)
130 | 
131 |     assert len(featurized_systems) == 5  # filter protein with wrong UniProt ID and empty string
132 |     assert list(featurized_systems[0].featurizations["last"])[0][0] == 1
133 |     assert list(featurized_systems[1].featurizations["last"])[0][14] == 1
134 |     assert list(featurized_systems[2].featurizations["last"])[0][14] == 1
135 |     assert list(featurized_systems[3].featurizations["last"])[0][14] == 1
136 |     assert list(featurized_systems[4].featurizations["last"])[0][14] == 1
137 | 
138 | 
139 | def test_onehotencodedsequencefeaturizer_klifs_structure():
140 |     """Check OneHotEncodedSequenceFeaturizer with structure KLIFS sequence."""
141 |     from kinoml.core.proteins import KLIFSKinase
142 |     from kinoml.core.systems import ProteinSystem
143 |     from kinoml.features.protein import OneHotEncodedSequenceFeaturizer
144 | 
145 |     systems = [
146 |         ProteinSystem([KLIFSKinase(sequence="")]),
147 |         ProteinSystem([KLIFSKinase(structure_klifs_sequence="A")]),
148 |         ProteinSystem([KLIFSKinase(uniprot_id="P00519")]),
149 |         ProteinSystem([KLIFSKinase(kinase_klifs_id=480)]),
150 |         ProteinSystem([KLIFSKinase(structure_klifs_id=3620)]),
151 |     ]
152 |     featurizer = OneHotEncodedSequenceFeaturizer(
153 |         sequence_type="klifs_structure", use_multiprocessing=False
154 |     )
155 |     featurized_systems = featurizer.featurize(systems)
156 | 
157 |     assert len(featurized_systems) == 2  # needs structure_klifs_sequence or structure_klifs_id
158 |     assert list(featurized_systems[0].featurizations["last"])[0][0] == 1
159 |     assert list(featurized_systems[1].featurizations["last"])[0][14] == 1
160 | 
161 | 
162 | def test_oeproteinstructurefeaturizer():
163 |     """Check OEProteinStructureFeaturizer with different inputs."""
164 |     from kinoml.core.proteins import Protein
165 |     from kinoml.core.systems import ProteinSystem
166 |     from kinoml.features.protein import OEProteinStructureFeaturizer
167 | 
168 |     systems = []
169 |     # unspecifc definition of the system, only via PDB ID
170 |     # modeling will be performed according to the sequence stored in the PDB Header
171 |     protein = Protein(pdb_id="4f8o", name="PsaA")
172 |     system = ProteinSystem(components=[protein])
173 |     systems.append(system)
174 |     # more specific definition of the system, protein of chain A co-crystallized with ligand AES
175 |     # and alternate location B, modeling will be performed according to the sequence of the given
176 |     # UniProt ID
177 |     protein = Protein.from_pdb(pdb_id="4f8o", name="PsaA")
178 |     protein.uniprot_id = "P31522"
179 |     protein.chain_id = "A"
180 |     protein.alternate_location = "B"
181 |     protein.expo_id = "AES"
182 |     system = ProteinSystem(components=[protein])
183 |     systems.append(system)
184 |     # use a protein structure form file
185 |     with resources.path("kinoml.data.proteins", "4f8o_edit.pdb") as structure_path:
186 |         protein = Protein.from_file(file_path=structure_path, name="PsaA")
187 |         protein.uniprot_id = "P31522"
188 |         system = ProteinSystem(components=[protein])
189 |         systems.append(system)
190 | 
191 |     with resources.path("kinoml.data.proteins", "kinoml_tests_4f8o_spruce.loop_db") as loop_db:
192 |         featurizer = OEProteinStructureFeaturizer(loop_db=loop_db, use_multiprocessing=False)
193 |         systems = featurizer.featurize(systems)
194 |         # check number of residues
195 |         assert len(systems[0].featurizations["last"].residues) == 239
196 |         assert len(systems[1].featurizations["last"].residues) == 216
197 |         assert len(systems[2].featurizations["last"].residues) == 109
198 |         # check numbering of first residue
199 |         assert systems[0].featurizations["last"].residues[0].resid == 1
200 |         assert systems[1].featurizations["last"].residues[0].resid == 44
201 |         assert systems[2].featurizations["last"].residues[0].resid == 47
202 | 


--------------------------------------------------------------------------------
/kinoml/modeling/SCHRODINGERModeling.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | import subprocess
  4 | from tempfile import NamedTemporaryFile
  5 | from typing import Union
  6 | 
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def run_prepwizard(
 12 |     schrodinger_directory: Union[Path, str],
 13 |     input_file: Union[Path, str],
 14 |     output_file: Union[Path, str],
 15 |     cap_termini: bool = True,
 16 |     build_loops: bool = True,
 17 |     sequence: Union[str, None] = None,
 18 |     chain_id: str = "",
 19 |     protein_pH: str = "neutral",
 20 |     propka_pH: float = 7.4,
 21 |     epik_pH: float = 7.4,
 22 |     force_field: str = "3",
 23 | ):
 24 |     """
 25 |     Run the prepwizard utility to prepare a protein structure.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     schrodinger_directory: Path or str
 30 |         The path to the directory of the Schrodinger installation.
 31 |     input_file: Path or str
 32 |         The path to the input file.
 33 |     output_file: Path or str
 34 |         The path to the output file.
 35 |     cap_termini: bool, default=True
 36 |         If termini should be capped.
 37 |     build_loops: bool, default=True
 38 |         If loops should be built.
 39 |     sequence: str or None
 40 |         The amino acid sequence in single letter codes that should be used for loop building.
 41 |         Also needs the chain_id parameter to work correctly.
 42 |     chain_id: str, default=""
 43 |         The chain ID of the protein that should be modeled based on the given sequence.
 44 |     protein_pH: str, default='neutral'
 45 |         The pH used during protonation of the protein ('very_low', 'low', 'neutral', 'high').
 46 |     propka_pH: float, default=7.4
 47 |         Run PROPKA at given pH.
 48 |     epik_pH: float, default=7.4
 49 |         The pH used during protonation of the ligand.
 50 |     force_field: str, default='3'
 51 |         Force field to use during minimization (2005, 3)
 52 |     """
 53 |     schrodinger_directory = Path(schrodinger_directory)
 54 |     executable = str(schrodinger_directory / "utilities/prepwizard")
 55 |     standard_arguments = [
 56 |         str(input_file),
 57 |         str(output_file),
 58 |         "-HOST",
 59 |         "localhost",
 60 |         "-WAIT",
 61 |         "-keepfarwat",
 62 |         "-disulfides",
 63 |         "-glycosylation",
 64 |         "-palmitoylation",
 65 |         "-mse",
 66 |         "-fillsidechains",
 67 |         "-samplewater",
 68 |         "-pH",
 69 |         protein_pH,
 70 |         "-propka_pH",
 71 |         str(propka_pH),
 72 |         "-minimize_adj_h",
 73 |         "-epik_pH",
 74 |         str(epik_pH),
 75 |         "-f",
 76 |         force_field,
 77 |     ]
 78 |     optional_arguments = []
 79 |     if cap_termini:
 80 |         optional_arguments.append("-c")
 81 |     if build_loops:
 82 |         optional_arguments.append("-fillloops")
 83 | 
 84 |     if sequence:  # one letter characters, 60 per line, no header
 85 |         with NamedTemporaryFile(mode="w", suffix=".fasta") as fasta_file:
 86 |             sequence = "\n".join([sequence[i : i + 60] for i in range(0, len(sequence), 60)])
 87 |             fasta_file.write(f">entry:{chain_id}\n")
 88 |             fasta_file.write(sequence)
 89 |             fasta_file.flush()
 90 |             subprocess.run(
 91 |                 [executable]
 92 |                 + standard_arguments
 93 |                 + optional_arguments
 94 |                 + ["-fasta_file", fasta_file.name]
 95 |             )
 96 |     else:
 97 |         subprocess.run([executable] + standard_arguments + optional_arguments)
 98 | 
 99 |     if logger.getEffectiveLevel() != logging.DEBUG:  # remove prepwizard log
100 |         paths = Path(".").glob(f"*{Path(input_file).stem}*")
101 |         for path in paths:
102 |             try:
103 |                 path.unlink()
104 |             except FileNotFoundError:
105 |                 # may happen in multiprocessing of the same structure
106 |                 pass
107 | 
108 |     return
109 | 
110 | 
111 | def mae_to_pdb(
112 |     schrodinger_directory: Union[str, Path],
113 |     mae_file_path: Union[str, Path],
114 |     pdb_file_path: Union[str, Path],
115 | ):
116 |     """
117 |     Convert a structure file from MAE to PDB format.
118 | 
119 |     Parameters
120 |     ----------
121 |     schrodinger_directory: str or pathlib.Path
122 |         The path to the directory of the Schrodinger installation.
123 |     mae_file_path: str or pathlib.Path
124 |         The path to the input file in MAE format.
125 |     pdb_file_path: str or pathlib.Path
126 |         The path to the output file in PDB format.
127 |     """
128 |     schrodinger_directory = Path(schrodinger_directory)
129 |     arguments = [
130 |         str(schrodinger_directory / "utilities/pdbconvert"),  # executable
131 |         "-imae",
132 |         str(mae_file_path),
133 |         "-opdb",
134 |         str(pdb_file_path),  # file paths
135 |     ]
136 |     subprocess.run(arguments)
137 |     return
138 | 
139 | 
140 | def shape_screen(
141 |     schrodinger_directory: Union[Path, str],
142 |     query_path: Union[str, Path],
143 |     library_path: Union[str, Path],
144 |     output_sdf_path: Union[str, Path],
145 |     flexible: bool = True,
146 |     thorough_sampling: bool = True,
147 |     keep_best_match_only: bool = True,
148 | ):
149 |     """
150 |     Run the shape_screen tool to align a library of small molecules to the given shape query.
151 | 
152 |     Parameters
153 |     ----------
154 |     schrodinger_directory: Path or str
155 |         The path to the directory of the Schrodinger installation.
156 |     query_path: Path or str
157 |         The path to a valid shape query, e.g. an SDF file with one or more small molecules.
158 |     library_path: Path or str
159 |         The path to a valid ligand library for shape screening, e.g. and SDF file with one more
160 |         small molecules.
161 |     output_sdf_path: Path or str
162 |         The path to the output SDF file of the shape screening.
163 |     flexible: bool, default=True
164 |         If conformers shell be generated for the small molecule library to screen.
165 |     thorough_sampling: bool, default=True
166 |         If conformations shell thoroughly sampled.
167 |     keep_best_match_only: bool, default=True
168 |         In case multiple shape queries, if only the results for best matching shape query shell
169 |         be returned.
170 |     """
171 |     import gzip
172 |     import shutil
173 | 
174 |     schrodinger_directory = Path(schrodinger_directory)
175 |     executable = str(schrodinger_directory / "shape_screen")
176 |     standard_arguments = [
177 |         "-shape",
178 |         str(query_path),
179 |         "-screen",
180 |         str(library_path),
181 |         "-osd",
182 |         "-atomtypes",
183 |         "element",
184 |         "-HOST",
185 |         "localhost",
186 |         "-WAIT",
187 |     ]
188 |     optional_arguments = []
189 |     if flexible:
190 |         optional_arguments.append("-flex")
191 |         optional_arguments.append("-max")
192 |         optional_arguments.append("800")
193 |         if thorough_sampling:
194 |             optional_arguments += ["-sample", "thorough"]
195 |     if keep_best_match_only:
196 |         optional_arguments.append("-best")
197 | 
198 |     subprocess.run([executable] + standard_arguments + optional_arguments)
199 |     if logger.getEffectiveLevel() != logging.DEBUG:  # remove shape_screen log and okay
200 |         paths = [
201 |             Path(".") / f"{Path(query_path).stem}_shape.log",
202 |             Path(".") / f"{Path(query_path).stem}_shape.okay",
203 |         ]
204 |         for path in paths:
205 |             try:
206 |                 path.unlink()
207 |             except FileNotFoundError:
208 |                 # may happen in multiprocessing of the same query file
209 |                 pass
210 | 
211 |     logger.debug("Unzipping and renaming results ...")
212 |     output_sdfgz_path = Path(".") / f"{Path(query_path).stem}_align.sdfgz"
213 |     with gzip.open(output_sdfgz_path, "rb") as sdfgz:
214 |         with open(output_sdf_path, "wb") as sdf:
215 |             shutil.copyfileobj(sdfgz, sdf)
216 |     output_sdfgz_path.unlink()
217 | 
218 |     return
219 | 


--------------------------------------------------------------------------------
/kinoml/tests/features/test_complexes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test complex featurizers of `kinoml.features`
  3 | """
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def test_oecomplexfeaturizer():
  8 |     """Check OEComplexFeaturizer with different inputs."""
  9 |     from kinoml.core.ligands import Ligand
 10 |     from kinoml.core.proteins import Protein
 11 |     from kinoml.core.systems import ProteinLigandComplex
 12 |     from kinoml.features.complexes import OEComplexFeaturizer
 13 | 
 14 |     systems = []
 15 |     protein = Protein(pdb_id="4f8o", name="PsaA")
 16 |     ligand = Ligand(name="AEBSF")
 17 |     system = ProteinLigandComplex(components=[protein, ligand])
 18 |     systems.append(system)
 19 |     protein = Protein.from_pdb(pdb_id="4f8o", name="PsaA")
 20 |     protein.uniprot_id = "P31522"
 21 |     protein.chain_id = "A"
 22 |     protein.alternate_location = "B"
 23 |     protein.expo_id = "AES"
 24 |     ligand = Ligand(name="AEBSF")
 25 |     system = ProteinLigandComplex(components=[protein, ligand])
 26 |     systems.append(system)
 27 |     featurizer = OEComplexFeaturizer(use_multiprocessing=False)
 28 |     systems = featurizer.featurize(systems)
 29 |     # check LIG exists
 30 |     assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1
 31 |     assert len(systems[1].featurizations["last"].select_atoms("resname LIG").residues) == 1
 32 |     # check caps
 33 |     assert (
 34 |         len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues)
 35 |         == 2
 36 |     )
 37 |     assert (
 38 |         len(systems[1].featurizations["last"].select_atoms("resname ACE or resname NME").residues)
 39 |         == 1
 40 |     )
 41 |     # check number of residues
 42 |     assert len(systems[0].featurizations["last"].residues) == 240
 43 |     assert len(systems[1].featurizations["last"].residues) == 217
 44 |     # check numbering of first residue
 45 |     assert systems[0].featurizations["last"].residues[0].resid == 1
 46 |     assert systems[1].featurizations["last"].residues[0].resid == 44
 47 | 
 48 | 
 49 | def test_oedockingfeaturizer_fred():
 50 |     """Check OEDockingFeaturizer with Fred and different inputs."""
 51 |     from kinoml.core.ligands import Ligand
 52 |     from kinoml.core.proteins import Protein
 53 |     from kinoml.core.systems import ProteinLigandComplex
 54 |     from kinoml.features.complexes import OEDockingFeaturizer
 55 | 
 56 |     systems = []
 57 |     # define the binding site for docking via co-crystallized ligand
 58 |     protein = Protein(pdb_id="4yne", name="NTRK1")
 59 |     protein.expo_id = "4EK"
 60 |     ligand = Ligand(
 61 |         smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F",
 62 |         name="larotrectinib_fred",
 63 |     )
 64 |     system = ProteinLigandComplex(components=[protein, ligand])
 65 |     systems.append(system)
 66 |     # define the binding site for docking via residue IDs
 67 |     protein = Protein(pdb_id="4yne", name="NTRK1")
 68 |     protein.pocket_resids = [
 69 |         516,
 70 |         517,
 71 |         521,
 72 |         524,
 73 |         542,
 74 |         544,
 75 |         573,
 76 |         589,
 77 |         590,
 78 |         591,
 79 |         592,
 80 |         595,
 81 |         596,
 82 |         654,
 83 |         655,
 84 |         656,
 85 |         657,
 86 |         667,
 87 |         668,
 88 |     ]
 89 |     ligand = Ligand(
 90 |         smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F",
 91 |         name="larotrectinib_fred_2",
 92 |     )
 93 |     system = ProteinLigandComplex(components=[protein, ligand])
 94 |     systems.append(system)
 95 |     featurizer = OEDockingFeaturizer(method="Fred", use_multiprocessing=False)
 96 |     systems = featurizer.featurize(systems)
 97 |     # check docking score was stored
 98 |     assert isinstance(systems[0].featurizations["last"]._topology.docking_score, float)
 99 |     # check LIG exists
100 |     assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1
101 |     assert len(systems[1].featurizations["last"].select_atoms("resname LIG").residues) == 1
102 |     # check caps
103 |     assert (
104 |         len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues)
105 |         == 10
106 |     )
107 |     assert (
108 |         len(systems[1].featurizations["last"].select_atoms("resname ACE or resname NME").residues)
109 |         == 10
110 |     )
111 |     # check numbering of first residue
112 |     assert systems[0].featurizations["last"].residues[0].resid == 501
113 |     assert systems[1].featurizations["last"].residues[0].resid == 501
114 | 
115 | 
116 | def test_oedockingfeaturizer_hybrid():
117 |     """Check OEDockingFeaturizer with Hybrid."""
118 |     from kinoml.core.ligands import Ligand
119 |     from kinoml.core.proteins import Protein
120 |     from kinoml.core.systems import ProteinLigandComplex
121 |     from kinoml.features.complexes import OEDockingFeaturizer
122 | 
123 |     systems = []
124 |     protein = Protein(pdb_id="4yne", name="NTRK1")
125 |     protein.expo_id = "4EK"
126 |     ligand = Ligand(
127 |         smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F",
128 |         name="larotrectinib_hybrid",
129 |     )
130 |     system = ProteinLigandComplex(components=[protein, ligand])
131 |     systems.append(system)
132 |     featurizer = OEDockingFeaturizer(method="Hybrid", use_multiprocessing=False)
133 |     systems = featurizer.featurize(systems)
134 |     # check LIG exists
135 |     assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1
136 |     # check caps
137 |     assert (
138 |         len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues)
139 |         == 10
140 |     )
141 |     # check numbering of first residue
142 |     assert systems[0].featurizations["last"].residues[0].resid == 501
143 | 
144 | 
145 | def test_oedockingfeaturizer_posit():
146 |     """Check OEDockingFeaturizer with Posit."""
147 |     from kinoml.core.ligands import Ligand
148 |     from kinoml.core.proteins import Protein
149 |     from kinoml.core.systems import ProteinLigandComplex
150 |     from kinoml.features.complexes import OEDockingFeaturizer
151 | 
152 |     systems = []
153 |     protein = Protein(pdb_id="4yne", name="NTRK1")
154 |     protein.expo_id = "4EK"
155 |     ligand = Ligand(
156 |         smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F",
157 |         name="larotrectinib_posit",
158 |     )
159 |     system = ProteinLigandComplex(components=[protein, ligand])
160 |     systems.append(system)
161 |     featurizer = OEDockingFeaturizer(method="Posit", use_multiprocessing=False)
162 |     systems = featurizer.featurize(systems)
163 |     # check LIG exists
164 |     assert len(systems[0].featurizations["last"].select_atoms("resname LIG").residues) == 1
165 |     # check caps
166 |     assert (
167 |         len(systems[0].featurizations["last"].select_atoms("resname ACE or resname NME").residues)
168 |         == 10
169 |     )
170 |     # check numbering of first residue
171 |     assert systems[0].featurizations["last"].residues[0].resid == 501
172 |     # check posit probability was stored
173 |     assert isinstance(systems[0].featurizations["last"]._topology.posit_probability, float)
174 | 
175 | 
176 | def test_mostsimilarpdbligandfeaturizer():
177 |     """Check MostSimilarPDBLigandFeaturizer with different similarity metrics."""
178 |     from kinoml.core.ligands import Ligand
179 |     from kinoml.core.proteins import Protein
180 |     from kinoml.core.systems import ProteinLigandComplex
181 |     from kinoml.features.complexes import MostSimilarPDBLigandFeaturizer
182 | 
183 |     for metric in ["mcs", "fingerprint", "openeye_shape"]:
184 |         systems = []
185 |         protein = Protein(uniprot_id="P04629", name="NTRK1")
186 |         ligand = Ligand(
187 |             smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F",
188 |             name="larotrectinib",
189 |         )
190 |         system = ProteinLigandComplex(components=[protein, ligand])
191 |         systems.append(system)
192 |         featurizer = MostSimilarPDBLigandFeaturizer(
193 |             similarity_metric=metric, use_multiprocessing=False
194 |         )
195 |         systems = featurizer.featurize(systems)
196 |         assert isinstance(systems[0].protein.pdb_id, str)
197 |         assert isinstance(systems[0].protein.chain_id, str)
198 |         assert isinstance(systems[0].protein.expo_id, str)
199 | 
200 | 
201 | def test_klifsconformationtemplatesfeaturizer():
202 |     """Check KLIFSConformationTemplatesFeaturizer with fingerprint only."""
203 |     from kinoml.core.ligands import Ligand
204 |     from kinoml.core.proteins import KLIFSKinase
205 |     from kinoml.core.systems import ProteinLigandComplex
206 |     from kinoml.features.complexes import KLIFSConformationTemplatesFeaturizer
207 | 
208 |     systems = []
209 |     protein = KLIFSKinase(uniprot_id="P04629", name="NTRK1")
210 |     ligand = Ligand(
211 |         smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F",
212 |         name="larotrectinib",
213 |     )
214 |     system = ProteinLigandComplex(components=[protein, ligand])
215 |     systems.append(system)
216 |     featurizer = KLIFSConformationTemplatesFeaturizer(
217 |         similarity_metric="fingerprint", use_multiprocessing=False
218 |     )
219 |     systems = featurizer.featurize(systems)
220 |     # check feature is dataframe
221 |     assert isinstance(systems[0].featurizations["last"], pd.DataFrame)
222 |     # check dataframe is not empty
223 |     assert len(systems[0].featurizations["last"]) > 0
224 | 


--------------------------------------------------------------------------------
/kinoml/docking/SCHRODINGERDocking.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | import subprocess
  4 | from tempfile import NamedTemporaryFile
  5 | from typing import List, Union
  6 | 
  7 | from appdirs import user_cache_dir
  8 | 
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def run_glide(
 14 |     schrodinger_directory: Union[Path, str],
 15 |     input_file_mae: Union[Path, str],
 16 |     output_file_sdf: Union[Path, str],
 17 |     mols_smiles: List[str],
 18 |     ligand_resname: Union[str, None],
 19 |     n_poses: int = 1,
 20 |     mols_names: Union[List[str], None] = None,
 21 |     shape_restrain: bool = True,
 22 |     macrocyles: bool = False,
 23 |     precision: str = "XP",
 24 |     cache_dir: Union[Path, str] = user_cache_dir(),
 25 | ):
 26 |     """
 27 |     Run glide for protein ligand docking.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     schrodinger_directory: Path or str
 32 |         The path to the directory of the Schrodinger installation.
 33 |     input_file_mae: Path or str
 34 |         The path to the input file in MAE format containing the protein structure to dock to and a
 35 |         co-crystallized ligand in the binding pocket of interest.
 36 |     output_file_sdf: Path or str
 37 |         The path to the output file of the generated in docking poses in SDF format.
 38 |     mols_smiles: list of str
 39 |         The molecules to dock as SMILES representation.
 40 |     ligand_resname: str or None
 41 |         The resname of the co-crystallized ligand, which will be used for pocket definition.
 42 |     mols_names: None or list of str, default=None
 43 |         The names of the molecules to dock. Will be used as molecule title in the SDF file. If
 44 |         None, names will be numbers (1,..,len(mols_smiles).
 45 |     n_poses: int, default=1
 46 |         Number of poses to generate per molecule.
 47 |     shape_restrain: bool, default=True
 48 |         If the co-crystallized ligand shell be used for shape restrained docking.
 49 |     macrocyles: bool, default=False
 50 |         Macrocycle conformations will be sampled with an appropriate algorithm. All non-
 51 |         macrocyclic molecules by detected by SCHRODINGER will be skipped.
 52 |     precision: str, default="XP"
 53 |         The docking precision to use ["HTVS", "SP", "XP"].
 54 |     cache_dir: Path or str, default=appdirs.user_cache_dir()
 55 |         Path to a directory for caching grids for docking.
 56 |     """
 57 |     import shutil
 58 | 
 59 |     from rdkit import Chem
 60 |     from rdkit.Chem import AllChem
 61 | 
 62 |     from ..utils import sha256_objects
 63 | 
 64 |     if precision not in ["HTVS", "SP", "XP"]:
 65 |         raise ValueError(
 66 |             f"Only 'HTVS', 'SP', 'XP' are allowed for precision, you provided {precision}!"
 67 |         )
 68 | 
 69 |     schrodinger_directory = Path(schrodinger_directory).resolve()
 70 |     input_file_mae = Path(input_file_mae).resolve()
 71 |     with NamedTemporaryFile(mode="w", suffix=".mae") as protein_file_mae, NamedTemporaryFile(
 72 |         mode="w", suffix=".mae"
 73 |     ) as ligand_file_mae, NamedTemporaryFile(
 74 |         mode="w", suffix=".mae"
 75 |     ) as protein_ligand_file_mae, NamedTemporaryFile(
 76 |         mode="w", suffix=".sdf"
 77 |     ) as mols_file_sdf, NamedTemporaryFile(
 78 |         mode="w", suffix=".in"
 79 |     ) as grid_input_file, NamedTemporaryFile(
 80 |         mode="w", suffix=".in"
 81 |     ) as docking_input_file:
 82 | 
 83 |         logger.debug("Selecting and writing protein from MAE input file ...")
 84 |         subprocess.run(
 85 |             [
 86 |                 str(schrodinger_directory / "run"),
 87 |                 "delete_atoms.py",
 88 |                 str(input_file_mae),
 89 |                 protein_file_mae.name,
 90 |                 "-asl",
 91 |                 "not protein",
 92 |             ]
 93 |         )
 94 | 
 95 |         with NamedTemporaryFile(mode="w", suffix=".mae") as ligand_file_raw_mae:
 96 |             logger.debug("Selecting and writing ligand from MAE input file ...")
 97 |             subprocess.run(  # first everything that could be ligand
 98 |                 [
 99 |                     str(schrodinger_directory / "run"),
100 |                     "delete_atoms.py",
101 |                     str(input_file_mae),
102 |                     ligand_file_raw_mae.name,
103 |                     "-asl",
104 |                     f"not res. {ligand_resname}" if ligand_resname else "not ligand",
105 |                 ]
106 |             )
107 |             subprocess.run(  # then only first molecule from potential ligands
108 |                 [
109 |                     str(schrodinger_directory / "run"),
110 |                     "delete_atoms.py",
111 |                     ligand_file_raw_mae.name,
112 |                     ligand_file_mae.name,
113 |                     "-asl",
114 |                     "mol. >1",
115 |                 ]
116 |             )
117 | 
118 |         logger.debug("Merging protein and ligand in the right order ...")
119 |         subprocess.run(
120 |             [
121 |                 str(schrodinger_directory / "utilities/structcat"),
122 |                 "-i",
123 |                 protein_file_mae.name,
124 |                 ligand_file_mae.name,
125 |                 "-o",
126 |                 protein_ligand_file_mae.name,
127 |             ]
128 |         )
129 | 
130 |         logger.debug("Writing molecules to SDF ...")
131 |         if not mols_names or len(mols_names) != len(mols_smiles):
132 |             logger.debug("Creating molecule names ...")
133 |             mols_names = [str(x) for x in range(1, len(mols_smiles) + 1)]
134 |         sd_writer = Chem.SDWriter(mols_file_sdf.name)
135 |         for smiles, name in zip(mols_smiles, mols_names):
136 |             mol = Chem.MolFromSmiles(smiles)
137 |             if not mol:
138 |                 logger.debug(f"Skipping molecule {name} with erroneous smiles ...")
139 |                 continue
140 |             mol.SetProp("_Name", name)
141 |             mol = Chem.AddHs(mol)
142 |             AllChem.EmbedMolecule(mol)
143 |             sd_writer.write(mol)
144 | 
145 |         logger.debug("Writing input file for grid generation ...")
146 |         grid_input_file.write(f"RECEP_FILE '{protein_ligand_file_mae.name}'\n")
147 |         grid_input_file.write("LIGAND_INDEX 2\n")
148 |         grid_input_file.flush()
149 | 
150 |         grid_file_path = Path(cache_dir) / (
151 |             sha256_objects([input_file_mae, ligand_resname]) + ".zip"
152 |         )  # caching via hash based on input structure and chosen ligand
153 |         if grid_file_path.is_file():
154 |             logger.debug("Found cached grid file ..")
155 |         else:
156 |             logger.debug("Generating grid for docking ...")
157 |             subprocess.run(
158 |                 [
159 |                     str(schrodinger_directory / "glide"),
160 |                     grid_input_file.name,
161 |                     "-HOST",
162 |                     "localhost",
163 |                     "-WAIT",
164 |                     "-OVERWRITE",
165 |                 ]
166 |             )
167 |             shutil.move(
168 |                 str(Path(".") / (Path(grid_input_file.name).stem + ".zip")), grid_file_path
169 |             )
170 | 
171 |         if logger.getEffectiveLevel() != 10:  # remove grid logs etc.
172 |             paths = Path(".").glob(f"*{Path(grid_input_file.name).stem}*")
173 |             for path in paths:
174 |                 path.unlink()
175 | 
176 |         logger.debug("Writing input file for docking ...")
177 |         docking_input_file.write(f"GRIDFILE '{str(grid_file_path)}'\n")
178 |         docking_input_file.write(f"LIGANDFILE '{mols_file_sdf.name}'\n")
179 |         docking_input_file.write(f"LIGPREP True\n")
180 |         docking_input_file.write("POSE_OUTTYPE ligandlib_sd\n")
181 |         docking_input_file.write(f"COMPRESS_POSES False\n")
182 |         docking_input_file.write(f"POSES_PER_LIG {n_poses}\n")
183 |         docking_input_file.write(f"PRECISION {precision}\n")
184 |         if shape_restrain:
185 |             docking_input_file.write(f"SHAPE_RESTRAIN True\n")
186 |             docking_input_file.write(f"SHAPE_REF_LIGAND_FILE '{ligand_file_mae.name}'\n")
187 |         if macrocyles:
188 |             docking_input_file.write(f"MACROCYCLE True\n")
189 |         docking_input_file.flush()
190 | 
191 |         logger.debug("Running docking ...")
192 |         subprocess.run(
193 |             [
194 |                 str(schrodinger_directory / "glide"),
195 |                 docking_input_file.name,
196 |                 "-HOST",
197 |                 "localhost",
198 |                 "-WAIT",
199 |                 "-OVERWRITE",
200 |             ]
201 |         )
202 | 
203 |         logger.debug("Filtering poses for appropriate number ...")
204 |         docking_input_file_path = Path(docking_input_file.name)
205 |         sd_file_path = Path(".") / (docking_input_file_path.stem + "_lib.sdf")
206 |         if not sd_file_path.is_file():
207 |             logger.debug("No docking poses were generated during docking ...")
208 |             return
209 |         supplier = Chem.SDMolSupplier(str(sd_file_path), removeHs=False)
210 |         sd_writer = Chem.SDWriter(str(output_file_sdf))
211 |         mol_counter_dict = {}
212 |         for mol in supplier:
213 |             # SDF from glide is sorted by docking score, but mols are in mixed order
214 |             name = mol.GetProp("_Name")
215 |             if name not in mol_counter_dict.keys():
216 |                 mol_counter_dict[name] = 0
217 |             if mol_counter_dict[name] < n_poses:
218 |                 sd_writer.write(mol)
219 |                 mol_counter_dict[name] += 1
220 |         sd_file_path.unlink()  # manually delete file
221 | 
222 |         if logger.getEffectiveLevel() != 10:  # remove docking logs etc.
223 |             paths = Path(".").glob(f"*{docking_input_file_path.stem}*")
224 |             for path in paths:
225 |                 path.unlink()
226 | 
227 |     return
228 | 


--------------------------------------------------------------------------------
/kinoml/features/protein.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Featurizers that mostly concern protein-based models
  3 | """
  4 | from __future__ import annotations
  5 | from collections import Counter
  6 | import logging
  7 | from typing import Union
  8 | 
  9 | import numpy as np
 10 | 
 11 | from .core import ParallelBaseFeaturizer, BaseOneHotEncodingFeaturizer, OEBaseModelingFeaturizer
 12 | from ..core.proteins import Protein, KLIFSKinase
 13 | from ..core.sequences import AminoAcidSequence
 14 | from ..core.systems import ProteinSystem, ProteinLigandComplex
 15 | 
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class SingleProteinFeaturizer(ParallelBaseFeaturizer):
 21 |     """
 22 |     Provides a minimally useful ``._supports()`` method for all Protein-like featurizers.
 23 |     """
 24 | 
 25 |     _COMPATIBLE_PROTEIN_TYPES = (Protein, KLIFSKinase)
 26 | 
 27 |     def __init__(self, **kwargs):
 28 |         super().__init__(**kwargs)
 29 | 
 30 |     def _supports(self, system: Union[ProteinSystem, ProteinLigandComplex]) -> bool:
 31 |         """
 32 |         Check that exactly one protein is present in the System
 33 |         """
 34 |         super_checks = super()._supports(system)
 35 |         proteins = [c for c in system.components if isinstance(c, self._COMPATIBLE_PROTEIN_TYPES)]
 36 |         return all([super_checks, len(proteins) == 1])
 37 | 
 38 | 
 39 | class AminoAcidCompositionFeaturizer(SingleProteinFeaturizer):
 40 | 
 41 |     """Featurizes the protein using the composition of the residues in the binding site."""
 42 | 
 43 |     def __init__(self, **kwargs):
 44 |         super().__init__(**kwargs)
 45 | 
 46 |     # Initialize a Counter object with 0 counts
 47 |     _counter = Counter(sorted(AminoAcidSequence.ALPHABET))
 48 |     for k in _counter.keys():
 49 |         _counter[k] = 0
 50 | 
 51 |     def _featurize_one(
 52 |         self, system: Union[ProteinSystem, ProteinLigandComplex]
 53 |     ) -> Union[np.array, None]:
 54 |         """
 55 |         Featurizes a protein using the residue count in the sequence.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         system: ProteinSystem or ProteinLigandComplex
 60 |             The System to be featurized.
 61 | 
 62 |         Returns
 63 |         -------
 64 |         : np.array or None
 65 |             The count of amino acids in the binding site.
 66 |         """
 67 |         count = self._counter.copy()
 68 |         try:
 69 |             sequence = system.protein.sequence
 70 |         except ValueError:  # e.g. erroneous uniprot_id in lazy instantiation
 71 |             return None
 72 |         count.update(system.protein.sequence)
 73 |         sorted_count = sorted(count.items(), key=lambda kv: kv[0])
 74 |         return np.array([number for _, number in sorted_count])
 75 | 
 76 | 
 77 | class OneHotEncodedSequenceFeaturizer(BaseOneHotEncodingFeaturizer, SingleProteinFeaturizer):
 78 | 
 79 |     """Featurizes the sequence of the protein to a one hot encoding."""
 80 | 
 81 |     ALPHABET = AminoAcidSequence.ALPHABET
 82 | 
 83 |     def __init__(self, sequence_type: str = "full", **kwargs):
 84 |         """
 85 |         Featurizes the sequence of the protein to a one hot encoding.
 86 | 
 87 |         Parameters
 88 |         ----------
 89 |         sequence_type: str, default=full
 90 |             The sequence to use for one hot encoding ('full', 'klifs_kinase' or 'klifs_structure').
 91 |         """
 92 |         if sequence_type not in ["full", "klifs_kinase", "klifs_structure"]:
 93 |             raise ValueError(
 94 |                 "Only 'full', 'klifs_kinase' and 'klifs_structure' are supported sequence_types, "
 95 |                 f"you provided {sequence_type}."
 96 |             )
 97 |         self.sequence_type = sequence_type
 98 |         if sequence_type != "full":
 99 |             self.ALPHABET += "-"  # add gap symbol for KLIFS sequence to ALPHABET
100 |         super().__init__(**kwargs)  # update ALPHABET first
101 | 
102 |     def _retrieve_sequence(self, system: Union[ProteinSystem, ProteinLigandComplex]) -> str:
103 |         try:
104 |             if self.sequence_type == "full":
105 |                 sequence = system.protein.sequence
106 |             elif self.sequence_type == "klifs_kinase":
107 |                 sequence = system.protein.kinase_klifs_sequence
108 |             else:
109 |                 sequence = system.protein.structure_klifs_sequence
110 |         except ValueError:  # e.g. erroneous uniprot_id in lazy instantiation
111 |             return ""
112 |         return sequence
113 | 
114 | 
115 | class OEProteinStructureFeaturizer(OEBaseModelingFeaturizer, SingleProteinFeaturizer):
116 |     """
117 |     Given systems with exactly one protein, prepare the protein structure by:
118 | 
119 |      - modeling missing loops with OESpruce according to the PDB header unless
120 |        a custom sequence is specified via the `uniprot_id` or `sequence`
121 |        attribute in the protein component (see below), missing sequences at
122 |        N- and C-termini are not modeled
123 |      - building missing side chains
124 |      - substitutions, deletions and insertions, if a `uniprot_id` or `sequence`
125 |        attribute is provided for the protein component alteration will be
126 |        modeled with OESpruce, if an alteration could not be modeled, the
127 |        corresponding mismatch in the structure will be deleted
128 |      - removing everything but protein and water
129 |      - protonation at pH 7.4
130 | 
131 |     The protein component of each system must be a `core.proteins.Protein`
132 |     or a subclass thereof, must be initialized with toolkit='OpenEye' and
133 |     give access to a molecular structure, e.g. via a pdb_id. Additionally,
134 |     the protein component can have the following optional attributes to
135 |     customize the protein modeling:
136 | 
137 |      - `name`: A string specifying the name of the protein, will be used for
138 |         generating the output file name.
139 |      - `chain_id`: A string specifying which chain should be used.
140 |      - `alternate_location`: A string specifying which alternate location
141 |         should be used.
142 |      - `expo_id`: A string specifying a ligand bound to the protein of
143 |        interest. This is especially useful if multiple proteins are found in
144 |        one PDB structure.
145 |      - `uniprot_id`: A string specifying the UniProt ID that will be used to
146 |        fetch the amino acid sequence from UniProt, which will be used for
147 |        modeling the protein. This will supersede the sequence information
148 |        given in the PDB header.
149 |      - `sequence`: A  string specifying the amino acid sequence in
150 |        one-letter-codes that should be used during modeling the protein. This
151 |        will supersede a given `uniprot_id` and the sequence information given
152 |        in the PDB header.
153 | 
154 |     Parameters
155 |     ----------
156 |     loop_db: str
157 |         The path to the loop database used by OESpruce to model missing loops.
158 |     cache_dir: str, Path or None, default=None
159 |         Path to directory used for saving intermediate files. If None, default
160 |         location provided by `appdirs.user_cache_dir()` will be used.
161 |     output_dir: str, Path or None, default=None
162 |         Path to directory used for saving output files. If None, output
163 |         structures will not be saved.
164 |     use_multiprocessing : bool, default=True
165 |         If multiprocessing to use.
166 |     n_processes : int or None, default=None
167 |         How many processes to use in case of multiprocessing. Defaults to
168 |         number of available CPUs.
169 |     """
170 | 
171 |     from MDAnalysis.core.universe import Universe
172 | 
173 |     def __init__(self, **kwargs):
174 |         super().__init__(**kwargs)
175 | 
176 |     def _featurize_one(self, system: ProteinSystem) -> Union[Universe, None]:
177 |         """
178 |         Prepare a protein structure.
179 | 
180 |         Parameters
181 |         ----------
182 |         system: ProteinSystem
183 |             A system object holding a protein component.
184 | 
185 |         Returns
186 |         -------
187 |         : Universe or None
188 |             An MDAnalysis universe of the featurized system. None if no design unit was found.
189 |         """
190 |         from pathlib import Path
191 | 
192 |         from ..modeling.MDAnalysisModeling import read_molecule
193 | 
194 |         structure = self._read_protein_structure(system.protein)
195 |         if structure is None:
196 |             logger.warning(
197 |                 f"Could not read protein structure for {system.protein}, returning None!"
198 |             )
199 |             return None
200 | 
201 |         logging.debug("Preparing protein structure ...")
202 |         design_unit = self._get_design_unit(
203 |             structure=structure,
204 |             chain_id=system.protein.chain_id if hasattr(system.protein, "chain_id") else None,
205 |             alternate_location=system.protein.alternate_location
206 |             if hasattr(system.protein, "alternate_location")
207 |             else None,
208 |             has_ligand=hasattr(system.protein, "expo_id"),
209 |             ligand_name=system.protein.expo_id if hasattr(system.protein, "expo_id") else None,
210 |             model_loops_and_caps=False if system.protein.sequence else True,
211 |         )  # if sequence is given model loops and caps separately later
212 |         if not design_unit:
213 |             logging.debug("No design unit found, returning None!")
214 |             return None
215 | 
216 |         logging.debug("Extracting design unit components ...")
217 |         protein, solvent = self._get_components(
218 |             design_unit=design_unit,
219 |             chain_id=system.protein.chain_id if hasattr(system.protein, "chain_id") else None,
220 |         )[:-1]
221 | 
222 |         if system.protein.sequence:
223 |             first_id = 1
224 |             if "construct_range" in system.protein.metadata.keys():
225 |                 first_id = int(system.protein.metadata["construct_range"].split("-")[0])
226 |             protein = self._process_protein(
227 |                 protein_structure=protein,
228 |                 amino_acid_sequence=system.protein.sequence,
229 |                 first_id=first_id,
230 |             )
231 | 
232 |         logging.debug("Assembling components ...")
233 |         solvated_protein = self._assemble_components(protein, solvent)
234 | 
235 |         logging.debug("Updating pdb header ...")
236 |         solvated_protein = self._update_pdb_header(
237 |             solvated_protein, protein_name=system.protein.name
238 |         )
239 | 
240 |         logging.debug("Writing results ...")
241 |         file_path = self._write_results(
242 |             solvated_protein,
243 |             "_".join(
244 |                 [
245 |                     info
246 |                     for info in [
247 |                         system.protein.name,
248 |                         system.protein.pdb_id
249 |                         if system.protein.pdb_id
250 |                         else Path(system.protein.metadata["file_path"]).stem,
251 |                         f"chain{system.protein.chain_id}"
252 |                         if hasattr(system.protein, "chain_id")
253 |                         else None,
254 |                         f"altloc{system.protein.alternate_location}"
255 |                         if hasattr(system.protein, "alternate_location")
256 |                         else None,
257 |                     ]
258 |                     if info
259 |                 ]
260 |             ),
261 |         )
262 | 
263 |         logging.debug("Generating new MDAnalysis universe ...")
264 |         structure = read_molecule(file_path)
265 | 
266 |         if not self.output_dir:
267 |             logging.debug("Removing structure file ...")
268 |             file_path.unlink()
269 | 
270 |         return structure
271 | 


--------------------------------------------------------------------------------
/kinoml/features/ligand.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Featurizers that mostly concern ligand-based models
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | from typing import Union
  7 | 
  8 | import numpy as np
  9 | from openff.toolkit.utils.exceptions import SMILESParseError, RadicalsNotSupportedError
 10 | from rdkit import Chem
 11 | 
 12 | from .core import ParallelBaseFeaturizer, BaseOneHotEncodingFeaturizer
 13 | from ..core.systems import LigandSystem, ProteinLigandComplex
 14 | from ..core.ligands import Ligand
 15 | 
 16 | 
 17 | class SingleLigandFeaturizer(ParallelBaseFeaturizer):
 18 |     """
 19 |     Provides a minimally useful ``._supports()`` method for all Ligand-like featurizers.
 20 |     """
 21 | 
 22 |     _COMPATIBLE_LIGAND_TYPES = (Ligand,)
 23 | 
 24 |     def __init__(self, **kwargs):
 25 |         super().__init__(**kwargs)
 26 | 
 27 |     def _supports(self, system: Union[LigandSystem, ProteinLigandComplex]) -> bool:
 28 |         """
 29 |         Check that exactly one ligand is present in the System
 30 |         """
 31 |         super_checks = super()._supports(system)
 32 |         ligands = [c for c in system.components if isinstance(c, self._COMPATIBLE_LIGAND_TYPES)]
 33 |         return all([super_checks, len(ligands) == 1])
 34 | 
 35 | 
 36 | class MorganFingerprintFeaturizer(SingleLigandFeaturizer):
 37 |     """
 38 |     Given a ``System`` containing one ``Ligand`` component, convert it to an RDKit molecule and
 39 |     generate the Morgan fingerprints bitvectors.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     radius: int, optional=2
 44 |         Morgan fingerprint neighborhood radius
 45 |     nbits: int, optional=512
 46 |         Length of the resulting bit vector
 47 |     """
 48 | 
 49 |     def __init__(self, radius: int = 2, nbits: int = 512, **kwargs):
 50 |         super().__init__(**kwargs)
 51 |         self.radius = radius
 52 |         self.nbits = nbits
 53 | 
 54 |     def _featurize_one(
 55 |         self, system: Union[LigandSystem, ProteinLigandComplex]
 56 |     ) -> Union[np.ndarray, None]:
 57 |         """
 58 |         Return the Morgan fingerprint for the given system.
 59 | 
 60 |         Parameters
 61 |         ----------
 62 |         system: LigandSystem or ProteinLigandComplex
 63 |             The System to be featurized.
 64 | 
 65 |         Returns
 66 |         -------
 67 |             : np.array or None
 68 |         """
 69 |         from rdkit.Chem import RemoveHs
 70 |         from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
 71 | 
 72 |         try:  # catch erroneous smiles not yet interpreted in case of lazy instantiation
 73 |             rdkit_mol = system.ligand.molecule.to_rdkit()
 74 |         except (SMILESParseError, RadicalsNotSupportedError):
 75 |             return None
 76 | 
 77 |         rdkit_mol = RemoveHs(rdkit_mol)
 78 |         fp = GetMorganFingerprintAsBitVect(rdkit_mol, radius=self.radius, nBits=self.nbits)
 79 |         return np.asarray(fp, dtype="int64")
 80 | 
 81 | 
 82 | class OneHotSMILESFeaturizer(BaseOneHotEncodingFeaturizer, SingleLigandFeaturizer):
 83 | 
 84 |     """
 85 |     One-hot encodes a ``Ligand`` from a SMILES representation.
 86 | 
 87 |     Attributes
 88 |     ----------
 89 |     ALPHABET: str
 90 |         Defines the character-integer mapping (as a sequence)
 91 |         of the one-hot encoding.
 92 |     """
 93 | 
 94 |     ALPHABET = (
 95 |         "BCFHIKNOPSUVWY"  # atoms
 96 |         "acegilnosru"  # aromatic atoms
 97 |         "-=#"  # bonds
 98 |         "1234567890"  # ring closures
 99 |         ".*"  # disconnections
100 |         "()"  # branches
101 |         "/+@:[]%\\"  # other characters
102 |         "LR$"  # single-char representation of Cl, Br, @@
103 |     )
104 | 
105 |     def __init__(self, smiles_type: str = "canonical", **kwargs):
106 |         """
107 |         One-hot encodes a ``Ligand`` from a SMILES representation.
108 | 
109 |         Parameters
110 |         ----------
111 |         smiles_type: str, default=canonical
112 |             The smiles type to use ('canonical' or 'raw').
113 |         """
114 |         super().__init__(**kwargs)
115 |         if smiles_type not in ["canonical", "raw"]:
116 |             raise ValueError(
117 |                 "Only 'canonical' and 'raw' are supported smiles_type, you provided "
118 |                 f"{smiles_type}."
119 |             )
120 |         self.smiles_type = smiles_type
121 | 
122 |     def _retrieve_sequence(self, system: Union[LigandSystem, ProteinLigandComplex]) -> str:
123 |         """
124 |         Get SMILES string from a `Ligand`-like component and postprocesses it.
125 | 
126 |         Double element symbols (such as `Cl`, ``Br`` for atoms and ``@@`` for chirality)
127 |         are replaced with single element symbols (`L`, ``R`` and ``$`` respectively).
128 |         """
129 |         try:
130 |             if self.smiles_type == "canonical":
131 |                 smiles = system.ligand.molecule.to_smiles(explicit_hydrogens=False)
132 |             else:
133 |                 smiles = system.ligand.metadata["smiles"]
134 |         except SMILESParseError:  # erroneous SMILES string
135 |             return ""
136 |         except KeyError:  # no SMILES string given during initialization
137 |             return ""
138 | 
139 |         return smiles.replace("Cl", "L").replace("Br", "R").replace("@@", "$")
140 | 
141 | 
142 | class GraphLigandFeaturizer(SingleLigandFeaturizer):
143 | 
144 |     """
145 |     Creates a graph representation of a `Ligand`-like component.
146 |     Each node (atom) is decorated with several RDKit descriptors
147 |     Check ```self._per_atom_features``` for details.
148 | 
149 |     Parameters
150 |     ----------
151 |     max_in_ring_size: int, optional=10
152 |         Maximum ring size for testing whether an atom belongs to a
153 |         ring or not. *Currently unused*
154 |     """
155 | 
156 |     ALL_ATOMIC_SYMBOLS = [
157 |         "C",
158 |         "N",
159 |         "O",
160 |         "S",
161 |         "F",
162 |         "Si",
163 |         "P",
164 |         "Cl",
165 |         "Br",
166 |         "Mg",
167 |         "Na",
168 |         "Ca",
169 |         "Fe",
170 |         "As",
171 |         "Al",
172 |         "I",
173 |         "B",
174 |         "V",
175 |         "K",
176 |         "Tl",
177 |         "Yb",
178 |         "Sb",
179 |         "Sn",
180 |         "Ag",
181 |         "Pd",
182 |         "Co",
183 |         "Se",
184 |         "Ti",
185 |         "Zn",
186 |         "H",
187 |         "Li",
188 |         "Ge",
189 |         "Cu",
190 |         "Au",
191 |         "Ni",
192 |         "Cd",
193 |         "In",
194 |         "Mn",
195 |         "Zr",
196 |         "Cr",
197 |         "Pt",
198 |         "Hg",
199 |         "Pb",
200 |         "Unknown",
201 |     ]
202 | 
203 |     def __init__(self, max_in_ring_size: int = 10, **kwargs):
204 |         super().__init__(**kwargs)
205 |         self.max_in_ring_size = max_in_ring_size
206 |         self._hybridization_names = sorted(Chem.rdchem.HybridizationType.names)
207 | 
208 |     def _featurize_one(
209 |         self, system: Union[LigandSystem, ProteinLigandComplex]
210 |     ) -> Union[tuple, None]:
211 |         """
212 |         Featurizes ligands contained in a System as a labeled graph.
213 | 
214 |         Parameters
215 |         ----------
216 |         system: LigandSystem or ProteinLigandComplex
217 |             The System being featurized.
218 | 
219 |         Returns
220 |         -------
221 |         tuple of np.array or None
222 |             A two-tuple with:
223 |             - Graph connectivity of the molecule with shape ``(2, n_edges)``
224 |             - Feature matrix with shape ``(n_atoms, n_features)``
225 |         """
226 |         try:  # catch erroneous smiles not yet interpreted in case of lazy instantiation
227 |             # rdkit_mol = system.ligand.molecule.to_rdkit()
228 |             # this does not work, since openff toolkit will permit implicit hydrogens when
229 |             # converting to rdkit (see https://github.com/openforcefield/openff-toolkit/pull/1001)
230 |             smiles = system.ligand.molecule.to_smiles(explicit_hydrogens=False)
231 |             rdkit_mol = Chem.MolFromSmiles(smiles)
232 |         except SMILESParseError:
233 |             return None
234 | 
235 |         connectivity_graph = self._connectivity_COO_format(rdkit_mol)
236 |         per_atom_features = np.array([self._per_atom_features(a) for a in rdkit_mol.GetAtoms()])
237 |         return connectivity_graph, per_atom_features
238 | 
239 |     def _per_atom_features(self, atom) -> np.ndarray:
240 |         """
241 |         Computes desired features for each atom in the molecular graph.
242 | 
243 |         Parameters
244 |         ----------
245 |         atom: rdkit.Chem.Atom
246 |             Atom to extract features from
247 | 
248 |         Returns
249 |         -------
250 |         tuple of atomic features.
251 |             atomic_symbol : array
252 |                 the one-hot encoded atomic symbol from `ALL_ATOMIC_SYMBOLS`.
253 |             formal_charge : int
254 |                 the formal charge of atom.
255 |             hybridization_type : array
256 |                 the one-hot encoded hybridization type from
257 |                 ``rdkit.Chem.rdchem.HybridizationType``.
258 |             aromatic : bool
259 |                 if atom is aromatic.
260 |             degree : array
261 |                 the one-hot encoded degree of the atom in the molecule.
262 |             total_h : int
263 |                 the total number of hydrogens on the atom (implicit and explicit).
264 |             implicit_h : int
265 |                 the number of implicit hydrogens on the atom.
266 |             radical_electrons : int
267 |                 the number of radical electrons.
268 | 
269 |         Notes
270 |         -----
271 |         The atomic features are the same as in PotentialNet [1]_.
272 | 
273 |         .. [1] https://doi.org/10.1021/acscentsci.8b00507
274 |         """
275 |         # Return flattened array; notice how the OHE'd matrices are flattened
276 |         # and iterated with the * unpacking operator --
277 |         return np.array(
278 |             [
279 |                 # 1. Chemical element, one-hot encoded
280 |                 *BaseOneHotEncodingFeaturizer.one_hot_encode(
281 |                     [atom.GetSymbol()], self.ALL_ATOMIC_SYMBOLS
282 |                 ).flatten(),
283 |                 # 2. Formal charge
284 |                 atom.GetFormalCharge(),
285 |                 # 3. Hybridization, one-hot encoded
286 |                 *BaseOneHotEncodingFeaturizer.one_hot_encode(
287 |                     [atom.GetHybridization().name],
288 |                     self._hybridization_names,
289 |                 ).flatten(),
290 |                 # 4. Aromaticity
291 |                 atom.GetIsAromatic(),
292 |                 # 5. Total numbers of bonds, one-hot encoded
293 |                 *BaseOneHotEncodingFeaturizer.one_hot_encode(
294 |                     [atom.GetDegree()], list(range(11))
295 |                 ).flatten(),
296 |                 # 6. Total number of hydrogens
297 |                 atom.GetTotalNumHs(),
298 |                 # 7. Number of implicit hydrogens
299 |                 atom.GetNumImplicitHs(),
300 |                 # 8. Number of radical electrons
301 |                 atom.GetNumRadicalElectrons(),
302 |             ],
303 |             dtype="float64",
304 |         )
305 | 
306 |     @staticmethod
307 |     def _connectivity_COO_format(mol: Chem.Mol) -> np.ndarray:
308 |         """
309 |         Returns the connectivity of the molecular graph in COO format.
310 | 
311 |         Parameters
312 |         ----------
313 |         mol: rdkit.Chem.Mol
314 |             RDKit molecule to extract bonds from
315 | 
316 |         Returns
317 |         -------
318 |         np.ndarray
319 |             graph connectivity in COO format with shape ``[2, num_edges]``
320 |         """
321 | 
322 |         row, col = [], []
323 | 
324 |         for bond in mol.GetBonds():
325 |             start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
326 |             row += [start, end]
327 |             col += [end, start]
328 | 
329 |         return np.array([row, col])
330 | 


--------------------------------------------------------------------------------