├── tests
├── __init__.py
├── data
│ ├── stereo1.sdf.bz2
│ ├── stereo2.sdf.bz2
│ ├── caffeine_planar.sdf.bz2
│ ├── ritalin_nonplanar.sdf.bz2
│ ├── rand_sdf_files
│ │ ├── CHEMBL116226.sdf.bz2
│ │ ├── CHEMBL197946.sdf.bz2
│ │ ├── CHEMBL282186.sdf.bz2
│ │ ├── CHEMBL501745.sdf.bz2
│ │ └── CHEMBL2114064.sdf.bz2
│ └── ritalin_nonplanar.sdf
├── test_config.py
├── test_dependencies.py
├── test_util.py
├── test_conformer.py
├── test_fingerprint.py
├── test_metrics.py
└── test_struct.py
├── src
└── e3fp
│ ├── config
│ ├── __init__.py
│ ├── defaults.cfg
│ └── params.py
│ ├── conformer
│ ├── __init__.py
│ ├── protonation.py
│ ├── util.py
│ └── generator.py
│ ├── fingerprint
│ ├── __init__.py
│ ├── metrics
│ │ ├── __pycache__
│ │ │ ├── array_metrics._dense_soergel-225.py312.nbi
│ │ │ ├── array_metrics._sparse_soergel-246.py312.nbi
│ │ │ ├── array_metrics._dense_soergel-225.py312.1.nbc
│ │ │ └── array_metrics._sparse_soergel-246.py312.1.nbc
│ │ ├── fprint_metrics.py
│ │ ├── __init__.py
│ │ └── array_metrics.py
│ ├── util.py
│ ├── array_ops.py
│ └── structs.py
│ ├── __init__.py
│ ├── pipeline.py
│ └── util.py
├── doc
├── source
│ ├── examples
│ │ └── data
│ │ │ ├── caffeine.smi
│ │ │ ├── new_params.cfg
│ │ │ └── test_smiles.smi
│ ├── api
│ │ ├── index.rst
│ │ ├── e3fp.util.rst
│ │ ├── e3fp.pipeline.rst
│ │ ├── e3fp.config.params.rst
│ │ ├── e3fp.conformer.util.rst
│ │ ├── e3fp.fingerprint.db.rst
│ │ ├── e3fp.fingerprint.util.rst
│ │ ├── e3fp.conformer.generate.rst
│ │ ├── e3fp.conformer.generator.rst
│ │ ├── e3fp.fingerprint.fprint.rst
│ │ ├── e3fp.fingerprint.structs.rst
│ │ ├── e3fp.fingerprint.fprinter.rst
│ │ ├── e3fp.fingerprint.generate.rst
│ │ ├── e3fp.conformer.protonation.rst
│ │ ├── e3fp.fingerprint.array_ops.rst
│ │ ├── e3fp.fingerprint.metrics.array_metrics.rst
│ │ ├── e3fp.fingerprint.metrics.fprint_metrics.rst
│ │ ├── e3fp.config.rst
│ │ ├── e3fp.conformer.rst
│ │ ├── e3fp.fingerprint.metrics.rst
│ │ ├── e3fp.rst
│ │ └── e3fp.fingerprint.rst
│ ├── _static
│ │ └── ritalin3d.png
│ ├── index.rst
│ ├── usage
│ │ ├── fingerprints
│ │ │ ├── index.rst
│ │ │ ├── comparison.rst
│ │ │ ├── storage.rst
│ │ │ └── fprints.rst
│ │ ├── index.rst
│ │ ├── config.rst
│ │ ├── pipeline.rst
│ │ └── cli.rst
│ ├── _templates
│ │ └── layout.html
│ ├── overview.rst
│ ├── substitutions.rst
│ ├── install.rst
│ ├── conf.py
│ └── dev
│ │ └── index.rst
└── Makefile
├── pytest.ini
├── .gitignore
├── .coveragerc
├── .readthedocs.yml
├── .github
└── workflows
│ ├── publish.yml
│ └── ci.yml
├── pyproject.toml
├── README.rst
└── LICENSE.txt
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/e3fp/config/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/e3fp/conformer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc/source/examples/data/caffeine.smi:
--------------------------------------------------------------------------------
1 | CN1C=NC2=C1C(=O)N(C(=O)N2C)C caffeine
2 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules
3 | doctest_optionflags = ELLIPSIS
4 |
--------------------------------------------------------------------------------
/tests/data/stereo1.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/stereo1.sdf.bz2
--------------------------------------------------------------------------------
/tests/data/stereo2.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/stereo2.sdf.bz2
--------------------------------------------------------------------------------
/doc/source/api/index.rst:
--------------------------------------------------------------------------------
1 | e3fp API
2 | ========
3 |
4 | .. toctree::
5 | :maxdepth: 5
6 |
7 | e3fp
8 |
--------------------------------------------------------------------------------
/doc/source/_static/ritalin3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/doc/source/_static/ritalin3d.png
--------------------------------------------------------------------------------
/tests/data/caffeine_planar.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/caffeine_planar.sdf.bz2
--------------------------------------------------------------------------------
/tests/data/ritalin_nonplanar.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/ritalin_nonplanar.sdf.bz2
--------------------------------------------------------------------------------
/doc/source/examples/data/new_params.cfg:
--------------------------------------------------------------------------------
1 | [conformer_generation]
2 | first = 10
3 |
4 | [fingerprinting]
5 | bits = 4096
6 | first = 10
--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL116226.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL116226.sdf.bz2
--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL197946.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL197946.sdf.bz2
--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL282186.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL282186.sdf.bz2
--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL501745.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL501745.sdf.bz2
--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL2114064.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL2114064.sdf.bz2
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | dist/*
3 | doc/_build/*
4 | .cache/*
5 | .coverage
6 | .DS_Store
7 | *egg*
8 | *.pyc
9 | *.so
10 | *.o
11 | uv.lock
12 | docs
13 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | branch = True
4 | source = e3fp
5 | omit =
6 | */e3fp/test/*
7 | */setup.py
8 | */doc/*
9 |
--------------------------------------------------------------------------------
/src/e3fp/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 | from .util import E3FPWarning, E3FPDeprecationWarning
3 |
4 | __version__ = importlib.metadata.version("e3fp")
5 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.util.rst:
--------------------------------------------------------------------------------
1 | e3fp\.util module
2 | =================
3 |
4 | .. automodule:: e3fp.util
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.pipeline.rst:
--------------------------------------------------------------------------------
1 | e3fp\.pipeline module
2 | =====================
3 |
4 | .. automodule:: e3fp.pipeline
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.config.params.rst:
--------------------------------------------------------------------------------
1 | e3fp\.config\.params module
2 | ===========================
3 |
4 | .. automodule:: e3fp.config.params
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.util.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.util module
2 | ============================
3 |
4 | .. automodule:: e3fp.conformer.util
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.db.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.db module
2 | ============================
3 |
4 | .. automodule:: e3fp.fingerprint.db
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.nbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.nbi
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.nbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.nbi
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.1.nbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.1.nbc
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.1.nbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.1.nbc
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.util.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.util module
2 | ==============================
3 |
4 | .. automodule:: e3fp.fingerprint.util
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.generate.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.generate module
2 | ================================
3 |
4 | .. automodule:: e3fp.conformer.generate
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.generator.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.generator module
2 | =================================
3 |
4 | .. automodule:: e3fp.conformer.generator
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.fprint.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.fprint module
2 | ================================
3 |
4 | .. automodule:: e3fp.fingerprint.fprint
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.structs.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.structs module
2 | =================================
3 |
4 | .. automodule:: e3fp.fingerprint.structs
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.fprinter.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.fprinter module
2 | ==================================
3 |
4 | .. automodule:: e3fp.fingerprint.fprinter
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.generate.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.generate module
2 | ==================================
3 |
4 | .. automodule:: e3fp.fingerprint.generate
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.protonation.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.protonation module
2 | ===================================
3 |
4 | .. automodule:: e3fp.conformer.protonation
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.array_ops.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.array\_ops module
2 | ====================================
3 |
4 | .. automodule:: e3fp.fingerprint.array_ops
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.metrics.array_metrics.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.metrics\.array\_metrics module
2 | =================================================
3 |
4 | .. automodule:: e3fp.fingerprint.metrics.array_metrics
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.metrics.fprint_metrics.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.metrics\.fprint\_metrics module
2 | ==================================================
3 |
4 | .. automodule:: e3fp.fingerprint.metrics.fprint_metrics
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | .. e3fp documentation master file
2 |
3 | e3fp
4 | ====
5 |
6 | :Release: |version|
7 | :Date: |today|
8 |
9 | Contents
10 | -----------------
11 |
12 | .. toctree::
13 | :maxdepth: 2
14 |
15 | overview
16 | install
17 | usage/index
18 | dev/index
19 | api/index
20 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.config.rst:
--------------------------------------------------------------------------------
1 | e3fp\.config package
2 | ====================
3 |
4 | Submodules
5 | ----------
6 |
7 | .. toctree::
8 |
9 | e3fp.config.params
10 |
11 | Module contents
12 | ---------------
13 |
14 | .. automodule:: e3fp.config
15 | :members:
16 | :undoc-members:
17 | :show-inheritance:
18 |
--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
1 | """Tests for loading config files.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import os
7 |
8 |
9 | class TestConfig:
10 | def test_config_file_exists(self):
11 | from e3fp.config.params import DEF_PARAM_FILE
12 |
13 | assert os.path.isfile(DEF_PARAM_FILE)
14 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: "ubuntu-22.04"
5 | tools:
6 | python: "3.13"
7 | commands:
8 | - asdf plugin add uv
9 | - asdf install uv latest
10 | - asdf global uv latest
11 | - uv sync --extra docs
12 | - uv run -m sphinx -T -b html -d docs/_build/doctrees doc/source $READTHEDOCS_OUTPUT/html
13 |
--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/index.rst:
--------------------------------------------------------------------------------
1 | Using Fingerprints
2 | ==================
3 |
4 | While molecular fingerprints are widely used, few packages provide simple
5 | interfaces for working with them and interfacing with machine learning
6 | packages. E3FP provides a number of general utility classes and methods for
7 | doing precisely this.
8 |
9 | .. toctree::
10 | fprints
11 | storage
12 | comparison
13 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer package
2 | =======================
3 |
4 | Submodules
5 | ----------
6 |
7 | .. toctree::
8 |
9 | e3fp.conformer.generate
10 | e3fp.conformer.generator
11 | e3fp.conformer.protonation
12 | e3fp.conformer.util
13 |
14 | Module contents
15 | ---------------
16 |
17 | .. automodule:: e3fp.conformer
18 | :members:
19 | :undoc-members:
20 | :show-inheritance:
21 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.metrics.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.metrics package
2 | ==================================
3 |
4 | Submodules
5 | ----------
6 |
7 | .. toctree::
8 |
9 | e3fp.fingerprint.metrics.array_metrics
10 | e3fp.fingerprint.metrics.fprint_metrics
11 |
12 | Module contents
13 | ---------------
14 |
15 | .. automodule:: e3fp.fingerprint.metrics
16 | :members:
17 | :undoc-members:
18 | :show-inheritance:
19 |
--------------------------------------------------------------------------------
/doc/source/api/e3fp.rst:
--------------------------------------------------------------------------------
1 | e3fp package
2 | ============
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | e3fp.config
10 | e3fp.conformer
11 | e3fp.fingerprint
12 |
13 | Submodules
14 | ----------
15 |
16 | .. toctree::
17 |
18 | e3fp.pipeline
19 | e3fp.util
20 |
21 | Module contents
22 | ---------------
23 |
24 | .. automodule:: e3fp
25 | :members:
26 | :undoc-members:
27 | :show-inheritance:
28 |
--------------------------------------------------------------------------------
/doc/source/usage/index.rst:
--------------------------------------------------------------------------------
1 | Usage and Examples
2 | ==================
3 |
4 | To facilitate flexible use of the E3FP package, we provide multiple interfaces
5 | for performing the same tasks. We have organized these below in the order in
6 | which we expect them to be most of use to the average user.
7 |
8 | .. toctree::
9 | :caption: Sections
10 | :maxdepth: 2
11 |
12 | config
13 | cli
14 | pipeline
15 | fingerprints/index
16 |
--------------------------------------------------------------------------------
/doc/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 |
3 | {% block footer %}
4 | {{ super() }}
5 |
14 | {% endblock %}
--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint package
2 | =========================
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | e3fp.fingerprint.metrics
10 |
11 | Submodules
12 | ----------
13 |
14 | .. toctree::
15 |
16 | e3fp.fingerprint.array_ops
17 | e3fp.fingerprint.db
18 | e3fp.fingerprint.fprint
19 | e3fp.fingerprint.fprinter
20 | e3fp.fingerprint.generate
21 | e3fp.fingerprint.structs
22 | e3fp.fingerprint.util
23 |
24 | Module contents
25 | ---------------
26 |
27 | .. automodule:: e3fp.fingerprint
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 |
--------------------------------------------------------------------------------
/src/e3fp/config/defaults.cfg:
--------------------------------------------------------------------------------
1 | [preprocessing]
2 | standardise = False
3 | protonate = False
4 |
5 | [conformer_generation]
6 | num_conf = -1
7 | first = -1
8 | pool_multiplier = 1
9 | rmsd_cutoff = 0.5
10 | max_energy_diff = None
11 | forcefield = uff
12 | out_dir = conformers
13 | compress = 2
14 | seed = -1
15 |
16 | ; Optimized parameters used in
17 | ; Axen et al. 2017
18 | [fingerprinting]
19 | bits = 1024
20 | level = 5
21 | first = 3
22 | radius_multiplier = 1.718
23 | stereo = True
24 | counts = False
25 | include_disconnected = True
26 | rdkit_invariants = False
27 | remove_duplicate_substructs = True
28 | exclude_floating = True
29 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/util.py:
--------------------------------------------------------------------------------
1 | """Utility methods and class for fingerprinting-related functions.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | from ..util import E3FPError
7 |
8 |
9 | class E3FPInvalidFingerprintError(E3FPError, TypeError):
10 | """Fingerprint is incorrectly formatted."""
11 |
12 |
13 | class E3FPMolError(E3FPError, TypeError):
14 | """Mol is of incorrect type."""
15 |
16 |
17 | class E3FPBitsValueError(E3FPError, ValueError):
18 | """Bits value is invalid."""
19 |
20 |
21 | class E3FPCountsError(E3FPError, ValueError):
22 | """Index in counts is invalid."""
23 |
24 |
25 | class E3FPOptionError(E3FPError, ValueError):
26 | """Option provided is invalid."""
27 |
--------------------------------------------------------------------------------
/doc/source/examples/data/test_smiles.smi:
--------------------------------------------------------------------------------
1 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccccc1)C(C)C CHEMBL1643865
2 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)C(C)C)C(C)C CHEMBL1643866
3 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccccn1)C(C)C CHEMBL1643867
4 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccc(OC)c1)C(C)C CHEMBL1643868
5 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccc(F)c1)C(C)C CHEMBL1643869
6 | CN1CCN(C(=O)c2ccc3n2Cc2ccccc2N(C(=O)c2ccc(NC(=O)c4ccccc4-c4ccccc4)cc2Cl)C3)CC1 CHEMBL164387
7 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccnc1)C(C)C CHEMBL1643870
8 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccncc1)C(C)C CHEMBL1643871
9 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccc2ccccc2n1)C(C)C CHEMBL1643872
10 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1nccc2ccccc21)C(C)C CHEMBL1643873
11 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = e3fp
8 | SOURCEDIR = source
9 | BUILDDIR = _build
10 |
11 | # Internal variables
12 | PAPEROPT_a4 = -D latex_paper_size=a4
13 | PAPEROPT_letter = -D latex_paper_size=letter
14 | ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
15 |
16 | # Put it first so that "make" without argument is like "make help".
17 | help:
18 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
19 |
20 | .PHONY: help Makefile
21 |
22 | # Catch-all target: route all unknown targets to Sphinx using the new
23 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
24 | %: Makefile
25 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 |
--------------------------------------------------------------------------------
/tests/test_dependencies.py:
--------------------------------------------------------------------------------
1 | """Integration tests for dependencies.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 |
7 |
8 | class TestRequiredDependencies:
9 | def test_rdkit(self):
10 | import rdkit
11 |
12 | def test_numpy(self):
13 | import numpy
14 |
15 | def test_scipy(self):
16 | import scipy
17 |
18 | def test_murmurhash(self):
19 | import mmh3
20 |
21 | def test_python_utilities(self):
22 | import python_utilities
23 |
24 |
25 | class TestOptionalFeatureDependencies:
26 | def test_h5py(self):
27 | import h5py
28 |
29 | def test_standardiser(self):
30 | import standardiser
31 |
32 |
33 | class TestOptionalParallelDependencies:
34 | def test_mpi4py(self):
35 | import mpi4py
36 |
37 | def test_concurrent(self):
38 | import concurrent.futures
39 |
40 | def test_python_utilities(self):
41 | import python_utilities.parallel
42 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish
2 |
3 | on:
4 | release:
5 | types:
6 | - published
7 |
8 | jobs:
9 | build-test:
10 | runs-on: ubuntu-latest
11 | env:
12 | uv_version: "0.5.2"
13 | python_version: "3.13"
14 | steps:
15 | - uses: actions/checkout@v2
16 | - name: Setup MPI
17 | uses: mpi4py/setup-mpi@v1
18 | - name: Install uv
19 | uses: astral-sh/setup-uv@v3
20 | with:
21 | version: ${{ env.uv_version }}
22 | - name: Build the project
23 | run: uv build --no-sources --python ${{ env.python_version }}
24 | - name: Sync only the test dependencies
25 | run: uv sync --no-install-project --extra test
26 | - name: Install and test source distribution
27 | run: |
28 | uv pip install dist/*.tar.gz
29 | uv run --no-sync pytest
30 | uv pip uninstall e3fp
31 | - name: Install and test wheel
32 | run: |
33 | uv pip install dist/*.whl
34 | uv run --no-sync pytest
35 | - name: Publish to PyPI
36 | run: uv publish --token ${{ secrets.PYPI_API_TOKEN }}
37 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on:
3 | push:
4 | branches: [master]
5 | pull_request:
6 |
7 | env:
8 | # Setting RDMAV_FORK_SAFE=1 to avoid libfabric EFA provider issues with
9 | # fork() on Python 3.9 and Ubuntu.
10 | RDMAV_FORK_SAFE: 1
11 |
12 | jobs:
13 | test:
14 | name: Python ${{ matrix.python-version }} - ${{ matrix.os }}
15 | runs-on: ${{ matrix.os }}
16 | env:
17 | uv_version: "0.5.2"
18 | strategy:
19 | matrix:
20 | os: ["ubuntu-latest", "macos-latest"]
21 | python-version: ["3.9", "3.13"]
22 | fail-fast: false
23 | steps:
24 | - uses: actions/checkout@v2
25 | with:
26 | fetch-depth: 2
27 | - name: Setup MPI
28 | uses: mpi4py/setup-mpi@v1
29 | - name: Install uv
30 | uses: astral-sh/setup-uv@v3
31 | with:
32 | version: ${{ env.uv_version }}
33 | - name: Install the project
34 | run: uv sync --extra test --python ${{ matrix.python-version }}
35 | - name: Run tests
36 | run: uv run pytest --cov=e3fp --cov-report=xml
37 | - name: Upload coverage to Codecov
38 | uses: codecov/codecov-action@v5
39 | with:
40 | fail_ci_if_error: false
41 |
--------------------------------------------------------------------------------
/doc/source/overview.rst:
--------------------------------------------------------------------------------
1 | Overview of E3FP
2 | ================
3 |
4 | Introduction
5 | ------------
6 |
7 | The Extended 3-Dimensional FingerPrint (E3FP) [1]_ is a 3D molecular
8 | fingerprinting method inspired by Extended Connectivity FingerPrints (ECFP)
9 | [2]_, integrating tightly with the RDKit_. It is developed by the
10 | `Keiser Lab`_ at UCSF_ and maintained primarily by `Seth Axen`_.
11 |
12 | For a thorough description of E3FP, please consult the original paper [1]_ and
13 | `paper repository`_ or :ref:`usage/index:Usage and Examples`.
14 |
15 | Documentation is hosted by ReadTheDocs_.
16 |
17 | Contributing
18 | ------------
19 |
20 | Development occurs on GitHub_.
21 | Contributions, feature requests, and bug reports are greatly appreciated.
22 | Please consult the `issue tracker`_.
23 |
24 | License
25 | -------
26 | E3FP is released under the |license_long| (|license|).
27 |
28 | Briefly, this means E3FP can be used in any manner without modification,
29 | with proper attribution. However, if the source code is modified for an
30 | application, this modified source must also be released under |license| so that
31 | the community may benefit.
32 |
33 | Citing E3FP
34 | -----------
35 |
36 | To cite E3FP, please reference the original paper [1]_.
37 |
38 | .. rubric:: References
39 |
40 | .. [1] |axen2017|
41 | .. [2] |rogers2010|
42 |
43 | .. include:: substitutions.rst
44 | .. _GitHub: https://github.com/keiserlab/e3fp
45 |
--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | """Tests for util methods.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import pytest
7 | import warnings
8 |
9 |
10 | class TestUtil:
11 | def test_deprecated(self):
12 | from e3fp.util import deprecated, E3FPDeprecationWarning
13 |
14 | @deprecated("1.1", remove_version="1.3", msg="DEPRECATED!!!")
15 | def dep_method():
16 | pass
17 |
18 | with warnings.catch_warnings(record=True) as w:
19 | warnings.simplefilter("always")
20 | dep_method()
21 | assert len(w) == 1
22 | assert issubclass(w[-1].category, E3FPDeprecationWarning)
23 | message = str(w[-1].message)
24 | assert "deprecated in 1.1" in message
25 | assert "removed in 1.3" in message
26 | assert "DEPRECATED!!!" in str(w[-1].message)
27 |
28 | assert "\t.. deprecated:: 1.1\n\t DEPRECATED!!!" in dep_method.__doc__
29 |
30 | def test_efficiency_warning(self):
31 | from e3fp.util import E3FPEfficiencyWarning
32 |
33 | def test(warn=False):
34 | if warn:
35 | raise E3FPEfficiencyWarning("Inefficient!")
36 |
37 | with warnings.catch_warnings(record=True):
38 | warnings.simplefilter("error")
39 | test(warn=False)
40 |
41 | with pytest.raises(E3FPEfficiencyWarning):
42 | test(warn=True)
43 |
--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/comparison.rst:
--------------------------------------------------------------------------------
1 | Fingerprint Comparison
2 | ======================
3 |
4 | The `e3fp.fingerprint.metrics` sub-package provides several useful methods for
5 | batch comparison of fingerprints in various representations.
6 |
7 | Fingerprint Metrics
8 | -------------------
9 |
10 | These metrics operate directly on pairs of :py:class:`.Fingerprint` and
11 | :py:class:`.FingerprintDatabase` objects or on a combination of each. If
12 | only a single variable is specified, self-comparison is performed. The
13 | implemented methods are common functions for fingerprint similarity in the
14 | literature.
15 |
16 | .. todo::
17 |
18 | Document examples
19 |
20 | Array Metrics
21 | -------------
22 |
23 | To efficiently compare fingerprint databases above, we provide comparison
24 | metrics that can operate directly on the internal sparse matrix representation
25 | without the need to "densify it". We describe these here, as they have several
26 | additional features.
27 |
28 | The array metrics implemented in `e3fp.fingerprint.metrics.array_metrics` are
29 | implemented such that they may take any combination of dense and sparse inputs.
30 | Additionally, they are designed to function as
31 | `scikit-learn-compatible kernels `_
32 | for machine learning tasks. For example, one might perform an analysis using a
33 | support vector machine (SVM) and Tanimoto kernel.
34 |
35 | .. code:: python
36 |
37 | >>> from sklearn.svm import SVC
38 | >>> from e3fp.fingerprint.metrics.array_metrics import tanimoto
39 | >>> clf = SVC(kernel=tanimoto)
40 | >>> clf.fit(X, y)
41 | ...
42 | >>> clf.predict(test)
43 | ...
44 |
45 | Most common fingerprint comparison metrics only apply to binary fingerprints.
46 | We include several that operate equally well on count- and float-based
47 | fingerprints. For example, to our knowledge, we provide the only open source
48 | implementation of Soergel similarity, the analog to the Tanimoto coefficient
49 | for non-binary fingerprints that can efficiently operate on sparse inputs.
50 |
51 | .. code:: python
52 |
53 | >>> from e3fp.fingerprint.metrics.array_metrics import soergel
54 | >>> clf = SVC(kernel=soergel)
55 | >>> clf.fit(X, y)
56 | ...
57 | >>> clf.predict(test)
58 | ...
59 |
--------------------------------------------------------------------------------
/doc/source/substitutions.rst:
--------------------------------------------------------------------------------
1 | .. Common substitutions used throughout the documentation
2 |
3 | .. URLs
4 | .. _RDKit: http://www.rdkit.org
5 | .. _NumPy: https://www.numpy.org
6 | .. _SciPy: https://www.scipy.org
7 | .. _mmh3: https://pypi.python.org/pypi/mmh3
8 | .. _python_utilities: https://github.com/sdaxen/python_utilities
9 | .. _mpi4py: http://mpi4py.scipy.org
10 | .. _smart_open: https://github.com/RaRe-Technologies/smart_open
11 | .. _standardiser: https://wwwdev.ebi.ac.uk/chembl/extra/francis/standardiser
12 | .. _cxcalc: https://docs.chemaxon.com/display/CALCPLUGS/cxcalc+command+line+tool
13 | .. _h5py: http://www.h5py.org/
14 | .. _numba: https://numba.pydata.org/
15 | .. _Anaconda: https://anaconda.org/conda-forge/e3fp
16 | .. _uv: https://docs.astral.sh/uv/
17 | .. _repository: https://github.com/keiserlab/e3fp
18 | .. _paper repository: https://github.com/keiserlab/e3fp-paper
19 | .. _issue tracker: https://github.com/keiserlab/e3fp/issues
20 | .. _ReadTheDocs: http://e3fp.readthedocs.io
21 | .. _Keiser Lab: http://www.keiserlab.org
22 | .. _UCSF: https://www.ucsf.edu
23 | .. _Seth Axen: http://sethaxen.com
24 |
25 | .. Badges
26 | .. |bioRxiv| image:: https://img.shields.io/badge/bioRxiv-136705-blue.svg
27 | :target: https://doi.org/10.1101/136705
28 | :alt: Access the preprint on bioRxiv
29 |
30 | .. References
31 | .. |axen2017_doi| image:: https://img.shields.io/badge/doi-10.1021/acs.jmedchem.7b00696-blue.svg
32 | :target: http://dx.doi.org/10.1021/acs.jmedchem.7b00696
33 | :alt: Access the paper
34 | .. |axen2017| replace:: Axen SD, Huang XP, Caceres EL, Gendelev L, Roth BL, Keiser MJ. A Simple Representation Of Three-Dimensional Molecular Structure. *J. Med. Chem.* **60** (17): 7393–7409 (2017). |axen2017_doi| |bioRxiv|
35 | .. |rogers2010_doi| image:: https://img.shields.io/badge/doi-10.1021/ci100050t-blue.svg
36 | :target: http://dx.doi.org/10.1021/ci100050t
37 | :alt: Access the paper
38 | .. |rogers2010| replace:: Rogers D & Hahn M. Extended-connectivity fingerprints. *J. Chem. Inf. Model.* **50**: 742-54 (2010). |rogers2010_doi|
39 |
40 | .. Misc
41 | .. |license_link| replace:: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt
42 | .. |license_long| replace:: `GNU Lesser General Public License version 3.0`_
43 | .. _GNU Lesser General Public License version 3.0: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt
44 | .. |license| replace:: LGPLv3
45 |
--------------------------------------------------------------------------------
/doc/source/install.rst:
--------------------------------------------------------------------------------
1 | Setup and Installation
2 | ======================
3 |
4 | Dependencies
5 | ------------
6 |
7 | E3FP is compatible with Python 3.x. It additionally has the
8 | following dependencies:
9 |
10 | Required
11 | ~~~~~~~~
12 |
13 | - NumPy_
14 | - SciPy_
15 | - RDKit_
16 | - mmh3_
17 | - python_utilities_
18 | - smart_open_
19 |
20 | Optional
21 | ~~~~~~~~
22 |
23 | The following packages are required for the specified features:
24 |
25 | - parallelization:
26 |
27 | + mpi4py_
28 |
29 | - molecular standardisation:
30 |
31 | + standardiser_
32 |
33 | - protonation states:
34 |
35 | + cxcalc_
36 |
37 | - storing conformer energies:
38 |
39 | + h5py_
40 |
41 | - faster fingerprint metric calculations:
42 |
43 | + numba_
44 |
45 |
46 | Installation
47 | ------------
48 |
49 | The following installation approaches are listed in order of recommendation.
50 |
51 | Option 1: Install with Pip
52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
53 |
54 | Basic installation:
55 |
56 | .. code:: bash
57 |
58 | $ pip install e3fp
59 |
60 | With optional dependencies:
61 |
62 | .. code:: bash
63 |
64 | $ pip install e3fp[optional]
65 |
66 |
67 | Option 2: Install from conda-forge
68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 |
70 | E3FP is available on conda-forge.
71 |
72 | .. code:: bash
73 |
74 | $ conda create -n e3fp_env -c conda-forge e3fp
75 | $ conda activate e3fp_env
76 |
77 | To install optional dependencies:
78 |
79 | .. code:: bash
80 |
81 | $ conda install -c conda-forge mpi4py h5py standardiser
82 |
83 | Option 3: Install from source
84 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
85 |
86 | 1. Clone the repository:
87 |
88 | .. code:: bash
89 |
90 | $ git clone https://github.com/keiserlab/e3fp.git
91 | $ cd e3fp
92 |
93 | 2. Install for development in an already-activated environment.
94 |
95 | You can do this using pip:
96 |
97 | .. code:: bash
98 |
99 | $ pip install -e .[dev]
100 |
101 | Or use uv_ to set up a development environment:
102 |
103 | .. code:: bash
104 |
105 | $ uv sync --extra dev
106 |
107 | Testing
108 | -------
109 |
110 | Run tests using pytest:
111 |
112 | .. code:: bash
113 |
114 | $ pip install pytest # if not already installed
115 | $ pytest e3fp
116 |
117 |
118 | .. include:: substitutions.rst
119 |
--------------------------------------------------------------------------------
/doc/source/usage/config.rst:
--------------------------------------------------------------------------------
1 | Configuration
2 | =============
3 |
4 | E3FP configurational parameters are stored in the widely used INI_ file
5 | format. These may be passed to :ref:`usage/cli:Command Line Interface` programs
6 | or parsed to Python dicts for :ref:`usage/pipeline:Pipeline Methods` or other
7 | lower-level functions.
8 |
9 | Loading Default Parameters
10 | --------------------------
11 |
12 | The below example shows all default parameters, accessed via the
13 | :py:mod:`e3fp.config` module.
14 |
15 | .. literalinclude:: ../../../src/e3fp/config/defaults.cfg
16 | :caption: `defaults.cfg `_
17 |
18 | :py:mod:`configparser` is used internally to parse and store these
19 | config parameters.
20 |
21 | >>> from e3fp.config.params import default_params
22 | >>> default_params
23 |
24 | >>> print(default_params.sections())
25 | ['preprocessing', 'conformer_generation', 'fingerprinting']
26 | >>> default_params.items('fingerprinting')
27 | [('bits', '1024'), ('level', '5'), ('first', '3'), ('radius_multiplier', '1.718'), ('stereo', 'True'), ('counts', 'False'), ('include_disconnected', 'True'), ('rdkit_invariants', 'False'), ('merge_duplicate_substructs', 'True'), ('exclude_floating', 'True')]
28 |
29 | Parsing User-Provided Parameters
30 | --------------------------------
31 |
32 | A user may provide a custom config file.
33 |
34 | .. literalinclude:: ../examples/data/new_params.cfg
35 | :caption: new_params.cfg
36 |
37 | .. doctest::
38 |
39 | >>> from e3fp.config.params import read_params
40 | >>> config = read_params("source/examples/data/new_params.cfg")
41 | >>> config.items('fingerprinting')
42 | [('bits', '4096'), ('first', '10')]
43 |
44 | When passing these parameters to any downstream methods, default options will
45 | be used except where these options are specified.
46 |
47 | Converting Parameters to Argument Dicts
48 | ---------------------------------------
49 |
50 | To pass the parameters to Python methods for fingerprinting and conformer
51 | generation, we need to convert them to Python dicts.
52 |
53 | >>> from e3fp.pipeline import params_to_dicts
54 | >>> confgen_params, fprint_params = params_to_dicts(config)
55 | >>> fprint_params
56 | {'bits': 4096, 'first': 10}
57 |
58 | .. _INI: https://en.wikipedia.org/wiki/INI_file
59 |
60 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["flit_core >=3.2,<4"]
3 | build-backend = "flit_core.buildapi"
4 |
5 | [project]
6 | name = "e3fp"
7 | version = "1.2.7"
8 | requires-python = ">=3.9, <3.14"
9 | description = "Molecular 3D fingerprinting"
10 | readme = "README.rst"
11 | authors = [
12 | {name = "Seth Axen", email = "seth.axen@gmail.com"},
13 | ]
14 | license = {file = "LICENSE.txt"}
15 | keywords = ["e3fp", "3d", "molecule", "fingerprint", "conformer"]
16 | classifiers = [
17 | "Programming Language :: Python",
18 | "Programming Language :: Python :: 3.9",
19 | "Programming Language :: Python :: 3.10",
20 | "Programming Language :: Python :: 3.11",
21 | "Programming Language :: Python :: 3.12",
22 | "Programming Language :: Python :: 3.13",
23 | "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
24 | "Operating System :: OS Independent",
25 | "Development Status :: 4 - Beta",
26 | "Intended Audience :: Science/Research",
27 | "Intended Audience :: Developers",
28 | "Topic :: Scientific/Engineering :: Chemistry",
29 | "Topic :: Software Development :: Libraries :: Python Modules",
30 | ]
31 | dependencies = [
32 | "mmh3>=2.3.1",
33 | "numpy>=1.11.3",
34 | "rdkit>=2016.03.4",
35 | "scipy>=0.18.0",
36 | "sdaxen_python_utilities>=0.1.5",
37 | "smart_open>=1.8.3",
38 | ]
39 |
40 | [project.optional-dependencies]
41 | optional = [
42 | "h5py",
43 | "mpi4py",
44 | "numba",
45 | "six", # needed by standardiser, but not listed as a dependency
46 | "standardiser",
47 | ]
48 | test = [
49 | "mock",
50 | "pytest",
51 | "pytest-cov",
52 | "e3fp[optional]",
53 | ]
54 | docs = [
55 | "sphinx",
56 | "sphinxcontrib-programoutput",
57 | "sphinx-rtd-theme",
58 | ]
59 | dev = [
60 | "e3fp[docs]",
61 | "e3fp[test]",
62 | ]
63 |
64 | [project.urls]
65 | Homepage = "https://github.com/keiserlab/e3fp"
66 | Download = "https://github.com/keiserlab/e3fp/tarball/{version}"
67 |
68 | [project.scripts]
69 | e3fp-fingerprint = "e3fp.fingerprint.generate:main"
70 | e3fp-conformer = "e3fp.conformer.generate:main"
71 |
72 | [tool.pytest.ini_options]
73 | addopts = "-ra -q"
74 | testpaths = ["e3fp/test"]
75 |
76 | # https://github.com/astral-sh/uv/issues/6281
77 | [tool.uv]
78 | constraint-dependencies = ["numba>=0.60.0"]
79 | # Resolve dependencies separately for each Python version
80 | environments = [
81 | "python_version>='3.13'",
82 | "python_version=='3.12'",
83 | "python_version=='3.11'",
84 | "python_version=='3.10'",
85 | "python_version=='3.9'",
86 | ]
87 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | E3FP: Extended 3-Dimensional FingerPrint
2 | ========================================
3 |
4 | |Docs Status| |CI Status| |Codecov Status| |PyPi Version| |Conda Version| |License|
5 |
6 | E3FP [1]_ is a 3D molecular fingerprinting method inspired by Extended
7 | Connectivity FingerPrints (ECFP) [2]_, integrating tightly with the RDKit_.
8 |
9 | Documentation is hosted by ReadTheDocs_, and development occurs on GitHub_.
10 |
11 | Installation and Usage
12 | ----------------------
13 |
14 | For installation and usage instructions, see the
15 | `documentation `__.
16 |
17 | See the E3FP `paper repository`_ for an application of E3FP and all code used
18 | for the E3FP paper [1]_.
19 |
20 | License
21 | -------
22 |
23 | E3FP is available under the `GNU Lesser General Public License version 3.0
24 | `_ (LGPLv3). See the
25 | `documentation `__
26 | for more details.
27 |
28 |
29 | References
30 | ----------
31 |
32 | .. [1] |axen2017|
33 | .. [2] |rogers2010|
34 |
35 | .. substitutions
36 |
37 | .. _RDKit: http://www.rdkit.org
38 | .. _GitHub: https://github.com/keiserlab/e3fp
39 | .. _paper repository: https://github.com/keiserlab/e3fp-paper
40 | .. _ReadTheDocs: http://e3fp.readthedocs.io
41 | .. |axen2017_doi| image:: https://img.shields.io/badge/doi-10.1021/acs.jmedchem.7b00696-blue.svg
42 | :target: http://dx.doi.org/10.1021/acs.jmedchem.7b00696
43 | :alt: Access the paper
44 | .. |axen2017| replace:: Axen SD, Huang XP, Caceres EL, Gendelev L, Roth BL, Keiser MJ. A Simple Representation Of Three-Dimensional Molecular Structure. *J. Med. Chem.* **60** (17): 7393–7409 (2017). |axen2017_doi| |bioRxiv|
45 | .. |rogers2010_doi| image:: https://img.shields.io/badge/doi-10.1021/ci100050t-blue.svg
46 | :target: http://dx.doi.org/10.1021/ci100050t
47 | :alt: Access the paper
48 | .. |rogers2010| replace:: Rogers D & Hahn M. Extended-connectivity fingerprints. *J. Chem. Inf. Model.* **50**: 742-54 (2010). |rogers2010_doi|
49 | .. |CI Status| image:: https://github.com/keiserlab/e3fp/actions/workflows/ci.yml/badge.svg
50 | :target: https://github.com/keiserlab/e3fp/actions?query=workflow%3ACI
51 | :alt: CI Status
52 | .. |Docs Status| image:: http://readthedocs.org/projects/e3fp/badge/?version=latest
53 | :target: http://e3fp.readthedocs.io/en/latest/?badge=latest
54 | :alt: Documentation Status
55 | .. |Codecov Status| image:: https://codecov.io/github/keiserlab/e3fp/coverage.svg?branch=master
56 | :target: https://codecov.io/github/keiserlab/e3fp?branch=master
57 | :alt: Code Coverage
58 | .. |PyPi Version| image:: https://img.shields.io/pypi/v/e3fp.svg
59 | :target: https://pypi.python.org/pypi/e3fp
60 | :alt: Package on PyPi
61 | .. |Conda Version| image:: https://img.shields.io/conda/v/conda-forge/e3fp.svg
62 | :target: https://anaconda.org/conda-forge/e3fp
63 | :alt: Package on Anaconda
64 | .. |License| image:: https://img.shields.io/badge/license-LGPLv3-blue.svg
65 | :target: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt
66 | .. |bioRxiv| image:: https://img.shields.io/badge/bioRxiv-136705-blue.svg
67 | :target: https://doi.org/10.1101/136705
68 | :alt: Access the preprint on bioRxiv
69 |
--------------------------------------------------------------------------------
/src/e3fp/pipeline.py:
--------------------------------------------------------------------------------
1 | """Functions for various pipeline use cases.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | from .config.params import params_to_sections_dict
7 | from .conformer.util import mol_from_smiles, mol_from_sdf, mol_to_sdf
8 | from .conformer.generate import generate_conformers
9 | from .fingerprint.generate import fprints_dict_from_mol
10 |
11 |
12 | def params_to_dicts(params):
13 | """Get params dicts for pipeline functions from INI format params file."""
14 | sections_dict = params_to_sections_dict(params, auto=True)
15 |
16 | # preproc_params will eventually be returned separately, when there's a
17 | # pipeline function for protonation
18 | preproc_params = sections_dict.get("preprocessing", {})
19 | confgen_params = sections_dict.get("conformer_generation", {})
20 | confgen_params.update(preproc_params)
21 | fprint_params = sections_dict.get("fingerprinting", {})
22 | return confgen_params, fprint_params
23 |
24 |
25 | def confs_from_smiles(smiles, name, confgen_params={}, save=False):
26 | """Generate conformations of molecule from SMILES string."""
27 | mol = mol_from_smiles(smiles, name)
28 | confgen_result = generate_conformers(
29 | mol, name, save=save, **confgen_params
30 | )
31 | mol = confgen_result[0]
32 | return mol
33 |
34 |
35 | def sdf_from_smiles(
36 | smiles, name, confgen_params={}, out_file=None, out_ext=".sdf.bz2"
37 | ):
38 | """Generate conformations from SMILES string and save to SDF file."""
39 | mol = confs_from_smiles(
40 | smiles, name, confgen_params=confgen_params, save=False
41 | )
42 | if out_file is None:
43 | out_file = name + out_ext
44 | mol_to_sdf(mol, out_file)
45 |
46 |
47 | def fprints_from_fprints_dict(fprints_dict, level=-1):
48 | """Get fingerprint at `level` from dict of level to fingerprint."""
49 | fprints_list = fprints_dict.get(
50 | level, fprints_dict[max(fprints_dict.keys())]
51 | )
52 | return fprints_list
53 |
54 |
55 | def fprints_from_mol(mol, fprint_params={}, save=False):
56 | """Generate fingerprints for all `first` conformers in mol."""
57 | fprints_dict = fprints_dict_from_mol(mol, save=save, **fprint_params)
58 | level = fprint_params.get("level", -1)
59 | fprints_list = fprints_from_fprints_dict(fprints_dict, level=level)
60 | return fprints_list
61 |
62 |
63 | def fprints_from_smiles(
64 | smiles, name, confgen_params={}, fprint_params={}, save=False
65 | ):
66 | """Generate conformers and fingerprints from a SMILES string."""
67 | if save is False and "first" not in confgen_params:
68 | confgen_params["first"] = fprint_params.get("first", -1)
69 | mol = confs_from_smiles(
70 | smiles, name, confgen_params=confgen_params, save=save
71 | )
72 | fprints_list = fprints_from_mol(
73 | mol, fprint_params=fprint_params, save=save
74 | )
75 | return fprints_list
76 |
77 |
78 | def fprints_from_sdf(sdf_file, fprint_params={}, save=False):
79 | """Generate fingerprints from conformers in an SDF file."""
80 | mol = mol_from_sdf(sdf_file)
81 | fprints_list = fprints_from_mol(
82 | mol, fprint_params=fprint_params, save=save
83 | )
84 | return fprints_list
85 |
--------------------------------------------------------------------------------
/tests/test_conformer.py:
--------------------------------------------------------------------------------
1 | """Tests for conformer generation.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 |
7 | import os
8 |
9 | DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
10 | SDF_FILE_COMPRESSED = os.path.join(DATA_DIR, "ritalin_nonplanar.sdf.bz2")
11 | SDF_FILE_UNCOMPRESSED = os.path.join(DATA_DIR, "ritalin_nonplanar.sdf")
12 |
13 | class TestConformer:
14 | def test_standardisation(self):
15 | import rdkit.Chem
16 | from e3fp.conformer.util import (
17 | mol_from_smiles,
18 | mol_to_standardised_mol,
19 | )
20 |
21 | smiles = "C[N-]c1cccc[n+]1C"
22 | mol = mol_from_smiles(smiles, "tmp")
23 | assert rdkit.Chem.MolToSmiles(mol) == smiles
24 |
25 | mol = mol_to_standardised_mol(mol)
26 | assert rdkit.Chem.MolToSmiles(mol) == "CN=c1ccccn1C"
27 |
28 | def test_default_is_unseeded(self):
29 | import rdkit.Chem
30 | from rdkit.Chem import AllChem
31 | from e3fp.conformer.util import (
32 | mol_from_smiles,
33 | mol_to_standardised_mol,
34 | )
35 | from e3fp.conformer.generate import generate_conformers
36 |
37 | ntrials = 10
38 | confgen_params = {"num_conf": 1}
39 | smiles = "C" * 20 # long flexible molecule
40 | mol = mol_from_smiles(smiles, "tmp")
41 | mols = [
42 | generate_conformers(mol, **confgen_params)[0]
43 | for i in range(ntrials)
44 | ]
45 |
46 | fail = True
47 | for i in range(ntrials):
48 | for j in range(i + 1, ntrials):
49 | rms = AllChem.GetBestRMS(mols[i], mols[j])
50 | if rms > 1e-2:
51 | fail = False
52 | break
53 | assert not fail
54 |
55 | def test_seed_produces_same_conformers(self):
56 | import rdkit.Chem
57 | from rdkit.Chem import AllChem
58 | from e3fp.conformer.util import (
59 | mol_from_smiles,
60 | mol_to_standardised_mol,
61 | )
62 | from e3fp.conformer.generate import generate_conformers
63 |
64 | ntrials = 10
65 | confgen_params = {"num_conf": 1, "seed": 42}
66 | smiles = "C" * 20 # long flexible molecule
67 | mol = mol_from_smiles(smiles, "tmp")
68 | mols = [
69 | generate_conformers(mol, **confgen_params)[0]
70 | for i in range(ntrials)
71 | ]
72 |
73 | fail = False
74 | for i in range(ntrials):
75 | for j in range(i + 1, ntrials):
76 | rms = AllChem.GetBestRMS(mols[i], mols[j])
77 | if rms > 1e-2:
78 | fail = True
79 | break
80 | assert not fail
81 |
82 | def test_compressed_sdf_reads_same_as_uncompressed(self):
83 | from rdkit import Chem
84 | from e3fp.conformer.util import mol_from_sdf
85 |
86 | sdf_files = [SDF_FILE_COMPRESSED, SDF_FILE_UNCOMPRESSED]
87 | smiles = [Chem.MolToSmiles(mol_from_sdf(f)) for f in sdf_files]
88 | assert smiles[0] == smiles[1]
89 |
90 | def test_conformer_generation_without_name(self):
91 | from e3fp.conformer.util import mol_from_smiles
92 | from e3fp.conformer.generate import generate_conformers
93 |
94 | confgen_params = {"num_conf": 1, "seed": 42}
95 | smiles = "C" * 20 # long flexible molecule
96 | mol = mol_from_smiles(smiles, "tmp")
97 | mol.ClearProp("_Name")
98 | assert not mol.HasProp("_Name")
99 | generate_conformers(mol, **confgen_params)
100 |
--------------------------------------------------------------------------------
/doc/source/usage/pipeline.rst:
--------------------------------------------------------------------------------
1 | Pipeline Methods
2 | ================
3 |
4 | E3FP can be easily plugged into an existing pipeline using the methods in the
5 | `e3fp.pipeline` module. Each of these methods wraps functionality in other
6 | modules for generating various outputs from inputs and specified options.
7 |
8 | .. note::
9 |
10 | As fingerprinting many molecules is embarrassingly parallel, we highly
11 | recommend employing a parallelization strategy. We use our own
12 | python_utilities_ package.
13 |
14 | First we must choose configuration options. See :ref:`usage/config:Configuration` for
15 | detailed instructions. Here we will use defaults for all but a few options.
16 |
17 | .. testsetup:: *
18 |
19 | smiles_file = "source/examples/data/test_smiles.smi"
20 |
21 | .. doctest::
22 |
23 | >>> fprint_params = {'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True}
24 | >>> confgen_params = {'max_energy_diff': 20.0, 'first': 3}
25 | >>> smiles = "COC(=O)C(C1CCCCN1)C2=CC=CC=C2"
26 |
27 | Generating Conformers from SMILES
28 | ---------------------------------
29 |
30 | The following code snippet generates a multi-conformer molecule:
31 |
32 | >>> from e3fp.pipeline import confs_from_smiles
33 | >>> mol = confs_from_smiles(smiles, "ritalin", confgen_params=confgen_params)
34 | >>> mol.GetNumConformers()
35 | 3
36 |
37 | This produces the following conformers:
38 |
39 | .. image:: ../_static/ritalin3d.png
40 | :width: 300px
41 | :height: 300px
42 | :alt: ritalin conformers
43 |
44 | Generating Fingerprints from Conformers
45 | ---------------------------------------
46 |
47 | >>> from e3fp.pipeline import fprints_from_mol
48 | >>> fprints = fprints_from_mol(mol, fprint_params=fprint_params)
49 | >>> len(fprints)
50 | 3
51 | >>> fprints[0]
52 | Fingerprint(indices=array([188, 224, ..., 3775, 4053]), level=5, bits=4096, name=ritalin_0)
53 | >>> fprints[1]
54 | Fingerprint(indices=array([125, 188, ..., 3693, 4053]), level=5, bits=4096, name=ritalin_1)
55 | >>> fprints[2]
56 | Fingerprint(indices=array([188, 206, ..., 3743, 4053]), level=5, bits=4096, name=ritalin_2)
57 |
58 | Generating Fingerprints from SMILES
59 | -----------------------------------
60 |
61 | >>> from e3fp.pipeline import fprints_from_smiles
62 | >>> fprints = fprints_from_smiles(smiles, "ritalin", confgen_params=confgen_params, fprint_params=fprint_params)
63 | >>> fprints[0]
64 | Fingerprint(indices=array([188, 224, ..., 3775, 4053]), level=5, bits=4096, name=ritalin_0)
65 |
66 | Parallel Fingerprinting
67 | -----------------------
68 |
69 | The following script demonstrates use of python_utilities_ for fingerprinting
70 | all SDF files in a directory in parallel. This essentially is the same as the
71 | :ref:`usage/cli:Command Line Interface`, albeit with a less convenient interface.
72 |
73 | >>> from glob import glob
74 | >>> from python_utilities.parallel import Parallelizer
75 | >>> from e3fp.conformer.util import smiles_to_dict
76 | >>> smiles_dict = smiles_to_dict(smiles_file)
77 | >>> print(smiles_dict)
78 | {'CHEMBL1643866': 'CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)C(C)C)C(C)C', ...}
79 | >>> len(smiles_dict)
80 | 10
81 | >>> smiles_iter = ((smiles, name) for name, smiles in smiles_dict.items())
82 | >>> kwargs = {"confgen_params": confgen_params, "fprint_params": fprint_params}
83 | >>> parallelizer = Parallelizer(parallel_mode="processes")
84 | >>> fprints_list = parallelizer.run(fprints_from_smiles, smiles_iter, kwargs=kwargs) # doctest: +SKIP
85 | >>> len(fprints_list) # doctest: +SKIP
86 | 10
87 |
88 | For all pipeline methods, please see the `e3fp.pipeline` module API.
89 |
90 | .. include:: ../substitutions.rst
91 |
--------------------------------------------------------------------------------
/tests/data/ritalin_nonplanar.sdf:
--------------------------------------------------------------------------------
1 | ZINC00896711
2 | -OEChem-11081520323D
3 |
4 | 37 38 0 1 0 0 0 0 0999 V2000
5 | -0.0173 1.4248 0.0099 C 0 0 0 0 0 0 0 0 0 0 0 0
6 | 0.0021 -0.0041 0.0020 O 0 0 0 0 0 0 0 0 0 0 0 0
7 | -1.1855 -0.6297 0.0100 C 0 0 0 0 0 0 0 0 0 0 0 0
8 | -2.2076 0.0145 0.0232 O 0 0 0 0 0 0 0 0 0 0 0 0
9 | -1.2439 -2.1355 0.0025 C 0 0 2 0 0 0 0 0 0 0 0 0
10 | -0.7531 -2.5137 -0.8943 H 0 0 0 0 0 0 0 0 0 0 0 0
11 | -2.6831 -2.5824 0.0138 C 0 0 0 0 0 0 0 0 0 0 0 0
12 | -3.5122 -2.2166 1.0577 C 0 0 0 0 0 0 0 0 0 0 0 0
13 | -4.8323 -2.6265 1.0681 C 0 0 0 0 0 0 0 0 0 0 0 0
14 | -5.3235 -3.4019 0.0344 C 0 0 0 0 0 0 0 0 0 0 0 0
15 | -4.4946 -3.7670 -1.0099 C 0 0 0 0 0 0 0 0 0 0 0 0
16 | -3.1756 -3.3535 -1.0225 C 0 0 0 0 0 0 0 0 0 0 0 0
17 | -0.5311 -2.6798 1.2421 C 0 0 1 0 0 0 0 0 0 0 0 0
18 | -1.0223 -2.3014 2.1385 H 0 0 0 0 0 0 0 0 0 0 0 0
19 | -0.5921 -4.2087 1.2346 C 0 0 0 0 0 0 0 0 0 0 0 0
20 | 0.1222 -4.7487 2.4770 C 0 0 0 0 0 0 0 0 0 0 0 0
21 | 1.5613 -4.2254 2.4942 C 0 0 0 0 0 0 0 0 0 0 0 0
22 | 1.5425 -2.6958 2.4549 C 0 0 0 0 0 0 0 0 0 0 0 0
23 | 0.8702 -2.2430 1.2312 N 0 3 0 0 0 0 0 0 0 0 0 0
24 | 1.0053 1.8021 0.0021 H 0 0 0 0 0 0 0 0 0 0 0 0
25 | -0.5445 1.7859 -0.8732 H 0 0 0 0 0 0 0 0 0 0 0 0
26 | -0.5275 1.7763 0.9067 H 0 0 0 0 0 0 0 0 0 0 0 0
27 | -3.1285 -1.6108 1.8652 H 0 0 0 0 0 0 0 0 0 0 0 0
28 | -5.4799 -2.3413 1.8840 H 0 0 0 0 0 0 0 0 0 0 0 0
29 | -6.3547 -3.7228 0.0429 H 0 0 0 0 0 0 0 0 0 0 0 0
30 | -4.8782 -4.3731 -1.8173 H 0 0 0 0 0 0 0 0 0 0 0 0
31 | -2.5290 -3.6356 -1.8402 H 0 0 0 0 0 0 0 0 0 0 0 0
32 | -1.6332 -4.5315 1.2442 H 0 0 0 0 0 0 0 0 0 0 0 0
33 | -0.1010 -4.5882 0.3386 H 0 0 0 0 0 0 0 0 0 0 0 0
34 | -0.3992 -4.4111 3.3727 H 0 0 0 0 0 0 0 0 0 0 0 0
35 | 0.1309 -5.8383 2.4477 H 0 0 0 0 0 0 0 0 0 0 0 0
36 | 2.0594 -4.5600 3.4041 H 0 0 0 0 0 0 0 0 0 0 0 0
37 | 2.0971 -4.6052 1.6243 H 0 0 0 0 0 0 0 0 0 0 0 0
38 | 1.0067 -2.3170 3.3253 H 0 0 0 0 0 0 0 0 0 0 0 0
39 | 2.5655 -2.3199 2.4673 H 0 0 0 0 0 0 0 0 0 0 0 0
40 | 1.3372 -2.6344 0.4270 H 0 0 0 0 0 0 0 0 0 0 0 0
41 | 0.9071 -1.2358 1.1819 H 0 0 0 0 0 0 0 0 0 0 0 0
42 | 1 2 1 0 0 0 0
43 | 1 20 1 0 0 0 0
44 | 1 21 1 0 0 0 0
45 | 1 22 1 0 0 0 0
46 | 2 3 1 0 0 0 0
47 | 3 4 2 0 0 0 0
48 | 3 5 1 0 0 0 0
49 | 5 6 1 0 0 0 0
50 | 5 7 1 0 0 0 0
51 | 5 13 1 0 0 0 0
52 | 7 12 2 0 0 0 0
53 | 7 8 1 0 0 0 0
54 | 8 9 2 0 0 0 0
55 | 8 23 1 0 0 0 0
56 | 9 10 1 0 0 0 0
57 | 9 24 1 0 0 0 0
58 | 10 11 2 0 0 0 0
59 | 10 25 1 0 0 0 0
60 | 11 12 1 0 0 0 0
61 | 11 26 1 0 0 0 0
62 | 12 27 1 0 0 0 0
63 | 13 14 1 0 0 0 0
64 | 13 19 1 0 0 0 0
65 | 13 15 1 0 0 0 0
66 | 15 16 1 0 0 0 0
67 | 15 28 1 0 0 0 0
68 | 15 29 1 0 0 0 0
69 | 16 17 1 0 0 0 0
70 | 16 30 1 0 0 0 0
71 | 16 31 1 0 0 0 0
72 | 17 18 1 0 0 0 0
73 | 17 32 1 0 0 0 0
74 | 17 33 1 0 0 0 0
75 | 18 19 1 0 0 0 0
76 | 18 34 1 0 0 0 0
77 | 18 35 1 0 0 0 0
78 | 19 36 1 0 0 0 0
79 | 19 37 1 0 0 0 0
80 | M CHG 1 19 1
81 | M END
82 | $$$$
83 |
--------------------------------------------------------------------------------
/doc/source/usage/cli.rst:
--------------------------------------------------------------------------------
1 | Command Line Interface
2 | ======================
3 |
4 | Command line interfaces (CLI) are provided for the two most common tasks:
5 | conformer generation and fingerprinting.
6 | When e3fp is installed, the CLI commands are available as ``e3fp-conformer`` and
7 | ``e3fp-fingerprint``.
8 |
9 | Conformer Generation CLI
10 | ------------------------
11 |
12 | To see all available options, run
13 |
14 | .. command-output:: e3fp-conformer --help
15 | :shell:
16 |
17 | We will generate conformers for the molecule whose SMILES string is defined in
18 | ``caffeine.smi``.
19 |
20 | .. literalinclude:: ../examples/data/caffeine.smi
21 | :caption: caffeine.smi
22 |
23 | The below example generates at most 3 conformers for this molecule.
24 |
25 | .. code-block:: shell-session
26 |
27 | $ e3fp-conformer -s caffeine.smi --num_conf 3 -o ./
28 | 2017-07-17 00:11:05,743|WARNING|Only 1 processes available. 'mpi' mode not available.
29 | 2017-07-17 00:11:05,748|INFO|num_proc is not specified. 'processes' mode will use all 8 processes
30 | 2017-07-17 00:11:05,748|INFO|Parallelizer initialized with mode 'processes' and 8 processors.
31 | 2017-07-17 00:11:05,748|INFO|Input type: Detected SMILES file(s)
32 | 2017-07-17 00:11:05,748|INFO|Input file number: 1
33 | 2017-07-17 00:11:05,748|INFO|Parallel Type: processes
34 | 2017-07-17 00:11:05,748|INFO|Out Directory: ./
35 | 2017-07-17 00:11:05,749|INFO|Overwrite Existing Files: False
36 | 2017-07-17 00:11:05,749|INFO|Target Conformer Number: 3
37 | 2017-07-17 00:11:05,749|INFO|First Conformers Number: all
38 | 2017-07-17 00:11:05,749|INFO|Pool Multiplier: 1
39 | 2017-07-17 00:11:05,749|INFO|RMSD Cutoff: 0.5
40 | 2017-07-17 00:11:05,749|INFO|Maximum Energy Difference: None
41 | 2017-07-17 00:11:05,749|INFO|Forcefield: UFF
42 | 2017-07-17 00:11:05,749|INFO|Starting.
43 | 2017-07-17 00:11:05,779|INFO|Generating conformers for caffeine.
44 | 2017-07-17 00:11:05,823|INFO|Generated 1 conformers for caffeine.
45 | 2017-07-17 00:11:05,829|INFO|Saved conformers for caffeine to ./caffeine.sdf.bz2.
46 |
47 | The result is a multi-conformer SDF file called ``caffeine.sdf.bz2`` in the
48 | current directory.
49 |
50 | Fingerprinting CLI
51 | ------------------
52 |
53 | To see all available options, run
54 |
55 | .. command-output:: e3fp-fingerprint --help
56 | :shell:
57 |
58 | To continue the above example, we will fingerprint our caffeine conformers.
59 |
60 | .. code-block:: shell-session
61 |
62 | $ e3fp-fingerprint caffeine.sdf.bz2 --bits 1024
63 | 2017-07-17 00:12:33,797|WARNING|Only 1 processes available. 'mpi' mode not available.
64 | 2017-07-17 00:12:33,801|INFO|num_proc is not specified. 'processes' mode will use all 8 processes
65 | 2017-07-17 00:12:33,801|INFO|Parallelizer initialized with mode 'processes' and 8 processors.
66 | 2017-07-17 00:12:33,801|INFO|Initializing E3FP generation.
67 | 2017-07-17 00:12:33,801|INFO|Getting SDF files
68 | 2017-07-17 00:12:33,801|INFO|SDF File Number: 1
69 | 2017-07-17 00:12:33,802|INFO|Database File: fingerprints.fpz
70 | 2017-07-17 00:12:33,802|INFO|Max First Conformers: 3
71 | 2017-07-17 00:12:33,802|INFO|Bits: 1024
72 | 2017-07-17 00:12:33,802|INFO|Level/Max Iterations: 5
73 | 2017-07-17 00:12:33,802|INFO|Shell Radius Multiplier: 1.718
74 | 2017-07-17 00:12:33,802|INFO|Stereo Mode: True
75 | 2017-07-17 00:12:33,802|INFO|Connected-only mode: on
76 | 2017-07-17 00:12:33,802|INFO|Invariant type: Daylight
77 | 2017-07-17 00:12:33,802|INFO|Parallel Mode: processes
78 | 2017-07-17 00:12:33,802|INFO|Starting
79 | 2017-07-17 00:12:33,829|INFO|Generating fingerprints for caffeine.
80 | 2017-07-17 00:12:33,935|INFO|Generated 1 fingerprints for caffeine.
81 | 2017-07-17 00:12:34,011|INFO|Saved FingerprintDatabase with fingerprints to fingerprints.fpz
82 |
83 | The result is a file ``fingerprints.fpz`` containing a
84 | :py:class:`.FingerprintDatabase`. To use such a database, consult
85 | :ref:`usage/fingerprints/storage:Fingerprint Storage`.
86 |
--------------------------------------------------------------------------------
/src/e3fp/util.py:
--------------------------------------------------------------------------------
1 | """Utility classes/methods.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import inspect
7 | import warnings
8 |
9 |
10 | class E3FPError(Exception):
11 | """Base class for E3FP-specific errors.
12 |
13 | This class is provided for future E3FP-specific functionality.
14 | """
15 |
16 |
17 | class E3FPWarning(Warning):
18 | """Base E3FP warning class.
19 |
20 | Unlike normal warnings, these are by default always set to on.
21 | """
22 |
23 |
24 | # Always show custom warnings for this package
25 | warnings.filterwarnings("always", category=E3FPWarning)
26 |
27 |
28 | class E3FPDeprecationWarning(E3FPWarning, DeprecationWarning):
29 | """A warning class for a deprecated method or class."""
30 |
31 |
32 | class E3FPEfficiencyWarning(E3FPWarning, RuntimeWarning):
33 | """A warning class for a potentially inefficient process."""
34 |
35 |
36 | def maybe_jit(*args, **kwargs):
37 | """Decorator to jit a function using Numba if available.
38 |
39 | Usage is identical to `numba.jit`.
40 | """
41 | def wrapper(func):
42 | try:
43 | import numba
44 | has_numba = True
45 | except ImportError:
46 | has_numba = False
47 |
48 | if has_numba:
49 | return numba.jit(*args, **kwargs)(func)
50 | else:
51 | return func
52 | return wrapper
53 |
54 |
55 | class deprecated(object):
56 | """Decorator to mark a function as deprecated.
57 |
58 | Issue a deprecation warning when a function is called, and update the
59 | documentation. A deprecation version must be provided.
60 |
61 | Examples
62 | --------
63 | >>> from e3fp.util import deprecated
64 | >>> @deprecated("1.1", remove_version="1.3",
65 | ... msg="Function no longer needed")
66 | ... def my_function():
67 | ... pass
68 |
69 | Notes
70 | -----
71 | Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary
72 | """
73 |
74 | def __init__(self, deprecated_version, remove_version=None, msg=None):
75 | """Constructor.
76 |
77 | Parameters
78 | ----------
79 | deprecated_version : str
80 | Version in which object was deprecated (e.g. '1.1')
81 | remove_version : str, optional
82 | Version in which object will be removed (e.g. '1.2'). If not
83 | specified, it is assumed the object will be removed in the next
84 | release (e.g. '1.2' if `deprecated_version` is '1.1')
85 | msg : str, optional
86 | Message to include with deprecation warning, to explain deprecation
87 | or point to newer version.
88 | """
89 | self.deprecated_version = deprecated_version
90 | if remove_version is None:
91 | version_info = deprecated_version.split(".")
92 | version_info[1] = str(int(version_info[1]) + 1)
93 | for i in range(2, len(version_info)):
94 | version_info[i] = "0"
95 | remove_version = ".".join(version_info)
96 | self.remove_version = remove_version
97 | if msg is None:
98 | self.extra = ""
99 | else:
100 | self.extra = " {0}".format(msg)
101 |
102 | def __call__(self, obj):
103 | if inspect.isfunction(obj):
104 | return self.deprecate_function(obj)
105 | else:
106 | raise ValueError("Deprecated object is not a function.")
107 |
108 | def deprecate_function(self, f):
109 | """Return the decorated function."""
110 | msg = (
111 | "Function `{0}` was deprecated in {1} and will be removed "
112 | "in {2}.{3}"
113 | ).format(
114 | f.__name__,
115 | self.deprecated_version,
116 | self.remove_version,
117 | self.extra,
118 | )
119 |
120 | def new_func(*args, **kwargs):
121 | warnings.warn(msg, category=E3FPDeprecationWarning, stacklevel=2)
122 | return f(*args, **kwargs)
123 |
124 | new_func.__name__ = f.__name__
125 | new_func.__dict__ = f.__dict__
126 | new_func.__doc__ = f.__doc__
127 | self.update_docstring(new_func)
128 | return new_func
129 |
130 | def update_docstring(self, obj):
131 | """Add deprecation note to docstring."""
132 | # print(obj.__doc__)
133 | msg = (
134 | f"\t.. deprecated:: {self.deprecated_version}\n"
135 | f"\t {self.extra}"
136 | )
137 | obj.__doc__ = f"{obj.__doc__}\n\n{msg}"
138 | return obj
139 |
--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/storage.rst:
--------------------------------------------------------------------------------
1 | Fingerprint Storage
2 | ===================
3 |
4 | The most efficient way to store and interact with fingerprints is through the
5 | :py:class:`.FingerprintDatabase` class. This class wraps a matrix with
6 | sparse rows (:py:class:`scipy.sparse.csr_matrix`), where each row is a
7 | fingerprint. This enables rapid I/O of the database while also minimizing the
8 | memory footprint. Accessing the underlying sparse representation with the
9 | :py:attr:`.FingerprintDatabase.array` attribute is convenient for machine learning
10 | purposes, while the database class itself provides several useful functions.
11 |
12 | .. note::
13 |
14 | We strongly recommend upgrading to at least SciPy v1.0.0 when working with
15 | large fingerprint databases, as old versions are much slower and have
16 | several bugs for database loading.
17 |
18 |
19 | Database I/O and Indexing
20 | -------------------------
21 |
22 | See the full :py:class:`.FingerprintDatabase` documentation for a
23 | description of basic database usage, attributes, and methods. Below, several
24 | additional use cases are documented.
25 |
26 | Batch Database Operations
27 | -------------------------
28 |
29 | Due to the sparse representation of the underlying data structure, an un-
30 | folded database, a database with unfolded fingerprints does not use
31 | significantly more disk space than a database with folded fingerprints. However,
32 | it is usually necessary to fold fingerprints for machine learning tasks. The
33 | :py:class:`.FingerprintDatabase` does this very quickly.
34 |
35 | .. testsetup::
36 |
37 | import numpy as np
38 | np.random.seed(3)
39 |
40 | .. doctest::
41 |
42 | >>> from e3fp.fingerprint.db import FingerprintDatabase
43 | >>> from e3fp.fingerprint.fprint import Fingerprint
44 | >>> import numpy as np
45 | >>> db = FingerprintDatabase(fp_type=Fingerprint, name="TestDB")
46 | >>> print(db)
47 | FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: None, fp_num: 0]
48 | >>> on_inds = [np.random.uniform(0, 2**32, size=30) for i in range(5)]
49 | >>> fps = [Fingerprint(x, bits=2**32) for x in on_inds]
50 | >>> db.add_fingerprints(fps)
51 | >>> print(db)
52 | FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5]
53 | >>> db.get_density()
54 | 6.984919309616089e-09
55 | >>> fold_db = db.fold(1024)
56 | >>> print(fold_db)
57 | FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 1024, fp_num: 5]
58 | >>> fold_db.get_density()
59 | 0.0287109375
60 |
61 | A database can be converted to a different fingerprint type:
62 |
63 | >>> from e3fp.fingerprint.fprint import CountFingerprint
64 | >>> count_db = db.as_type(CountFingerprint)
65 | >>> print(count_db)
66 | FingerprintDatabase[name: TestDB, fp_type: CountFingerprint, level: -1, bits: 4294967296, fp_num: 5]
67 | >>> count_db[0]
68 | CountFingerprint(counts={2977004690: 1, ..., 3041471738: 1}, level=-1, bits=4294967296, name=None)
69 |
70 | The :py:func:`e3fp.fingerprint.db.concat` method allows efficient joining of multiple
71 | databases.
72 |
73 | >>> from e3fp.fingerprint.db import concat
74 | >>> dbs = []
75 | >>> for i in range(10):
76 | ... db = FingerprintDatabase(fp_type=Fingerprint)
77 | ... on_inds = [np.random.uniform(0, 1024, size=30) for j in range(5)]
78 | ... fps = [Fingerprint(x, bits=2**32, name="Mol{}".format(i)) for x in on_inds]
79 | ... db.add_fingerprints(fps)
80 | ... dbs.append(db)
81 | >>> dbs[0][0]
82 | Fingerprint(indices=array([94, 97, ..., 988, 994]), level=-1, bits=4294967296, name=Mol0)
83 | >>> print(dbs[0])
84 | FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5]
85 | >>> merge_db = concat(dbs)
86 | >>> print(merge_db)
87 | FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 50]
88 |
89 | Database Comparison
90 | -------------------
91 |
92 | Two databases may be compared using various metrics in
93 | :py:mod:`e3fp.fingerprint.metrics`. Additionally, all fingerprints in a database
94 | may be compared to each other simply by only providing a single database.
95 | See :ref:`usage/fingerprints/comparison:Fingerprint Comparison` for more details.
96 |
97 | Performing Machine Learning on the Database
98 | -------------------------------------------
99 |
100 | The underlying sparse matrix may be passed directly to machine learning tools
101 | in any package that is compatible with SciPy sparse matrices, such as
102 | `scikit-learn `_.
103 |
104 | >>> from sklearn.naive_bayes import BernoulliNB
105 | >>> clf = BernoulliNB()
106 | >>> clf.fit(db.array, ypred) # doctest: +SKIP
107 | BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
108 | >>> clf.predict(db2.array) # doctest: +SKIP
109 | ...
110 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/fprint_metrics.py:
--------------------------------------------------------------------------------
1 | """Fingerprint comparison metrics.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | from __future__ import division
7 |
8 | import numpy as np
9 | from ..fprint import CountFingerprint, diff_counts_dict
10 |
11 |
12 | def tanimoto(fp1, fp2):
13 | """Calculate Tanimoto coefficient between fingerprints.
14 |
15 | Parameters
16 | ----------
17 | fp1 : Fingerprint
18 | Fingerprint 1
19 | fp2 : Fingerprint
20 | Fingerprint 2
21 |
22 | Returns
23 | -------
24 | float : Tanimoto coefficient.
25 | """
26 | try:
27 | intersect = np.intersect1d(
28 | fp1.indices, fp2.indices, assume_unique=True
29 | ).shape[0]
30 | return intersect / (fp1.bit_count + fp2.bit_count - intersect)
31 | except ZeroDivisionError:
32 | return 0.0
33 |
34 |
35 | def soergel(fp1, fp2):
36 | """Calculate Soergel similarity between fingerprints.
37 |
38 | Soergel similarity is the complement of Soergel distance and can be
39 | thought of as the analog of the Tanimoto coefficient for count/float-based
40 | fingerprints. For `Fingerprint`, it is equivalent to the Tanimoto
41 | coefficient.
42 |
43 | Parameters
44 | ----------
45 | fp1 : Fingerprint
46 | Fingerprint 1
47 | fp2 : Fingerprint
48 | Fingerprint 2
49 |
50 | Returns
51 | -------
52 | float : Soergel similarity.
53 |
54 | Reference
55 | -------
56 |
57 | """
58 | if not (
59 | isinstance(fp1, CountFingerprint) and isinstance(fp2, CountFingerprint)
60 | ):
61 | return tanimoto(fp1, fp2)
62 |
63 | counts_diff = diff_counts_dict(fp1, fp2)
64 | temp = np.asarray(
65 | [
66 | (abs(counts_diff[x]), max(fp1.get_count(x), fp2.get_count(x)))
67 | for x in counts_diff.keys()
68 | ],
69 | dtype=float,
70 | ).T
71 | soergel = 1 - np.sum(temp[0, :]) / np.sum(temp[1, :])
72 |
73 | return soergel
74 |
75 |
76 | def dice(fp1, fp2):
77 | """Calculate Dice coefficient between fingerprints.
78 |
79 | Parameters
80 | ----------
81 | fp1 : Fingerprint
82 | Fingerprint 1
83 | fp2 : Fingerprint
84 | Fingerprint 2
85 |
86 | Returns
87 | -------
88 | float : Dice coefficient.
89 | """
90 | try:
91 | intersect = np.intersect1d(
92 | fp1.indices, fp2.indices, assume_unique=True
93 | ).shape[0]
94 | return 2 * intersect / (fp1.bit_count + fp2.bit_count)
95 | except ZeroDivisionError:
96 | return 0.0
97 |
98 |
99 | def cosine(fp1, fp2):
100 | """Calculate cosine similarity between fingerprints.
101 |
102 | Parameters
103 | ----------
104 | fp1 : Fingerprint
105 | Fingerprint 1
106 | fp2 : Fingerprint
107 | Fingerprint 2
108 |
109 | Returns
110 | -------
111 | float : Cosine similarity.
112 | """
113 | try:
114 | dot = sum(v * fp2.get_count(k) for k, v in fp1.counts.items())
115 | root_norm = (
116 | sum(v ** 2 for v in fp1.counts.values())
117 | * sum(v ** 2 for v in fp2.counts.values())
118 | ) ** 0.5
119 | return dot / root_norm
120 | except ZeroDivisionError:
121 | return 0.0
122 |
123 |
124 | def pearson(fp1, fp2):
125 | """Calculate Pearson correlation between fingerprints.
126 |
127 | Parameters
128 | ----------
129 | fp1 : Fingerprint
130 | Fingerprint 1
131 | fp2 : Fingerprint
132 | Fingerprint 2
133 |
134 | Returns
135 | -------
136 | float : Pearson correlation.
137 | """
138 | try:
139 | dot = sum(v * fp2.get_count(k) for k, v in fp1.counts.items())
140 | return (dot / fp1.bits - fp1.mean() * fp2.mean()) / (
141 | fp1.std() * fp2.std()
142 | )
143 | except ZeroDivisionError:
144 | return 0.0
145 |
146 | # intersect = np.intersect1d(fp1.indices, fp2.indices,
147 | # assume_unique=True).shape[0]
148 | # return ((intersect / fp1.bits) -
149 | # ((fp1.mean() * fp2.mean()) / (fp1.std() * fp2.std())))
150 |
151 |
152 | def hamming(fp1, fp2):
153 | """Calculate Hamming distance between fingerprints.
154 |
155 | Parameters
156 | ----------
157 | fp1 : Fingerprint
158 | Fingerprint 1
159 | fp2 : Fingerprint
160 | Fingerprint 2
161 |
162 | Returns
163 | -------
164 | float : Hamming distance.
165 | """
166 | intersect = np.intersect1d(
167 | fp1.indices, fp2.indices, assume_unique=True
168 | ).shape[0]
169 | return fp1.bit_count + fp2.bit_count - 2 * intersect
170 |
171 |
172 | def distance(fp1, fp2):
173 | """Calculate Euclidean distance between fingerprints.
174 |
175 | Parameters
176 | ----------
177 | fp1 : Fingerprint
178 | Fingerprint 1
179 | fp2 : Fingerprint
180 | Fingerprint 2
181 |
182 | Returns
183 | -------
184 | float : Euclidian distance.
185 | """
186 | return hamming(fp1, fp2) ** 0.5
187 |
--------------------------------------------------------------------------------
/tests/test_fingerprint.py:
--------------------------------------------------------------------------------
1 | """Tests for E3FP fingerprints.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import pytest
7 |
8 | class TestFingerprintIO:
9 | def test_fprint_from_indices(self):
10 | from e3fp.fingerprint.fprint import (
11 | Fingerprint,
12 | CountFingerprint,
13 | FloatFingerprint,
14 | )
15 |
16 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
17 | in_indices = [3, 1, 4, 5]
18 | bits = 32
19 | fprint = fp_type.from_indices(in_indices, bits=bits)
20 | assert sorted(in_indices) == sorted(fprint.indices)
21 |
22 | def test_fprint_from_fprint(self):
23 | from e3fp.fingerprint.fprint import (
24 | Fingerprint,
25 | CountFingerprint,
26 | FloatFingerprint,
27 | )
28 |
29 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
30 | in_indices = [3, 1, 4, 5, 1, 5, 9]
31 | bits = 32
32 | fprint1 = fp_type.from_indices(in_indices, bits=bits)
33 | fprint2 = fp_type.from_fingerprint(fprint1)
34 | assert fprint1 == fprint2
35 |
36 | def test_countfprint_from_counts(self):
37 | from e3fp.fingerprint.fprint import CountFingerprint
38 |
39 | in_counts = {3: 1, 1: 4, 5: 1}
40 | bits = 32
41 | fprint = CountFingerprint.from_counts(in_counts, bits=bits)
42 | out_counts = fprint.counts
43 | assert in_counts == out_counts
44 |
45 | def test_floatfprint_from_counts(self):
46 | from e3fp.fingerprint.fprint import FloatFingerprint
47 |
48 | in_counts = {3: 1.0, 1: 4.0, 5: 1.0}
49 | bits = 32
50 | fprint = FloatFingerprint.from_counts(in_counts, bits=bits)
51 | out_counts = fprint.counts
52 | assert in_counts == out_counts
53 |
54 | def test_unique_indices(self):
55 | from e3fp.fingerprint.fprint import (
56 | Fingerprint,
57 | CountFingerprint,
58 | FloatFingerprint,
59 | )
60 |
61 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
62 | in_indices = [3, 1, 4, 5, 1, 5, 9]
63 | bits = 32
64 | fprint = fp_type.from_indices(in_indices, bits=bits)
65 | assert sorted(set(in_indices)) == sorted(fprint.indices)
66 |
67 | def test_bitstring_io(self):
68 | from e3fp.fingerprint.fprint import (
69 | Fingerprint,
70 | CountFingerprint,
71 | FloatFingerprint,
72 | )
73 |
74 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
75 | in_bitstring = "1001001111011000"
76 | fprint = fp_type.from_bitstring(in_bitstring)
77 | out_bitstring = fprint.to_bitstring()
78 | assert in_bitstring == out_bitstring
79 |
80 | def test_vector_io(self):
81 | from e3fp.fingerprint.fprint import (
82 | Fingerprint,
83 | CountFingerprint,
84 | FloatFingerprint,
85 | )
86 | import numpy as np
87 |
88 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
89 | in_vector = np.array([0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=np.bool_)
90 | fprint = fp_type.from_vector(in_vector)
91 | out_vector = fprint.to_vector(sparse=False)
92 | np.testing.assert_array_equal(in_vector, out_vector)
93 |
94 | def test_rdkit_io(self):
95 | from e3fp.fingerprint.fprint import (
96 | Fingerprint,
97 | CountFingerprint,
98 | FloatFingerprint,
99 | )
100 |
101 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
102 | indices = [3, 1, 4, 5]
103 | bits = 32
104 | fprint1 = fp_type.from_indices(indices, bits=bits)
105 | rdkit_fprint1 = fprint1.to_rdkit()
106 | fprint2 = fp_type.from_rdkit(rdkit_fprint1)
107 | rdkit_fprint2 = fprint2.to_rdkit()
108 | assert rdkit_fprint1 == rdkit_fprint2
109 |
110 | def test_basic_properties(self):
111 | from e3fp.fingerprint.fprint import (
112 | Fingerprint,
113 | CountFingerprint,
114 | FloatFingerprint,
115 | )
116 | import numpy as np
117 |
118 | bits = 1024
119 | for i in range(10):
120 | indices = np.random.randint(0, bits, 30)
121 | unique_inds = np.unique(indices)
122 | level = int(np.random.randint(0, 10))
123 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
124 | fp = fp_type.from_indices(indices, bits=bits, level=level)
125 | assert fp.bits == bits
126 | assert len(fp) == bits
127 | assert fp.bit_count == unique_inds.size
128 | assert fp.density == pytest.approx(float(unique_inds.size) / bits)
129 |
130 |
131 | class TestFingerprintAlgebra:
132 | pass
133 |
134 |
135 | class TestFingerprintComparison:
136 | pass
137 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | """Efficient comparison metrics for fingerprints and their databases.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import logging
7 |
8 | from ..fprint import Fingerprint
9 | from ..util import E3FPBitsValueError
10 | from ..db import FingerprintDatabase
11 | from . import array_metrics
12 | from . import fprint_metrics
13 |
14 |
15 | def tanimoto(A, B=None):
16 | """Compute Tanimoto coefficients between fingerprints.
17 |
18 | Fingerprints must have same number of bits. If not bit-fingerprints,
19 | arrays will be cast to binary. For non-binary data, use `soergel`. If only
20 | one fingerprint/database is provided, it is compared to self.
21 |
22 | Parameters
23 | ----------
24 | A, B : Fingerprint or FingerprintDatabase
25 | Fingerprint(s) to be compared
26 |
27 | Returns
28 | -------
29 | tanimoto : float or ndarray [shape (num_fps_A, num_fps_B)]
30 | Pairwise tanimoto(s) between fingerprint(s) in `A` and `B`.
31 |
32 | See Also
33 | --------
34 | cosine, dice, pearson, soergel
35 | """
36 | A, B = _check_item_pair(A, B, fp_type=Fingerprint)
37 | if isinstance(A, Fingerprint):
38 | return fprint_metrics.tanimoto(A, B)
39 | return array_metrics.tanimoto(A.array, B.array)
40 |
41 |
42 | def soergel(A, B=None):
43 | """Compute Soergel similarities between fingerprints.
44 |
45 | Soergel similarity is the complement of the Soergel distance and is
46 | analogous to the Tanimoto coefficient for count/float fingerprints. For
47 | binary data, it is equivalent to `tanimoto`.
48 |
49 | Parameters
50 | ----------
51 | A, B : Fingerprint or FingerprintDatabase
52 | Fingerprint(s) to be compared
53 |
54 | Returns
55 | -------
56 | soergel : float or ndarray [shape (num_fps_A, num_fps_B)]
57 |
58 | See Also
59 | --------
60 | cosine, dice, pearson, tanimoto
61 |
62 | """
63 | A, B = _check_item_pair(A, B)
64 | if isinstance(A, Fingerprint):
65 | return fprint_metrics.soergel(A, B)
66 | return array_metrics.soergel(A.array, B.array)
67 |
68 |
69 | def dice(A, B=None):
70 | """Compute Dice coefficients between fingerprints.
71 |
72 | Fingerprints must have same number of bits. If not bit-fingerprints,
73 | arrays will be cast to binary. If only one fingerprint/database is
74 | provided, it is compared to self.
75 |
76 | Parameters
77 | ----------
78 | A, B : Fingerprint or FingerprintDatabase
79 | Fingerprint(s) to be compared
80 |
81 | Returns
82 | -------
83 | dice : float or ndarray [shape (num_fps_A, num_fps_B)]
84 |
85 | See Also
86 | --------
87 | cosine, pearson, soergel, tanimoto
88 | """
89 | A, B = _check_item_pair(A, B, fp_type=Fingerprint)
90 | if isinstance(A, Fingerprint):
91 | return fprint_metrics.dice(A, B)
92 | return array_metrics.dice(A.array, B.array)
93 |
94 |
95 | def cosine(A, B=None):
96 | """Compute cosine similarities between fingerprints.
97 |
98 | Fingerprints must have same number of bits. If only one
99 | fingerprint/database is provided, it is compared to self.
100 |
101 | Parameters
102 | ----------
103 | A, B : Fingerprint or FingerprintDatabase
104 | Fingerprint(s) to be compared
105 |
106 | Returns
107 | -------
108 | cosine : float or ndarray [shape (num_fps_A, num_fps_B)]
109 |
110 | See Also
111 | --------
112 | dice, pearson, soergel, tanimoto
113 | """
114 | A, B = _check_item_pair(A, B)
115 | if isinstance(A, Fingerprint):
116 | return fprint_metrics.cosine(A, B)
117 | return array_metrics.cosine(A.array, B.array)
118 |
119 |
120 | def pearson(A, B=None):
121 | """Compute Pearson correlation between fingerprints.
122 |
123 | Fingerprints must have same number of bits. If only one
124 | fingerprint/database is provided, it is compared to self.
125 |
126 | Parameters
127 | ----------
128 | A, B : Fingerprint or FingerprintDatabase
129 | Fingerprint(s) to be compared
130 |
131 | Returns
132 | -------
133 | pearson : float or ndarray [shape (num_fps_A, num_fps_B)]
134 |
135 | See Also
136 | --------
137 | cosine, dice, soergel, tanimoto
138 | """
139 | A, B = _check_item_pair(A, B)
140 | if isinstance(A, Fingerprint):
141 | return fprint_metrics.pearson(A, B)
142 | return array_metrics.pearson(A.array, B.array)
143 |
144 |
145 | def _check_item(item, fp_type=None, force_db=False):
146 | if force_db and isinstance(item, Fingerprint):
147 | if not fp_type:
148 | fp_type = item.__class__
149 | db = FingerprintDatabase(fp_type=fp_type)
150 | db.add_fingerprints([item])
151 | item = db
152 | elif fp_type and isinstance(item, FingerprintDatabase):
153 | logging.debug(
154 | "Casting database fingerprints to {}.".format(fp_type.__name__)
155 | )
156 | item = item.as_type(fp_type, copy=False)
157 | return item
158 |
159 |
160 | def _check_item_pair(A, B, fp_type=None, force_db=False):
161 | try:
162 | if B is not None and A.bits != B.bits:
163 | raise E3FPBitsValueError(
164 | "Fingerprints must have same number of bits."
165 | )
166 | except AttributeError:
167 | raise TypeError("Items must be Fingerprint or FingerprintDatabase.")
168 | if isinstance(A, FingerprintDatabase) or isinstance(
169 | B, FingerprintDatabase
170 | ):
171 | force_db = True
172 | A = _check_item(A, fp_type=fp_type, force_db=force_db)
173 | if B is None:
174 | B = A
175 | else:
176 | B = _check_item(B, fp_type=fp_type, force_db=force_db)
177 | return A, B
178 |
--------------------------------------------------------------------------------
/src/e3fp/config/params.py:
--------------------------------------------------------------------------------
1 | """Get E3FP default parameters and read parameters from files.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import os
7 | import copy
8 | import ast
9 |
10 | from configparser import (
11 | ConfigParser,
12 | NoSectionError,
13 | DuplicateSectionError,
14 | )
15 |
16 | CONFIG_DIR = os.path.dirname(os.path.realpath(__file__))
17 | DEF_PARAM_FILE = os.path.join(CONFIG_DIR, "defaults.cfg")
18 |
19 |
20 | def read_params(params=None, fill_defaults=False):
21 | """Get combination of provided parameters and default parameters.
22 |
23 | Parameters
24 | ----------
25 | params : str or ConfigParser, optional
26 | User provided parameters as an INI file or `ConfigParser`.
27 | Any parameters provided will replace default parameters.
28 | fill_defaults : bool, optional
29 | Fill values that aren't provided with package defaults, if `params`
30 | is file.
31 |
32 | Returns
33 | -------
34 | all_params : ConfigParser
35 | Combination of default and user-provided parameters.
36 | """
37 | if isinstance(params, ConfigParser):
38 | return copy.copy(params)
39 |
40 | params_list = []
41 | if fill_defaults:
42 | params_list.append(DEF_PARAM_FILE)
43 | if params is not None:
44 | params_list.append(params)
45 |
46 | all_params = ConfigParser()
47 | all_params.read(params_list)
48 |
49 | return all_params
50 |
51 |
52 | def write_params(params, params_file="params.cfg"):
53 | """Write params to file.
54 |
55 | Parameters
56 | ----------
57 | params : ConfigParser
58 | Params
59 | params_file : str
60 | Params file
61 | """
62 | with open(params_file, "w") as f:
63 | params.write(f)
64 |
65 |
66 | def get_value(
67 | params, section_name, param_name, dtype=str, auto=False, fallback=None
68 | ):
69 | """Get value from params with fallback.
70 |
71 | Parameters
72 | ----------
73 | params : ConfigParser
74 | Parameters
75 | section_name : str
76 | Name of section in `params`
77 | param_name : str
78 | Name of parameter in `section`
79 | dtype : type, optional
80 | Type to return data as.
81 | auto : bool, optional
82 | Auto-discover type of value. If provided, `dtype` is ignored.
83 | fallback : any, optional
84 | Value to return if getting value fails.
85 |
86 | Returns
87 | -------
88 | value : any
89 | Value of parameter or `fallback`.
90 | """
91 | if auto:
92 | try:
93 | value = params.get(section_name, param_name)
94 | except ValueError:
95 | return fallback
96 |
97 | try:
98 | return ast.literal_eval(value)
99 | except (ValueError, SyntaxError):
100 | return value
101 | else:
102 | get_function = params.get
103 | if dtype is int:
104 | get_function = params.getint
105 | elif dtype is float:
106 | get_function = params.getfloat
107 | elif dtype is bool:
108 | get_function = params.getboolean
109 |
110 | try:
111 | return get_function(section_name, param_name)
112 | except ValueError:
113 | return fallback
114 |
115 |
116 | def get_default_value(*args, **kwargs):
117 | global default_params
118 | return get_value(default_params, *args, **kwargs)
119 |
120 |
121 | def update_params(
122 | params_dict, params=None, section_name=None, fill_defaults=False
123 | ):
124 | """Set `ConfigParser` values from a sections dict.
125 |
126 | Sections dict key must be parameter sections, and value must be dict
127 | matching parameter name to value. If existing `ConfigParser` is
128 | provided, parameter values are updated.
129 |
130 | Parameters
131 | ----------
132 | params_dict : dict
133 | If `section_name` is provided, dict must match parameter names to
134 | values. If `section_name` is not provided, dict key(s) must be
135 | parameter sections, and value(s) must be parameter dict.
136 | params : ConfigParser, optional
137 | Existing parameters.
138 | section_name : str, optional
139 | Name of section to which to add parameters in `params_dict`
140 | fill_defaults : bool, optional
141 | Fill values that aren't provided with package defaults, if `params`
142 | is file.
143 | """
144 | if params is None:
145 | params = ConfigParser()
146 | else:
147 | params = read_params(params, fill_defaults=fill_defaults)
148 |
149 | if section_name is not None:
150 | try:
151 | params.add_section(section_name)
152 | except DuplicateSectionError:
153 | pass
154 |
155 | for param_name, param_value in params_dict.items():
156 | params.set(section_name, param_name, str(param_value))
157 | else:
158 | sections_dict = params_dict
159 | for section_name, params_dict in sections_dict.items():
160 | for param_name, param_value in params_dict.items():
161 | params.set(section_name, param_name, param_value)
162 | return params
163 |
164 |
165 | def params_to_sections_dict(params, auto=True):
166 | """Get dict of sections dicts in params, with optional type discovery.
167 |
168 | Parameters
169 | ----------
170 | params : str or ConfigParser
171 | Params to read
172 | auto : bool, optional
173 | Auto typing of parameter values.
174 |
175 | Returns
176 | ----------
177 | dict : dict matching sections to parameters to values.
178 | """
179 | params = read_params(params)
180 | sections = default_params.sections()
181 | params_dicts = {}
182 | for section in sections:
183 | try:
184 | params_dict = dict(params.items(section))
185 | except NoSectionError:
186 | continue
187 | if auto:
188 | params_dict = {
189 | param_name: get_value(params, section, param_name, auto=True)
190 | for param_name in params_dict
191 | }
192 | params_dicts[section] = params_dict
193 | return params_dicts
194 |
195 |
196 | default_params = read_params(fill_defaults=True)
197 |
--------------------------------------------------------------------------------
/src/e3fp/conformer/protonation.py:
--------------------------------------------------------------------------------
1 | """Functions for generating protonation states of molecules.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import os
7 | import tempfile
8 | import subprocess
9 | import itertools
10 | import logging
11 |
12 | from .util import iter_to_smiles, MolItemName
13 |
14 |
15 | def smiles_dict_to_proto_smiles_dict(
16 | in_smiles_dict,
17 | max_states=3,
18 | pka=7.4,
19 | dist_cutoff=20.0,
20 | add_missing=False,
21 | parallelizer=None,
22 | chunk_size=100,
23 | ):
24 | """Generate dict of SMILES for protonated states from SMILES dict."""
25 | kwargs = {"max_states": max_states, "pka": pka, "dist_cutoff": dist_cutoff}
26 | in_smiles_iter = (
27 | (smiles, mol_name) for mol_name, smiles in in_smiles_dict.items()
28 | )
29 | if parallelizer is None:
30 | proto_smiles_iter = iter(
31 | smiles_list_to_proto_smiles_list(in_smiles_iter, **kwargs)
32 | )
33 | else:
34 | smiles_chunks_iter = (
35 | (chunk,)
36 | for chunk in _chunk_iter_to_lists(
37 | in_smiles_iter, chunk_size=chunk_size
38 | )
39 | )
40 | results_iter = (
41 | result
42 | for result, data in parallelizer.run_gen(
43 | smiles_list_to_proto_smiles_list,
44 | smiles_chunks_iter,
45 | kwargs=kwargs,
46 | )
47 | if result is not False
48 | )
49 | proto_smiles_iter = itertools.chain.from_iterable(results_iter)
50 |
51 | proto_smiles_dict = {
52 | mol_name: smiles for smiles, mol_name in proto_smiles_iter
53 | }
54 | if add_missing:
55 | for mol_name, smiles in in_smiles_dict.items():
56 | proto_name = MolItemName(mol_name, proto_state_num=0).proto_name
57 | if proto_name not in proto_smiles_dict:
58 | logging.debug(
59 | (
60 | "Protonated SMILES for {} could not be generated. "
61 | "Returning input SMILES."
62 | ).format(mol_name)
63 | )
64 | proto_smiles_dict[mol_name] = smiles
65 |
66 | return proto_smiles_dict
67 |
68 |
69 | def smiles_list_to_proto_smiles_list(
70 | in_smiles_list, max_states=3, pka=7.4, dist_cutoff=20.0
71 | ):
72 | """Generate list of SMILES for protonated states from single SMILES."""
73 | in_smiles_file = tempfile.mkstemp(suffix=".smi")[1]
74 | iter_to_smiles(
75 | in_smiles_file,
76 | ((mol_name, smiles) for smiles, mol_name in in_smiles_list),
77 | )
78 | logging.debug("Protonating SMILES in %s" % (in_smiles_file))
79 | proc = subprocess.Popen(
80 | (
81 | "cxcalc %s --ignore-error dominanttautomerdistribution -H %g -C "
82 | 'false -t dist -f "smiles:n,T:dist"'
83 | ).format(in_smiles_file, pka),
84 | shell=True,
85 | stdout=subprocess.PIPE,
86 | )
87 |
88 | proto_smiles_list = []
89 | try:
90 | stdout_iter = iter(proc.stdout.readline, b"")
91 | next(stdout_iter)
92 | curr_mol_name = None
93 | curr_states_count = 0
94 | for line in stdout_iter:
95 | try:
96 | smiles, mol_name, dist = line.rstrip("\r\n").split()
97 | except ValueError:
98 | logging.warning("Error parsing line:\n%s" % line)
99 | continue
100 | if mol_name != curr_mol_name:
101 | curr_states_count = 0
102 | curr_mol_name = mol_name
103 | if curr_states_count >= max_states:
104 | continue
105 | if float(dist) > dist_cutoff:
106 | proto_name = MolItemName(
107 | mol_name, proto_state_num=curr_states_count
108 | ).proto_name
109 | curr_states_count += 1
110 | proto_smiles_list.append((smiles, proto_name))
111 | logging.debug("Finished protonating SMILES in %s" % (in_smiles_file))
112 | except Exception:
113 | logging.exception("Error running cxcalc", exc_info=True)
114 |
115 | proc.kill()
116 | os.remove(in_smiles_file)
117 | return proto_smiles_list
118 |
119 |
120 | def smiles_to_proto_smiles(
121 | smiles, mol_name, max_states=3, pka=7.4, dist_cutoff=20.0
122 | ):
123 | """Generate list of SMILES for protonated states from single SMILES.
124 |
125 | This is very inefficient in batch.
126 | """
127 | logging.debug("Protonating SMILES in %s" % (mol_name))
128 | proc = subprocess.Popen(
129 | (
130 | 'cxcalc "%s %s" --ignore-error dominanttautomerdistribution -H %g '
131 | '-C false -t dist -f "smiles:n,T:dist"'
132 | ).format(smiles, mol_name, pka),
133 | shell=True,
134 | stdout=subprocess.PIPE,
135 | )
136 | states_count = 0
137 | proto_smiles_list = []
138 | try:
139 | stdout_iter = iter(proc.stdout.readline, b"")
140 | next(stdout_iter)
141 | for line in stdout_iter:
142 | try:
143 | this_smiles, this_name, dist = line.rstrip("\r\n").split()
144 | except ValueError:
145 | logging.warning("Error parsing line:\n%s" % line)
146 | continue
147 | if states_count >= max_states:
148 | break
149 | if float(dist) > dist_cutoff:
150 | proto_name = MolItemName(
151 | mol_name, proto_state_num=states_count
152 | ).proto_name
153 | states_count += 1
154 | proto_smiles_list.append((smiles, proto_name))
155 | logging.debug("Finished protonating SMILES in %s" % (mol_name))
156 | except OSError:
157 | logging.exception(
158 | "Error running cxcalc on %s" % (mol_name), exc_info=True
159 | )
160 |
161 | proc.kill()
162 | return proto_smiles_list
163 |
164 |
165 | def _chunk_iter_to_lists(iterable, chunk_size=100):
166 | """Yield chunks of size `chunk_size` from iterator."""
167 | i = 0
168 | chunk = []
169 | for item in iterable:
170 | if i >= chunk_size:
171 | yield chunk
172 | chunk = []
173 | i = 0
174 | chunk.append(item)
175 | i += 1
176 | if len(chunk) != 0:
177 | yield chunk
178 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # e3fp documentation build configuration file, created by
4 | # sphinx-quickstart on Sun Jun 25 01:13:34 2017.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | # If extensions (or modules to document with autodoc) are in another directory,
16 | # add these directories to sys.path here. If the directory is relative to the
17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
18 | #
19 | import importlib.metadata
20 | import os
21 | import sys
22 |
23 | e3fp_version = importlib.metadata.version('e3fp')
24 |
25 | # Set-up environment variable for programoutput
26 | os.environ['E3FP_REPO'] = os.path.abspath("../..")
27 |
28 | # -- General configuration ------------------------------------------------
29 |
30 | # If your documentation needs a minimal Sphinx version, state it here.
31 | #
32 | # needs_sphinx = '1.0'
33 |
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | extensions = [
38 | 'sphinx.ext.autosectionlabel',
39 | 'sphinx.ext.autosummary',
40 | 'sphinx.ext.intersphinx',
41 | 'sphinx.ext.coverage',
42 | 'sphinx.ext.ifconfig',
43 | 'sphinx.ext.viewcode',
44 | 'sphinx.ext.githubpages',
45 | 'sphinx.ext.autodoc',
46 | 'sphinx.ext.napoleon',
47 | 'sphinx.ext.doctest',
48 | 'sphinx.ext.todo',
49 | 'sphinx.ext.imgconverter',
50 | 'sphinxcontrib.programoutput']
51 |
52 | napoleon_google_docstring = False
53 | napoleon_numpy_docstring = True
54 | napoleon_use_param = False
55 | napoleon_use_ivar = True
56 |
57 | autosummary_generate = True
58 |
59 | add_module_names = False
60 |
61 | # Add any paths that contain templates here, relative to this directory.
62 | templates_path = ['_templates']
63 |
64 | # The suffix(es) of source filenames.
65 | # You can specify multiple suffix as a list of string:
66 | #
67 | # source_suffix = ['.rst', '.md']
68 | source_suffix = {'.rst': 'restructuredtext'}
69 |
70 | # The master toctree document.
71 | master_doc = 'index'
72 |
73 | # General information about the project.
74 | project = u'e3fp'
75 | copyright = u'2017, Seth Axen'
76 | author = u'Seth Axen'
77 |
78 | # The version info for the project you're documenting, acts as replacement for
79 | # |version| and |release|, also used in various other places throughout the
80 | # built documents.
81 | #
82 | # The short X.Y version.
83 | version = '%s' % (e3fp_version)
84 | # The full version, including alpha/beta/rc tags.
85 | release = version
86 |
87 | # The language for content autogenerated by Sphinx. Refer to documentation
88 | # for a list of supported languages.
89 | #
90 | # This is also used if you do content translation via gettext catalogs.
91 | # Usually you set "language" from the command line for these cases.
92 | language = "en"
93 |
94 | # The reST default role (used for this markup: `text`) to use for all
95 | # documents. The autolink role functions as :obj: when the name referred can
96 | # be resolved to a Python object
97 | default_role = "autolink"
98 |
99 | # List of patterns, relative to source directory, that match files and
100 | # directories to ignore when looking for source files.
101 | # This patterns also effect to html_static_path and html_extra_path
102 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
103 |
104 | # The name of the Pygments (syntax highlighting) style to use.
105 | pygments_style = 'sphinx'
106 |
107 | # If true, `todo` and `todoList` produce output, else they produce nothing.
108 | todo_include_todos = False
109 |
110 | # Add unique prefixes to autosectionlabel to avoid duplicate labels
111 | autosectionlabel_prefix_document = True
112 |
113 |
114 | # -- Options for HTML output ----------------------------------------------
115 |
116 | # The theme to use for HTML and HTML Help pages. See the documentation for
117 | # a list of builtin themes.
118 | html_theme = 'sphinx_rtd_theme'
119 |
120 | # Theme options are theme-specific and customize the look and feel of a theme
121 | # further. For a list of options available for each theme, see the
122 | # documentation.
123 | #
124 | # html_theme_options = {}
125 |
126 | # Add any paths that contain custom static files (such as style sheets) here,
127 | # relative to this directory. They are copied after the builtin static files,
128 | # so a file named "default.css" will overwrite the builtin "default.css".
129 | html_static_path = ['_static']
130 |
131 |
132 | # -- Options for HTMLHelp output ------------------------------------------
133 |
134 | # Output file base name for HTML help builder.
135 | htmlhelp_basename = 'e3fpdoc'
136 |
137 |
138 | # -- Options for LaTeX output ---------------------------------------------
139 |
140 | latex_elements = {
141 | # The paper size ('letterpaper' or 'a4paper').
142 | #
143 | # 'papersize': 'letterpaper',
144 |
145 | # The font size ('10pt', '11pt' or '12pt').
146 | #
147 | # 'pointsize': '10pt',
148 |
149 | # Additional stuff for the LaTeX preamble.
150 | #
151 | # 'preamble': '',
152 |
153 | # Latex figure (float) alignment
154 | #
155 | # 'figure_align': 'htbp',
156 | }
157 |
158 | # Grouping the document tree into LaTeX files. List of tuples
159 | # (source start file, target name, title,
160 | # author, documentclass [howto, manual, or own class]).
161 | latex_documents = [
162 | (master_doc, 'e3fp.tex', u'e3fp Documentation',
163 | u'Seth Axen', 'manual'),
164 | ]
165 |
166 |
167 | # -- Options for manual page output ---------------------------------------
168 |
169 | # One entry per manual page. List of tuples
170 | # (source start file, name, description, authors, manual section).
171 | man_pages = [
172 | (master_doc, 'e3fp', u'e3fp Documentation',
173 | [author], 1)
174 | ]
175 |
176 |
177 | # -- Options for Texinfo output -------------------------------------------
178 |
179 | # Grouping the document tree into Texinfo files. List of tuples
180 | # (source start file, target name, title, author,
181 | # dir menu entry, description, category)
182 | texinfo_documents = [
183 | (master_doc, 'e3fp', u'e3fp Documentation',
184 | author, 'e3fp', 'One line description of project.',
185 | 'Miscellaneous'),
186 | ]
187 |
188 |
189 | # Example configuration for intersphinx: refer to the Python standard library.
190 | intersphinx_mapping = {
191 | 'python': ('https://docs.python.org/3/', None),
192 | 'numpy': ('https://numpy.org/doc/stable/', None),
193 | 'scipy': ('https://docs.scipy.org/doc/scipy/', None),
194 | }
195 |
--------------------------------------------------------------------------------
/doc/source/dev/index.rst:
--------------------------------------------------------------------------------
1 | Developer Notes
2 | ===============
3 |
4 | We welcome contributions to E3FP! These notes are designed to help developers
5 | contribute code
6 |
7 | Authoring Code
8 | --------------
9 |
10 | Code Formatting
11 | ~~~~~~~~~~~~~~~
12 |
13 | E3FP's code should be *readable*. To ensure this, we rigorously follow the
14 | PEP8_ style conventions and PEP257_ docstring conventions, which maximize
15 | readability of the code and ease of future development. You may check your
16 | code for conformation to these conventions with the pycodestyle_ and
17 | pydocstyle_ utilities, respectively. Where the code is necessarily
18 | complicated, inline comments should reorient the reader.
19 |
20 | Utility Methods and Classes
21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
22 |
23 | Three sets of utility methods and classes are provided: `e3fp.util`,
24 | `e3fp.conformer.util`, and `e3fp.fingerprint.util`. These provide general and
25 | often-used functionality in their corresponding packages. Additionally, they
26 | provide E3FP-specific errors and exceptions.
27 |
28 | Warnings and Errors
29 | ~~~~~~~~~~~~~~~~~~~
30 |
31 | By default, warnings in Python are silent. We therefore provide a warning base
32 | class `e3fp.util.E3FPWarning` that is not silent by default. We provide several
33 | general warnings:
34 |
35 | :py:class:`.E3FPDeprecationWarning`
36 | warns when a deprecated method is called or class is instantiated.
37 |
38 | .. seealso::
39 |
40 | `Deprecation`_
41 |
42 | :py:class:`.E3FPEfficiencyWarning`
43 | warns when a method, module version, or combination of parameters is known
44 | to be inefficient.
45 |
46 | .. note::
47 |
48 | If possible, the warning message should advise on a more efficient
49 | approach.
50 |
51 | E3FP-specific errors should inherit `e3fp.util.E3FPError` base class. Several
52 | fingerprinting-specific errors are defined in `e3fp.fingerprint.util`.
53 |
54 | Deprecation
55 | ~~~~~~~~~~~
56 |
57 | Whenever changing the interface or behavior of a user-facing method or class,
58 | it is proper to deprecate it for at least one release, so that the users have
59 | time to update their scripts accordingly. A deprecated method should providing
60 | an `e3fp.util.E3FPDeprecationWarning`, notifying the user in which release to
61 | expect the method or class to be removed, and updating the documentation
62 | accordingly. This functionality is automated with the `e3fp.util.deprecated`
63 | decorator, as shown in this example:
64 |
65 | >>> import sys
66 | >>> sys.stderr = sys.stdout
67 | >>> from e3fp.util import deprecated
68 | >>> @deprecated("1.1", remove_version="1.3", msg="Function no longer needed.")
69 | ... def deprecated_method():
70 | ... """A method to demonstrate method deprecation."""
71 | ... pass
72 | >>> deprecated_method()
73 | ...: E3FPDeprecationWarning: Function `my_function` was deprecated in 1.1 and will be removed in 1.3. Function no longer needed.
74 |
75 | In the api documentation, the method will appear as:
76 |
77 | .. function:: deprecated_method()
78 |
79 | .. note:: Deprecated in e3fp 1.1.
80 | `deprecated_method` will be removed in e3fp 1.3. Function no longer needed.
81 |
82 | A method to demonstrate method deprecation.
83 |
84 | .. note::
85 | If no `remove_version` is specified, then the remove version defaults to the
86 | next release after deprecation. For example, if the method was deprecated in
87 | 1.1, it is by default marked for removal in 1.2.
88 |
89 | Contributing Code
90 | ~~~~~~~~~~~~~~~~~
91 |
92 | Before contributing code to E3FP, it is advisable for major modifications to
93 | submit an issue to the
94 | `issue tracker`_ to enable other
95 | developers to contribute to the design of the code and to reduce the amount of
96 | work necessary to conform the code to E3FP's standards. After writing the code,
97 | create a `pull request`_. This is best even if you have push access to the
98 | E3FP repo, as it enables the test suite to be run on the new code prior to
99 | merging it with the remaining code base.
100 |
101 | Writing Tests
102 | ~~~~~~~~~~~~~
103 |
104 | The standard in E3FP is to commit a test for new functionality simultaneously
105 | with the new functionality or within the same pull request. While this slows
106 | development, it prevents building a large backlog of untested methods and
107 | classes.
108 |
109 | These should ideally be unit tests, though for some complicated
110 | functionalities, such as fingerprinting, integration tests are also
111 | necessary. For these complicated functions, specific units may still be
112 | tested using :py:mod:`unittest.mock`. For example,
113 | :py:meth:`unittest.mock.patch` may be used to force a high level method to
114 | produce a specific output. For examples, see the `fingeprinting tests
115 | `_.
116 |
117 | Continuous Integration
118 | ~~~~~~~~~~~~~~~~~~~~~~
119 |
120 | E3FP uses `GitHub Actions`_ for continuous integration. This ensures that each commit
121 | and pull request passes all tests on a variety of a systems and for all
122 | supported versions of Python. Additionally, GitHub Actions updates code coverage on
123 | Codecov_ and tests all usage examples in the documentation using `doctest`.
124 |
125 | Documentation
126 | -------------
127 |
128 | In general, it is best to document the rationale and basic usage of a module,
129 | class, or method in its docstring instead of in a separate documentation file.
130 | See, for example, the docstring for `e3fp.fingerprint.db.FingerprintDatabase`.
131 | We use a variety of tools to ensure that our documentation is always
132 | up-to-date. The official documentation is hosted on ReadtheDocs_ and is
133 | automatically generated when new code is committed to the repository.
134 |
135 | Documenting Code
136 | ~~~~~~~~~~~~~~~~
137 |
138 | E3FP uses NumPy's `docstring conventions`_ for all docstrings. These are
139 | parsed by Sphinx_ using Napoleon_. All usage examples must be fully
140 | functional, as these are tested using `doctest`.
141 |
142 | The purpose of a docstring is to explain the purpose of a class/method, any
143 | relevant implementation details, its parameters, its attributes, its outputs,
144 | and its usage. The goal is clarity. For self-evident methods with descriptive
145 | variables, a simple one- ine summary is all that is needed. For complicated use
146 | cases, often involving other methods/classes, it is better to document the
147 | usage elsewhere in the documentation.
148 |
149 | Documentation Usage
150 | ~~~~~~~~~~~~~~~~~~~
151 |
152 | Coming soon.
153 |
154 | .. todo::
155 | Write documentation usage
156 |
157 | Releasing Code
158 | --------------
159 |
160 | .. todo::
161 | Write release protocol
162 |
163 | .. _PEP8: https://www.python.org/dev/peps/pep-0008/
164 | .. _PEP257: https://www.python.org/dev/peps/pep-0257/
165 | .. _pycodestyle: http://pycodestyle.pycqa.org/en/latest/
166 | .. _pydocstyle: http://pydocstyle.pycqa.org/en/latest/
167 | .. _docstring conventions: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt
168 | .. _Napoleon: http://www.sphinx-doc.org/en/stable/ext/napoleon.html
169 | .. _Sphinx: http://www.sphinx-doc.org/en/stable/index.html
170 | .. _doctest: https://docs.python.org/3/library/doctest.html
171 | .. _pull request: https://help.github.com/articles/creating-a-pull-request/
172 | .. _GitHub Actions: https://github.com/keiserlab/e3fp/actions
173 | .. _Codecov: https://codecov.io/github/keiserlab/e3fp
174 |
175 | .. include:: ../substitutions.rst
176 |
--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/fprints.rst:
--------------------------------------------------------------------------------
1 | Fingerprints
2 | ============
3 |
4 | The simplest interface for molecular fingerprints are through three classes in
5 | `e3fp.fingerprint.fprint`:
6 |
7 | :py:class:`.Fingerprint`
8 | a fingerprint with "on" bits
9 |
10 | :py:class:`.CountFingerprint`
11 | a fingerprint with counts for each "on" bit
12 |
13 | :py:class:`.FloatFingerprint`
14 | a fingerprint with float values for each "on" bit, generated for example by
15 | averaging conformer fingerprints.
16 |
17 | In addition to storing "on" indices and, for the latter two, corresponding
18 | values, they store fingerprint properties, such as name, level, and any
19 | arbitrary property. They also provide simple interfaces for fingerprint
20 | comparison, some basic processing, and comparison.
21 |
22 | .. note:: Many of these operations are more efficient when operating on a
23 | :py:class:`.FingerprintDatabase`. See
24 | :ref:`usage/fingerprints/storage:Fingerprint Storage` for more information.
25 |
26 | In the below examples, we will focus on :py:class:`.Fingerprint` and
27 | :py:class:`.CountFingerprint`. First, we execute the necessary imports.
28 |
29 | .. testsetup::
30 |
31 | import numpy as np
32 | np.random.seed(0)
33 |
34 | .. doctest::
35 |
36 | >>> from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint
37 | >>> import numpy as np
38 |
39 | .. seealso::
40 |
41 | :ref:`usage/fingerprints/storage:Fingerprint Storage`,
42 | :ref:`usage/fingerprints/comparison:Fingerprint Comparison`
43 |
44 | Creation and Conversion
45 | -----------------------
46 |
47 | Here we create a bit-fingerprint with random "on" indices.
48 |
49 | >>> bits = 2**32
50 | >>> indices = np.sort(np.random.randint(0, bits, 30))
51 | >>> indices
52 | array([ 243580376, 305097549, ..., 3975407269, 4138900056])
53 | >>> fp1 = Fingerprint(indices, bits=bits, level=0)
54 | >>> fp1
55 | Fingerprint(indices=array([243580376, ..., 4138900056]), level=0, bits=4294967296, name=None)
56 |
57 | This fingerprint is extremely sparse
58 |
59 | >>> fp1.bit_count
60 | 30
61 | >>> fp1.density
62 | 6.984919309616089e-09
63 |
64 | We can therefore "fold" the fingerprint through a series of bitwise "OR"
65 | operations on halves of the sparse vector until it is of a specified length,
66 | with minimal collision of bits.
67 |
68 | >>> fp_folded = fp1.fold(1024)
69 | >>> fp_folded
70 | Fingerprint(indices=array([9, 70, ..., 845, 849]), level=0, bits=1024, name=None)
71 | >>> fp_folded.bit_count
72 | 29
73 | >>> fp_folded.density
74 | 0.0283203125
75 |
76 | A :py:class:`.CountFingerprint` may be created by also providing a dictionary
77 | matching indices with nonzero counts to the counts.
78 |
79 | >>> indices2 = np.sort(np.random.randint(0, bits, 60))
80 | >>> counts = dict(zip(indices2, np.random.randint(1, 10, indices2.size)))
81 | >>> counts
82 | {80701568: 8, 580757632: 7, ..., 800291326: 5, 4057322111: 7}
83 | >>> cfp1 = CountFingerprint(counts=counts, bits=bits, level=0)
84 | >>> cfp1
85 | CountFingerprint(counts={80701568: 8, 580757632: 7, ..., 3342157822: 2, 4057322111: 7}, level=0, bits=4294967296, name=None)
86 |
87 | Unlike folding a bit fingerprint, by default, folding a count fingerprint
88 | performs a "SUM" operation on colliding counts.
89 |
90 | >>> cfp1.bit_count
91 | 60
92 | >>> cfp_folded = cfp1.fold(1024)
93 | >>> cfp_folded
94 | CountFingerprint(counts={128: 15, 257: 4, ..., 1022: 2, 639: 7}, level=0, bits=1024, name=None)
95 | >>> cfp_folded.bit_count
96 | 57
97 |
98 | It is trivial to interconvert the fingerprints.
99 |
100 | >>> cfp_folded2 = CountFingerprint.from_fingerprint(fp_folded)
101 | >>> cfp_folded2
102 | CountFingerprint(counts={9: 1, 87: 1, ..., 629: 1, 763: 1}, level=0, bits=1024, name=None)
103 | >>> cfp_folded2.indices[:5]
104 | array([ 9, 70, 72, 87, 174])
105 | >>> fp_folded.indices[:5]
106 | array([ 9, 70, 72, 87, 174])
107 |
108 | RDKit Morgan fingerprints (analogous to ECFP) may easily be converted to a
109 | :py:class:`.Fingerprint`.
110 |
111 | >>> from rdkit import Chem
112 | >>> from rdkit.Chem import AllChem
113 | >>> mol = Chem.MolFromSmiles('Cc1ccccc1')
114 | >>> mfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
115 | >>> mfp
116 |
117 | >>> Fingerprint.from_rdkit(mfp)
118 | Fingerprint(indices=array([389, 1055, ..., 1873, 1920]), level=-1, bits=2048, name=None)
119 |
120 | Likewise, :py:class:`.Fingerprint` can be easily converted to a NumPy ndarray or
121 | SciPy sparse matrix.
122 |
123 | >>> fp_folded.to_vector()
124 | <1x1024 sparse matrix of type ''
125 | ...with 29 stored elements in Compressed Sparse Row format>
126 | >>> fp_folded.to_vector(sparse=False)
127 | array([False, False, False, ..., False, False, False], dtype=bool)
128 | >>> np.where(fp_folded.to_vector(sparse=False))[0]
129 | array([ 9, 70, 72, 87, ...])
130 | >>> cfp_folded.to_vector(sparse=False)
131 | array([0, 0, 0, ..., 0, 2, 0], dtype=uint16)
132 | >>> cfp_folded.to_vector(sparse=False).sum()
133 | 252
134 |
135 | Algebra
136 | -------
137 |
138 | Basic algebraic functions may be performed on fingerprints. If either
139 | fingerprint is a bit fingerprint, all algebraic functions are bit-wise.
140 | The following bit-wise operations are supported:
141 |
142 | Equality
143 | >>> fp1 = Fingerprint([0, 1, 6, 8, 12], bits=16)
144 | >>> fp2 = Fingerprint([1, 2, 4, 8, 11, 12], bits=16)
145 | >>> fp1 == fp2
146 | False
147 | >>> fp1_copy = Fingerprint.from_fingerprint(fp1)
148 | >>> fp1 == fp1_copy
149 | True
150 | >>> fp1_copy.level = 5
151 | >>> fp1 == fp1_copy
152 | False
153 |
154 | Union/OR
155 | >>> fp1 + fp2
156 | Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None)
157 | >>> fp1 | fp2
158 | Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None)
159 |
160 | Intersection/AND
161 | >>> fp1 & fp2
162 | Fingerprint(indices=array([1, 8, 12]), level=-1, bits=16, name=None)
163 |
164 | Difference/AND NOT
165 | >>> fp1 - fp2
166 | Fingerprint(indices=array([0, 6]), level=-1, bits=16, name=None)
167 | >>> fp2 - fp1
168 | Fingerprint(indices=array([2, 4, 11]), level=-1, bits=16, name=None)
169 |
170 | XOR
171 | >>> fp1 ^ fp2
172 | Fingerprint(indices=array([0, 2, 4, 6, 11]), level=-1, bits=16, name=None)
173 |
174 | With count or float fingerprints, bit-wise operations are still possible, but
175 | algebraic operations are applied to counts.
176 |
177 | >>> fp1 = CountFingerprint(counts={0: 3, 1: 2, 5: 1, 9: 3}, bits=16)
178 | >>> fp2 = CountFingerprint(counts={1: 2, 5: 2, 7: 3, 10: 7}, bits=16)
179 | >>> fp1 + fp2
180 | CountFingerprint(counts={0: 3, 1: 4, 5: 3, 7: 3, 9: 3, 10: 7}, level=-1, bits=16, name=None)
181 | >>> fp1 - fp2
182 | CountFingerprint(counts={0: 3, 1: 0, 5: -1, 7: -3, 9: 3, 10: -7}, level=-1, bits=16, name=None)
183 | >>> fp1 * 3
184 | CountFingerprint(counts={0: 9, 1: 6, 5: 3, 9: 9}, level=-1, bits=16, name=None)
185 | >>> fp1 / 2
186 | FloatFingerprint(counts={0: 1.5, 1: 1.0, 5: 0.5, 9: 1.5}, level=-1, bits=16, name=None)
187 |
188 | Finally, fingerprints may be batch added and averaged, producing either a count
189 | or float fingerprint when sensible.
190 |
191 | >>> from e3fp.fingerprint.fprint import add, mean
192 | >>> fps = [Fingerprint(np.random.randint(0, 32, 8), bits=32) for i in range(100)]
193 | >>> add(fps)
194 | CountFingerprint(counts={0: 23, 1: 23, ..., 30: 20, 31: 14}, level=-1, bits=32, name=None)
195 | >>> mean(fps)
196 | FloatFingerprint(counts={0: 0.23, 1: 0.23, ..., 30: 0.2, 31: 0.14}, level=-1, bits=32, name=None)
197 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
1 | """Tests for fingerprint comparison metrics.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import pytest
7 |
8 | import numpy as np
9 | from scipy.sparse import csr_matrix
10 | from scipy.spatial.distance import cdist
11 | from e3fp.fingerprint import metrics, fprint, db
12 | from e3fp.fingerprint.metrics import array_metrics, fprint_metrics
13 |
14 |
15 | def _create_random_sparse(nrows, nbits=1024, perc_pos=0.1, counts=False):
16 | arr = csr_matrix(
17 | np.random.uniform(0, 1, (nrows, nbits)) > (1 - perc_pos),
18 | dtype=np.double,
19 | )
20 | if counts:
21 | arr.data = np.random.randint(1, 30, arr.data.shape[0]).astype(
22 | np.double
23 | )
24 | return arr
25 |
26 | def soergeldist(x, y):
27 | return np.abs(x - y).sum() / np.maximum(x, y).sum()
28 |
29 |
30 | class TestArrayMetrics:
31 |
32 | """Tests for array comparison metrics"""
33 |
34 | @staticmethod
35 | def _eval(func, X, Y=None, dense=False, **kwargs):
36 | if dense:
37 | X = X.toarray()
38 | if Y is not None:
39 | Y = Y.toarray()
40 | return func(X, Y, **kwargs)
41 |
42 | @pytest.mark.parametrize("dense", [True, False])
43 | @pytest.mark.parametrize(
44 | "func,cdist_metric,counts",
45 | [
46 | (array_metrics.tanimoto, "jaccard", False),
47 | (array_metrics.dice, "dice", False),
48 | (array_metrics.cosine, "cosine", False),
49 | (array_metrics.cosine, "cosine", True),
50 | (array_metrics.pearson, "correlation", False),
51 | (array_metrics.pearson, "correlation", True),
52 | (array_metrics.soergel, soergeldist, False),
53 | (array_metrics.soergel, soergeldist, True),
54 | ],
55 | )
56 | def test_metrics_vs_cdist(self, func, cdist_metric, counts, dense):
57 | X = _create_random_sparse(10, counts=counts)
58 | Y = _create_random_sparse(8, counts=counts)
59 | expect_score = 1.0 - cdist(X.toarray(), Y.toarray(), metric=cdist_metric)
60 | score = self._eval(func, X, Y, dense=dense)
61 | assert type(score) is np.ndarray
62 | np.testing.assert_allclose(score, expect_score)
63 | # test self-comparison
64 | expect_score = 1.0 - cdist(X.toarray(), X.toarray(), metric=cdist_metric)
65 | score = self._eval(func, X, dense=dense)
66 | np.testing.assert_allclose(score, expect_score)
67 |
68 | @pytest.mark.parametrize("dense", [True, False])
69 | def test_tanimoto_soergel_equal_for_binary(self, dense):
70 | X = _create_random_sparse(10, counts=False)
71 | Y = _create_random_sparse(8, counts=False)
72 | tscore = self._eval(array_metrics.tanimoto, X, Y, dense=dense)
73 | sscore = self._eval(array_metrics.soergel, X, Y, dense=dense)
74 | np.testing.assert_allclose(tscore, sscore)
75 |
76 |
77 | class TestFlexibleMetrics:
78 |
79 | """Tests for flexible comparison metrics"""
80 |
81 | metric_names = ["tanimoto", "soergel", "dice", "cosine", "pearson"]
82 | count_metric_names = ["soergel", "cosine", "pearson"]
83 |
84 | def test_binary_fprint_vs_fprint(self):
85 | fp1 = fprint.Fingerprint.from_vector(
86 | _create_random_sparse(1, counts=False, perc_pos=0.5)
87 | )
88 | fp2 = fprint.Fingerprint.from_vector(
89 | _create_random_sparse(1, counts=False, perc_pos=0.5)
90 | )
91 | for metric_name in self.metric_names:
92 | gen_score = getattr(metrics, metric_name)(fp1, fp2)
93 | fp_score = getattr(fprint_metrics, metric_name)(fp1, fp2)
94 | assert gen_score == pytest.approx(fp_score)
95 | array_score = getattr(array_metrics, metric_name)(
96 | fp1.to_vector(sparse=True), fp2.to_vector(sparse=True)
97 | )
98 | assert gen_score == pytest.approx(array_score[0][0])
99 |
100 | def test_count_fprint_vs_fprint(self):
101 | fp1 = fprint.CountFingerprint.from_vector(
102 | _create_random_sparse(1, nbits=32, counts=True, perc_pos=0.5)
103 | )
104 | fp2 = fprint.CountFingerprint.from_vector(
105 | _create_random_sparse(1, nbits=32, counts=True, perc_pos=0.5)
106 | )
107 | for metric_name in self.count_metric_names:
108 | gen_score = getattr(metrics, metric_name)(fp1, fp2)
109 | fp_score = getattr(fprint_metrics, metric_name)(fp1, fp2)
110 | assert gen_score == pytest.approx(fp_score)
111 | array_score = getattr(array_metrics, metric_name)(
112 | fp1.to_vector(sparse=True), fp2.to_vector(sparse=True)
113 | )
114 | assert gen_score == pytest.approx(array_score[0][0])
115 |
116 | def test_binary_fprint_vs_db(self):
117 | fp_array = _create_random_sparse(1, counts=False, perc_pos=0.5)
118 | fp = fprint.Fingerprint.from_vector(fp_array)
119 | db_array = _create_random_sparse(10, counts=False, perc_pos=0.5)
120 | fp_names = [str(i) for i in range(db_array.shape[0])]
121 | fdb = db.FingerprintDatabase.from_array(
122 | db_array, fp_names, fp_type=fprint.Fingerprint
123 | )
124 | for metric_name in self.metric_names:
125 | gen_score = getattr(metrics, metric_name)(fp, fdb)
126 | array_score = getattr(array_metrics, metric_name)(
127 | fp_array, db_array
128 | )
129 | np.testing.assert_allclose(gen_score, array_score)
130 | gen_score = getattr(metrics, metric_name)(fdb, fp)
131 | np.testing.assert_allclose(gen_score.T, array_score)
132 |
133 | def test_count_fprint_vs_db(self):
134 | fp_array = _create_random_sparse(1, counts=True, perc_pos=0.5)
135 | fp = fprint.CountFingerprint.from_vector(fp_array)
136 | db_array = _create_random_sparse(10, counts=True, perc_pos=0.5)
137 | fp_names = [str(i) for i in range(db_array.shape[0])]
138 | fdb = db.FingerprintDatabase.from_array(
139 | db_array, fp_names, fp_type=fprint.CountFingerprint
140 | )
141 | for metric_name in self.count_metric_names:
142 | gen_score = getattr(metrics, metric_name)(fp, fdb)
143 | array_score = getattr(array_metrics, metric_name)(
144 | fp_array, db_array
145 | )
146 | np.testing.assert_allclose(gen_score, array_score)
147 | # Check if reverse order produces transpose
148 | gen_score = getattr(metrics, metric_name)(fdb, fp)
149 | np.testing.assert_allclose(gen_score.T, array_score)
150 |
151 | def test_binary_db_vs_db(self):
152 | db_array1 = _create_random_sparse(1, counts=False, perc_pos=0.5)
153 | fp_names = [str(i) for i in range(db_array1.shape[0])]
154 | db1 = db.FingerprintDatabase.from_array(
155 | db_array1, fp_names, fp_type=fprint.Fingerprint
156 | )
157 | db_array2 = _create_random_sparse(1, counts=False, perc_pos=0.5)
158 | fp_names = [str(i) for i in range(db_array2.shape[0])]
159 | db2 = db.FingerprintDatabase.from_array(
160 | db_array2, fp_names, fp_type=fprint.Fingerprint
161 | )
162 | for metric_name in self.metric_names:
163 | gen_score = getattr(metrics, metric_name)(db1, db2)
164 | array_score = getattr(array_metrics, metric_name)(
165 | db_array1, db_array2
166 | )
167 | np.testing.assert_allclose(gen_score, array_score)
168 |
169 | def test_count_db_vs_db(self):
170 | db_array1 = _create_random_sparse(1, counts=True, perc_pos=0.5)
171 | fp_names = [str(i) for i in range(db_array1.shape[0])]
172 | db1 = db.FingerprintDatabase.from_array(
173 | db_array1, fp_names, fp_type=fprint.CountFingerprint
174 | )
175 | db_array2 = _create_random_sparse(1, counts=True, perc_pos=0.5)
176 | fp_names = [str(i) for i in range(db_array2.shape[0])]
177 | db2 = db.FingerprintDatabase.from_array(
178 | db_array2, fp_names, fp_type=fprint.CountFingerprint
179 | )
180 | for metric_name in self.count_metric_names:
181 | gen_score = getattr(metrics, metric_name)(db1, db2)
182 | array_score = getattr(array_metrics, metric_name)(
183 | db_array1, db_array2
184 | )
185 | np.testing.assert_allclose(gen_score, array_score)
186 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/array_metrics.py:
--------------------------------------------------------------------------------
1 | """Fingerprint array comparison metrics.
2 |
3 | Each is fully compatible with both dense and sparse inputs.
4 |
5 | Author: Seth Axen
6 | E-mail: seth.axen@gmail.com
7 | """
8 | from __future__ import division
9 |
10 | import numpy as np
11 | import scipy
12 | from scipy.sparse import csr_matrix, issparse, vstack
13 | import scipy.sparse.linalg
14 | import scipy.spatial
15 | from e3fp.util import maybe_jit
16 |
17 |
18 | def tanimoto(X, Y=None):
19 | """Compute the Tanimoto coefficients between `X` and `Y`.
20 |
21 | Data must be binary. This is not checked.
22 |
23 | Parameters
24 | ----------
25 | X : array_like or sparse matrix
26 | with shape (`n_fprints_X`, `n_bits`).
27 | Y : array_like or sparse matrix, optional
28 | with shape (`n_fprints_Y`, `n_bits`).
29 |
30 | Returns
31 | -------
32 | tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`)
33 |
34 | See Also
35 | --------
36 | soergel: Analog to Tanimoto for non-binary data.
37 | cosine, dice, pearson
38 | """
39 | X, Y = _check_array_pair(X, Y)
40 | Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
41 | with np.errstate(divide="ignore"): # handle 0 in denominator
42 | return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits)))
43 |
44 |
45 | def soergel(X, Y=None):
46 | """Compute the Soergel similarities between `X` and `Y`.
47 |
48 | Soergel similarity is the complement of Soergel distance and can be
49 | thought of as the analog of the Tanimoto coefficient for count/float-based
50 | data. For binary data, it is equivalent to the Tanimoto coefficient.
51 |
52 | Parameters
53 | ----------
54 | X : array_like or sparse matrix
55 | with shape (`n_fprints_X`, `n_bits`).
56 | Y : array_like or sparse matrix, optional
57 | with shape (`n_fprints_Y`, `n_bits`).
58 |
59 | Returns
60 | -------
61 | soergel : array of shape (`n_fprints_X`, `n_fprints_Y`)
62 |
63 | Notes
64 | --------
65 | If Numba is available, this function is jit-compiled and much more efficient.
66 |
67 | See Also
68 | --------
69 | tanimoto: A fast version of this function for binary data.
70 | pearson: Pearson correlation, also appropriate for non-binary data.
71 | cosine, dice
72 | """
73 | X, Y = _check_array_pair(X, Y)
74 | S = np.empty((X.shape[0], Y.shape[0]), dtype=float)
75 | if issparse(X):
76 | return _sparse_soergel(X.data, X.indices, X.indptr,
77 | Y.data, Y.indices, Y.indptr, S)
78 | return _dense_soergel(X, Y, S)
79 |
80 | def dice(X, Y=None):
81 | """Compute the Dice coefficients between `X` and `Y`.
82 |
83 | Data must be binary. This is not checked.
84 |
85 | Parameters
86 | ----------
87 | X : array_like or sparse matrix
88 | with shape (`n_fprints_X`, `n_bits`).
89 | Y : array_like or sparse matrix, optional
90 | with shape (`n_fprints_Y`, `n_bits`).
91 |
92 | Returns
93 | -------
94 | dice : array of shape (`n_fprints_X`, `n_fprints_Y`)
95 |
96 | See Also
97 | --------
98 | cosine, soergel, tanimoto, pearson
99 | """
100 | X, Y = _check_array_pair(X, Y)
101 | Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
102 | with np.errstate(divide="ignore"): # handle 0 in denominator
103 | return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T)))
104 |
105 |
106 | def cosine(X, Y=None, assume_binary=False):
107 | """Compute the Cosine similarities between `X` and `Y`.
108 |
109 | Parameters
110 | ----------
111 | X : array_like or sparse matrix
112 | with shape (`n_fprints_X`, `n_bits`).
113 | Y : array_like or sparse matrix, optional
114 | with shape (`n_fprints_Y`, `n_bits`).
115 | assume_binary : bool, optional
116 | Assume data is binary (results in efficiency boost). If data is not
117 | binary, the result will be incorrect.
118 |
119 | Returns
120 | -------
121 | cosine : array of shape (`n_fprints_X`, `n_fprints_Y`)
122 |
123 | See Also
124 | --------
125 | dice, soergel, tanimoto
126 | """
127 | X, Y = _check_array_pair(X, Y)
128 | if not issparse(X):
129 | return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine")
130 | if assume_binary:
131 | Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
132 | with np.errstate(divide="ignore"): # handle 0 in denominator
133 | return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T)))
134 | else:
135 | return _sparse_cosine(X, Y)
136 |
137 |
138 | def pearson(X, Y=None):
139 | """Compute the Pearson correlation between `X` and `Y`.
140 |
141 | Parameters
142 | ----------
143 | X : array_like or sparse matrix
144 | with shape (`n_fprints_X`, `n_bits`).
145 | Y : array_like or sparse matrix, optional
146 | with shape (`n_fprints_Y`, `n_bits`).
147 |
148 | Returns
149 | -------
150 | pearson : array of shape (`n_fprints_X`, `n_fprints_Y`)
151 |
152 |
153 | See Also
154 | --------
155 | soergel: Soergel similarity for non-binary data
156 | cosine, dice, tanimoto
157 | """
158 | X, Y = _check_array_pair(X, Y)
159 | Xlen = X.shape[0]
160 | if issparse(X):
161 | X = vstack((X, Y), format="csr")
162 | X = X - X.mean(axis=1)
163 | cov = (X * X.T) / (X.shape[1] - 1.0)
164 | d = np.sqrt(np.diag(cov))
165 | with np.errstate(divide="ignore"): # handle 0 in denominator
166 | pearson = cov / np.outer(d, d)
167 | else:
168 | with np.errstate(divide="ignore"): # handle 0 in denominator
169 | pearson = np.corrcoef(X, Y)
170 | return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
171 |
172 |
173 | def _check_array(arr, dtype=float, force_sparse=False):
174 | if force_sparse or issparse(arr):
175 | return csr_matrix(arr, copy=False, dtype=dtype)
176 | else:
177 | return arr.astype(dtype, copy=False)
178 |
179 |
180 | def _check_array_pair(X, Y=None, dtype=float, force_sparse=False):
181 | if Y is not None and X.shape[1] != Y.shape[1]:
182 | raise ValueError("Arrays must have same width.")
183 | if force_sparse or issparse(X) or issparse(Y):
184 | force_sparse = True # ensure if one is sparse, all are sparse.
185 | X = _check_array(X, dtype=dtype, force_sparse=force_sparse)
186 | if Y is None or Y is X:
187 | Y = X
188 | else:
189 | Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse)
190 | return X, Y
191 |
192 |
193 | def _get_bitcount_arrays(X, Y, return_XYbits=False):
194 | if issparse(X):
195 | Xbits = np.sum(X, axis=1)
196 | if Y is X:
197 | Ybits = Xbits
198 | else:
199 | Ybits = np.sum(Y, axis=1)
200 | if return_XYbits:
201 | XYbits = (X * Y.T).toarray()
202 | return Xbits, Ybits, XYbits
203 | else:
204 | Xbits = np.sum(X, axis=1, keepdims=True)
205 | if Y is X:
206 | Ybits = Xbits
207 | else:
208 | Ybits = np.sum(Y, axis=1, keepdims=True)
209 | if return_XYbits:
210 | XYbits = np.dot(X, Y.T)
211 | return Xbits, Ybits, XYbits
212 | return Xbits, Ybits
213 |
214 |
215 | def _sparse_cosine(X, Y):
216 | Xnorm = scipy.sparse.linalg.norm(X, axis=1)
217 | if Y is X:
218 | Ynorm = Xnorm
219 | else:
220 | Ynorm = scipy.sparse.linalg.norm(Y, axis=1)
221 | XY = (X * Y.T).toarray()
222 | with np.errstate(divide="ignore"): # handle 0 in denominator
223 | return np.nan_to_num(XY / np.outer(Xnorm, Ynorm))
224 |
225 | @maybe_jit(nopython=True, nogil=True, cache=True)
226 | def _dense_soergel(X, Y, S):
227 | for ix in range(S.shape[0]):
228 | for iy in range(S.shape[1]):
229 | sum_abs_diff = 0
230 | sum_max = 0
231 | for j in range(X.shape[1]):
232 | diff = X[ix, j] - Y[iy, j]
233 | if diff > 0:
234 | sum_abs_diff += diff
235 | sum_max += X[ix, j]
236 | else:
237 | sum_abs_diff -= diff
238 | sum_max += Y[iy, j]
239 |
240 | if sum_max == 0:
241 | S[ix, iy] = 0
242 | continue
243 | S[ix, iy] = 1 - sum_abs_diff / sum_max
244 | return S
245 |
246 | @maybe_jit(nopython=True, nogil=True, cache=True)
247 | def _sparse_soergel(Xdata, Xindices, Xindptr, Ydata, Yindices, Yindptr, S):
248 | for ix in range(S.shape[0]):
249 | if Xindptr[ix] == Xindptr[ix + 1]:
250 | for iy in range(S.shape[1]): # no X values in row
251 | S[ix, iy] = 0
252 | continue
253 | jxindmax = Xindptr[ix + 1] - 1
254 | for iy in range(S.shape[1]):
255 | if Yindptr[iy] == Yindptr[iy + 1]: # no Y values in row
256 | S[ix, iy] = 0
257 | continue
258 |
259 | sum_abs_diff = 0
260 | sum_max = 0
261 | # Implementation of the final step of merge sort
262 | jyindmax = Yindptr[iy + 1] - 1
263 | jx = Xindptr[ix]
264 | jy = Yindptr[iy]
265 | while jx <= jxindmax and jy <= jyindmax:
266 | jxind = Xindices[jx]
267 | jyind = Yindices[jy]
268 | if jxind < jyind:
269 | sum_max += Xdata[jx]
270 | sum_abs_diff += Xdata[jx]
271 | jx += 1
272 | elif jyind < jxind:
273 | sum_max += Ydata[jy]
274 | sum_abs_diff += Ydata[jy]
275 | jy += 1
276 | else:
277 | diff = Xdata[jx] - Ydata[jy]
278 | if diff > 0:
279 | sum_abs_diff += diff
280 | sum_max += Xdata[jx]
281 | else:
282 | sum_abs_diff -= diff
283 | sum_max += Ydata[jy]
284 | jx += 1
285 | jy += 1
286 |
287 | while jx <= jxindmax:
288 | sum_max += Xdata[jx]
289 | sum_abs_diff += Xdata[jx]
290 | jx += 1
291 |
292 | while jy <= jyindmax:
293 | sum_max += Ydata[jy]
294 | sum_abs_diff += Ydata[jy]
295 | jy += 1
296 |
297 | if sum_max == 0:
298 | S[ix, iy] = 0
299 | continue
300 | S[ix, iy] = 1 - sum_abs_diff / sum_max
301 | return S
302 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/array_ops.py:
--------------------------------------------------------------------------------
1 | """Various array operations.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import numpy as np
7 | from scipy.spatial.distance import pdist, squareform
8 |
9 | QUATERNION_DTYPE = float
10 | X_AXIS, Y_AXIS, Z_AXIS = np.identity(3, dtype=float)
11 | EPS = 1e-12 # epsilon, a number close to 0
12 |
13 |
14 | # Vector Algebra Methods
15 | def as_unit(v, axis=1):
16 | """Return array of unit vectors parallel to vectors in `v`.
17 |
18 | Parameters
19 | ----------
20 | v : ndarray of float
21 | axis : int, optional
22 | Axis along which to normalize length.
23 |
24 | Returns
25 | -------
26 | ndarray of float : Unit vector of `v`, i.e. `v` divided by its
27 | magnitude along `axis`.
28 | """
29 | u = np.array(v, dtype=float, copy=True)
30 | if u.ndim == 1:
31 | sqmag = u.dot(u)
32 | if sqmag >= EPS:
33 | u /= sqmag ** 0.5
34 | else:
35 | if axis == 1:
36 | sqmag = np.einsum("...ij,...ij->...i", u, u)
37 | else:
38 | sqmag = np.einsum("...ij,...ij->...j", u, u)
39 |
40 | sqmag[sqmag < EPS] = 1.0
41 | u /= np.expand_dims(np.sqrt(sqmag), axis)
42 | return u
43 |
44 |
45 | def make_distance_matrix(coords):
46 | """Build pairwise distance matrix from coordinates.
47 |
48 | Parameters
49 | ----------
50 | coords : ndarray of float
51 | an Mx3 array of cartesian coordinates.
52 |
53 | Returns
54 | -------
55 | ndarray of float : square symmetrical distance matrix
56 | """
57 | return squareform(pdist(coords))
58 |
59 |
60 | def make_transform_matrix(center, y=None, z=None):
61 | """Make 4x4 homogenous transformation matrix.
62 |
63 | Given Nx4 array A where A[:, 4] = 1., the transform matrix M should be
64 | used with dot(M, A.T).T. Order of operations is 1. translation, 2. align
65 | `y` x `z` plane to yz-plane 3. align `y` to y-axis.
66 |
67 | Parameters
68 | ----------
69 | center : 1x3 array of float
70 | Coordinate that should be centered after transformation.
71 | y : None or 1x3 array of float
72 | Vector that should lie on the y-axis after transformation
73 | z : None or 1x3 array of float
74 | Vector that after transformation should lie on yz-plane in direction
75 | of z-axis.
76 |
77 | Returns
78 | -------
79 | 4x4 array of float
80 | 4x4 homogenous transformation matrix.
81 | """
82 | translate = np.identity(4, dtype=float)
83 | translate[:3, 3] = -np.asarray(center, dtype=float)
84 | if y is not None:
85 | y = np.atleast_2d(y)
86 | if z is None:
87 | rotate = np.identity(4, dtype=float)
88 | rotate[:3, :3] = make_rotation_matrix(y, Y_AXIS)
89 | else:
90 | z = np.atleast_2d(z)
91 | rotate_norm = np.identity(4, dtype=float)
92 | x_unit = as_unit(np.cross(y, z))
93 | rotate_norm[:3, :3] = make_rotation_matrix(x_unit, X_AXIS)
94 | new_y = np.dot(rotate_norm[:3, :3], y.flatten())
95 | rotate_y = np.identity(4, dtype=float)
96 | rotate_y[:3, :3] = make_rotation_matrix(new_y.flatten(), Y_AXIS)
97 | rotate = np.dot(rotate_y, rotate_norm)
98 | transform = np.dot(rotate, translate)
99 | else:
100 | transform = translate
101 | return transform
102 |
103 |
104 | def make_rotation_matrix(v0, v1):
105 | """Create 3x3 matrix of rotation from `v0` onto `v1`.
106 |
107 | Should be used by dot(R, v0.T).T.
108 |
109 | Parameters
110 | ----------
111 | v0 : 1x3 array of float
112 | Initial vector before alignment.
113 | v1 : 1x3 array of float
114 | Vector to which to align `v0`.
115 | """
116 | v0 = as_unit(v0)
117 | v1 = as_unit(v1)
118 | u = np.cross(v0.ravel(), v1.ravel())
119 | if np.all(u == 0.0):
120 | return np.identity(3, dtype=float)
121 | sin_ang = u.dot(u) ** 0.5
122 | u /= sin_ang
123 | cos_ang = np.dot(v0, v1.T)
124 | # fmt: off
125 | ux = np.array([[ 0., -u[2], u[1]],
126 | [ u[2], 0., -u[0]],
127 | [-u[1], u[0], 0.]], dtype=float)
128 | # fmt: on
129 | rot = (
130 | cos_ang * np.identity(3, dtype=float)
131 | + sin_ang * ux
132 | + (1 - cos_ang) * np.outer(u, u)
133 | )
134 | return rot
135 |
136 |
137 | def transform_array(transform_matrix, a):
138 | """Pad an array with 1s, transform, and return with original dimensions.
139 |
140 | Parameters
141 | ----------
142 | transform_matrix : 4x4 array of float
143 | 4x4 homogenous transformation matrix
144 | a : Nx3 array of float
145 | Array of 3-D coordinates.
146 |
147 | Returns
148 | -------
149 | Nx3 array of float : Transformed array
150 | """
151 | return unpad_array(np.dot(transform_matrix, pad_array(a).T).T)
152 |
153 |
154 | def pad_array(a, n=1.0, axis=1):
155 | """Return `a` with row of `n` appended to `axis`.
156 |
157 | Parameters
158 | ----------
159 | a : ndarray
160 | Array to pad
161 | n : float or int, optional
162 | Value to pad `a` with
163 | axis : int, optional
164 | Axis of `a` to pad with `n`.
165 |
166 | Returns
167 | -------
168 | ndarray
169 | Padded array.
170 | """
171 | if a.ndim == 1:
172 | pad = np.ones(a.shape[0] + 1, dtype=a.dtype) * n
173 | pad[: a.shape[0]] = a
174 | else:
175 | shape = list(a.shape)
176 | shape[axis] += 1
177 | pad = np.ones(shape, dtype=a.dtype)
178 | pad[: a.shape[0], : a.shape[1]] = a
179 | return pad
180 |
181 |
182 | def unpad_array(a, axis=1):
183 | """Return `a` with row removed along `axis`.
184 |
185 | Parameters
186 | ----------
187 | a : ndarray
188 | Array from which to remove row
189 | axis : int, optional
190 | Axis from which to remove row
191 |
192 | Returns
193 | -------
194 | ndarray
195 | Unpadded array.
196 | """
197 | if a.ndim == 1:
198 | return a[:-1]
199 | else:
200 | shape = list(a.shape)
201 | shape[axis] -= 1
202 | return a[: shape[0], : shape[1]]
203 |
204 |
205 | def project_to_plane(vec_arr, norm):
206 | """Project array of vectors to plane with normal `norm`.
207 |
208 | Parameters
209 | ----------
210 | vec_arr : Nx3 array
211 | Array of N 3D vectors.
212 | norm : 1x3 array
213 | Normal vector to plane.
214 |
215 | Returns
216 | -------
217 | Nx3 array
218 | Array of vectors projected onto plane.
219 | """
220 | unit_norm = as_unit(norm).flatten()
221 | mag_on_norm = np.dot(vec_arr, unit_norm)
222 | if vec_arr.ndim == 1:
223 | vec_on_norm = np.array(unit_norm, copy=True)
224 | vec_on_norm *= mag_on_norm
225 | else:
226 | vec_on_norm = np.tile(unit_norm, (vec_arr.shape[0], 1))
227 | vec_on_norm *= mag_on_norm[:, None]
228 | return vec_arr - vec_on_norm
229 |
230 |
231 | def calculate_angles(vec_arr, ref, ref_norm=None):
232 | """Calculate angles between vectors in `vec_arr` and `ref` vector.
233 |
234 | If `ref_norm` is not provided, angle ranges between 0 and pi. If it is
235 | provided, angle ranges between 0 and 2pi. Note that if `ref_norm` is
236 | orthogonal to `vec_arr` and `ref`, then the angle is rotation around the
237 | axis, but if a non-orthogonal axis is provided, this may not be the case.
238 |
239 | Parameters
240 | ----------
241 | vec_arr : Nx3 array of float
242 | Array of N 3D vectors.
243 | ref : 1x3 array of float
244 | Reference vector
245 | ref_norm : 1x3 array of float
246 | Normal vector.
247 |
248 | Returns
249 | -------
250 | 1-D array
251 | Array of N angles
252 | """
253 | unit_vec_arr = as_unit(vec_arr)
254 | unit_ref = as_unit(ref).flatten()
255 | ang = np.arccos(np.clip(np.dot(unit_vec_arr, unit_ref), -1.0, 1.0))
256 | # handle cases where a vector is the origin
257 | ang[np.all(unit_vec_arr == np.zeros(3), axis=1)] = 0.0
258 | if ref_norm is not None:
259 | sign = np.sign(
260 | np.dot(ref_norm, np.cross(unit_vec_arr, unit_ref).T)
261 | ).flatten()
262 | sign[sign == 0] = 1
263 | ang = rotate_angles(sign * ang, 2 * np.pi)
264 | return ang
265 |
266 |
267 | def rotate_angles(angles, amount):
268 | """Rotate angles by `amount`, keeping in 0 to 2pi range.
269 |
270 | Parameters
271 | ----------
272 | angles : 1-D array of float
273 | Angles in radians
274 | amount : float
275 | Amount to rotate angles by
276 |
277 | Returns
278 | -------
279 | 1-D array of float : Rotated angles
280 | """
281 | return (angles + amount) % (2 * np.pi)
282 |
283 |
284 | def quaternion_to_transform_matrix(quaternion, translation=np.zeros(3)):
285 | """Convert quaternion to homogenous 4x4 transform matrix.
286 |
287 | Parameters
288 | ----------
289 | quaternion : 4x1 array of float
290 | Quaternion describing rotation after translation.
291 | translation : 3x1 array of float, optional
292 | Translation to be performed before rotation.
293 | """
294 | q = np.array(quaternion, dtype=float, copy=True)
295 | n = np.linalg.norm(q)
296 | if n < 1e-12:
297 | return np.identity(4, dtype=float)
298 | q /= n
299 | q = 2 * np.outer(q, q)
300 | # fmt: off
301 | transform_mat = np.array(
302 | [[1.-q[2, 2]-q[3, 3], q[1, 2]-q[3, 0], q[1, 3]+q[2, 0], 0.],
303 | [ q[1, 2]+q[3, 0], 1.-q[1, 1]-q[3, 3], q[2, 3]-q[1, 0], 0.],
304 | [ q[1, 3]-q[2, 0], q[2, 3]+q[1, 0], 1.-q[1, 1]-q[2, 2], 0.],
305 | [ 0., 0., 0., 1.]],
306 | dtype=float
307 | )
308 | # fmt: on
309 | transform_mat[:3, 3] = translation
310 | return transform_mat
311 |
312 |
313 | def transform_matrix_to_quaternion(transform_matrix, dtype=QUATERNION_DTYPE):
314 | """Convert homogenous 4x4 transform matrix to quaternion.
315 |
316 | Parameters
317 | ----------
318 | transform_matrix : 4x4 array of float
319 | Homogenous transformation matrix.
320 | dtype : numpy dtype, optional
321 | Datatype for returned quaternion.
322 | """
323 | T = np.array(transform_matrix, dtype=float)
324 | R = T[:3, :3]
325 | q = np.zeros(4, dtype=dtype)
326 | q[0] = np.sqrt(1.0 + R.trace()) / 2.0
327 | q[1] = R[2, 1] - R[1, 2]
328 | q[2] = R[0, 2] - R[2, 0]
329 | q[3] = R[1, 0] - R[0, 1]
330 | q[1:4] /= 4.0 * q[0]
331 | return q
332 |
--------------------------------------------------------------------------------
/tests/test_struct.py:
--------------------------------------------------------------------------------
1 | """Tests for Shell and Substruct objects.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import os
7 | import pytest
8 |
9 | DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
10 | PLANAR_SDF_FILE = os.path.join(DATA_DIR, "caffeine_planar.sdf.bz2")
11 |
12 |
13 | class TestShellCreation:
14 | def test_error_when_center_not_atom(self):
15 | from e3fp.fingerprint.structs import Shell
16 |
17 | with pytest.raises(TypeError):
18 | Shell(None)
19 |
20 | def test_error_when_shells_has_non_shell(self):
21 | from e3fp.fingerprint.structs import Shell
22 |
23 | atom = 0
24 | shells = [None]
25 | with pytest.raises(TypeError):
26 | Shell(atom, shells)
27 |
28 | def test_creation_with_atoms_or_ids_equivalent(self):
29 | from e3fp.fingerprint.structs import Shell
30 | from e3fp.conformer.util import mol_from_sdf
31 |
32 | mol = mol_from_sdf(PLANAR_SDF_FILE)
33 | atoms = list(mol.GetAtoms())
34 | atom_ids = [x.GetIdx() for x in atoms]
35 | assert Shell(atoms[0], atoms[1:]) == Shell(atom_ids[0], atom_ids[1:])
36 |
37 | def test_create_shell_no_shell(self):
38 | from e3fp.fingerprint.structs import Shell
39 | from e3fp.conformer.util import mol_from_sdf
40 |
41 | mol = mol_from_sdf(PLANAR_SDF_FILE)
42 | atoms = list(mol.GetAtoms())
43 | center_atom = atoms[0]
44 | Shell(center_atom)
45 |
46 | def test_create_shell_with_same_center_fails(self):
47 | from e3fp.fingerprint.structs import Shell, FormatError
48 | from e3fp.conformer.util import mol_from_sdf
49 |
50 | mol = mol_from_sdf(PLANAR_SDF_FILE)
51 | atoms = list(mol.GetAtoms())
52 | center_atom = atoms[0]
53 | with pytest.raises(FormatError):
54 | Shell(center_atom, atoms)
55 |
56 | def test_atoms_converted_to_shells(self):
57 | from e3fp.fingerprint.structs import Shell
58 | from e3fp.conformer.util import mol_from_sdf
59 |
60 | mol = mol_from_sdf(PLANAR_SDF_FILE)
61 | atoms = list(mol.GetAtoms())
62 | center_atom = atoms[0]
63 | shell = Shell(center_atom, atoms[1:])
64 | for s in shell.shells:
65 | assert isinstance(s, Shell)
66 |
67 | def test_creation_with_atoms_or_shells_equal(self):
68 | from e3fp.fingerprint.structs import Shell
69 | from e3fp.conformer.util import mol_from_sdf
70 |
71 | mol = mol_from_sdf(PLANAR_SDF_FILE)
72 | atoms = list(mol.GetAtoms())
73 | shells = list(map(Shell, atoms))
74 | center_atom = atoms[0]
75 | shell1 = Shell(center_atom, atoms[1:])
76 | shell2 = Shell(center_atom, shells[1:])
77 | assert shell1 == shell2
78 |
79 | def test_recursive_atom_shells_correct(self):
80 | from e3fp.fingerprint.structs import Shell
81 | from e3fp.conformer.util import mol_from_sdf
82 |
83 | mol = mol_from_sdf(PLANAR_SDF_FILE)
84 | atoms = list(mol.GetAtoms())
85 | shell1 = Shell(atoms[5], atoms[6:8])
86 | shell2 = Shell(atoms[2], atoms[3:5])
87 | shell = Shell(atoms[0], (shell1, shell2))
88 | assert shell.atoms == {x.GetIdx() for x in (atoms[0], atoms[2], atoms[5])}
89 |
90 |
91 | class TestShellComparison:
92 | def test_shells_same_center_same_atoms_equal(self):
93 | from e3fp.fingerprint.structs import Shell
94 | from e3fp.conformer.util import mol_from_sdf
95 |
96 | mol = mol_from_sdf(PLANAR_SDF_FILE)
97 | atoms = list(mol.GetAtoms())
98 | center_atom = atoms[0]
99 | shell1 = Shell(center_atom, atoms[1:])
100 | shell2 = Shell(center_atom, atoms[1:])
101 | assert shell1 == shell2
102 |
103 | def test_shells_diff_center_same_atoms_nonequal(self):
104 | from e3fp.fingerprint.structs import Shell
105 | from e3fp.conformer.util import mol_from_sdf
106 |
107 | mol = mol_from_sdf(PLANAR_SDF_FILE)
108 | atoms = list(mol.GetAtoms())
109 | shell1 = Shell(atoms[0], atoms[2:])
110 | shell2 = Shell(atoms[1], atoms[2:])
111 | assert shell1 != shell2
112 |
113 | def test_shells_same_center_diff_atoms_nonequal(self):
114 | from e3fp.fingerprint.structs import Shell
115 | from e3fp.conformer.util import mol_from_sdf
116 |
117 | mol = mol_from_sdf(PLANAR_SDF_FILE)
118 | atoms = list(mol.GetAtoms())
119 | center_atom = atoms[0]
120 | shell1 = Shell(center_atom, atoms[1:])
121 | shell2 = Shell(center_atom, atoms[2:])
122 | assert shell1 != shell2
123 |
124 | def test_equal_shells_hash_to_same_value(self):
125 | from e3fp.fingerprint.structs import Shell
126 | from e3fp.conformer.util import mol_from_sdf
127 |
128 | mol = mol_from_sdf(PLANAR_SDF_FILE)
129 | atoms = list(mol.GetAtoms())
130 | center_atom = atoms[0]
131 | shell1 = Shell(center_atom, atoms[1:])
132 | shell2 = Shell(center_atom, atoms[1:])
133 | assert hash(shell1) == hash(shell2)
134 |
135 | def test_same_shell_hashes_to_same_value(self):
136 | from e3fp.fingerprint.structs import Shell
137 | from e3fp.conformer.util import mol_from_sdf
138 |
139 | mol = mol_from_sdf(PLANAR_SDF_FILE)
140 | atoms = list(mol.GetAtoms())
141 | center_atom = atoms[0]
142 | shell = Shell(center_atom, atoms[1:])
143 | assert hash(shell) == hash(shell)
144 |
145 |
146 | class TestShellSubstructInterface:
147 | def test_recursive_shell_substruct_correct1(self):
148 | from e3fp.fingerprint.structs import Shell
149 | from e3fp.conformer.util import mol_from_sdf
150 |
151 | mol = mol_from_sdf(PLANAR_SDF_FILE)
152 | atoms = list(mol.GetAtoms())
153 | shell1 = Shell(atoms[5], atoms[6:8])
154 | shell2 = Shell(atoms[1], atoms[2:5])
155 | shell = Shell(atoms[0], (shell1, shell2))
156 | assert shell.substruct.atoms == {x.GetIdx() for x in atoms[:8]}
157 |
158 | def test_recursive_shell_substruct_correct2(self):
159 | from e3fp.fingerprint.structs import Shell
160 | from e3fp.conformer.util import mol_from_sdf
161 |
162 | mol = mol_from_sdf(PLANAR_SDF_FILE)
163 | atoms = list(mol.GetAtoms())
164 | shell1 = Shell(atoms[1], atoms[2:5])
165 | shell2 = Shell(atoms[5], {shell1})
166 | shell3 = Shell(atoms[6], atoms[7:10])
167 | shell4 = Shell(atoms[10], {shell3})
168 | shell = Shell(atoms[0], (shell2, shell4))
169 | assert shell.substruct.atoms == {x.GetIdx() for x in atoms[:11]}
170 |
171 | def test_shell_creation_from_substruct_without_center_fails(self):
172 | from e3fp.fingerprint.structs import Shell, Substruct, FormatError
173 | from e3fp.conformer.util import mol_from_sdf
174 |
175 | mol = mol_from_sdf(PLANAR_SDF_FILE)
176 | atoms = list(mol.GetAtoms())
177 | substruct = Substruct(None, atoms[:2])
178 | with pytest.raises(FormatError):
179 | Shell.from_substruct(substruct)
180 |
181 | def test_shell_creation_from_substruct(self):
182 | from e3fp.fingerprint.structs import Shell, Substruct
183 | from e3fp.conformer.util import mol_from_sdf
184 |
185 | mol = mol_from_sdf(PLANAR_SDF_FILE)
186 | atoms = list(mol.GetAtoms())
187 | substruct = Substruct(atoms[0], atoms[:2])
188 | shell = Shell.from_substruct(substruct)
189 | assert shell.atoms == substruct.atoms
190 |
191 | def test_substruct_creation_from_shell(self):
192 | from e3fp.fingerprint.structs import Shell, Substruct
193 | from e3fp.conformer.util import mol_from_sdf
194 |
195 | mol = mol_from_sdf(PLANAR_SDF_FILE)
196 | atoms = list(mol.GetAtoms())
197 | shell = Shell(atoms[0], atoms[1:])
198 | substruct = Substruct.from_shell(shell)
199 | assert shell.substruct == substruct
200 |
201 |
202 | class TestSubstructCreation:
203 | def test_error_when_center_not_atom(self):
204 | from e3fp.fingerprint.structs import Substruct
205 |
206 | with pytest.raises(TypeError):
207 | Substruct("foo")
208 |
209 | def test_error_when_atoms_has_non_atom(self):
210 | from e3fp.fingerprint.structs import Substruct
211 |
212 | atoms = [None]
213 | with pytest.raises(TypeError):
214 | Substruct(atoms=atoms)
215 |
216 | def test_center_atom_auto_added_to_atoms(self):
217 | from e3fp.fingerprint.structs import Substruct
218 | from e3fp.conformer.util import mol_from_sdf
219 |
220 | mol = mol_from_sdf(PLANAR_SDF_FILE)
221 | atoms = list(mol.GetAtoms())
222 | center_atom = atoms[0]
223 | substruct = Substruct(center_atom, atoms[1:])
224 | assert center_atom.GetIdx() in substruct.atoms
225 |
226 |
227 | class TestSubstructCreationComparison:
228 | def test_substructs_same_center_same_atoms_equal(self):
229 | from e3fp.fingerprint.structs import Substruct
230 | from e3fp.conformer.util import mol_from_sdf
231 |
232 | mol = mol_from_sdf(PLANAR_SDF_FILE)
233 | atoms = list(mol.GetAtoms())
234 | center_atom = atoms[0]
235 | substruct1 = Substruct(center_atom, atoms)
236 | substruct2 = Substruct(center_atom, atoms)
237 | assert substruct1 == substruct2
238 |
239 | def test_substructs_diff_center_same_atoms_equal(self):
240 | from e3fp.fingerprint.structs import Substruct
241 | from e3fp.conformer.util import mol_from_sdf
242 |
243 | mol = mol_from_sdf(PLANAR_SDF_FILE)
244 | atoms = list(mol.GetAtoms())
245 | substruct1 = Substruct(atoms[0], atoms)
246 | substruct2 = Substruct(atoms[1], atoms)
247 | assert substruct1 == substruct2
248 |
249 | def test_substructs_same_center_diff_atoms_nonequal(self):
250 | from e3fp.fingerprint.structs import Substruct
251 | from e3fp.conformer.util import mol_from_sdf
252 |
253 | mol = mol_from_sdf(PLANAR_SDF_FILE)
254 | atoms = list(mol.GetAtoms())
255 | substruct1 = Substruct(atoms[0], atoms[1:])
256 | substruct2 = Substruct(atoms[0], atoms[2:])
257 | assert substruct1 != substruct2
258 |
259 | def test_equal_shells_hash_to_same_value(self):
260 | from e3fp.fingerprint.structs import Substruct
261 | from e3fp.conformer.util import mol_from_sdf
262 |
263 | mol = mol_from_sdf(PLANAR_SDF_FILE)
264 | atoms = list(mol.GetAtoms())
265 | center_atom = atoms[0]
266 | substruct1 = Substruct(center_atom, atoms[1:])
267 | substruct2 = Substruct(center_atom, atoms[1:])
268 | assert hash(substruct1) == hash(substruct2)
269 |
270 | def test_same_shells_hash_to_same_value(self):
271 | from e3fp.fingerprint.structs import Substruct
272 | from e3fp.conformer.util import mol_from_sdf
273 |
274 | mol = mol_from_sdf(PLANAR_SDF_FILE)
275 | atoms = list(mol.GetAtoms())
276 | center_atom = atoms[0]
277 | substruct = Substruct(center_atom, atoms[1:])
278 | assert hash(substruct) == hash(substruct)
279 |
--------------------------------------------------------------------------------
/src/e3fp/fingerprint/structs.py:
--------------------------------------------------------------------------------
1 | """Class for defining 3D atom environments.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | from __future__ import division, print_function
7 | from functools import reduce
8 |
9 | import numpy as np
10 | import rdkit.Chem
11 |
12 | import smart_open
13 | from e3fp.fingerprint import array_ops
14 |
15 |
16 | PDB_LINE = (
17 | "HETATM{atom_id:>5d} {name:<4s} LIG A 1 "
18 | "{coord[0]:>8.3f}{coord[1]:>8.3f}{coord[2]:>8.3f}"
19 | "{occupancy:>6.2f}{temp:>6.2f} {elem:>2s}{charge:>2s}"
20 | )
21 |
22 |
23 | class Shell(object):
24 | """A container for other Shells centered on an atom.
25 |
26 | Shells represent all atoms explicitly within a container. Atoms are
27 | represented by their ids. If atoms are provided instead of shells, they
28 | are converted to single-atom shells. A Substruct is generated from a Shell
29 | on the fly by recursion through member shells. An optional identifier may
30 | be set.
31 | """
32 |
33 | def __init__(
34 | self,
35 | center_atom,
36 | shells=set(),
37 | radius=None,
38 | last_shell=None,
39 | identifier=None,
40 | ):
41 | if isinstance(center_atom, rdkit.Chem.Atom):
42 | center_atom = center_atom.GetIdx()
43 | elif not isinstance(center_atom, (int, np.integer)):
44 | raise TypeError("center_atom must be Atom or atom id")
45 | self._center_atom = center_atom
46 |
47 | self._shells = set()
48 | for shell in shells:
49 | if isinstance(shell, int):
50 | shell = Shell(shell)
51 | elif isinstance(shell, rdkit.Chem.Atom):
52 | shell = Shell(shell.GetIdx())
53 | elif not isinstance(shell, Shell):
54 | raise TypeError("shells must be Shells, Atoms, or atom ids")
55 | if shell.center_atom == self.center_atom:
56 | raise FormatError(
57 | "member shells cannot be centered on same "
58 | "center_atom as new shell"
59 | )
60 | self._shells.add(shell)
61 | self._shells = frozenset(self._shells)
62 |
63 | self.radius = radius
64 | self.last_shell = last_shell
65 | self.atoms = None
66 | self.substruct = None
67 | self.identifier = identifier
68 | self.is_duplicate = False
69 | self.duplicate = None
70 |
71 | @classmethod
72 | def from_substruct(cls, substruct):
73 | """Create shell with one shell for each atom in the substruct."""
74 | if substruct.center_atom is None:
75 | raise FormatError(
76 | "Can only create Shell from Substruct if "
77 | "center_atom is defined"
78 | )
79 | atoms = substruct.atoms ^ {substruct.center_atom}
80 | return cls(substruct.center_atom, [Shell(x) for x in atoms])
81 |
82 | @property
83 | def center_atom(self):
84 | return self._center_atom
85 |
86 | @property
87 | def shells(self):
88 | return self._shells
89 |
90 | @property
91 | def atoms(self):
92 | """Get all atoms explicitly within the shell."""
93 | if self._atoms is None:
94 | self._atoms = set([self.center_atom,])
95 | self._atoms.update([x.center_atom for x in self.shells])
96 | return self._atoms
97 |
98 | @atoms.setter
99 | def atoms(self, atoms):
100 | self._atoms = atoms
101 |
102 | @property
103 | def substruct(self):
104 | """Get substruct with all atoms implicitly within the shell."""
105 | if self._substruct is None:
106 | atom_sets = [set(x.substruct.atoms) for x in self.shells]
107 | if len(atom_sets) > 0:
108 | atoms = reduce(set.union, atom_sets)
109 | else:
110 | atoms = set()
111 | self._substruct = Substruct(
112 | center_atom=self.center_atom, atoms=atoms
113 | )
114 | self._substruct.shell = self
115 | return self._substruct
116 |
117 | @substruct.setter
118 | def substruct(self, substruct):
119 | if not isinstance(substruct, Substruct) and substruct is not None:
120 | raise TypeError("substruct must be of type Substruct")
121 | self._substruct = substruct
122 |
123 | def __repr__(self):
124 | return (
125 | "Shell(center_atom={!r}, shells={!r}, radius={!r}, "
126 | "last_shell={!r}, identifier={!r})"
127 | ).format(
128 | self.center_atom,
129 | tuple(self.shells),
130 | self.radius,
131 | self.last_shell,
132 | self.identifier,
133 | )
134 |
135 | def __str__(self):
136 | return (
137 | "Shell(center_atom={!r}, atoms={!r}, radius={!r}, "
138 | "identifier={!r})"
139 | ).format(
140 | self.center_atom, tuple(self.atoms), self.radius, self.identifier
141 | )
142 |
143 | def __hash__(self):
144 | return hash((self.center_atom, self.shells))
145 |
146 | def __eq__(self, other):
147 | return (self.center_atom == other.center_atom) and (
148 | self.shells == other.shells
149 | )
150 |
151 | def __ne__(self, other):
152 | return not self.__eq__(other)
153 |
154 | def __len__(self):
155 | return 1 + len(self.shells)
156 |
157 | def __contains__(self, key):
158 | if isinstance(key, (int, rdkit.Chem.Atom)):
159 | key = Shell(key)
160 | return key in self.shells or key == self
161 |
162 |
163 | class Substruct(object):
164 | """A container for atoms optionally centered on an atom.
165 |
166 | A Substruct represents all atoms implicitly within a Shell. Two Substructs
167 | are equal if they contain the same atoms.
168 | """
169 |
170 | def __init__(self, center_atom=None, atoms=set()):
171 | self.center_atom = center_atom
172 | self.shell = None
173 | self._atoms = set()
174 | for atom in atoms:
175 | if isinstance(atom, rdkit.Chem.Atom):
176 | atom = atom.GetIdx()
177 | elif not isinstance(atom, (int, np.integer)):
178 | raise TypeError("atoms must be Atom or atom id")
179 | self._atoms.add(atom)
180 | if self.center_atom is not None:
181 | self._atoms.add(self.center_atom)
182 | self._atoms = frozenset(self._atoms)
183 | self.transform_matrix = np.identity(4, dtype=float)
184 |
185 | @classmethod
186 | def from_shell(cls, shell):
187 | return shell.substruct
188 |
189 | @property
190 | def center_atom(self):
191 | return self._center_atom
192 |
193 | @center_atom.setter
194 | def center_atom(self, center_atom):
195 | if isinstance(center_atom, rdkit.Chem.Atom):
196 | center_atom = center_atom.GetIdx()
197 | elif (
198 | not isinstance(center_atom, (int, np.integer))
199 | and center_atom is not None
200 | ):
201 | raise TypeError("center_atom must be Atom or atom id")
202 | self._center_atom = center_atom
203 |
204 | @property
205 | def atoms(self):
206 | return self._atoms
207 |
208 | def __repr__(self):
209 | return "Substruct(center_atom={!r}, atoms={!r})".format(
210 | self.center_atom, tuple(self.atoms)
211 | )
212 |
213 | def __str__(self):
214 | return self.__repr__()
215 |
216 | def __hash__(self):
217 | return hash(self.atoms)
218 |
219 | def __eq__(self, other):
220 | return self.atoms == other.atoms
221 |
222 | def __ne__(self, other):
223 | return not self.__eq__(other)
224 |
225 | def __len__(self):
226 | return len(self.atoms)
227 |
228 | def __contains__(self, key):
229 | if isinstance(key, rdkit.Chem.Atom):
230 | key = key.GetIdx()
231 | return key in self.atoms
232 |
233 |
234 | class FormatError(Exception):
235 | pass
236 |
237 |
238 | # methods/classes for shell i/o
239 | def shell_to_pdb(
240 | mol, shell, atom_coords, bound_atoms_dict, out_file=None, reorient=True
241 | ):
242 | """Append substructure within shell to PDB.
243 |
244 | Parameters
245 | ----------
246 | mol : RDKit Mol
247 | Input mol
248 | shell : Shell
249 | A shell
250 | atom_coords : dict
251 | Dict matching atom id to coordinates.
252 | bound_atoms_dict : dict
253 | Dict matching atom id to id of bound atoms.
254 | out_file : str or None, optional
255 | File to which to append coordinates.
256 | reorient : bool, optional
257 | Use the transformation matrix in the shell to align by the stereo
258 | quadrants. If no transformation matrix present, centers the center
259 | atom.
260 |
261 | Returns
262 | -------
263 | list of str: list of PDB file lines, if `out_file` not specified
264 | """
265 | remark = "REMARK 400"
266 | name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
267 | header_lines = [remark + " COMPOUND", remark + " " + name]
268 | lines = header_lines + [
269 | "MODEL",
270 | ]
271 | atom_ids = sorted(shell.substruct.atoms)
272 | atoms = [mol.GetAtomWithIdx(int(x)) for x in atom_ids]
273 | coords = np.asarray(list(map(atom_coords.get, atom_ids)), dtype=float)
274 | if reorient:
275 | try:
276 | coords = array_ops.transform_array(shell.transform_matrix, coords)
277 | except AttributeError:
278 | coords -= atom_coords[shell.center_atom]
279 |
280 | for i, atom_id in enumerate(atom_ids):
281 | elem = atoms[i].GetSymbol()
282 | name = "{}{:d}".format(elem, atom_id + 1)
283 | charge = atoms[i].GetFormalCharge()
284 | if charge > 0:
285 | charge = "{:d}+".format(charge)
286 | elif charge < 0:
287 | charge = "{:d}-".format(abs(charge))
288 | else:
289 | charge = ""
290 | if atom_id == shell.center_atom:
291 | temp = 1.0
292 | elif atom_id in shell.atoms:
293 | temp = 0.5
294 | else:
295 | temp = 0.0
296 | pdb_entries = {
297 | "atom_id": atom_id,
298 | "name": name,
299 | "coord": coords[i, :].flatten(),
300 | "occupancy": 0.0,
301 | "temp": temp,
302 | "elem": elem,
303 | "charge": charge,
304 | }
305 | lines.append(PDB_LINE.format(**pdb_entries))
306 |
307 | # PLACEHOLDER FOR WRITING BONDS TO PDB
308 | # used_bonds = set()
309 | # write_bonds = []
310 | # for atom_id in atom_ids:
311 | # write_bonds.append(atom_id)
312 | # bound_atom_ids = bound_atoms_dict.get(atom_id, set())
313 | # for bound_atom_id in bound_atom_ids:
314 | # if (atom_id, bound_atom_id) in used_bonds:
315 | # continue
316 | # if len(write_bonds) > 3:
317 | # lines.append("CONECT "+" ".join(map(str, write_bonds)))
318 | # write_bonds = [atom_id,]
319 | # write_bonds.append(bound_atom_id)
320 | # used_bonds.add((atom_id, bound_atom_id))
321 | # used_bonds.add((bound_atom_id, atom_id))
322 |
323 | # lines.append("CONECT "+" ".join(map(str, write_bonds)))
324 | # write_bonds = []
325 |
326 | lines.append("ENDMDL")
327 |
328 | if out_file is not None:
329 | with smart_open.open(out_file, "a") as f:
330 | for line in lines:
331 | f.write(line + "\n")
332 | else:
333 | return lines
334 |
--------------------------------------------------------------------------------
/src/e3fp/conformer/util.py:
--------------------------------------------------------------------------------
1 | """Utilities for handling SMILES strings and RDKit mols and conformers.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import os
7 | import re
8 | import copy
9 | import logging
10 | from collections import namedtuple
11 |
12 | import rdkit
13 | import rdkit.Chem
14 | import rdkit.Chem.PropertyMol
15 | from rdkit.Chem.PropertyMol import PropertyMol
16 | from python_utilities.io_tools import touch_dir
17 | import smart_open
18 |
19 | PROTO_NAME_DELIM = "-"
20 | CONF_NAME_DELIM = "_"
21 | MOL_ITEM_REGEX = re.compile(
22 | r"(?P<{0}>.+?)(?:{1}(?P<{2}>\d+))?(?:{3}(?P<{4}>\d+))?$".format(
23 | "mol_name",
24 | PROTO_NAME_DELIM,
25 | "proto_state_num",
26 | CONF_NAME_DELIM,
27 | "conf_num",
28 | )
29 | )
30 | MOL_ITEM_FIELDS = ("mol_name", "proto_state_num", "conf_num")
31 | CONF_ENERGIES_PROPNAME = "_ConfEnergies"
32 | CONF_ENERGIES_DELIM = "|"
33 | CONF_ENERGY_PROPNAME = "Energy"
34 |
35 | MolItemTuple = namedtuple(
36 | "MolItemTuple", ["mol_name", "proto_state_num", "conf_num"]
37 | )
38 |
39 |
40 | class MolItemName(object):
41 | """Class for parsing mol item names and converting to various formats."""
42 |
43 | def __init__(
44 | self,
45 | mol_name=None,
46 | proto_state_num=None,
47 | conf_num=None,
48 | proto_delim=PROTO_NAME_DELIM,
49 | conf_delim=CONF_NAME_DELIM,
50 | ):
51 | self.mol_name = mol_name
52 | self.proto_state_num = proto_state_num
53 | self.conf_num = conf_num
54 | self.proto_delim = proto_delim
55 | self.conf_delim = conf_delim
56 |
57 | @classmethod
58 | def from_str(
59 | cls,
60 | mol_item_name,
61 | mol_item_regex=MOL_ITEM_REGEX,
62 | mol_item_fields=MOL_ITEM_FIELDS,
63 | **kwargs
64 | ):
65 | fields = cls.mol_item_name_to_dict(
66 | mol_item_name,
67 | mol_item_regex=mol_item_regex,
68 | mol_item_fields=mol_item_fields,
69 | )
70 | return cls(
71 | fields["mol_name"],
72 | fields["proto_state_num"],
73 | fields["conf_num"],
74 | **kwargs
75 | )
76 |
77 | def to_str(self):
78 | return self.mol_item_name
79 |
80 | @classmethod
81 | def from_tuple(cls, fields_tuple):
82 | return cls(*fields_tuple)
83 |
84 | def to_tuple(self):
85 | return MolItemTuple(self.mol_name, self.proto_state_num, self.conf_num)
86 |
87 | @property
88 | def mol_name(self):
89 | return self._mol_name
90 |
91 | @mol_name.setter
92 | def mol_name(self, mol_name):
93 | self._mol_name = mol_name
94 |
95 | def to_mol_name(self, as_proto=False):
96 | if as_proto:
97 | return self.proto_name
98 | else:
99 | return self.mol_name
100 |
101 | @property
102 | def proto_name(self):
103 | return self.to_proto_name(self.proto_state_num)
104 |
105 | def to_proto_name(
106 | self, proto_state_num=None, proto_delim=PROTO_NAME_DELIM
107 | ):
108 | if proto_state_num is not None:
109 | return "{}{}{:d}".format(
110 | self.mol_name, proto_delim, proto_state_num
111 | )
112 | else:
113 | return self.mol_name
114 |
115 | @property
116 | def conf_name(self):
117 | return self.to_conf_name(conf_num=self.conf_num)
118 |
119 | def to_conf_name(self, conf_num=None, conf_delim=CONF_NAME_DELIM):
120 | if conf_num is not None:
121 | return "{}{}{:d}".format(self.proto_name, conf_delim, conf_num)
122 | else:
123 | return self.proto_name
124 |
125 | @property
126 | def mol_item_name(self):
127 | return self.conf_name
128 |
129 | @staticmethod
130 | def mol_item_name_to_dict(
131 | mol_item_name,
132 | mol_item_regex=MOL_ITEM_REGEX,
133 | mol_item_fields=MOL_ITEM_FIELDS,
134 | ):
135 | match = re.match(mol_item_regex, mol_item_name)
136 | groups = match.groups()
137 | fields = dict(zip(mol_item_fields, groups))
138 | proto_state_num = fields.get("proto_state_num")
139 | if proto_state_num is not None:
140 | fields["proto_state_num"] = int(proto_state_num)
141 | conf_num = fields.get("conf_num")
142 | if conf_num is not None:
143 | fields["conf_num"] = int(conf_num)
144 | return fields
145 |
146 | def copy(self):
147 | return copy.copy(self)
148 |
149 | def __repr__(self):
150 | return (
151 | "MolItemName(mol_name={}, proto_state_num={}, "
152 | "conf_num={})".format(
153 | self.mol_name, self.proto_state_num, self.conf_num
154 | )
155 | )
156 |
157 | def __str__(self):
158 | return self.conf_name
159 |
160 | def __eq__(self, other):
161 | return self.to_tuple() == other.to_tuple()
162 |
163 | def __ne__(self, other):
164 | return not self.__eq__(other)
165 |
166 | def __gt__(self, other):
167 | return self.to_tuple().__gt__(other.to_tuple())
168 |
169 | def __lt__(self, other):
170 | return self.to_tuple().__lt__(other.to_tuple())
171 |
172 | def __hash__(self):
173 | return hash(self.to_tuple())
174 |
175 |
176 | def smiles_generator(*filenames):
177 | """Parse SMILES file(s) and yield (name, smile).
178 |
179 | Parameters
180 | ----------
181 | files : iterable object
182 | List of files containing smiles. File must contain one smile per
183 | line, followed by a space and then the molecule name.
184 |
185 | Yields
186 | ------
187 | tuple:
188 | `tuple` of the format (smile, name).
189 | """
190 | for filename in filenames:
191 | with smart_open.open(filename, "r") as f:
192 | for i, line in enumerate(f):
193 | values = line.rstrip("\r\n").split()
194 | if len(values) >= 2:
195 | yield tuple(values[:2])
196 | else:
197 | logging.warning(
198 | (
199 | "Line {:d} of {} has {:d} entries. Expected at least"
200 | " 2.".format(i + 1, filename, len(values))
201 | ),
202 | exc_info=True,
203 | )
204 |
205 |
206 | def smiles_to_dict(smiles_file, unique=False, has_header=False):
207 | """Read SMILES file to dict."""
208 | smiles_gen = smiles_generator(smiles_file)
209 | if has_header:
210 | header = next(smiles_gen)
211 | logging.info("Skipping first (header) values: {!r}".format(header))
212 | if unique:
213 | used_smiles = set()
214 | smiles_dict = {}
215 | for smiles, name in smiles_gen:
216 | if name not in smiles_dict and smiles not in used_smiles:
217 | smiles_dict[name] = smiles
218 | used_smiles.add(smiles)
219 | else:
220 | smiles_dict = {name: smiles for smiles, name in smiles_gen}
221 | return smiles_dict
222 |
223 |
224 | def dict_to_smiles(smiles_file, smiles_dict):
225 | """Write SMILES dict to file."""
226 | iter_to_smiles(smiles_file, sorted(smiles_dict.items()))
227 |
228 |
229 | def iter_to_smiles(smiles_file, smiles_iter):
230 | """Write iterator of (mol_name, SMILES) to file."""
231 | with smart_open.open(smiles_file, "w") as f:
232 | for mol_name, smiles in smiles_iter:
233 | f.write("{} {}\n".format(smiles, mol_name))
234 |
235 |
236 | def mol2_generator(*filenames):
237 | """Parse name from mol2 filename and return generator.
238 |
239 | Parameters
240 | ----------
241 | files : iterable object
242 | List of mol2 files, where filename should be molecule name followed by
243 | ".mol2"
244 |
245 | Yields
246 | ------
247 | tuple:
248 | `tuple` of the format (file, name).
249 | """
250 | for filename in filenames:
251 | name = os.path.splitext(os.path.basename(filename))[0]
252 | yield (filename, name)
253 |
254 |
255 | def mol_from_smiles(smiles, name, standardise=False):
256 | """Generate a n RDKit `PropertyMol` from SMILES string.
257 |
258 | Parameters
259 | ----------
260 | smile : str
261 | SMILES string
262 | name : str
263 | Name of molecule
264 | standardise : bool
265 | Clean Mol through standardisation
266 |
267 | Returns
268 | -------
269 | RDKit PropertyMol : Molecule.
270 | """
271 | mol = rdkit.Chem.MolFromSmiles(smiles)
272 | if mol is None:
273 | logging.error(
274 | "Mol creation failed from SMILES: {!r}".format((smiles, name))
275 | )
276 | return None
277 | if standardise:
278 | mol = mol_to_standardised_mol(mol, name)
279 | mol = PropertyMol(mol)
280 | mol.SetProp("_Name", name)
281 | mol.SetProp("_SMILES", smiles)
282 | return mol
283 |
284 |
285 | def mol_from_mol2(mol2_file, name=None, standardise=False):
286 | """Read a mol2 file into an RDKit `PropertyMol`.
287 |
288 | Parameters
289 | ----------
290 | mol2_file : str
291 | path to a mol2 file
292 | name : str, optional
293 | Name of molecule. If not provided, uses file basename as name
294 | standardise : bool
295 | Clean mol through standardisation
296 |
297 | Returns
298 | -------
299 | RDKit PropertyMol : Molecule.
300 | """
301 | if name is None:
302 | name = os.path.splitext(os.path.basename(mol2_file))[0]
303 | mol = rdkit.Chem.MolFromMol2File(mol2_file)
304 | if standardise:
305 | mol = mol_to_standardised_mol(mol, name)
306 | mol = PropertyMol(mol)
307 | mol.SetProp("_Name", name)
308 | return mol
309 |
310 |
311 | def mol_from_sdf(sdf_file, conf_num=None, standardise=False, mode="rb"):
312 | """Read SDF file into an RDKit `Mol` object.
313 |
314 | Parameters
315 | ----------
316 | sdf_file : str
317 | Path to an SDF file
318 | conf_num : int or None, optional
319 | Maximum number of conformers to read from file. Defaults to all.
320 | standardise : bool (default False)
321 | Clean mol through standardisation
322 | mode : str (default 'rb')
323 | Mode with which to open file
324 |
325 | Returns
326 | -------
327 | RDKit Mol : `Mol` object with each molecule in SDF file as a conformer
328 | """
329 | mol = None
330 | conf_energies = []
331 | with smart_open.open(sdf_file, mode) as f:
332 | supplier = rdkit.Chem.ForwardSDMolSupplier(f)
333 | i = 0
334 | while True:
335 | if i == conf_num:
336 | break
337 | try:
338 | new_mol = next(supplier)
339 | except StopIteration:
340 | logging.debug(
341 | "Read {:d} conformers from {}.".format(i, sdf_file)
342 | )
343 | break
344 |
345 | if new_mol.HasProp(CONF_ENERGY_PROPNAME):
346 | conf_energies.append(
347 | float(new_mol.GetProp(CONF_ENERGY_PROPNAME))
348 | )
349 |
350 | if mol is None:
351 | mol = rdkit.Chem.Mol(new_mol)
352 | mol.RemoveAllConformers()
353 | conf = new_mol.GetConformers()[0]
354 | mol.AddConformer(conf, assignId=True)
355 | i += 1
356 | if standardise:
357 | mol = mol_to_standardised_mol(mol)
358 | try:
359 | mol.GetProp("_Name")
360 | except KeyError:
361 | name = os.path.basename(sdf_file).split(".sdf")[0]
362 | mol.SetProp("_Name", name)
363 |
364 | if len(conf_energies) > 0:
365 | add_conformer_energies_to_mol(mol, conf_energies)
366 | mol.ClearProp(CONF_ENERGY_PROPNAME)
367 |
368 | return mol
369 |
370 |
371 | def mol_to_sdf(mol, out_file, conf_num=None):
372 | """Write RDKit `Mol` objects to an SDF file.
373 |
374 | Parameters
375 | ----------
376 | mol : RDKit Mol
377 | A molecule containing 1 or more conformations to write to file.
378 | out_file : str
379 | Path to save SDF file.
380 | conf_num : int or None, optional
381 | Maximum number of conformers to save to file. Defaults to all.
382 | """
383 | touch_dir(os.path.dirname(out_file))
384 | with smart_open.open(out_file, "w") as fobj:
385 | writer = rdkit.Chem.SDWriter(fobj)
386 | conf_ids = [conf.GetId() for conf in mol.GetConformers()]
387 | conf_energies = get_conformer_energies_from_mol(mol)
388 | mol.ClearProp(CONF_ENERGIES_PROPNAME)
389 | for i in conf_ids:
390 | if conf_num not in {-1, None} and i >= conf_num:
391 | break
392 | try:
393 | conf_energy = conf_energies[i]
394 | mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy))
395 | except (IndexError, TypeError):
396 | pass
397 | writer.write(mol, confId=i)
398 | writer.close()
399 | mol.ClearProp(CONF_ENERGY_PROPNAME)
400 | if conf_energies is not None:
401 | add_conformer_energies_to_mol(mol, conf_energies)
402 | logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
403 |
404 |
405 | def mol_to_standardised_mol(mol, name=None):
406 | """Standardise mol(s)."""
407 | try:
408 | from standardiser import standardise
409 | from standardiser.utils import StandardiseException
410 | except ImportError:
411 | logging.warning(
412 | "standardiser module unavailable. Using unstandardised mol."
413 | )
414 | return mol
415 |
416 | if name is None:
417 | try:
418 | name = mol.GetProp("_Name")
419 | except KeyError:
420 | name = repr(mol)
421 |
422 | if isinstance(mol, PropertyMol):
423 | mol_type = PropertyMol
424 | mol = rdkit.Chem.Mol(mol)
425 | else:
426 | mol_type = rdkit.Chem.Mol
427 |
428 | logging.debug("Standardising {}".format(name))
429 | try:
430 | std_mol = standardise.run(mol)
431 | except AttributeError: # backwards-compatible with old standardiser
432 | std_mol = standardise.apply(mol)
433 | except StandardiseException:
434 | logging.error(
435 | (
436 | "Standardisation of {} failed. Using unstandardised "
437 | "mol.".format(name)
438 | ),
439 | exc_info=True,
440 | )
441 | return mol_type(mol)
442 |
443 | std_mol = mol_type(std_mol)
444 | try:
445 | std_mol.SetProp("_Name", mol.GetProp("_Name"))
446 | except KeyError:
447 | pass
448 |
449 | return std_mol
450 |
451 |
452 | def add_conformer_energies_to_mol(mol, energies):
453 | """Add conformer energies as mol property.
454 |
455 | See discussion at https://sourceforge.net/p/rdkit/mailman/message/27547551/
456 | """
457 | energies_str = CONF_ENERGIES_DELIM.join(
458 | "{:.4f}".format(e) for e in energies
459 | )
460 | mol.SetProp(CONF_ENERGIES_PROPNAME, energies_str)
461 | return mol
462 |
463 |
464 | def get_conformer_energies_from_mol(mol):
465 | """Get conformer energies from mol."""
466 | if not mol.HasProp(CONF_ENERGIES_PROPNAME):
467 | return None
468 | energies_str = mol.GetProp(CONF_ENERGIES_PROPNAME)
469 | energies = [float(x) for x in energies_str.split(CONF_ENERGIES_DELIM)]
470 | return energies
471 |
--------------------------------------------------------------------------------
/src/e3fp/conformer/generator.py:
--------------------------------------------------------------------------------
1 | """Conformer generation.
2 |
3 | Author: Seth Axen
4 | E-mail: seth.axen@gmail.com
5 | """
6 | import logging
7 |
8 | import numpy as np
9 |
10 | from rdkit import Chem
11 | from rdkit.Chem import AllChem
12 | from rdkit.Chem import PropertyMol
13 | from .util import add_conformer_energies_to_mol
14 |
15 | # Heavily modified by Seth Axen from code under the following license
16 | __author__ = "Steven Kearnes"
17 | __copyright__ = "Copyright 2014, Stanford University"
18 | __license__ = "3-clause BSD"
19 |
20 | # options
21 | FORCEFIELD_CHOICES = ("uff", "mmff94", "mmff94s")
22 |
23 | # default values
24 | NUM_CONF_DEF = -1
25 | FIRST_DEF = -1
26 | POOL_MULTIPLIER_DEF = 1
27 | RMSD_CUTOFF_DEF = 0.5
28 | MAX_ENERGY_DIFF_DEF = -1.0
29 | FORCEFIELD_DEF = "uff"
30 | SEED_DEF = -1
31 |
32 |
33 | class ConformerGenerator(object):
34 | """Generate conformers using RDKit.
35 |
36 | Procedure
37 | ---------
38 | 1. Generate a pool of conformers.
39 | 2. Minimize conformers.
40 | 3. Filter conformers using an RMSD threshold and optional minimum energy
41 | difference.
42 |
43 | Note that pruning is done _after_ minimization, which differs from the
44 | protocol described in the references.
45 |
46 | References
47 | ----------
48 | * http://rdkit.org/docs/GettingStartedInPython.html
49 | #working-with-3d-molecules
50 | * http://pubs.acs.org/doi/full/10.1021/ci2004658
51 | * https://github.com/skearnes/rdkit-utils/blob/master/rdkit_utils/
52 | conformers.py
53 | """
54 |
55 | def __init__(
56 | self,
57 | num_conf: int=NUM_CONF_DEF,
58 | first: int=FIRST_DEF,
59 | rmsd_cutoff: float=RMSD_CUTOFF_DEF,
60 | max_energy_diff: float=MAX_ENERGY_DIFF_DEF,
61 | forcefield: str=FORCEFIELD_DEF,
62 | pool_multiplier: int=POOL_MULTIPLIER_DEF,
63 | seed: int=SEED_DEF,
64 | get_values: bool=False,
65 | sparse_rmsd: bool=True,
66 | store_energies: bool=True,
67 | ):
68 | """Initialize generator settings.
69 |
70 | Parameters
71 | ----------
72 | num_conf : int, optional
73 | Maximum number of conformers to generate (after pruning). -1
74 | results in auto selection of max_conformers.
75 | first : int, optional
76 | Terminate when this number of conformers has been accepted, and
77 | only return those conformers. -1 results in all conformers being
78 | returned.
79 | pool_multiplier : int, optional
80 | Factor to multiply by max_conformers to generate the initial
81 | conformer pool. Since conformers are filtered after energy
82 | minimization, increasing the size of the pool increases the chance
83 | of identifying max_conformers unique conformers.
84 | rmsd_cutoff : float, optional
85 | RMSD cutoff for pruning conformers. If None or negative, no
86 | pruning is performed.
87 | max_energy_diff : float, optional
88 | If set, conformers with energies this amount above the minimum
89 | energy conformer are not accepted.
90 | forcefield : {'uff', 'mmff94', 'mmff94s'}, optional
91 | Force field to use for conformer energy calculation and
92 | minimization.
93 | seed : int, optional
94 | Random seed for conformer generation. If -1, the random number
95 | generator is unseeded.
96 | get_values : boolean, optional
97 | Return tuple of key values, for storage.
98 | sparse_rmsd : bool, optional
99 | If `get_values` is True, instead of returning full symmetric RMSD
100 | matrix, only return flattened upper triangle.
101 | store_energies : bool, optional
102 | Store conformer energies as property in mol.
103 | """
104 | if not isinstance(num_conf, int) or num_conf < -1 or num_conf == 0:
105 | raise ValueError("num_conf must be either -1 or a positive integer")
106 | self.max_conformers = num_conf
107 | if not isinstance(first, int) or first < -1 or first == 0:
108 | raise ValueError("first must be either -1 or a positive integer")
109 | self.first_conformers = first
110 | if not rmsd_cutoff or rmsd_cutoff < 0:
111 | rmsd_cutoff = -1.0
112 | self.rmsd_cutoff = rmsd_cutoff
113 |
114 | if max_energy_diff is None or max_energy_diff < 0:
115 | max_energy_diff = -1.0
116 | self.max_energy_diff = max_energy_diff
117 |
118 | if forcefield not in FORCEFIELD_CHOICES:
119 | raise ValueError(
120 | "%s is not a valid option for forcefield" % forcefield
121 | )
122 | self.forcefield = forcefield
123 | if not isinstance(pool_multiplier, int) or pool_multiplier < 1:
124 | raise ValueError("pool_multiplier must be a positive integer")
125 | self.pool_multiplier = pool_multiplier
126 | self.seed = seed
127 | self.get_values = get_values
128 | self.sparse_rmsd = sparse_rmsd
129 | self.store_energies = store_energies
130 |
131 | def __call__(self, mol):
132 | """Generate conformers for a molecule.
133 |
134 | Parameters
135 | ----------
136 | mol : RDKit Mol
137 | Molecule.
138 |
139 | Returns
140 | -------
141 | RDKit Mol : copy of the input molecule with embedded conformers
142 | """
143 | return self.generate_conformers(mol)
144 |
145 | def generate_conformers(self, mol):
146 | """Generate conformers for a molecule.
147 |
148 | Parameters
149 | ----------
150 | mol : RDKit Mol
151 | Molecule.
152 |
153 | Returns
154 | -------
155 | RDKit Mol : copy of the input molecule with embedded conformers
156 | """
157 | # initial embedding
158 | mol = self.embed_molecule(mol)
159 | if not mol.GetNumConformers():
160 | msg = "No conformers generated for molecule"
161 | if mol.HasProp("_Name"):
162 | name = mol.GetProp("_Name")
163 | msg += ' "{}".'.format(name)
164 | else:
165 | msg += "."
166 | raise RuntimeError(msg)
167 |
168 | # minimization and filtering
169 | self.minimize_conformers(mol)
170 | mol, indices, energies, rmsds = self.filter_conformers(mol)
171 |
172 | if self.store_energies:
173 | add_conformer_energies_to_mol(mol, energies)
174 |
175 | if self.get_values is True:
176 | if self.sparse_rmsd:
177 | rmsds_mat = rmsds[np.triu_indices_from(rmsds, k=1)]
178 | else:
179 | rmsds_mat = rmsds
180 | return mol, (self.max_conformers, indices, energies, rmsds_mat)
181 | else:
182 | return mol
183 |
184 | @staticmethod
185 | def get_num_conformers(mol):
186 | """Return ideal number of conformers from rotatable bond number in model.
187 |
188 | Parameters
189 | ----------
190 | mol : Mol
191 | RDKit `Mol` object for molecule
192 |
193 | Yields
194 | ------
195 | num_conf : int
196 | Target number of conformers to accept
197 | """
198 | num_rot = AllChem.CalcNumRotatableBonds(mol)
199 | if num_rot < 8:
200 | return 50
201 | elif num_rot >= 8 and num_rot <= 12:
202 | return 200
203 | elif num_rot > 12:
204 | return 300
205 | else:
206 | return 0
207 |
208 | def embed_molecule(self, mol):
209 | """Generate conformers, possibly with pruning.
210 |
211 | Parameters
212 | ----------
213 | mol : RDKit Mol
214 | Molecule.
215 | """
216 | log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
217 | logging.debug("Adding hydrogens for %s" % log_name)
218 | mol = Chem.AddHs(mol) # add hydrogens
219 | logging.debug("Hydrogens added to %s" % log_name)
220 | logging.debug("Sanitizing mol for %s" % log_name)
221 | Chem.SanitizeMol(mol)
222 | logging.debug("Mol sanitized for %s" % log_name)
223 | if self.max_conformers == -1 or type(self.max_conformers) is not int:
224 | self.max_conformers = self.get_num_conformers(mol)
225 | n_confs = self.max_conformers * self.pool_multiplier
226 | if self.first_conformers == -1:
227 | self.first_conformers = self.max_conformers
228 | logging.debug("Embedding %d conformers for %s" % (n_confs, log_name))
229 | AllChem.EmbedMultipleConfs(
230 | mol,
231 | numConfs=n_confs,
232 | maxAttempts=10 * n_confs,
233 | pruneRmsThresh=-1.0,
234 | randomSeed=self.seed,
235 | ignoreSmoothingFailures=True,
236 | )
237 | logging.debug("Conformers embedded for %s" % log_name)
238 | return mol
239 |
240 | def get_molecule_force_field(self, mol, conf_id=None, **kwargs):
241 | """Get a force field for a molecule.
242 |
243 | Parameters
244 | ----------
245 | mol : RDKit Mol
246 | Molecule.
247 | conf_id : int, optional
248 | ID of the conformer to associate with the force field.
249 | **kwargs : dict, optional
250 | Keyword arguments for force field constructor.
251 | """
252 | if self.forcefield == "uff":
253 | ff = AllChem.UFFGetMoleculeForceField(
254 | mol, confId=conf_id, **kwargs
255 | )
256 | elif self.forcefield.startswith("mmff"):
257 | AllChem.MMFFSanitizeMolecule(mol)
258 | mmff_props = AllChem.MMFFGetMoleculeProperties(
259 | mol, mmffVariant=self.forcefield
260 | )
261 | ff = AllChem.MMFFGetMoleculeForceField(
262 | mol, mmff_props, confId=conf_id, **kwargs
263 | )
264 | else:
265 | raise ValueError(
266 | "Invalid forcefield " + "'{}'.".format(self.forcefield)
267 | )
268 | return ff
269 |
270 | def minimize_conformers(self, mol):
271 | """Minimize molecule conformers.
272 |
273 | Parameters
274 | ----------
275 | mol : RDKit Mol
276 | Molecule.
277 | """
278 | log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
279 | logging.debug("Minimizing conformers for %s" % log_name)
280 | for conf in mol.GetConformers():
281 | ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
282 | ff.Minimize()
283 | logging.debug("Conformers minimized for %s" % log_name)
284 |
285 | def get_conformer_energies(self, mol):
286 | """Calculate conformer energies.
287 |
288 | Parameters
289 | ----------
290 | mol : RDKit Mol
291 | Molecule.
292 |
293 | Returns
294 | -------
295 | energies : array_like
296 | Minimized conformer energies.
297 | """
298 | num_conf = mol.GetNumConformers()
299 | energies = np.empty((num_conf,), dtype=float)
300 | for i, conf in enumerate(mol.GetConformers()):
301 | ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
302 | energies[i] = ff.CalcEnergy()
303 | return energies
304 |
305 | def filter_conformers(self, mol):
306 | """Filter conformers which do not meet an RMSD threshold.
307 |
308 | Parameters
309 | ----------
310 | mol : RDKit Mol
311 | Molecule.
312 |
313 | Returns
314 | -------
315 | A new RDKit Mol containing the chosen conformers, sorted by
316 | increasing energy.
317 | """
318 | log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
319 | logging.debug("Pruning conformers for %s" % log_name)
320 | energies = self.get_conformer_energies(mol)
321 | energy_below_threshold = np.ones_like(energies, dtype=np.bool_)
322 |
323 | sort = np.argsort(energies) # sort by increasing energy
324 | confs = np.array(mol.GetConformers())
325 |
326 | # remove hydrogens to speed up substruct match
327 | mol = Chem.RemoveHs(mol)
328 | accepted = [] # always accept lowest-energy conformer
329 | rejected = []
330 | rmsds = np.zeros((confs.shape[0], confs.shape[0]), dtype=float)
331 | for i, fit_ind in enumerate(sort):
332 | accepted_num = len(accepted)
333 |
334 | # always accept lowest-energy conformer
335 | if accepted_num == 0:
336 | accepted.append(fit_ind)
337 |
338 | # pre-compute if Es are in acceptable range of min E
339 | if self.max_energy_diff != -1.0:
340 | energy_below_threshold = (
341 | energies <= energies[fit_ind] + self.max_energy_diff
342 | )
343 |
344 | continue
345 |
346 | # reject conformers after first_conformers is reached
347 | if accepted_num >= self.first_conformers:
348 | rejected.append(fit_ind)
349 | continue
350 |
351 | # check if energy is too high
352 | if not energy_below_threshold[fit_ind]:
353 | rejected.append(fit_ind)
354 | continue
355 |
356 | # get RMSD to selected conformers
357 | these_rmsds = np.zeros((accepted_num,), dtype=float)
358 | # reverse so all confs aligned to lowest energy
359 | for j, accepted_ind in self.reverse_enumerate(accepted):
360 | this_rmsd = AllChem.GetBestRMS(
361 | mol,
362 | mol,
363 | confs[accepted_ind].GetId(),
364 | confs[fit_ind].GetId(),
365 | )
366 | # reject conformers within the RMSD threshold
367 | if this_rmsd < self.rmsd_cutoff:
368 | rejected.append(fit_ind)
369 | break
370 | else:
371 | these_rmsds[-j - 1] = this_rmsd
372 | else:
373 | rmsds[fit_ind, accepted] = these_rmsds
374 | rmsds[accepted, fit_ind] = these_rmsds
375 | accepted.append(fit_ind)
376 |
377 | # slice and order rmsds and energies to match accepted list
378 | rmsds = rmsds[np.ix_(accepted, accepted)]
379 | energies = energies[accepted]
380 |
381 | # create a new molecule with all conformers, sorted by energy
382 | new = PropertyMol.PropertyMol(mol)
383 | new.RemoveAllConformers()
384 | conf_ids = [conf.GetId() for conf in mol.GetConformers()]
385 | for i in accepted:
386 | conf = mol.GetConformer(conf_ids[i])
387 | new.AddConformer(conf, assignId=True)
388 |
389 | logging.debug("Conformers filtered for %s" % log_name)
390 | return new, np.asarray(accepted, dtype=int), energies, rmsds
391 |
392 | @staticmethod
393 | def reverse_enumerate(iterable):
394 | """Enumerate, but with the last result first but still numbered last.
395 |
396 | Parameters
397 | ----------
398 | iterable : some 1-D iterable
399 |
400 | Returns
401 | -------
402 | iterable:
403 | Reverse of `enumerate` function
404 | """
405 | return zip(reversed(range(len(iterable))), reversed(iterable))
406 |
407 | # magic methods
408 | def __repr__(self):
409 | return """ConformerGenerator(num_conf=%r, first=%r,\
410 | \n pool_multiplier=%r, rmsd_cutoff=%r,\
411 | \n max_energy_diff=%r, forcefield=%r,\
412 | \n get_values=%r, sparse_rmsd=%r)""" % (
413 | self.max_conformers,
414 | self.first,
415 | self.pool_multiplier,
416 | self.rmsd_cutoff,
417 | self.max_energy_diff,
418 | self.forcefield,
419 | self.get_values,
420 | self.sparse_rmsd,
421 | )
422 |
423 | def __str__(self):
424 | return self.__repr__()
425 |
--------------------------------------------------------------------------------