├── tests ├── __init__.py ├── data │ ├── stereo1.sdf.bz2 │ ├── stereo2.sdf.bz2 │ ├── caffeine_planar.sdf.bz2 │ ├── ritalin_nonplanar.sdf.bz2 │ ├── rand_sdf_files │ │ ├── CHEMBL116226.sdf.bz2 │ │ ├── CHEMBL197946.sdf.bz2 │ │ ├── CHEMBL282186.sdf.bz2 │ │ ├── CHEMBL501745.sdf.bz2 │ │ └── CHEMBL2114064.sdf.bz2 │ └── ritalin_nonplanar.sdf ├── test_config.py ├── test_dependencies.py ├── test_util.py ├── test_conformer.py ├── test_fingerprint.py ├── test_metrics.py └── test_struct.py ├── src └── e3fp │ ├── config │ ├── __init__.py │ ├── defaults.cfg │ └── params.py │ ├── conformer │ ├── __init__.py │ ├── protonation.py │ ├── util.py │ └── generator.py │ ├── fingerprint │ ├── __init__.py │ ├── metrics │ │ ├── __pycache__ │ │ │ ├── array_metrics._dense_soergel-225.py312.nbi │ │ │ ├── array_metrics._sparse_soergel-246.py312.nbi │ │ │ ├── array_metrics._dense_soergel-225.py312.1.nbc │ │ │ └── array_metrics._sparse_soergel-246.py312.1.nbc │ │ ├── fprint_metrics.py │ │ ├── __init__.py │ │ └── array_metrics.py │ ├── util.py │ ├── array_ops.py │ └── structs.py │ ├── __init__.py │ ├── pipeline.py │ └── util.py ├── doc ├── source │ ├── examples │ │ └── data │ │ │ ├── caffeine.smi │ │ │ ├── new_params.cfg │ │ │ └── test_smiles.smi │ ├── api │ │ ├── index.rst │ │ ├── e3fp.util.rst │ │ ├── e3fp.pipeline.rst │ │ ├── e3fp.config.params.rst │ │ ├── e3fp.conformer.util.rst │ │ ├── e3fp.fingerprint.db.rst │ │ ├── e3fp.fingerprint.util.rst │ │ ├── e3fp.conformer.generate.rst │ │ ├── e3fp.conformer.generator.rst │ │ ├── e3fp.fingerprint.fprint.rst │ │ ├── e3fp.fingerprint.structs.rst │ │ ├── e3fp.fingerprint.fprinter.rst │ │ ├── e3fp.fingerprint.generate.rst │ │ ├── e3fp.conformer.protonation.rst │ │ ├── e3fp.fingerprint.array_ops.rst │ │ ├── e3fp.fingerprint.metrics.array_metrics.rst │ │ ├── e3fp.fingerprint.metrics.fprint_metrics.rst │ │ ├── e3fp.config.rst │ │ ├── e3fp.conformer.rst │ │ ├── e3fp.fingerprint.metrics.rst │ │ ├── e3fp.rst │ │ └── e3fp.fingerprint.rst │ ├── _static │ │ └── ritalin3d.png │ ├── index.rst │ ├── usage │ │ ├── fingerprints │ │ │ ├── index.rst │ │ │ ├── comparison.rst │ │ │ ├── storage.rst │ │ │ └── fprints.rst │ │ ├── index.rst │ │ ├── config.rst │ │ ├── pipeline.rst │ │ └── cli.rst │ ├── _templates │ │ └── layout.html │ ├── overview.rst │ ├── substitutions.rst │ ├── install.rst │ ├── conf.py │ └── dev │ │ └── index.rst └── Makefile ├── pytest.ini ├── .gitignore ├── .coveragerc ├── .readthedocs.yml ├── .github └── workflows │ ├── publish.yml │ └── ci.yml ├── pyproject.toml ├── README.rst └── LICENSE.txt /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/e3fp/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/e3fp/conformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/examples/data/caffeine.smi: -------------------------------------------------------------------------------- 1 | CN1C=NC2=C1C(=O)N(C(=O)N2C)C caffeine 2 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-modules 3 | doctest_optionflags = ELLIPSIS 4 | -------------------------------------------------------------------------------- /tests/data/stereo1.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/stereo1.sdf.bz2 -------------------------------------------------------------------------------- /tests/data/stereo2.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/stereo2.sdf.bz2 -------------------------------------------------------------------------------- /doc/source/api/index.rst: -------------------------------------------------------------------------------- 1 | e3fp API 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 5 6 | 7 | e3fp 8 | -------------------------------------------------------------------------------- /doc/source/_static/ritalin3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/doc/source/_static/ritalin3d.png -------------------------------------------------------------------------------- /tests/data/caffeine_planar.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/caffeine_planar.sdf.bz2 -------------------------------------------------------------------------------- /tests/data/ritalin_nonplanar.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/ritalin_nonplanar.sdf.bz2 -------------------------------------------------------------------------------- /doc/source/examples/data/new_params.cfg: -------------------------------------------------------------------------------- 1 | [conformer_generation] 2 | first = 10 3 | 4 | [fingerprinting] 5 | bits = 4096 6 | first = 10 -------------------------------------------------------------------------------- /tests/data/rand_sdf_files/CHEMBL116226.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL116226.sdf.bz2 -------------------------------------------------------------------------------- /tests/data/rand_sdf_files/CHEMBL197946.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL197946.sdf.bz2 -------------------------------------------------------------------------------- /tests/data/rand_sdf_files/CHEMBL282186.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL282186.sdf.bz2 -------------------------------------------------------------------------------- /tests/data/rand_sdf_files/CHEMBL501745.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL501745.sdf.bz2 -------------------------------------------------------------------------------- /tests/data/rand_sdf_files/CHEMBL2114064.sdf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL2114064.sdf.bz2 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | dist/* 3 | doc/_build/* 4 | .cache/* 5 | .coverage 6 | .DS_Store 7 | *egg* 8 | *.pyc 9 | *.so 10 | *.o 11 | uv.lock 12 | docs 13 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = True 4 | source = e3fp 5 | omit = 6 | */e3fp/test/* 7 | */setup.py 8 | */doc/* 9 | -------------------------------------------------------------------------------- /src/e3fp/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | from .util import E3FPWarning, E3FPDeprecationWarning 3 | 4 | __version__ = importlib.metadata.version("e3fp") 5 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.util.rst: -------------------------------------------------------------------------------- 1 | e3fp\.util module 2 | ================= 3 | 4 | .. automodule:: e3fp.util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.pipeline.rst: -------------------------------------------------------------------------------- 1 | e3fp\.pipeline module 2 | ===================== 3 | 4 | .. automodule:: e3fp.pipeline 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.config.params.rst: -------------------------------------------------------------------------------- 1 | e3fp\.config\.params module 2 | =========================== 3 | 4 | .. automodule:: e3fp.config.params 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.conformer.util.rst: -------------------------------------------------------------------------------- 1 | e3fp\.conformer\.util module 2 | ============================ 3 | 4 | .. automodule:: e3fp.conformer.util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.db.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.db module 2 | ============================ 3 | 4 | .. automodule:: e3fp.fingerprint.db 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.nbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.nbi -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.nbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.nbi -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.1.nbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.1.nbc -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.1.nbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.1.nbc -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.util.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.util module 2 | ============================== 3 | 4 | .. automodule:: e3fp.fingerprint.util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.conformer.generate.rst: -------------------------------------------------------------------------------- 1 | e3fp\.conformer\.generate module 2 | ================================ 3 | 4 | .. automodule:: e3fp.conformer.generate 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.conformer.generator.rst: -------------------------------------------------------------------------------- 1 | e3fp\.conformer\.generator module 2 | ================================= 3 | 4 | .. automodule:: e3fp.conformer.generator 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.fprint.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.fprint module 2 | ================================ 3 | 4 | .. automodule:: e3fp.fingerprint.fprint 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.structs.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.structs module 2 | ================================= 3 | 4 | .. automodule:: e3fp.fingerprint.structs 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.fprinter.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.fprinter module 2 | ================================== 3 | 4 | .. automodule:: e3fp.fingerprint.fprinter 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.generate.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.generate module 2 | ================================== 3 | 4 | .. automodule:: e3fp.fingerprint.generate 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.conformer.protonation.rst: -------------------------------------------------------------------------------- 1 | e3fp\.conformer\.protonation module 2 | =================================== 3 | 4 | .. automodule:: e3fp.conformer.protonation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.array_ops.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.array\_ops module 2 | ==================================== 3 | 4 | .. automodule:: e3fp.fingerprint.array_ops 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.metrics.array_metrics.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.metrics\.array\_metrics module 2 | ================================================= 3 | 4 | .. automodule:: e3fp.fingerprint.metrics.array_metrics 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.metrics.fprint_metrics.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.metrics\.fprint\_metrics module 2 | ================================================== 3 | 4 | .. automodule:: e3fp.fingerprint.metrics.fprint_metrics 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. e3fp documentation master file 2 | 3 | e3fp 4 | ==== 5 | 6 | :Release: |version| 7 | :Date: |today| 8 | 9 | Contents 10 | ----------------- 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | 15 | overview 16 | install 17 | usage/index 18 | dev/index 19 | api/index 20 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.config.rst: -------------------------------------------------------------------------------- 1 | e3fp\.config package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | e3fp.config.params 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: e3fp.config 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | """Tests for loading config files. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import os 7 | 8 | 9 | class TestConfig: 10 | def test_config_file_exists(self): 11 | from e3fp.config.params import DEF_PARAM_FILE 12 | 13 | assert os.path.isfile(DEF_PARAM_FILE) 14 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.13" 7 | commands: 8 | - asdf plugin add uv 9 | - asdf install uv latest 10 | - asdf global uv latest 11 | - uv sync --extra docs 12 | - uv run -m sphinx -T -b html -d docs/_build/doctrees doc/source $READTHEDOCS_OUTPUT/html 13 | -------------------------------------------------------------------------------- /doc/source/usage/fingerprints/index.rst: -------------------------------------------------------------------------------- 1 | Using Fingerprints 2 | ================== 3 | 4 | While molecular fingerprints are widely used, few packages provide simple 5 | interfaces for working with them and interfacing with machine learning 6 | packages. E3FP provides a number of general utility classes and methods for 7 | doing precisely this. 8 | 9 | .. toctree:: 10 | fprints 11 | storage 12 | comparison 13 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.conformer.rst: -------------------------------------------------------------------------------- 1 | e3fp\.conformer package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | e3fp.conformer.generate 10 | e3fp.conformer.generator 11 | e3fp.conformer.protonation 12 | e3fp.conformer.util 13 | 14 | Module contents 15 | --------------- 16 | 17 | .. automodule:: e3fp.conformer 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.metrics.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint\.metrics package 2 | ================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | e3fp.fingerprint.metrics.array_metrics 10 | e3fp.fingerprint.metrics.fprint_metrics 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: e3fp.fingerprint.metrics 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /doc/source/api/e3fp.rst: -------------------------------------------------------------------------------- 1 | e3fp package 2 | ============ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | e3fp.config 10 | e3fp.conformer 11 | e3fp.fingerprint 12 | 13 | Submodules 14 | ---------- 15 | 16 | .. toctree:: 17 | 18 | e3fp.pipeline 19 | e3fp.util 20 | 21 | Module contents 22 | --------------- 23 | 24 | .. automodule:: e3fp 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | -------------------------------------------------------------------------------- /doc/source/usage/index.rst: -------------------------------------------------------------------------------- 1 | Usage and Examples 2 | ================== 3 | 4 | To facilitate flexible use of the E3FP package, we provide multiple interfaces 5 | for performing the same tasks. We have organized these below in the order in 6 | which we expect them to be most of use to the average user. 7 | 8 | .. toctree:: 9 | :caption: Sections 10 | :maxdepth: 2 11 | 12 | config 13 | cli 14 | pipeline 15 | fingerprints/index 16 | -------------------------------------------------------------------------------- /doc/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {% block footer %} 4 | {{ super() }} 5 | 14 | {% endblock %} -------------------------------------------------------------------------------- /doc/source/api/e3fp.fingerprint.rst: -------------------------------------------------------------------------------- 1 | e3fp\.fingerprint package 2 | ========================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | e3fp.fingerprint.metrics 10 | 11 | Submodules 12 | ---------- 13 | 14 | .. toctree:: 15 | 16 | e3fp.fingerprint.array_ops 17 | e3fp.fingerprint.db 18 | e3fp.fingerprint.fprint 19 | e3fp.fingerprint.fprinter 20 | e3fp.fingerprint.generate 21 | e3fp.fingerprint.structs 22 | e3fp.fingerprint.util 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: e3fp.fingerprint 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /src/e3fp/config/defaults.cfg: -------------------------------------------------------------------------------- 1 | [preprocessing] 2 | standardise = False 3 | protonate = False 4 | 5 | [conformer_generation] 6 | num_conf = -1 7 | first = -1 8 | pool_multiplier = 1 9 | rmsd_cutoff = 0.5 10 | max_energy_diff = None 11 | forcefield = uff 12 | out_dir = conformers 13 | compress = 2 14 | seed = -1 15 | 16 | ; Optimized parameters used in 17 | ; Axen et al. 2017 18 | [fingerprinting] 19 | bits = 1024 20 | level = 5 21 | first = 3 22 | radius_multiplier = 1.718 23 | stereo = True 24 | counts = False 25 | include_disconnected = True 26 | rdkit_invariants = False 27 | remove_duplicate_substructs = True 28 | exclude_floating = True 29 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/util.py: -------------------------------------------------------------------------------- 1 | """Utility methods and class for fingerprinting-related functions. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | from ..util import E3FPError 7 | 8 | 9 | class E3FPInvalidFingerprintError(E3FPError, TypeError): 10 | """Fingerprint is incorrectly formatted.""" 11 | 12 | 13 | class E3FPMolError(E3FPError, TypeError): 14 | """Mol is of incorrect type.""" 15 | 16 | 17 | class E3FPBitsValueError(E3FPError, ValueError): 18 | """Bits value is invalid.""" 19 | 20 | 21 | class E3FPCountsError(E3FPError, ValueError): 22 | """Index in counts is invalid.""" 23 | 24 | 25 | class E3FPOptionError(E3FPError, ValueError): 26 | """Option provided is invalid.""" 27 | -------------------------------------------------------------------------------- /doc/source/examples/data/test_smiles.smi: -------------------------------------------------------------------------------- 1 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccccc1)C(C)C CHEMBL1643865 2 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)C(C)C)C(C)C CHEMBL1643866 3 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccccn1)C(C)C CHEMBL1643867 4 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccc(OC)c1)C(C)C CHEMBL1643868 5 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccc(F)c1)C(C)C CHEMBL1643869 6 | CN1CCN(C(=O)c2ccc3n2Cc2ccccc2N(C(=O)c2ccc(NC(=O)c4ccccc4-c4ccccc4)cc2Cl)C3)CC1 CHEMBL164387 7 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccnc1)C(C)C CHEMBL1643870 8 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccncc1)C(C)C CHEMBL1643871 9 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccc2ccccc2n1)C(C)C CHEMBL1643872 10 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1nccc2ccccc21)C(C)C CHEMBL1643873 11 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = e3fp 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Internal variables 12 | PAPEROPT_a4 = -D latex_paper_size=a4 13 | PAPEROPT_letter = -D latex_paper_size=letter 14 | ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 15 | 16 | # Put it first so that "make" without argument is like "make help". 17 | help: 18 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 19 | 20 | .PHONY: help Makefile 21 | 22 | # Catch-all target: route all unknown targets to Sphinx using the new 23 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 24 | %: Makefile 25 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 26 | -------------------------------------------------------------------------------- /tests/test_dependencies.py: -------------------------------------------------------------------------------- 1 | """Integration tests for dependencies. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | 7 | 8 | class TestRequiredDependencies: 9 | def test_rdkit(self): 10 | import rdkit 11 | 12 | def test_numpy(self): 13 | import numpy 14 | 15 | def test_scipy(self): 16 | import scipy 17 | 18 | def test_murmurhash(self): 19 | import mmh3 20 | 21 | def test_python_utilities(self): 22 | import python_utilities 23 | 24 | 25 | class TestOptionalFeatureDependencies: 26 | def test_h5py(self): 27 | import h5py 28 | 29 | def test_standardiser(self): 30 | import standardiser 31 | 32 | 33 | class TestOptionalParallelDependencies: 34 | def test_mpi4py(self): 35 | import mpi4py 36 | 37 | def test_concurrent(self): 38 | import concurrent.futures 39 | 40 | def test_python_utilities(self): 41 | import python_utilities.parallel 42 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | release: 5 | types: 6 | - published 7 | 8 | jobs: 9 | build-test: 10 | runs-on: ubuntu-latest 11 | env: 12 | uv_version: "0.5.2" 13 | python_version: "3.13" 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Setup MPI 17 | uses: mpi4py/setup-mpi@v1 18 | - name: Install uv 19 | uses: astral-sh/setup-uv@v3 20 | with: 21 | version: ${{ env.uv_version }} 22 | - name: Build the project 23 | run: uv build --no-sources --python ${{ env.python_version }} 24 | - name: Sync only the test dependencies 25 | run: uv sync --no-install-project --extra test 26 | - name: Install and test source distribution 27 | run: | 28 | uv pip install dist/*.tar.gz 29 | uv run --no-sync pytest 30 | uv pip uninstall e3fp 31 | - name: Install and test wheel 32 | run: | 33 | uv pip install dist/*.whl 34 | uv run --no-sync pytest 35 | - name: Publish to PyPI 36 | run: uv publish --token ${{ secrets.PYPI_API_TOKEN }} 37 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [master] 5 | pull_request: 6 | 7 | env: 8 | # Setting RDMAV_FORK_SAFE=1 to avoid libfabric EFA provider issues with 9 | # fork() on Python 3.9 and Ubuntu. 10 | RDMAV_FORK_SAFE: 1 11 | 12 | jobs: 13 | test: 14 | name: Python ${{ matrix.python-version }} - ${{ matrix.os }} 15 | runs-on: ${{ matrix.os }} 16 | env: 17 | uv_version: "0.5.2" 18 | strategy: 19 | matrix: 20 | os: ["ubuntu-latest", "macos-latest"] 21 | python-version: ["3.9", "3.13"] 22 | fail-fast: false 23 | steps: 24 | - uses: actions/checkout@v2 25 | with: 26 | fetch-depth: 2 27 | - name: Setup MPI 28 | uses: mpi4py/setup-mpi@v1 29 | - name: Install uv 30 | uses: astral-sh/setup-uv@v3 31 | with: 32 | version: ${{ env.uv_version }} 33 | - name: Install the project 34 | run: uv sync --extra test --python ${{ matrix.python-version }} 35 | - name: Run tests 36 | run: uv run pytest --cov=e3fp --cov-report=xml 37 | - name: Upload coverage to Codecov 38 | uses: codecov/codecov-action@v5 39 | with: 40 | fail_ci_if_error: false 41 | -------------------------------------------------------------------------------- /doc/source/overview.rst: -------------------------------------------------------------------------------- 1 | Overview of E3FP 2 | ================ 3 | 4 | Introduction 5 | ------------ 6 | 7 | The Extended 3-Dimensional FingerPrint (E3FP) [1]_ is a 3D molecular 8 | fingerprinting method inspired by Extended Connectivity FingerPrints (ECFP) 9 | [2]_, integrating tightly with the RDKit_. It is developed by the 10 | `Keiser Lab`_ at UCSF_ and maintained primarily by `Seth Axen`_. 11 | 12 | For a thorough description of E3FP, please consult the original paper [1]_ and 13 | `paper repository`_ or :ref:`usage/index:Usage and Examples`. 14 | 15 | Documentation is hosted by ReadTheDocs_. 16 | 17 | Contributing 18 | ------------ 19 | 20 | Development occurs on GitHub_. 21 | Contributions, feature requests, and bug reports are greatly appreciated. 22 | Please consult the `issue tracker`_. 23 | 24 | License 25 | ------- 26 | E3FP is released under the |license_long| (|license|). 27 | 28 | Briefly, this means E3FP can be used in any manner without modification, 29 | with proper attribution. However, if the source code is modified for an 30 | application, this modified source must also be released under |license| so that 31 | the community may benefit. 32 | 33 | Citing E3FP 34 | ----------- 35 | 36 | To cite E3FP, please reference the original paper [1]_. 37 | 38 | .. rubric:: References 39 | 40 | .. [1] |axen2017| 41 | .. [2] |rogers2010| 42 | 43 | .. include:: substitutions.rst 44 | .. _GitHub: https://github.com/keiserlab/e3fp 45 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | """Tests for util methods. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import pytest 7 | import warnings 8 | 9 | 10 | class TestUtil: 11 | def test_deprecated(self): 12 | from e3fp.util import deprecated, E3FPDeprecationWarning 13 | 14 | @deprecated("1.1", remove_version="1.3", msg="DEPRECATED!!!") 15 | def dep_method(): 16 | pass 17 | 18 | with warnings.catch_warnings(record=True) as w: 19 | warnings.simplefilter("always") 20 | dep_method() 21 | assert len(w) == 1 22 | assert issubclass(w[-1].category, E3FPDeprecationWarning) 23 | message = str(w[-1].message) 24 | assert "deprecated in 1.1" in message 25 | assert "removed in 1.3" in message 26 | assert "DEPRECATED!!!" in str(w[-1].message) 27 | 28 | assert "\t.. deprecated:: 1.1\n\t DEPRECATED!!!" in dep_method.__doc__ 29 | 30 | def test_efficiency_warning(self): 31 | from e3fp.util import E3FPEfficiencyWarning 32 | 33 | def test(warn=False): 34 | if warn: 35 | raise E3FPEfficiencyWarning("Inefficient!") 36 | 37 | with warnings.catch_warnings(record=True): 38 | warnings.simplefilter("error") 39 | test(warn=False) 40 | 41 | with pytest.raises(E3FPEfficiencyWarning): 42 | test(warn=True) 43 | -------------------------------------------------------------------------------- /doc/source/usage/fingerprints/comparison.rst: -------------------------------------------------------------------------------- 1 | Fingerprint Comparison 2 | ====================== 3 | 4 | The `e3fp.fingerprint.metrics` sub-package provides several useful methods for 5 | batch comparison of fingerprints in various representations. 6 | 7 | Fingerprint Metrics 8 | ------------------- 9 | 10 | These metrics operate directly on pairs of :py:class:`.Fingerprint` and 11 | :py:class:`.FingerprintDatabase` objects or on a combination of each. If 12 | only a single variable is specified, self-comparison is performed. The 13 | implemented methods are common functions for fingerprint similarity in the 14 | literature. 15 | 16 | .. todo:: 17 | 18 | Document examples 19 | 20 | Array Metrics 21 | ------------- 22 | 23 | To efficiently compare fingerprint databases above, we provide comparison 24 | metrics that can operate directly on the internal sparse matrix representation 25 | without the need to "densify it". We describe these here, as they have several 26 | additional features. 27 | 28 | The array metrics implemented in `e3fp.fingerprint.metrics.array_metrics` are 29 | implemented such that they may take any combination of dense and sparse inputs. 30 | Additionally, they are designed to function as 31 | `scikit-learn-compatible kernels `_ 32 | for machine learning tasks. For example, one might perform an analysis using a 33 | support vector machine (SVM) and Tanimoto kernel. 34 | 35 | .. code:: python 36 | 37 | >>> from sklearn.svm import SVC 38 | >>> from e3fp.fingerprint.metrics.array_metrics import tanimoto 39 | >>> clf = SVC(kernel=tanimoto) 40 | >>> clf.fit(X, y) 41 | ... 42 | >>> clf.predict(test) 43 | ... 44 | 45 | Most common fingerprint comparison metrics only apply to binary fingerprints. 46 | We include several that operate equally well on count- and float-based 47 | fingerprints. For example, to our knowledge, we provide the only open source 48 | implementation of Soergel similarity, the analog to the Tanimoto coefficient 49 | for non-binary fingerprints that can efficiently operate on sparse inputs. 50 | 51 | .. code:: python 52 | 53 | >>> from e3fp.fingerprint.metrics.array_metrics import soergel 54 | >>> clf = SVC(kernel=soergel) 55 | >>> clf.fit(X, y) 56 | ... 57 | >>> clf.predict(test) 58 | ... 59 | -------------------------------------------------------------------------------- /doc/source/substitutions.rst: -------------------------------------------------------------------------------- 1 | .. Common substitutions used throughout the documentation 2 | 3 | .. URLs 4 | .. _RDKit: http://www.rdkit.org 5 | .. _NumPy: https://www.numpy.org 6 | .. _SciPy: https://www.scipy.org 7 | .. _mmh3: https://pypi.python.org/pypi/mmh3 8 | .. _python_utilities: https://github.com/sdaxen/python_utilities 9 | .. _mpi4py: http://mpi4py.scipy.org 10 | .. _smart_open: https://github.com/RaRe-Technologies/smart_open 11 | .. _standardiser: https://wwwdev.ebi.ac.uk/chembl/extra/francis/standardiser 12 | .. _cxcalc: https://docs.chemaxon.com/display/CALCPLUGS/cxcalc+command+line+tool 13 | .. _h5py: http://www.h5py.org/ 14 | .. _numba: https://numba.pydata.org/ 15 | .. _Anaconda: https://anaconda.org/conda-forge/e3fp 16 | .. _uv: https://docs.astral.sh/uv/ 17 | .. _repository: https://github.com/keiserlab/e3fp 18 | .. _paper repository: https://github.com/keiserlab/e3fp-paper 19 | .. _issue tracker: https://github.com/keiserlab/e3fp/issues 20 | .. _ReadTheDocs: http://e3fp.readthedocs.io 21 | .. _Keiser Lab: http://www.keiserlab.org 22 | .. _UCSF: https://www.ucsf.edu 23 | .. _Seth Axen: http://sethaxen.com 24 | 25 | .. Badges 26 | .. |bioRxiv| image:: https://img.shields.io/badge/bioRxiv-136705-blue.svg 27 | :target: https://doi.org/10.1101/136705 28 | :alt: Access the preprint on bioRxiv 29 | 30 | .. References 31 | .. |axen2017_doi| image:: https://img.shields.io/badge/doi-10.1021/acs.jmedchem.7b00696-blue.svg 32 | :target: http://dx.doi.org/10.1021/acs.jmedchem.7b00696 33 | :alt: Access the paper 34 | .. |axen2017| replace:: Axen SD, Huang XP, Caceres EL, Gendelev L, Roth BL, Keiser MJ. A Simple Representation Of Three-Dimensional Molecular Structure. *J. Med. Chem.* **60** (17): 7393–7409 (2017). |axen2017_doi| |bioRxiv| 35 | .. |rogers2010_doi| image:: https://img.shields.io/badge/doi-10.1021/ci100050t-blue.svg 36 | :target: http://dx.doi.org/10.1021/ci100050t 37 | :alt: Access the paper 38 | .. |rogers2010| replace:: Rogers D & Hahn M. Extended-connectivity fingerprints. *J. Chem. Inf. Model.* **50**: 742-54 (2010). |rogers2010_doi| 39 | 40 | .. Misc 41 | .. |license_link| replace:: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt 42 | .. |license_long| replace:: `GNU Lesser General Public License version 3.0`_ 43 | .. _GNU Lesser General Public License version 3.0: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt 44 | .. |license| replace:: LGPLv3 45 | -------------------------------------------------------------------------------- /doc/source/install.rst: -------------------------------------------------------------------------------- 1 | Setup and Installation 2 | ====================== 3 | 4 | Dependencies 5 | ------------ 6 | 7 | E3FP is compatible with Python 3.x. It additionally has the 8 | following dependencies: 9 | 10 | Required 11 | ~~~~~~~~ 12 | 13 | - NumPy_ 14 | - SciPy_ 15 | - RDKit_ 16 | - mmh3_ 17 | - python_utilities_ 18 | - smart_open_ 19 | 20 | Optional 21 | ~~~~~~~~ 22 | 23 | The following packages are required for the specified features: 24 | 25 | - parallelization: 26 | 27 | + mpi4py_ 28 | 29 | - molecular standardisation: 30 | 31 | + standardiser_ 32 | 33 | - protonation states: 34 | 35 | + cxcalc_ 36 | 37 | - storing conformer energies: 38 | 39 | + h5py_ 40 | 41 | - faster fingerprint metric calculations: 42 | 43 | + numba_ 44 | 45 | 46 | Installation 47 | ------------ 48 | 49 | The following installation approaches are listed in order of recommendation. 50 | 51 | Option 1: Install with Pip 52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | Basic installation: 55 | 56 | .. code:: bash 57 | 58 | $ pip install e3fp 59 | 60 | With optional dependencies: 61 | 62 | .. code:: bash 63 | 64 | $ pip install e3fp[optional] 65 | 66 | 67 | Option 2: Install from conda-forge 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | 70 | E3FP is available on conda-forge. 71 | 72 | .. code:: bash 73 | 74 | $ conda create -n e3fp_env -c conda-forge e3fp 75 | $ conda activate e3fp_env 76 | 77 | To install optional dependencies: 78 | 79 | .. code:: bash 80 | 81 | $ conda install -c conda-forge mpi4py h5py standardiser 82 | 83 | Option 3: Install from source 84 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 85 | 86 | 1. Clone the repository: 87 | 88 | .. code:: bash 89 | 90 | $ git clone https://github.com/keiserlab/e3fp.git 91 | $ cd e3fp 92 | 93 | 2. Install for development in an already-activated environment. 94 | 95 | You can do this using pip: 96 | 97 | .. code:: bash 98 | 99 | $ pip install -e .[dev] 100 | 101 | Or use uv_ to set up a development environment: 102 | 103 | .. code:: bash 104 | 105 | $ uv sync --extra dev 106 | 107 | Testing 108 | ------- 109 | 110 | Run tests using pytest: 111 | 112 | .. code:: bash 113 | 114 | $ pip install pytest # if not already installed 115 | $ pytest e3fp 116 | 117 | 118 | .. include:: substitutions.rst 119 | -------------------------------------------------------------------------------- /doc/source/usage/config.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ============= 3 | 4 | E3FP configurational parameters are stored in the widely used INI_ file 5 | format. These may be passed to :ref:`usage/cli:Command Line Interface` programs 6 | or parsed to Python dicts for :ref:`usage/pipeline:Pipeline Methods` or other 7 | lower-level functions. 8 | 9 | Loading Default Parameters 10 | -------------------------- 11 | 12 | The below example shows all default parameters, accessed via the 13 | :py:mod:`e3fp.config` module. 14 | 15 | .. literalinclude:: ../../../src/e3fp/config/defaults.cfg 16 | :caption: `defaults.cfg `_ 17 | 18 | :py:mod:`configparser` is used internally to parse and store these 19 | config parameters. 20 | 21 | >>> from e3fp.config.params import default_params 22 | >>> default_params 23 | 24 | >>> print(default_params.sections()) 25 | ['preprocessing', 'conformer_generation', 'fingerprinting'] 26 | >>> default_params.items('fingerprinting') 27 | [('bits', '1024'), ('level', '5'), ('first', '3'), ('radius_multiplier', '1.718'), ('stereo', 'True'), ('counts', 'False'), ('include_disconnected', 'True'), ('rdkit_invariants', 'False'), ('merge_duplicate_substructs', 'True'), ('exclude_floating', 'True')] 28 | 29 | Parsing User-Provided Parameters 30 | -------------------------------- 31 | 32 | A user may provide a custom config file. 33 | 34 | .. literalinclude:: ../examples/data/new_params.cfg 35 | :caption: new_params.cfg 36 | 37 | .. doctest:: 38 | 39 | >>> from e3fp.config.params import read_params 40 | >>> config = read_params("source/examples/data/new_params.cfg") 41 | >>> config.items('fingerprinting') 42 | [('bits', '4096'), ('first', '10')] 43 | 44 | When passing these parameters to any downstream methods, default options will 45 | be used except where these options are specified. 46 | 47 | Converting Parameters to Argument Dicts 48 | --------------------------------------- 49 | 50 | To pass the parameters to Python methods for fingerprinting and conformer 51 | generation, we need to convert them to Python dicts. 52 | 53 | >>> from e3fp.pipeline import params_to_dicts 54 | >>> confgen_params, fprint_params = params_to_dicts(config) 55 | >>> fprint_params 56 | {'bits': 4096, 'first': 10} 57 | 58 | .. _INI: https://en.wikipedia.org/wiki/INI_file 59 | 60 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "e3fp" 7 | version = "1.2.7" 8 | requires-python = ">=3.9, <3.14" 9 | description = "Molecular 3D fingerprinting" 10 | readme = "README.rst" 11 | authors = [ 12 | {name = "Seth Axen", email = "seth.axen@gmail.com"}, 13 | ] 14 | license = {file = "LICENSE.txt"} 15 | keywords = ["e3fp", "3d", "molecule", "fingerprint", "conformer"] 16 | classifiers = [ 17 | "Programming Language :: Python", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)", 24 | "Operating System :: OS Independent", 25 | "Development Status :: 4 - Beta", 26 | "Intended Audience :: Science/Research", 27 | "Intended Audience :: Developers", 28 | "Topic :: Scientific/Engineering :: Chemistry", 29 | "Topic :: Software Development :: Libraries :: Python Modules", 30 | ] 31 | dependencies = [ 32 | "mmh3>=2.3.1", 33 | "numpy>=1.11.3", 34 | "rdkit>=2016.03.4", 35 | "scipy>=0.18.0", 36 | "sdaxen_python_utilities>=0.1.5", 37 | "smart_open>=1.8.3", 38 | ] 39 | 40 | [project.optional-dependencies] 41 | optional = [ 42 | "h5py", 43 | "mpi4py", 44 | "numba", 45 | "six", # needed by standardiser, but not listed as a dependency 46 | "standardiser", 47 | ] 48 | test = [ 49 | "mock", 50 | "pytest", 51 | "pytest-cov", 52 | "e3fp[optional]", 53 | ] 54 | docs = [ 55 | "sphinx", 56 | "sphinxcontrib-programoutput", 57 | "sphinx-rtd-theme", 58 | ] 59 | dev = [ 60 | "e3fp[docs]", 61 | "e3fp[test]", 62 | ] 63 | 64 | [project.urls] 65 | Homepage = "https://github.com/keiserlab/e3fp" 66 | Download = "https://github.com/keiserlab/e3fp/tarball/{version}" 67 | 68 | [project.scripts] 69 | e3fp-fingerprint = "e3fp.fingerprint.generate:main" 70 | e3fp-conformer = "e3fp.conformer.generate:main" 71 | 72 | [tool.pytest.ini_options] 73 | addopts = "-ra -q" 74 | testpaths = ["e3fp/test"] 75 | 76 | # https://github.com/astral-sh/uv/issues/6281 77 | [tool.uv] 78 | constraint-dependencies = ["numba>=0.60.0"] 79 | # Resolve dependencies separately for each Python version 80 | environments = [ 81 | "python_version>='3.13'", 82 | "python_version=='3.12'", 83 | "python_version=='3.11'", 84 | "python_version=='3.10'", 85 | "python_version=='3.9'", 86 | ] 87 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | E3FP: Extended 3-Dimensional FingerPrint 2 | ======================================== 3 | 4 | |Docs Status| |CI Status| |Codecov Status| |PyPi Version| |Conda Version| |License| 5 | 6 | E3FP [1]_ is a 3D molecular fingerprinting method inspired by Extended 7 | Connectivity FingerPrints (ECFP) [2]_, integrating tightly with the RDKit_. 8 | 9 | Documentation is hosted by ReadTheDocs_, and development occurs on GitHub_. 10 | 11 | Installation and Usage 12 | ---------------------- 13 | 14 | For installation and usage instructions, see the 15 | `documentation `__. 16 | 17 | See the E3FP `paper repository`_ for an application of E3FP and all code used 18 | for the E3FP paper [1]_. 19 | 20 | License 21 | ------- 22 | 23 | E3FP is available under the `GNU Lesser General Public License version 3.0 24 | `_ (LGPLv3). See the 25 | `documentation `__ 26 | for more details. 27 | 28 | 29 | References 30 | ---------- 31 | 32 | .. [1] |axen2017| 33 | .. [2] |rogers2010| 34 | 35 | .. substitutions 36 | 37 | .. _RDKit: http://www.rdkit.org 38 | .. _GitHub: https://github.com/keiserlab/e3fp 39 | .. _paper repository: https://github.com/keiserlab/e3fp-paper 40 | .. _ReadTheDocs: http://e3fp.readthedocs.io 41 | .. |axen2017_doi| image:: https://img.shields.io/badge/doi-10.1021/acs.jmedchem.7b00696-blue.svg 42 | :target: http://dx.doi.org/10.1021/acs.jmedchem.7b00696 43 | :alt: Access the paper 44 | .. |axen2017| replace:: Axen SD, Huang XP, Caceres EL, Gendelev L, Roth BL, Keiser MJ. A Simple Representation Of Three-Dimensional Molecular Structure. *J. Med. Chem.* **60** (17): 7393–7409 (2017). |axen2017_doi| |bioRxiv| 45 | .. |rogers2010_doi| image:: https://img.shields.io/badge/doi-10.1021/ci100050t-blue.svg 46 | :target: http://dx.doi.org/10.1021/ci100050t 47 | :alt: Access the paper 48 | .. |rogers2010| replace:: Rogers D & Hahn M. Extended-connectivity fingerprints. *J. Chem. Inf. Model.* **50**: 742-54 (2010). |rogers2010_doi| 49 | .. |CI Status| image:: https://github.com/keiserlab/e3fp/actions/workflows/ci.yml/badge.svg 50 | :target: https://github.com/keiserlab/e3fp/actions?query=workflow%3ACI 51 | :alt: CI Status 52 | .. |Docs Status| image:: http://readthedocs.org/projects/e3fp/badge/?version=latest 53 | :target: http://e3fp.readthedocs.io/en/latest/?badge=latest 54 | :alt: Documentation Status 55 | .. |Codecov Status| image:: https://codecov.io/github/keiserlab/e3fp/coverage.svg?branch=master 56 | :target: https://codecov.io/github/keiserlab/e3fp?branch=master 57 | :alt: Code Coverage 58 | .. |PyPi Version| image:: https://img.shields.io/pypi/v/e3fp.svg 59 | :target: https://pypi.python.org/pypi/e3fp 60 | :alt: Package on PyPi 61 | .. |Conda Version| image:: https://img.shields.io/conda/v/conda-forge/e3fp.svg 62 | :target: https://anaconda.org/conda-forge/e3fp 63 | :alt: Package on Anaconda 64 | .. |License| image:: https://img.shields.io/badge/license-LGPLv3-blue.svg 65 | :target: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt 66 | .. |bioRxiv| image:: https://img.shields.io/badge/bioRxiv-136705-blue.svg 67 | :target: https://doi.org/10.1101/136705 68 | :alt: Access the preprint on bioRxiv 69 | -------------------------------------------------------------------------------- /src/e3fp/pipeline.py: -------------------------------------------------------------------------------- 1 | """Functions for various pipeline use cases. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | from .config.params import params_to_sections_dict 7 | from .conformer.util import mol_from_smiles, mol_from_sdf, mol_to_sdf 8 | from .conformer.generate import generate_conformers 9 | from .fingerprint.generate import fprints_dict_from_mol 10 | 11 | 12 | def params_to_dicts(params): 13 | """Get params dicts for pipeline functions from INI format params file.""" 14 | sections_dict = params_to_sections_dict(params, auto=True) 15 | 16 | # preproc_params will eventually be returned separately, when there's a 17 | # pipeline function for protonation 18 | preproc_params = sections_dict.get("preprocessing", {}) 19 | confgen_params = sections_dict.get("conformer_generation", {}) 20 | confgen_params.update(preproc_params) 21 | fprint_params = sections_dict.get("fingerprinting", {}) 22 | return confgen_params, fprint_params 23 | 24 | 25 | def confs_from_smiles(smiles, name, confgen_params={}, save=False): 26 | """Generate conformations of molecule from SMILES string.""" 27 | mol = mol_from_smiles(smiles, name) 28 | confgen_result = generate_conformers( 29 | mol, name, save=save, **confgen_params 30 | ) 31 | mol = confgen_result[0] 32 | return mol 33 | 34 | 35 | def sdf_from_smiles( 36 | smiles, name, confgen_params={}, out_file=None, out_ext=".sdf.bz2" 37 | ): 38 | """Generate conformations from SMILES string and save to SDF file.""" 39 | mol = confs_from_smiles( 40 | smiles, name, confgen_params=confgen_params, save=False 41 | ) 42 | if out_file is None: 43 | out_file = name + out_ext 44 | mol_to_sdf(mol, out_file) 45 | 46 | 47 | def fprints_from_fprints_dict(fprints_dict, level=-1): 48 | """Get fingerprint at `level` from dict of level to fingerprint.""" 49 | fprints_list = fprints_dict.get( 50 | level, fprints_dict[max(fprints_dict.keys())] 51 | ) 52 | return fprints_list 53 | 54 | 55 | def fprints_from_mol(mol, fprint_params={}, save=False): 56 | """Generate fingerprints for all `first` conformers in mol.""" 57 | fprints_dict = fprints_dict_from_mol(mol, save=save, **fprint_params) 58 | level = fprint_params.get("level", -1) 59 | fprints_list = fprints_from_fprints_dict(fprints_dict, level=level) 60 | return fprints_list 61 | 62 | 63 | def fprints_from_smiles( 64 | smiles, name, confgen_params={}, fprint_params={}, save=False 65 | ): 66 | """Generate conformers and fingerprints from a SMILES string.""" 67 | if save is False and "first" not in confgen_params: 68 | confgen_params["first"] = fprint_params.get("first", -1) 69 | mol = confs_from_smiles( 70 | smiles, name, confgen_params=confgen_params, save=save 71 | ) 72 | fprints_list = fprints_from_mol( 73 | mol, fprint_params=fprint_params, save=save 74 | ) 75 | return fprints_list 76 | 77 | 78 | def fprints_from_sdf(sdf_file, fprint_params={}, save=False): 79 | """Generate fingerprints from conformers in an SDF file.""" 80 | mol = mol_from_sdf(sdf_file) 81 | fprints_list = fprints_from_mol( 82 | mol, fprint_params=fprint_params, save=save 83 | ) 84 | return fprints_list 85 | -------------------------------------------------------------------------------- /tests/test_conformer.py: -------------------------------------------------------------------------------- 1 | """Tests for conformer generation. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | 7 | import os 8 | 9 | DATA_DIR = os.path.join(os.path.dirname(__file__), "data") 10 | SDF_FILE_COMPRESSED = os.path.join(DATA_DIR, "ritalin_nonplanar.sdf.bz2") 11 | SDF_FILE_UNCOMPRESSED = os.path.join(DATA_DIR, "ritalin_nonplanar.sdf") 12 | 13 | class TestConformer: 14 | def test_standardisation(self): 15 | import rdkit.Chem 16 | from e3fp.conformer.util import ( 17 | mol_from_smiles, 18 | mol_to_standardised_mol, 19 | ) 20 | 21 | smiles = "C[N-]c1cccc[n+]1C" 22 | mol = mol_from_smiles(smiles, "tmp") 23 | assert rdkit.Chem.MolToSmiles(mol) == smiles 24 | 25 | mol = mol_to_standardised_mol(mol) 26 | assert rdkit.Chem.MolToSmiles(mol) == "CN=c1ccccn1C" 27 | 28 | def test_default_is_unseeded(self): 29 | import rdkit.Chem 30 | from rdkit.Chem import AllChem 31 | from e3fp.conformer.util import ( 32 | mol_from_smiles, 33 | mol_to_standardised_mol, 34 | ) 35 | from e3fp.conformer.generate import generate_conformers 36 | 37 | ntrials = 10 38 | confgen_params = {"num_conf": 1} 39 | smiles = "C" * 20 # long flexible molecule 40 | mol = mol_from_smiles(smiles, "tmp") 41 | mols = [ 42 | generate_conformers(mol, **confgen_params)[0] 43 | for i in range(ntrials) 44 | ] 45 | 46 | fail = True 47 | for i in range(ntrials): 48 | for j in range(i + 1, ntrials): 49 | rms = AllChem.GetBestRMS(mols[i], mols[j]) 50 | if rms > 1e-2: 51 | fail = False 52 | break 53 | assert not fail 54 | 55 | def test_seed_produces_same_conformers(self): 56 | import rdkit.Chem 57 | from rdkit.Chem import AllChem 58 | from e3fp.conformer.util import ( 59 | mol_from_smiles, 60 | mol_to_standardised_mol, 61 | ) 62 | from e3fp.conformer.generate import generate_conformers 63 | 64 | ntrials = 10 65 | confgen_params = {"num_conf": 1, "seed": 42} 66 | smiles = "C" * 20 # long flexible molecule 67 | mol = mol_from_smiles(smiles, "tmp") 68 | mols = [ 69 | generate_conformers(mol, **confgen_params)[0] 70 | for i in range(ntrials) 71 | ] 72 | 73 | fail = False 74 | for i in range(ntrials): 75 | for j in range(i + 1, ntrials): 76 | rms = AllChem.GetBestRMS(mols[i], mols[j]) 77 | if rms > 1e-2: 78 | fail = True 79 | break 80 | assert not fail 81 | 82 | def test_compressed_sdf_reads_same_as_uncompressed(self): 83 | from rdkit import Chem 84 | from e3fp.conformer.util import mol_from_sdf 85 | 86 | sdf_files = [SDF_FILE_COMPRESSED, SDF_FILE_UNCOMPRESSED] 87 | smiles = [Chem.MolToSmiles(mol_from_sdf(f)) for f in sdf_files] 88 | assert smiles[0] == smiles[1] 89 | 90 | def test_conformer_generation_without_name(self): 91 | from e3fp.conformer.util import mol_from_smiles 92 | from e3fp.conformer.generate import generate_conformers 93 | 94 | confgen_params = {"num_conf": 1, "seed": 42} 95 | smiles = "C" * 20 # long flexible molecule 96 | mol = mol_from_smiles(smiles, "tmp") 97 | mol.ClearProp("_Name") 98 | assert not mol.HasProp("_Name") 99 | generate_conformers(mol, **confgen_params) 100 | -------------------------------------------------------------------------------- /doc/source/usage/pipeline.rst: -------------------------------------------------------------------------------- 1 | Pipeline Methods 2 | ================ 3 | 4 | E3FP can be easily plugged into an existing pipeline using the methods in the 5 | `e3fp.pipeline` module. Each of these methods wraps functionality in other 6 | modules for generating various outputs from inputs and specified options. 7 | 8 | .. note:: 9 | 10 | As fingerprinting many molecules is embarrassingly parallel, we highly 11 | recommend employing a parallelization strategy. We use our own 12 | python_utilities_ package. 13 | 14 | First we must choose configuration options. See :ref:`usage/config:Configuration` for 15 | detailed instructions. Here we will use defaults for all but a few options. 16 | 17 | .. testsetup:: * 18 | 19 | smiles_file = "source/examples/data/test_smiles.smi" 20 | 21 | .. doctest:: 22 | 23 | >>> fprint_params = {'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True} 24 | >>> confgen_params = {'max_energy_diff': 20.0, 'first': 3} 25 | >>> smiles = "COC(=O)C(C1CCCCN1)C2=CC=CC=C2" 26 | 27 | Generating Conformers from SMILES 28 | --------------------------------- 29 | 30 | The following code snippet generates a multi-conformer molecule: 31 | 32 | >>> from e3fp.pipeline import confs_from_smiles 33 | >>> mol = confs_from_smiles(smiles, "ritalin", confgen_params=confgen_params) 34 | >>> mol.GetNumConformers() 35 | 3 36 | 37 | This produces the following conformers: 38 | 39 | .. image:: ../_static/ritalin3d.png 40 | :width: 300px 41 | :height: 300px 42 | :alt: ritalin conformers 43 | 44 | Generating Fingerprints from Conformers 45 | --------------------------------------- 46 | 47 | >>> from e3fp.pipeline import fprints_from_mol 48 | >>> fprints = fprints_from_mol(mol, fprint_params=fprint_params) 49 | >>> len(fprints) 50 | 3 51 | >>> fprints[0] 52 | Fingerprint(indices=array([188, 224, ..., 3775, 4053]), level=5, bits=4096, name=ritalin_0) 53 | >>> fprints[1] 54 | Fingerprint(indices=array([125, 188, ..., 3693, 4053]), level=5, bits=4096, name=ritalin_1) 55 | >>> fprints[2] 56 | Fingerprint(indices=array([188, 206, ..., 3743, 4053]), level=5, bits=4096, name=ritalin_2) 57 | 58 | Generating Fingerprints from SMILES 59 | ----------------------------------- 60 | 61 | >>> from e3fp.pipeline import fprints_from_smiles 62 | >>> fprints = fprints_from_smiles(smiles, "ritalin", confgen_params=confgen_params, fprint_params=fprint_params) 63 | >>> fprints[0] 64 | Fingerprint(indices=array([188, 224, ..., 3775, 4053]), level=5, bits=4096, name=ritalin_0) 65 | 66 | Parallel Fingerprinting 67 | ----------------------- 68 | 69 | The following script demonstrates use of python_utilities_ for fingerprinting 70 | all SDF files in a directory in parallel. This essentially is the same as the 71 | :ref:`usage/cli:Command Line Interface`, albeit with a less convenient interface. 72 | 73 | >>> from glob import glob 74 | >>> from python_utilities.parallel import Parallelizer 75 | >>> from e3fp.conformer.util import smiles_to_dict 76 | >>> smiles_dict = smiles_to_dict(smiles_file) 77 | >>> print(smiles_dict) 78 | {'CHEMBL1643866': 'CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)C(C)C)C(C)C', ...} 79 | >>> len(smiles_dict) 80 | 10 81 | >>> smiles_iter = ((smiles, name) for name, smiles in smiles_dict.items()) 82 | >>> kwargs = {"confgen_params": confgen_params, "fprint_params": fprint_params} 83 | >>> parallelizer = Parallelizer(parallel_mode="processes") 84 | >>> fprints_list = parallelizer.run(fprints_from_smiles, smiles_iter, kwargs=kwargs) # doctest: +SKIP 85 | >>> len(fprints_list) # doctest: +SKIP 86 | 10 87 | 88 | For all pipeline methods, please see the `e3fp.pipeline` module API. 89 | 90 | .. include:: ../substitutions.rst 91 | -------------------------------------------------------------------------------- /tests/data/ritalin_nonplanar.sdf: -------------------------------------------------------------------------------- 1 | ZINC00896711 2 | -OEChem-11081520323D 3 | 4 | 37 38 0 1 0 0 0 0 0999 V2000 5 | -0.0173 1.4248 0.0099 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | 0.0021 -0.0041 0.0020 O 0 0 0 0 0 0 0 0 0 0 0 0 7 | -1.1855 -0.6297 0.0100 C 0 0 0 0 0 0 0 0 0 0 0 0 8 | -2.2076 0.0145 0.0232 O 0 0 0 0 0 0 0 0 0 0 0 0 9 | -1.2439 -2.1355 0.0025 C 0 0 2 0 0 0 0 0 0 0 0 0 10 | -0.7531 -2.5137 -0.8943 H 0 0 0 0 0 0 0 0 0 0 0 0 11 | -2.6831 -2.5824 0.0138 C 0 0 0 0 0 0 0 0 0 0 0 0 12 | -3.5122 -2.2166 1.0577 C 0 0 0 0 0 0 0 0 0 0 0 0 13 | -4.8323 -2.6265 1.0681 C 0 0 0 0 0 0 0 0 0 0 0 0 14 | -5.3235 -3.4019 0.0344 C 0 0 0 0 0 0 0 0 0 0 0 0 15 | -4.4946 -3.7670 -1.0099 C 0 0 0 0 0 0 0 0 0 0 0 0 16 | -3.1756 -3.3535 -1.0225 C 0 0 0 0 0 0 0 0 0 0 0 0 17 | -0.5311 -2.6798 1.2421 C 0 0 1 0 0 0 0 0 0 0 0 0 18 | -1.0223 -2.3014 2.1385 H 0 0 0 0 0 0 0 0 0 0 0 0 19 | -0.5921 -4.2087 1.2346 C 0 0 0 0 0 0 0 0 0 0 0 0 20 | 0.1222 -4.7487 2.4770 C 0 0 0 0 0 0 0 0 0 0 0 0 21 | 1.5613 -4.2254 2.4942 C 0 0 0 0 0 0 0 0 0 0 0 0 22 | 1.5425 -2.6958 2.4549 C 0 0 0 0 0 0 0 0 0 0 0 0 23 | 0.8702 -2.2430 1.2312 N 0 3 0 0 0 0 0 0 0 0 0 0 24 | 1.0053 1.8021 0.0021 H 0 0 0 0 0 0 0 0 0 0 0 0 25 | -0.5445 1.7859 -0.8732 H 0 0 0 0 0 0 0 0 0 0 0 0 26 | -0.5275 1.7763 0.9067 H 0 0 0 0 0 0 0 0 0 0 0 0 27 | -3.1285 -1.6108 1.8652 H 0 0 0 0 0 0 0 0 0 0 0 0 28 | -5.4799 -2.3413 1.8840 H 0 0 0 0 0 0 0 0 0 0 0 0 29 | -6.3547 -3.7228 0.0429 H 0 0 0 0 0 0 0 0 0 0 0 0 30 | -4.8782 -4.3731 -1.8173 H 0 0 0 0 0 0 0 0 0 0 0 0 31 | -2.5290 -3.6356 -1.8402 H 0 0 0 0 0 0 0 0 0 0 0 0 32 | -1.6332 -4.5315 1.2442 H 0 0 0 0 0 0 0 0 0 0 0 0 33 | -0.1010 -4.5882 0.3386 H 0 0 0 0 0 0 0 0 0 0 0 0 34 | -0.3992 -4.4111 3.3727 H 0 0 0 0 0 0 0 0 0 0 0 0 35 | 0.1309 -5.8383 2.4477 H 0 0 0 0 0 0 0 0 0 0 0 0 36 | 2.0594 -4.5600 3.4041 H 0 0 0 0 0 0 0 0 0 0 0 0 37 | 2.0971 -4.6052 1.6243 H 0 0 0 0 0 0 0 0 0 0 0 0 38 | 1.0067 -2.3170 3.3253 H 0 0 0 0 0 0 0 0 0 0 0 0 39 | 2.5655 -2.3199 2.4673 H 0 0 0 0 0 0 0 0 0 0 0 0 40 | 1.3372 -2.6344 0.4270 H 0 0 0 0 0 0 0 0 0 0 0 0 41 | 0.9071 -1.2358 1.1819 H 0 0 0 0 0 0 0 0 0 0 0 0 42 | 1 2 1 0 0 0 0 43 | 1 20 1 0 0 0 0 44 | 1 21 1 0 0 0 0 45 | 1 22 1 0 0 0 0 46 | 2 3 1 0 0 0 0 47 | 3 4 2 0 0 0 0 48 | 3 5 1 0 0 0 0 49 | 5 6 1 0 0 0 0 50 | 5 7 1 0 0 0 0 51 | 5 13 1 0 0 0 0 52 | 7 12 2 0 0 0 0 53 | 7 8 1 0 0 0 0 54 | 8 9 2 0 0 0 0 55 | 8 23 1 0 0 0 0 56 | 9 10 1 0 0 0 0 57 | 9 24 1 0 0 0 0 58 | 10 11 2 0 0 0 0 59 | 10 25 1 0 0 0 0 60 | 11 12 1 0 0 0 0 61 | 11 26 1 0 0 0 0 62 | 12 27 1 0 0 0 0 63 | 13 14 1 0 0 0 0 64 | 13 19 1 0 0 0 0 65 | 13 15 1 0 0 0 0 66 | 15 16 1 0 0 0 0 67 | 15 28 1 0 0 0 0 68 | 15 29 1 0 0 0 0 69 | 16 17 1 0 0 0 0 70 | 16 30 1 0 0 0 0 71 | 16 31 1 0 0 0 0 72 | 17 18 1 0 0 0 0 73 | 17 32 1 0 0 0 0 74 | 17 33 1 0 0 0 0 75 | 18 19 1 0 0 0 0 76 | 18 34 1 0 0 0 0 77 | 18 35 1 0 0 0 0 78 | 19 36 1 0 0 0 0 79 | 19 37 1 0 0 0 0 80 | M CHG 1 19 1 81 | M END 82 | $$$$ 83 | -------------------------------------------------------------------------------- /doc/source/usage/cli.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface 2 | ====================== 3 | 4 | Command line interfaces (CLI) are provided for the two most common tasks: 5 | conformer generation and fingerprinting. 6 | When e3fp is installed, the CLI commands are available as ``e3fp-conformer`` and 7 | ``e3fp-fingerprint``. 8 | 9 | Conformer Generation CLI 10 | ------------------------ 11 | 12 | To see all available options, run 13 | 14 | .. command-output:: e3fp-conformer --help 15 | :shell: 16 | 17 | We will generate conformers for the molecule whose SMILES string is defined in 18 | ``caffeine.smi``. 19 | 20 | .. literalinclude:: ../examples/data/caffeine.smi 21 | :caption: caffeine.smi 22 | 23 | The below example generates at most 3 conformers for this molecule. 24 | 25 | .. code-block:: shell-session 26 | 27 | $ e3fp-conformer -s caffeine.smi --num_conf 3 -o ./ 28 | 2017-07-17 00:11:05,743|WARNING|Only 1 processes available. 'mpi' mode not available. 29 | 2017-07-17 00:11:05,748|INFO|num_proc is not specified. 'processes' mode will use all 8 processes 30 | 2017-07-17 00:11:05,748|INFO|Parallelizer initialized with mode 'processes' and 8 processors. 31 | 2017-07-17 00:11:05,748|INFO|Input type: Detected SMILES file(s) 32 | 2017-07-17 00:11:05,748|INFO|Input file number: 1 33 | 2017-07-17 00:11:05,748|INFO|Parallel Type: processes 34 | 2017-07-17 00:11:05,748|INFO|Out Directory: ./ 35 | 2017-07-17 00:11:05,749|INFO|Overwrite Existing Files: False 36 | 2017-07-17 00:11:05,749|INFO|Target Conformer Number: 3 37 | 2017-07-17 00:11:05,749|INFO|First Conformers Number: all 38 | 2017-07-17 00:11:05,749|INFO|Pool Multiplier: 1 39 | 2017-07-17 00:11:05,749|INFO|RMSD Cutoff: 0.5 40 | 2017-07-17 00:11:05,749|INFO|Maximum Energy Difference: None 41 | 2017-07-17 00:11:05,749|INFO|Forcefield: UFF 42 | 2017-07-17 00:11:05,749|INFO|Starting. 43 | 2017-07-17 00:11:05,779|INFO|Generating conformers for caffeine. 44 | 2017-07-17 00:11:05,823|INFO|Generated 1 conformers for caffeine. 45 | 2017-07-17 00:11:05,829|INFO|Saved conformers for caffeine to ./caffeine.sdf.bz2. 46 | 47 | The result is a multi-conformer SDF file called ``caffeine.sdf.bz2`` in the 48 | current directory. 49 | 50 | Fingerprinting CLI 51 | ------------------ 52 | 53 | To see all available options, run 54 | 55 | .. command-output:: e3fp-fingerprint --help 56 | :shell: 57 | 58 | To continue the above example, we will fingerprint our caffeine conformers. 59 | 60 | .. code-block:: shell-session 61 | 62 | $ e3fp-fingerprint caffeine.sdf.bz2 --bits 1024 63 | 2017-07-17 00:12:33,797|WARNING|Only 1 processes available. 'mpi' mode not available. 64 | 2017-07-17 00:12:33,801|INFO|num_proc is not specified. 'processes' mode will use all 8 processes 65 | 2017-07-17 00:12:33,801|INFO|Parallelizer initialized with mode 'processes' and 8 processors. 66 | 2017-07-17 00:12:33,801|INFO|Initializing E3FP generation. 67 | 2017-07-17 00:12:33,801|INFO|Getting SDF files 68 | 2017-07-17 00:12:33,801|INFO|SDF File Number: 1 69 | 2017-07-17 00:12:33,802|INFO|Database File: fingerprints.fpz 70 | 2017-07-17 00:12:33,802|INFO|Max First Conformers: 3 71 | 2017-07-17 00:12:33,802|INFO|Bits: 1024 72 | 2017-07-17 00:12:33,802|INFO|Level/Max Iterations: 5 73 | 2017-07-17 00:12:33,802|INFO|Shell Radius Multiplier: 1.718 74 | 2017-07-17 00:12:33,802|INFO|Stereo Mode: True 75 | 2017-07-17 00:12:33,802|INFO|Connected-only mode: on 76 | 2017-07-17 00:12:33,802|INFO|Invariant type: Daylight 77 | 2017-07-17 00:12:33,802|INFO|Parallel Mode: processes 78 | 2017-07-17 00:12:33,802|INFO|Starting 79 | 2017-07-17 00:12:33,829|INFO|Generating fingerprints for caffeine. 80 | 2017-07-17 00:12:33,935|INFO|Generated 1 fingerprints for caffeine. 81 | 2017-07-17 00:12:34,011|INFO|Saved FingerprintDatabase with fingerprints to fingerprints.fpz 82 | 83 | The result is a file ``fingerprints.fpz`` containing a 84 | :py:class:`.FingerprintDatabase`. To use such a database, consult 85 | :ref:`usage/fingerprints/storage:Fingerprint Storage`. 86 | -------------------------------------------------------------------------------- /src/e3fp/util.py: -------------------------------------------------------------------------------- 1 | """Utility classes/methods. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import inspect 7 | import warnings 8 | 9 | 10 | class E3FPError(Exception): 11 | """Base class for E3FP-specific errors. 12 | 13 | This class is provided for future E3FP-specific functionality. 14 | """ 15 | 16 | 17 | class E3FPWarning(Warning): 18 | """Base E3FP warning class. 19 | 20 | Unlike normal warnings, these are by default always set to on. 21 | """ 22 | 23 | 24 | # Always show custom warnings for this package 25 | warnings.filterwarnings("always", category=E3FPWarning) 26 | 27 | 28 | class E3FPDeprecationWarning(E3FPWarning, DeprecationWarning): 29 | """A warning class for a deprecated method or class.""" 30 | 31 | 32 | class E3FPEfficiencyWarning(E3FPWarning, RuntimeWarning): 33 | """A warning class for a potentially inefficient process.""" 34 | 35 | 36 | def maybe_jit(*args, **kwargs): 37 | """Decorator to jit a function using Numba if available. 38 | 39 | Usage is identical to `numba.jit`. 40 | """ 41 | def wrapper(func): 42 | try: 43 | import numba 44 | has_numba = True 45 | except ImportError: 46 | has_numba = False 47 | 48 | if has_numba: 49 | return numba.jit(*args, **kwargs)(func) 50 | else: 51 | return func 52 | return wrapper 53 | 54 | 55 | class deprecated(object): 56 | """Decorator to mark a function as deprecated. 57 | 58 | Issue a deprecation warning when a function is called, and update the 59 | documentation. A deprecation version must be provided. 60 | 61 | Examples 62 | -------- 63 | >>> from e3fp.util import deprecated 64 | >>> @deprecated("1.1", remove_version="1.3", 65 | ... msg="Function no longer needed") 66 | ... def my_function(): 67 | ... pass 68 | 69 | Notes 70 | ----- 71 | Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary 72 | """ 73 | 74 | def __init__(self, deprecated_version, remove_version=None, msg=None): 75 | """Constructor. 76 | 77 | Parameters 78 | ---------- 79 | deprecated_version : str 80 | Version in which object was deprecated (e.g. '1.1') 81 | remove_version : str, optional 82 | Version in which object will be removed (e.g. '1.2'). If not 83 | specified, it is assumed the object will be removed in the next 84 | release (e.g. '1.2' if `deprecated_version` is '1.1') 85 | msg : str, optional 86 | Message to include with deprecation warning, to explain deprecation 87 | or point to newer version. 88 | """ 89 | self.deprecated_version = deprecated_version 90 | if remove_version is None: 91 | version_info = deprecated_version.split(".") 92 | version_info[1] = str(int(version_info[1]) + 1) 93 | for i in range(2, len(version_info)): 94 | version_info[i] = "0" 95 | remove_version = ".".join(version_info) 96 | self.remove_version = remove_version 97 | if msg is None: 98 | self.extra = "" 99 | else: 100 | self.extra = " {0}".format(msg) 101 | 102 | def __call__(self, obj): 103 | if inspect.isfunction(obj): 104 | return self.deprecate_function(obj) 105 | else: 106 | raise ValueError("Deprecated object is not a function.") 107 | 108 | def deprecate_function(self, f): 109 | """Return the decorated function.""" 110 | msg = ( 111 | "Function `{0}` was deprecated in {1} and will be removed " 112 | "in {2}.{3}" 113 | ).format( 114 | f.__name__, 115 | self.deprecated_version, 116 | self.remove_version, 117 | self.extra, 118 | ) 119 | 120 | def new_func(*args, **kwargs): 121 | warnings.warn(msg, category=E3FPDeprecationWarning, stacklevel=2) 122 | return f(*args, **kwargs) 123 | 124 | new_func.__name__ = f.__name__ 125 | new_func.__dict__ = f.__dict__ 126 | new_func.__doc__ = f.__doc__ 127 | self.update_docstring(new_func) 128 | return new_func 129 | 130 | def update_docstring(self, obj): 131 | """Add deprecation note to docstring.""" 132 | # print(obj.__doc__) 133 | msg = ( 134 | f"\t.. deprecated:: {self.deprecated_version}\n" 135 | f"\t {self.extra}" 136 | ) 137 | obj.__doc__ = f"{obj.__doc__}\n\n{msg}" 138 | return obj 139 | -------------------------------------------------------------------------------- /doc/source/usage/fingerprints/storage.rst: -------------------------------------------------------------------------------- 1 | Fingerprint Storage 2 | =================== 3 | 4 | The most efficient way to store and interact with fingerprints is through the 5 | :py:class:`.FingerprintDatabase` class. This class wraps a matrix with 6 | sparse rows (:py:class:`scipy.sparse.csr_matrix`), where each row is a 7 | fingerprint. This enables rapid I/O of the database while also minimizing the 8 | memory footprint. Accessing the underlying sparse representation with the 9 | :py:attr:`.FingerprintDatabase.array` attribute is convenient for machine learning 10 | purposes, while the database class itself provides several useful functions. 11 | 12 | .. note:: 13 | 14 | We strongly recommend upgrading to at least SciPy v1.0.0 when working with 15 | large fingerprint databases, as old versions are much slower and have 16 | several bugs for database loading. 17 | 18 | 19 | Database I/O and Indexing 20 | ------------------------- 21 | 22 | See the full :py:class:`.FingerprintDatabase` documentation for a 23 | description of basic database usage, attributes, and methods. Below, several 24 | additional use cases are documented. 25 | 26 | Batch Database Operations 27 | ------------------------- 28 | 29 | Due to the sparse representation of the underlying data structure, an un- 30 | folded database, a database with unfolded fingerprints does not use 31 | significantly more disk space than a database with folded fingerprints. However, 32 | it is usually necessary to fold fingerprints for machine learning tasks. The 33 | :py:class:`.FingerprintDatabase` does this very quickly. 34 | 35 | .. testsetup:: 36 | 37 | import numpy as np 38 | np.random.seed(3) 39 | 40 | .. doctest:: 41 | 42 | >>> from e3fp.fingerprint.db import FingerprintDatabase 43 | >>> from e3fp.fingerprint.fprint import Fingerprint 44 | >>> import numpy as np 45 | >>> db = FingerprintDatabase(fp_type=Fingerprint, name="TestDB") 46 | >>> print(db) 47 | FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: None, fp_num: 0] 48 | >>> on_inds = [np.random.uniform(0, 2**32, size=30) for i in range(5)] 49 | >>> fps = [Fingerprint(x, bits=2**32) for x in on_inds] 50 | >>> db.add_fingerprints(fps) 51 | >>> print(db) 52 | FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5] 53 | >>> db.get_density() 54 | 6.984919309616089e-09 55 | >>> fold_db = db.fold(1024) 56 | >>> print(fold_db) 57 | FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 1024, fp_num: 5] 58 | >>> fold_db.get_density() 59 | 0.0287109375 60 | 61 | A database can be converted to a different fingerprint type: 62 | 63 | >>> from e3fp.fingerprint.fprint import CountFingerprint 64 | >>> count_db = db.as_type(CountFingerprint) 65 | >>> print(count_db) 66 | FingerprintDatabase[name: TestDB, fp_type: CountFingerprint, level: -1, bits: 4294967296, fp_num: 5] 67 | >>> count_db[0] 68 | CountFingerprint(counts={2977004690: 1, ..., 3041471738: 1}, level=-1, bits=4294967296, name=None) 69 | 70 | The :py:func:`e3fp.fingerprint.db.concat` method allows efficient joining of multiple 71 | databases. 72 | 73 | >>> from e3fp.fingerprint.db import concat 74 | >>> dbs = [] 75 | >>> for i in range(10): 76 | ... db = FingerprintDatabase(fp_type=Fingerprint) 77 | ... on_inds = [np.random.uniform(0, 1024, size=30) for j in range(5)] 78 | ... fps = [Fingerprint(x, bits=2**32, name="Mol{}".format(i)) for x in on_inds] 79 | ... db.add_fingerprints(fps) 80 | ... dbs.append(db) 81 | >>> dbs[0][0] 82 | Fingerprint(indices=array([94, 97, ..., 988, 994]), level=-1, bits=4294967296, name=Mol0) 83 | >>> print(dbs[0]) 84 | FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5] 85 | >>> merge_db = concat(dbs) 86 | >>> print(merge_db) 87 | FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 50] 88 | 89 | Database Comparison 90 | ------------------- 91 | 92 | Two databases may be compared using various metrics in 93 | :py:mod:`e3fp.fingerprint.metrics`. Additionally, all fingerprints in a database 94 | may be compared to each other simply by only providing a single database. 95 | See :ref:`usage/fingerprints/comparison:Fingerprint Comparison` for more details. 96 | 97 | Performing Machine Learning on the Database 98 | ------------------------------------------- 99 | 100 | The underlying sparse matrix may be passed directly to machine learning tools 101 | in any package that is compatible with SciPy sparse matrices, such as 102 | `scikit-learn `_. 103 | 104 | >>> from sklearn.naive_bayes import BernoulliNB 105 | >>> clf = BernoulliNB() 106 | >>> clf.fit(db.array, ypred) # doctest: +SKIP 107 | BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 108 | >>> clf.predict(db2.array) # doctest: +SKIP 109 | ... 110 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/fprint_metrics.py: -------------------------------------------------------------------------------- 1 | """Fingerprint comparison metrics. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | from __future__ import division 7 | 8 | import numpy as np 9 | from ..fprint import CountFingerprint, diff_counts_dict 10 | 11 | 12 | def tanimoto(fp1, fp2): 13 | """Calculate Tanimoto coefficient between fingerprints. 14 | 15 | Parameters 16 | ---------- 17 | fp1 : Fingerprint 18 | Fingerprint 1 19 | fp2 : Fingerprint 20 | Fingerprint 2 21 | 22 | Returns 23 | ------- 24 | float : Tanimoto coefficient. 25 | """ 26 | try: 27 | intersect = np.intersect1d( 28 | fp1.indices, fp2.indices, assume_unique=True 29 | ).shape[0] 30 | return intersect / (fp1.bit_count + fp2.bit_count - intersect) 31 | except ZeroDivisionError: 32 | return 0.0 33 | 34 | 35 | def soergel(fp1, fp2): 36 | """Calculate Soergel similarity between fingerprints. 37 | 38 | Soergel similarity is the complement of Soergel distance and can be 39 | thought of as the analog of the Tanimoto coefficient for count/float-based 40 | fingerprints. For `Fingerprint`, it is equivalent to the Tanimoto 41 | coefficient. 42 | 43 | Parameters 44 | ---------- 45 | fp1 : Fingerprint 46 | Fingerprint 1 47 | fp2 : Fingerprint 48 | Fingerprint 2 49 | 50 | Returns 51 | ------- 52 | float : Soergel similarity. 53 | 54 | Reference 55 | ------- 56 | 57 | """ 58 | if not ( 59 | isinstance(fp1, CountFingerprint) and isinstance(fp2, CountFingerprint) 60 | ): 61 | return tanimoto(fp1, fp2) 62 | 63 | counts_diff = diff_counts_dict(fp1, fp2) 64 | temp = np.asarray( 65 | [ 66 | (abs(counts_diff[x]), max(fp1.get_count(x), fp2.get_count(x))) 67 | for x in counts_diff.keys() 68 | ], 69 | dtype=float, 70 | ).T 71 | soergel = 1 - np.sum(temp[0, :]) / np.sum(temp[1, :]) 72 | 73 | return soergel 74 | 75 | 76 | def dice(fp1, fp2): 77 | """Calculate Dice coefficient between fingerprints. 78 | 79 | Parameters 80 | ---------- 81 | fp1 : Fingerprint 82 | Fingerprint 1 83 | fp2 : Fingerprint 84 | Fingerprint 2 85 | 86 | Returns 87 | ------- 88 | float : Dice coefficient. 89 | """ 90 | try: 91 | intersect = np.intersect1d( 92 | fp1.indices, fp2.indices, assume_unique=True 93 | ).shape[0] 94 | return 2 * intersect / (fp1.bit_count + fp2.bit_count) 95 | except ZeroDivisionError: 96 | return 0.0 97 | 98 | 99 | def cosine(fp1, fp2): 100 | """Calculate cosine similarity between fingerprints. 101 | 102 | Parameters 103 | ---------- 104 | fp1 : Fingerprint 105 | Fingerprint 1 106 | fp2 : Fingerprint 107 | Fingerprint 2 108 | 109 | Returns 110 | ------- 111 | float : Cosine similarity. 112 | """ 113 | try: 114 | dot = sum(v * fp2.get_count(k) for k, v in fp1.counts.items()) 115 | root_norm = ( 116 | sum(v ** 2 for v in fp1.counts.values()) 117 | * sum(v ** 2 for v in fp2.counts.values()) 118 | ) ** 0.5 119 | return dot / root_norm 120 | except ZeroDivisionError: 121 | return 0.0 122 | 123 | 124 | def pearson(fp1, fp2): 125 | """Calculate Pearson correlation between fingerprints. 126 | 127 | Parameters 128 | ---------- 129 | fp1 : Fingerprint 130 | Fingerprint 1 131 | fp2 : Fingerprint 132 | Fingerprint 2 133 | 134 | Returns 135 | ------- 136 | float : Pearson correlation. 137 | """ 138 | try: 139 | dot = sum(v * fp2.get_count(k) for k, v in fp1.counts.items()) 140 | return (dot / fp1.bits - fp1.mean() * fp2.mean()) / ( 141 | fp1.std() * fp2.std() 142 | ) 143 | except ZeroDivisionError: 144 | return 0.0 145 | 146 | # intersect = np.intersect1d(fp1.indices, fp2.indices, 147 | # assume_unique=True).shape[0] 148 | # return ((intersect / fp1.bits) - 149 | # ((fp1.mean() * fp2.mean()) / (fp1.std() * fp2.std()))) 150 | 151 | 152 | def hamming(fp1, fp2): 153 | """Calculate Hamming distance between fingerprints. 154 | 155 | Parameters 156 | ---------- 157 | fp1 : Fingerprint 158 | Fingerprint 1 159 | fp2 : Fingerprint 160 | Fingerprint 2 161 | 162 | Returns 163 | ------- 164 | float : Hamming distance. 165 | """ 166 | intersect = np.intersect1d( 167 | fp1.indices, fp2.indices, assume_unique=True 168 | ).shape[0] 169 | return fp1.bit_count + fp2.bit_count - 2 * intersect 170 | 171 | 172 | def distance(fp1, fp2): 173 | """Calculate Euclidean distance between fingerprints. 174 | 175 | Parameters 176 | ---------- 177 | fp1 : Fingerprint 178 | Fingerprint 1 179 | fp2 : Fingerprint 180 | Fingerprint 2 181 | 182 | Returns 183 | ------- 184 | float : Euclidian distance. 185 | """ 186 | return hamming(fp1, fp2) ** 0.5 187 | -------------------------------------------------------------------------------- /tests/test_fingerprint.py: -------------------------------------------------------------------------------- 1 | """Tests for E3FP fingerprints. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import pytest 7 | 8 | class TestFingerprintIO: 9 | def test_fprint_from_indices(self): 10 | from e3fp.fingerprint.fprint import ( 11 | Fingerprint, 12 | CountFingerprint, 13 | FloatFingerprint, 14 | ) 15 | 16 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 17 | in_indices = [3, 1, 4, 5] 18 | bits = 32 19 | fprint = fp_type.from_indices(in_indices, bits=bits) 20 | assert sorted(in_indices) == sorted(fprint.indices) 21 | 22 | def test_fprint_from_fprint(self): 23 | from e3fp.fingerprint.fprint import ( 24 | Fingerprint, 25 | CountFingerprint, 26 | FloatFingerprint, 27 | ) 28 | 29 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 30 | in_indices = [3, 1, 4, 5, 1, 5, 9] 31 | bits = 32 32 | fprint1 = fp_type.from_indices(in_indices, bits=bits) 33 | fprint2 = fp_type.from_fingerprint(fprint1) 34 | assert fprint1 == fprint2 35 | 36 | def test_countfprint_from_counts(self): 37 | from e3fp.fingerprint.fprint import CountFingerprint 38 | 39 | in_counts = {3: 1, 1: 4, 5: 1} 40 | bits = 32 41 | fprint = CountFingerprint.from_counts(in_counts, bits=bits) 42 | out_counts = fprint.counts 43 | assert in_counts == out_counts 44 | 45 | def test_floatfprint_from_counts(self): 46 | from e3fp.fingerprint.fprint import FloatFingerprint 47 | 48 | in_counts = {3: 1.0, 1: 4.0, 5: 1.0} 49 | bits = 32 50 | fprint = FloatFingerprint.from_counts(in_counts, bits=bits) 51 | out_counts = fprint.counts 52 | assert in_counts == out_counts 53 | 54 | def test_unique_indices(self): 55 | from e3fp.fingerprint.fprint import ( 56 | Fingerprint, 57 | CountFingerprint, 58 | FloatFingerprint, 59 | ) 60 | 61 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 62 | in_indices = [3, 1, 4, 5, 1, 5, 9] 63 | bits = 32 64 | fprint = fp_type.from_indices(in_indices, bits=bits) 65 | assert sorted(set(in_indices)) == sorted(fprint.indices) 66 | 67 | def test_bitstring_io(self): 68 | from e3fp.fingerprint.fprint import ( 69 | Fingerprint, 70 | CountFingerprint, 71 | FloatFingerprint, 72 | ) 73 | 74 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 75 | in_bitstring = "1001001111011000" 76 | fprint = fp_type.from_bitstring(in_bitstring) 77 | out_bitstring = fprint.to_bitstring() 78 | assert in_bitstring == out_bitstring 79 | 80 | def test_vector_io(self): 81 | from e3fp.fingerprint.fprint import ( 82 | Fingerprint, 83 | CountFingerprint, 84 | FloatFingerprint, 85 | ) 86 | import numpy as np 87 | 88 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 89 | in_vector = np.array([0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=np.bool_) 90 | fprint = fp_type.from_vector(in_vector) 91 | out_vector = fprint.to_vector(sparse=False) 92 | np.testing.assert_array_equal(in_vector, out_vector) 93 | 94 | def test_rdkit_io(self): 95 | from e3fp.fingerprint.fprint import ( 96 | Fingerprint, 97 | CountFingerprint, 98 | FloatFingerprint, 99 | ) 100 | 101 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 102 | indices = [3, 1, 4, 5] 103 | bits = 32 104 | fprint1 = fp_type.from_indices(indices, bits=bits) 105 | rdkit_fprint1 = fprint1.to_rdkit() 106 | fprint2 = fp_type.from_rdkit(rdkit_fprint1) 107 | rdkit_fprint2 = fprint2.to_rdkit() 108 | assert rdkit_fprint1 == rdkit_fprint2 109 | 110 | def test_basic_properties(self): 111 | from e3fp.fingerprint.fprint import ( 112 | Fingerprint, 113 | CountFingerprint, 114 | FloatFingerprint, 115 | ) 116 | import numpy as np 117 | 118 | bits = 1024 119 | for i in range(10): 120 | indices = np.random.randint(0, bits, 30) 121 | unique_inds = np.unique(indices) 122 | level = int(np.random.randint(0, 10)) 123 | for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint): 124 | fp = fp_type.from_indices(indices, bits=bits, level=level) 125 | assert fp.bits == bits 126 | assert len(fp) == bits 127 | assert fp.bit_count == unique_inds.size 128 | assert fp.density == pytest.approx(float(unique_inds.size) / bits) 129 | 130 | 131 | class TestFingerprintAlgebra: 132 | pass 133 | 134 | 135 | class TestFingerprintComparison: 136 | pass 137 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """Efficient comparison metrics for fingerprints and their databases. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import logging 7 | 8 | from ..fprint import Fingerprint 9 | from ..util import E3FPBitsValueError 10 | from ..db import FingerprintDatabase 11 | from . import array_metrics 12 | from . import fprint_metrics 13 | 14 | 15 | def tanimoto(A, B=None): 16 | """Compute Tanimoto coefficients between fingerprints. 17 | 18 | Fingerprints must have same number of bits. If not bit-fingerprints, 19 | arrays will be cast to binary. For non-binary data, use `soergel`. If only 20 | one fingerprint/database is provided, it is compared to self. 21 | 22 | Parameters 23 | ---------- 24 | A, B : Fingerprint or FingerprintDatabase 25 | Fingerprint(s) to be compared 26 | 27 | Returns 28 | ------- 29 | tanimoto : float or ndarray [shape (num_fps_A, num_fps_B)] 30 | Pairwise tanimoto(s) between fingerprint(s) in `A` and `B`. 31 | 32 | See Also 33 | -------- 34 | cosine, dice, pearson, soergel 35 | """ 36 | A, B = _check_item_pair(A, B, fp_type=Fingerprint) 37 | if isinstance(A, Fingerprint): 38 | return fprint_metrics.tanimoto(A, B) 39 | return array_metrics.tanimoto(A.array, B.array) 40 | 41 | 42 | def soergel(A, B=None): 43 | """Compute Soergel similarities between fingerprints. 44 | 45 | Soergel similarity is the complement of the Soergel distance and is 46 | analogous to the Tanimoto coefficient for count/float fingerprints. For 47 | binary data, it is equivalent to `tanimoto`. 48 | 49 | Parameters 50 | ---------- 51 | A, B : Fingerprint or FingerprintDatabase 52 | Fingerprint(s) to be compared 53 | 54 | Returns 55 | ------- 56 | soergel : float or ndarray [shape (num_fps_A, num_fps_B)] 57 | 58 | See Also 59 | -------- 60 | cosine, dice, pearson, tanimoto 61 | 62 | """ 63 | A, B = _check_item_pair(A, B) 64 | if isinstance(A, Fingerprint): 65 | return fprint_metrics.soergel(A, B) 66 | return array_metrics.soergel(A.array, B.array) 67 | 68 | 69 | def dice(A, B=None): 70 | """Compute Dice coefficients between fingerprints. 71 | 72 | Fingerprints must have same number of bits. If not bit-fingerprints, 73 | arrays will be cast to binary. If only one fingerprint/database is 74 | provided, it is compared to self. 75 | 76 | Parameters 77 | ---------- 78 | A, B : Fingerprint or FingerprintDatabase 79 | Fingerprint(s) to be compared 80 | 81 | Returns 82 | ------- 83 | dice : float or ndarray [shape (num_fps_A, num_fps_B)] 84 | 85 | See Also 86 | -------- 87 | cosine, pearson, soergel, tanimoto 88 | """ 89 | A, B = _check_item_pair(A, B, fp_type=Fingerprint) 90 | if isinstance(A, Fingerprint): 91 | return fprint_metrics.dice(A, B) 92 | return array_metrics.dice(A.array, B.array) 93 | 94 | 95 | def cosine(A, B=None): 96 | """Compute cosine similarities between fingerprints. 97 | 98 | Fingerprints must have same number of bits. If only one 99 | fingerprint/database is provided, it is compared to self. 100 | 101 | Parameters 102 | ---------- 103 | A, B : Fingerprint or FingerprintDatabase 104 | Fingerprint(s) to be compared 105 | 106 | Returns 107 | ------- 108 | cosine : float or ndarray [shape (num_fps_A, num_fps_B)] 109 | 110 | See Also 111 | -------- 112 | dice, pearson, soergel, tanimoto 113 | """ 114 | A, B = _check_item_pair(A, B) 115 | if isinstance(A, Fingerprint): 116 | return fprint_metrics.cosine(A, B) 117 | return array_metrics.cosine(A.array, B.array) 118 | 119 | 120 | def pearson(A, B=None): 121 | """Compute Pearson correlation between fingerprints. 122 | 123 | Fingerprints must have same number of bits. If only one 124 | fingerprint/database is provided, it is compared to self. 125 | 126 | Parameters 127 | ---------- 128 | A, B : Fingerprint or FingerprintDatabase 129 | Fingerprint(s) to be compared 130 | 131 | Returns 132 | ------- 133 | pearson : float or ndarray [shape (num_fps_A, num_fps_B)] 134 | 135 | See Also 136 | -------- 137 | cosine, dice, soergel, tanimoto 138 | """ 139 | A, B = _check_item_pair(A, B) 140 | if isinstance(A, Fingerprint): 141 | return fprint_metrics.pearson(A, B) 142 | return array_metrics.pearson(A.array, B.array) 143 | 144 | 145 | def _check_item(item, fp_type=None, force_db=False): 146 | if force_db and isinstance(item, Fingerprint): 147 | if not fp_type: 148 | fp_type = item.__class__ 149 | db = FingerprintDatabase(fp_type=fp_type) 150 | db.add_fingerprints([item]) 151 | item = db 152 | elif fp_type and isinstance(item, FingerprintDatabase): 153 | logging.debug( 154 | "Casting database fingerprints to {}.".format(fp_type.__name__) 155 | ) 156 | item = item.as_type(fp_type, copy=False) 157 | return item 158 | 159 | 160 | def _check_item_pair(A, B, fp_type=None, force_db=False): 161 | try: 162 | if B is not None and A.bits != B.bits: 163 | raise E3FPBitsValueError( 164 | "Fingerprints must have same number of bits." 165 | ) 166 | except AttributeError: 167 | raise TypeError("Items must be Fingerprint or FingerprintDatabase.") 168 | if isinstance(A, FingerprintDatabase) or isinstance( 169 | B, FingerprintDatabase 170 | ): 171 | force_db = True 172 | A = _check_item(A, fp_type=fp_type, force_db=force_db) 173 | if B is None: 174 | B = A 175 | else: 176 | B = _check_item(B, fp_type=fp_type, force_db=force_db) 177 | return A, B 178 | -------------------------------------------------------------------------------- /src/e3fp/config/params.py: -------------------------------------------------------------------------------- 1 | """Get E3FP default parameters and read parameters from files. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import os 7 | import copy 8 | import ast 9 | 10 | from configparser import ( 11 | ConfigParser, 12 | NoSectionError, 13 | DuplicateSectionError, 14 | ) 15 | 16 | CONFIG_DIR = os.path.dirname(os.path.realpath(__file__)) 17 | DEF_PARAM_FILE = os.path.join(CONFIG_DIR, "defaults.cfg") 18 | 19 | 20 | def read_params(params=None, fill_defaults=False): 21 | """Get combination of provided parameters and default parameters. 22 | 23 | Parameters 24 | ---------- 25 | params : str or ConfigParser, optional 26 | User provided parameters as an INI file or `ConfigParser`. 27 | Any parameters provided will replace default parameters. 28 | fill_defaults : bool, optional 29 | Fill values that aren't provided with package defaults, if `params` 30 | is file. 31 | 32 | Returns 33 | ------- 34 | all_params : ConfigParser 35 | Combination of default and user-provided parameters. 36 | """ 37 | if isinstance(params, ConfigParser): 38 | return copy.copy(params) 39 | 40 | params_list = [] 41 | if fill_defaults: 42 | params_list.append(DEF_PARAM_FILE) 43 | if params is not None: 44 | params_list.append(params) 45 | 46 | all_params = ConfigParser() 47 | all_params.read(params_list) 48 | 49 | return all_params 50 | 51 | 52 | def write_params(params, params_file="params.cfg"): 53 | """Write params to file. 54 | 55 | Parameters 56 | ---------- 57 | params : ConfigParser 58 | Params 59 | params_file : str 60 | Params file 61 | """ 62 | with open(params_file, "w") as f: 63 | params.write(f) 64 | 65 | 66 | def get_value( 67 | params, section_name, param_name, dtype=str, auto=False, fallback=None 68 | ): 69 | """Get value from params with fallback. 70 | 71 | Parameters 72 | ---------- 73 | params : ConfigParser 74 | Parameters 75 | section_name : str 76 | Name of section in `params` 77 | param_name : str 78 | Name of parameter in `section` 79 | dtype : type, optional 80 | Type to return data as. 81 | auto : bool, optional 82 | Auto-discover type of value. If provided, `dtype` is ignored. 83 | fallback : any, optional 84 | Value to return if getting value fails. 85 | 86 | Returns 87 | ------- 88 | value : any 89 | Value of parameter or `fallback`. 90 | """ 91 | if auto: 92 | try: 93 | value = params.get(section_name, param_name) 94 | except ValueError: 95 | return fallback 96 | 97 | try: 98 | return ast.literal_eval(value) 99 | except (ValueError, SyntaxError): 100 | return value 101 | else: 102 | get_function = params.get 103 | if dtype is int: 104 | get_function = params.getint 105 | elif dtype is float: 106 | get_function = params.getfloat 107 | elif dtype is bool: 108 | get_function = params.getboolean 109 | 110 | try: 111 | return get_function(section_name, param_name) 112 | except ValueError: 113 | return fallback 114 | 115 | 116 | def get_default_value(*args, **kwargs): 117 | global default_params 118 | return get_value(default_params, *args, **kwargs) 119 | 120 | 121 | def update_params( 122 | params_dict, params=None, section_name=None, fill_defaults=False 123 | ): 124 | """Set `ConfigParser` values from a sections dict. 125 | 126 | Sections dict key must be parameter sections, and value must be dict 127 | matching parameter name to value. If existing `ConfigParser` is 128 | provided, parameter values are updated. 129 | 130 | Parameters 131 | ---------- 132 | params_dict : dict 133 | If `section_name` is provided, dict must match parameter names to 134 | values. If `section_name` is not provided, dict key(s) must be 135 | parameter sections, and value(s) must be parameter dict. 136 | params : ConfigParser, optional 137 | Existing parameters. 138 | section_name : str, optional 139 | Name of section to which to add parameters in `params_dict` 140 | fill_defaults : bool, optional 141 | Fill values that aren't provided with package defaults, if `params` 142 | is file. 143 | """ 144 | if params is None: 145 | params = ConfigParser() 146 | else: 147 | params = read_params(params, fill_defaults=fill_defaults) 148 | 149 | if section_name is not None: 150 | try: 151 | params.add_section(section_name) 152 | except DuplicateSectionError: 153 | pass 154 | 155 | for param_name, param_value in params_dict.items(): 156 | params.set(section_name, param_name, str(param_value)) 157 | else: 158 | sections_dict = params_dict 159 | for section_name, params_dict in sections_dict.items(): 160 | for param_name, param_value in params_dict.items(): 161 | params.set(section_name, param_name, param_value) 162 | return params 163 | 164 | 165 | def params_to_sections_dict(params, auto=True): 166 | """Get dict of sections dicts in params, with optional type discovery. 167 | 168 | Parameters 169 | ---------- 170 | params : str or ConfigParser 171 | Params to read 172 | auto : bool, optional 173 | Auto typing of parameter values. 174 | 175 | Returns 176 | ---------- 177 | dict : dict matching sections to parameters to values. 178 | """ 179 | params = read_params(params) 180 | sections = default_params.sections() 181 | params_dicts = {} 182 | for section in sections: 183 | try: 184 | params_dict = dict(params.items(section)) 185 | except NoSectionError: 186 | continue 187 | if auto: 188 | params_dict = { 189 | param_name: get_value(params, section, param_name, auto=True) 190 | for param_name in params_dict 191 | } 192 | params_dicts[section] = params_dict 193 | return params_dicts 194 | 195 | 196 | default_params = read_params(fill_defaults=True) 197 | -------------------------------------------------------------------------------- /src/e3fp/conformer/protonation.py: -------------------------------------------------------------------------------- 1 | """Functions for generating protonation states of molecules. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import os 7 | import tempfile 8 | import subprocess 9 | import itertools 10 | import logging 11 | 12 | from .util import iter_to_smiles, MolItemName 13 | 14 | 15 | def smiles_dict_to_proto_smiles_dict( 16 | in_smiles_dict, 17 | max_states=3, 18 | pka=7.4, 19 | dist_cutoff=20.0, 20 | add_missing=False, 21 | parallelizer=None, 22 | chunk_size=100, 23 | ): 24 | """Generate dict of SMILES for protonated states from SMILES dict.""" 25 | kwargs = {"max_states": max_states, "pka": pka, "dist_cutoff": dist_cutoff} 26 | in_smiles_iter = ( 27 | (smiles, mol_name) for mol_name, smiles in in_smiles_dict.items() 28 | ) 29 | if parallelizer is None: 30 | proto_smiles_iter = iter( 31 | smiles_list_to_proto_smiles_list(in_smiles_iter, **kwargs) 32 | ) 33 | else: 34 | smiles_chunks_iter = ( 35 | (chunk,) 36 | for chunk in _chunk_iter_to_lists( 37 | in_smiles_iter, chunk_size=chunk_size 38 | ) 39 | ) 40 | results_iter = ( 41 | result 42 | for result, data in parallelizer.run_gen( 43 | smiles_list_to_proto_smiles_list, 44 | smiles_chunks_iter, 45 | kwargs=kwargs, 46 | ) 47 | if result is not False 48 | ) 49 | proto_smiles_iter = itertools.chain.from_iterable(results_iter) 50 | 51 | proto_smiles_dict = { 52 | mol_name: smiles for smiles, mol_name in proto_smiles_iter 53 | } 54 | if add_missing: 55 | for mol_name, smiles in in_smiles_dict.items(): 56 | proto_name = MolItemName(mol_name, proto_state_num=0).proto_name 57 | if proto_name not in proto_smiles_dict: 58 | logging.debug( 59 | ( 60 | "Protonated SMILES for {} could not be generated. " 61 | "Returning input SMILES." 62 | ).format(mol_name) 63 | ) 64 | proto_smiles_dict[mol_name] = smiles 65 | 66 | return proto_smiles_dict 67 | 68 | 69 | def smiles_list_to_proto_smiles_list( 70 | in_smiles_list, max_states=3, pka=7.4, dist_cutoff=20.0 71 | ): 72 | """Generate list of SMILES for protonated states from single SMILES.""" 73 | in_smiles_file = tempfile.mkstemp(suffix=".smi")[1] 74 | iter_to_smiles( 75 | in_smiles_file, 76 | ((mol_name, smiles) for smiles, mol_name in in_smiles_list), 77 | ) 78 | logging.debug("Protonating SMILES in %s" % (in_smiles_file)) 79 | proc = subprocess.Popen( 80 | ( 81 | "cxcalc %s --ignore-error dominanttautomerdistribution -H %g -C " 82 | 'false -t dist -f "smiles:n,T:dist"' 83 | ).format(in_smiles_file, pka), 84 | shell=True, 85 | stdout=subprocess.PIPE, 86 | ) 87 | 88 | proto_smiles_list = [] 89 | try: 90 | stdout_iter = iter(proc.stdout.readline, b"") 91 | next(stdout_iter) 92 | curr_mol_name = None 93 | curr_states_count = 0 94 | for line in stdout_iter: 95 | try: 96 | smiles, mol_name, dist = line.rstrip("\r\n").split() 97 | except ValueError: 98 | logging.warning("Error parsing line:\n%s" % line) 99 | continue 100 | if mol_name != curr_mol_name: 101 | curr_states_count = 0 102 | curr_mol_name = mol_name 103 | if curr_states_count >= max_states: 104 | continue 105 | if float(dist) > dist_cutoff: 106 | proto_name = MolItemName( 107 | mol_name, proto_state_num=curr_states_count 108 | ).proto_name 109 | curr_states_count += 1 110 | proto_smiles_list.append((smiles, proto_name)) 111 | logging.debug("Finished protonating SMILES in %s" % (in_smiles_file)) 112 | except Exception: 113 | logging.exception("Error running cxcalc", exc_info=True) 114 | 115 | proc.kill() 116 | os.remove(in_smiles_file) 117 | return proto_smiles_list 118 | 119 | 120 | def smiles_to_proto_smiles( 121 | smiles, mol_name, max_states=3, pka=7.4, dist_cutoff=20.0 122 | ): 123 | """Generate list of SMILES for protonated states from single SMILES. 124 | 125 | This is very inefficient in batch. 126 | """ 127 | logging.debug("Protonating SMILES in %s" % (mol_name)) 128 | proc = subprocess.Popen( 129 | ( 130 | 'cxcalc "%s %s" --ignore-error dominanttautomerdistribution -H %g ' 131 | '-C false -t dist -f "smiles:n,T:dist"' 132 | ).format(smiles, mol_name, pka), 133 | shell=True, 134 | stdout=subprocess.PIPE, 135 | ) 136 | states_count = 0 137 | proto_smiles_list = [] 138 | try: 139 | stdout_iter = iter(proc.stdout.readline, b"") 140 | next(stdout_iter) 141 | for line in stdout_iter: 142 | try: 143 | this_smiles, this_name, dist = line.rstrip("\r\n").split() 144 | except ValueError: 145 | logging.warning("Error parsing line:\n%s" % line) 146 | continue 147 | if states_count >= max_states: 148 | break 149 | if float(dist) > dist_cutoff: 150 | proto_name = MolItemName( 151 | mol_name, proto_state_num=states_count 152 | ).proto_name 153 | states_count += 1 154 | proto_smiles_list.append((smiles, proto_name)) 155 | logging.debug("Finished protonating SMILES in %s" % (mol_name)) 156 | except OSError: 157 | logging.exception( 158 | "Error running cxcalc on %s" % (mol_name), exc_info=True 159 | ) 160 | 161 | proc.kill() 162 | return proto_smiles_list 163 | 164 | 165 | def _chunk_iter_to_lists(iterable, chunk_size=100): 166 | """Yield chunks of size `chunk_size` from iterator.""" 167 | i = 0 168 | chunk = [] 169 | for item in iterable: 170 | if i >= chunk_size: 171 | yield chunk 172 | chunk = [] 173 | i = 0 174 | chunk.append(item) 175 | i += 1 176 | if len(chunk) != 0: 177 | yield chunk 178 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # e3fp documentation build configuration file, created by 4 | # sphinx-quickstart on Sun Jun 25 01:13:34 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import importlib.metadata 20 | import os 21 | import sys 22 | 23 | e3fp_version = importlib.metadata.version('e3fp') 24 | 25 | # Set-up environment variable for programoutput 26 | os.environ['E3FP_REPO'] = os.path.abspath("../..") 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | 'sphinx.ext.autosectionlabel', 39 | 'sphinx.ext.autosummary', 40 | 'sphinx.ext.intersphinx', 41 | 'sphinx.ext.coverage', 42 | 'sphinx.ext.ifconfig', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx.ext.githubpages', 45 | 'sphinx.ext.autodoc', 46 | 'sphinx.ext.napoleon', 47 | 'sphinx.ext.doctest', 48 | 'sphinx.ext.todo', 49 | 'sphinx.ext.imgconverter', 50 | 'sphinxcontrib.programoutput'] 51 | 52 | napoleon_google_docstring = False 53 | napoleon_numpy_docstring = True 54 | napoleon_use_param = False 55 | napoleon_use_ivar = True 56 | 57 | autosummary_generate = True 58 | 59 | add_module_names = False 60 | 61 | # Add any paths that contain templates here, relative to this directory. 62 | templates_path = ['_templates'] 63 | 64 | # The suffix(es) of source filenames. 65 | # You can specify multiple suffix as a list of string: 66 | # 67 | # source_suffix = ['.rst', '.md'] 68 | source_suffix = {'.rst': 'restructuredtext'} 69 | 70 | # The master toctree document. 71 | master_doc = 'index' 72 | 73 | # General information about the project. 74 | project = u'e3fp' 75 | copyright = u'2017, Seth Axen' 76 | author = u'Seth Axen' 77 | 78 | # The version info for the project you're documenting, acts as replacement for 79 | # |version| and |release|, also used in various other places throughout the 80 | # built documents. 81 | # 82 | # The short X.Y version. 83 | version = '%s' % (e3fp_version) 84 | # The full version, including alpha/beta/rc tags. 85 | release = version 86 | 87 | # The language for content autogenerated by Sphinx. Refer to documentation 88 | # for a list of supported languages. 89 | # 90 | # This is also used if you do content translation via gettext catalogs. 91 | # Usually you set "language" from the command line for these cases. 92 | language = "en" 93 | 94 | # The reST default role (used for this markup: `text`) to use for all 95 | # documents. The autolink role functions as :obj: when the name referred can 96 | # be resolved to a Python object 97 | default_role = "autolink" 98 | 99 | # List of patterns, relative to source directory, that match files and 100 | # directories to ignore when looking for source files. 101 | # This patterns also effect to html_static_path and html_extra_path 102 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 103 | 104 | # The name of the Pygments (syntax highlighting) style to use. 105 | pygments_style = 'sphinx' 106 | 107 | # If true, `todo` and `todoList` produce output, else they produce nothing. 108 | todo_include_todos = False 109 | 110 | # Add unique prefixes to autosectionlabel to avoid duplicate labels 111 | autosectionlabel_prefix_document = True 112 | 113 | 114 | # -- Options for HTML output ---------------------------------------------- 115 | 116 | # The theme to use for HTML and HTML Help pages. See the documentation for 117 | # a list of builtin themes. 118 | html_theme = 'sphinx_rtd_theme' 119 | 120 | # Theme options are theme-specific and customize the look and feel of a theme 121 | # further. For a list of options available for each theme, see the 122 | # documentation. 123 | # 124 | # html_theme_options = {} 125 | 126 | # Add any paths that contain custom static files (such as style sheets) here, 127 | # relative to this directory. They are copied after the builtin static files, 128 | # so a file named "default.css" will overwrite the builtin "default.css". 129 | html_static_path = ['_static'] 130 | 131 | 132 | # -- Options for HTMLHelp output ------------------------------------------ 133 | 134 | # Output file base name for HTML help builder. 135 | htmlhelp_basename = 'e3fpdoc' 136 | 137 | 138 | # -- Options for LaTeX output --------------------------------------------- 139 | 140 | latex_elements = { 141 | # The paper size ('letterpaper' or 'a4paper'). 142 | # 143 | # 'papersize': 'letterpaper', 144 | 145 | # The font size ('10pt', '11pt' or '12pt'). 146 | # 147 | # 'pointsize': '10pt', 148 | 149 | # Additional stuff for the LaTeX preamble. 150 | # 151 | # 'preamble': '', 152 | 153 | # Latex figure (float) alignment 154 | # 155 | # 'figure_align': 'htbp', 156 | } 157 | 158 | # Grouping the document tree into LaTeX files. List of tuples 159 | # (source start file, target name, title, 160 | # author, documentclass [howto, manual, or own class]). 161 | latex_documents = [ 162 | (master_doc, 'e3fp.tex', u'e3fp Documentation', 163 | u'Seth Axen', 'manual'), 164 | ] 165 | 166 | 167 | # -- Options for manual page output --------------------------------------- 168 | 169 | # One entry per manual page. List of tuples 170 | # (source start file, name, description, authors, manual section). 171 | man_pages = [ 172 | (master_doc, 'e3fp', u'e3fp Documentation', 173 | [author], 1) 174 | ] 175 | 176 | 177 | # -- Options for Texinfo output ------------------------------------------- 178 | 179 | # Grouping the document tree into Texinfo files. List of tuples 180 | # (source start file, target name, title, author, 181 | # dir menu entry, description, category) 182 | texinfo_documents = [ 183 | (master_doc, 'e3fp', u'e3fp Documentation', 184 | author, 'e3fp', 'One line description of project.', 185 | 'Miscellaneous'), 186 | ] 187 | 188 | 189 | # Example configuration for intersphinx: refer to the Python standard library. 190 | intersphinx_mapping = { 191 | 'python': ('https://docs.python.org/3/', None), 192 | 'numpy': ('https://numpy.org/doc/stable/', None), 193 | 'scipy': ('https://docs.scipy.org/doc/scipy/', None), 194 | } 195 | -------------------------------------------------------------------------------- /doc/source/dev/index.rst: -------------------------------------------------------------------------------- 1 | Developer Notes 2 | =============== 3 | 4 | We welcome contributions to E3FP! These notes are designed to help developers 5 | contribute code 6 | 7 | Authoring Code 8 | -------------- 9 | 10 | Code Formatting 11 | ~~~~~~~~~~~~~~~ 12 | 13 | E3FP's code should be *readable*. To ensure this, we rigorously follow the 14 | PEP8_ style conventions and PEP257_ docstring conventions, which maximize 15 | readability of the code and ease of future development. You may check your 16 | code for conformation to these conventions with the pycodestyle_ and 17 | pydocstyle_ utilities, respectively. Where the code is necessarily 18 | complicated, inline comments should reorient the reader. 19 | 20 | Utility Methods and Classes 21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 22 | 23 | Three sets of utility methods and classes are provided: `e3fp.util`, 24 | `e3fp.conformer.util`, and `e3fp.fingerprint.util`. These provide general and 25 | often-used functionality in their corresponding packages. Additionally, they 26 | provide E3FP-specific errors and exceptions. 27 | 28 | Warnings and Errors 29 | ~~~~~~~~~~~~~~~~~~~ 30 | 31 | By default, warnings in Python are silent. We therefore provide a warning base 32 | class `e3fp.util.E3FPWarning` that is not silent by default. We provide several 33 | general warnings: 34 | 35 | :py:class:`.E3FPDeprecationWarning` 36 | warns when a deprecated method is called or class is instantiated. 37 | 38 | .. seealso:: 39 | 40 | `Deprecation`_ 41 | 42 | :py:class:`.E3FPEfficiencyWarning` 43 | warns when a method, module version, or combination of parameters is known 44 | to be inefficient. 45 | 46 | .. note:: 47 | 48 | If possible, the warning message should advise on a more efficient 49 | approach. 50 | 51 | E3FP-specific errors should inherit `e3fp.util.E3FPError` base class. Several 52 | fingerprinting-specific errors are defined in `e3fp.fingerprint.util`. 53 | 54 | Deprecation 55 | ~~~~~~~~~~~ 56 | 57 | Whenever changing the interface or behavior of a user-facing method or class, 58 | it is proper to deprecate it for at least one release, so that the users have 59 | time to update their scripts accordingly. A deprecated method should providing 60 | an `e3fp.util.E3FPDeprecationWarning`, notifying the user in which release to 61 | expect the method or class to be removed, and updating the documentation 62 | accordingly. This functionality is automated with the `e3fp.util.deprecated` 63 | decorator, as shown in this example: 64 | 65 | >>> import sys 66 | >>> sys.stderr = sys.stdout 67 | >>> from e3fp.util import deprecated 68 | >>> @deprecated("1.1", remove_version="1.3", msg="Function no longer needed.") 69 | ... def deprecated_method(): 70 | ... """A method to demonstrate method deprecation.""" 71 | ... pass 72 | >>> deprecated_method() 73 | ...: E3FPDeprecationWarning: Function `my_function` was deprecated in 1.1 and will be removed in 1.3. Function no longer needed. 74 | 75 | In the api documentation, the method will appear as: 76 | 77 | .. function:: deprecated_method() 78 | 79 | .. note:: Deprecated in e3fp 1.1. 80 | `deprecated_method` will be removed in e3fp 1.3. Function no longer needed. 81 | 82 | A method to demonstrate method deprecation. 83 | 84 | .. note:: 85 | If no `remove_version` is specified, then the remove version defaults to the 86 | next release after deprecation. For example, if the method was deprecated in 87 | 1.1, it is by default marked for removal in 1.2. 88 | 89 | Contributing Code 90 | ~~~~~~~~~~~~~~~~~ 91 | 92 | Before contributing code to E3FP, it is advisable for major modifications to 93 | submit an issue to the 94 | `issue tracker`_ to enable other 95 | developers to contribute to the design of the code and to reduce the amount of 96 | work necessary to conform the code to E3FP's standards. After writing the code, 97 | create a `pull request`_. This is best even if you have push access to the 98 | E3FP repo, as it enables the test suite to be run on the new code prior to 99 | merging it with the remaining code base. 100 | 101 | Writing Tests 102 | ~~~~~~~~~~~~~ 103 | 104 | The standard in E3FP is to commit a test for new functionality simultaneously 105 | with the new functionality or within the same pull request. While this slows 106 | development, it prevents building a large backlog of untested methods and 107 | classes. 108 | 109 | These should ideally be unit tests, though for some complicated 110 | functionalities, such as fingerprinting, integration tests are also 111 | necessary. For these complicated functions, specific units may still be 112 | tested using :py:mod:`unittest.mock`. For example, 113 | :py:meth:`unittest.mock.patch` may be used to force a high level method to 114 | produce a specific output. For examples, see the `fingeprinting tests 115 | `_. 116 | 117 | Continuous Integration 118 | ~~~~~~~~~~~~~~~~~~~~~~ 119 | 120 | E3FP uses `GitHub Actions`_ for continuous integration. This ensures that each commit 121 | and pull request passes all tests on a variety of a systems and for all 122 | supported versions of Python. Additionally, GitHub Actions updates code coverage on 123 | Codecov_ and tests all usage examples in the documentation using `doctest`. 124 | 125 | Documentation 126 | ------------- 127 | 128 | In general, it is best to document the rationale and basic usage of a module, 129 | class, or method in its docstring instead of in a separate documentation file. 130 | See, for example, the docstring for `e3fp.fingerprint.db.FingerprintDatabase`. 131 | We use a variety of tools to ensure that our documentation is always 132 | up-to-date. The official documentation is hosted on ReadtheDocs_ and is 133 | automatically generated when new code is committed to the repository. 134 | 135 | Documenting Code 136 | ~~~~~~~~~~~~~~~~ 137 | 138 | E3FP uses NumPy's `docstring conventions`_ for all docstrings. These are 139 | parsed by Sphinx_ using Napoleon_. All usage examples must be fully 140 | functional, as these are tested using `doctest`. 141 | 142 | The purpose of a docstring is to explain the purpose of a class/method, any 143 | relevant implementation details, its parameters, its attributes, its outputs, 144 | and its usage. The goal is clarity. For self-evident methods with descriptive 145 | variables, a simple one- ine summary is all that is needed. For complicated use 146 | cases, often involving other methods/classes, it is better to document the 147 | usage elsewhere in the documentation. 148 | 149 | Documentation Usage 150 | ~~~~~~~~~~~~~~~~~~~ 151 | 152 | Coming soon. 153 | 154 | .. todo:: 155 | Write documentation usage 156 | 157 | Releasing Code 158 | -------------- 159 | 160 | .. todo:: 161 | Write release protocol 162 | 163 | .. _PEP8: https://www.python.org/dev/peps/pep-0008/ 164 | .. _PEP257: https://www.python.org/dev/peps/pep-0257/ 165 | .. _pycodestyle: http://pycodestyle.pycqa.org/en/latest/ 166 | .. _pydocstyle: http://pydocstyle.pycqa.org/en/latest/ 167 | .. _docstring conventions: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt 168 | .. _Napoleon: http://www.sphinx-doc.org/en/stable/ext/napoleon.html 169 | .. _Sphinx: http://www.sphinx-doc.org/en/stable/index.html 170 | .. _doctest: https://docs.python.org/3/library/doctest.html 171 | .. _pull request: https://help.github.com/articles/creating-a-pull-request/ 172 | .. _GitHub Actions: https://github.com/keiserlab/e3fp/actions 173 | .. _Codecov: https://codecov.io/github/keiserlab/e3fp 174 | 175 | .. include:: ../substitutions.rst 176 | -------------------------------------------------------------------------------- /doc/source/usage/fingerprints/fprints.rst: -------------------------------------------------------------------------------- 1 | Fingerprints 2 | ============ 3 | 4 | The simplest interface for molecular fingerprints are through three classes in 5 | `e3fp.fingerprint.fprint`: 6 | 7 | :py:class:`.Fingerprint` 8 | a fingerprint with "on" bits 9 | 10 | :py:class:`.CountFingerprint` 11 | a fingerprint with counts for each "on" bit 12 | 13 | :py:class:`.FloatFingerprint` 14 | a fingerprint with float values for each "on" bit, generated for example by 15 | averaging conformer fingerprints. 16 | 17 | In addition to storing "on" indices and, for the latter two, corresponding 18 | values, they store fingerprint properties, such as name, level, and any 19 | arbitrary property. They also provide simple interfaces for fingerprint 20 | comparison, some basic processing, and comparison. 21 | 22 | .. note:: Many of these operations are more efficient when operating on a 23 | :py:class:`.FingerprintDatabase`. See 24 | :ref:`usage/fingerprints/storage:Fingerprint Storage` for more information. 25 | 26 | In the below examples, we will focus on :py:class:`.Fingerprint` and 27 | :py:class:`.CountFingerprint`. First, we execute the necessary imports. 28 | 29 | .. testsetup:: 30 | 31 | import numpy as np 32 | np.random.seed(0) 33 | 34 | .. doctest:: 35 | 36 | >>> from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint 37 | >>> import numpy as np 38 | 39 | .. seealso:: 40 | 41 | :ref:`usage/fingerprints/storage:Fingerprint Storage`, 42 | :ref:`usage/fingerprints/comparison:Fingerprint Comparison` 43 | 44 | Creation and Conversion 45 | ----------------------- 46 | 47 | Here we create a bit-fingerprint with random "on" indices. 48 | 49 | >>> bits = 2**32 50 | >>> indices = np.sort(np.random.randint(0, bits, 30)) 51 | >>> indices 52 | array([ 243580376, 305097549, ..., 3975407269, 4138900056]) 53 | >>> fp1 = Fingerprint(indices, bits=bits, level=0) 54 | >>> fp1 55 | Fingerprint(indices=array([243580376, ..., 4138900056]), level=0, bits=4294967296, name=None) 56 | 57 | This fingerprint is extremely sparse 58 | 59 | >>> fp1.bit_count 60 | 30 61 | >>> fp1.density 62 | 6.984919309616089e-09 63 | 64 | We can therefore "fold" the fingerprint through a series of bitwise "OR" 65 | operations on halves of the sparse vector until it is of a specified length, 66 | with minimal collision of bits. 67 | 68 | >>> fp_folded = fp1.fold(1024) 69 | >>> fp_folded 70 | Fingerprint(indices=array([9, 70, ..., 845, 849]), level=0, bits=1024, name=None) 71 | >>> fp_folded.bit_count 72 | 29 73 | >>> fp_folded.density 74 | 0.0283203125 75 | 76 | A :py:class:`.CountFingerprint` may be created by also providing a dictionary 77 | matching indices with nonzero counts to the counts. 78 | 79 | >>> indices2 = np.sort(np.random.randint(0, bits, 60)) 80 | >>> counts = dict(zip(indices2, np.random.randint(1, 10, indices2.size))) 81 | >>> counts 82 | {80701568: 8, 580757632: 7, ..., 800291326: 5, 4057322111: 7} 83 | >>> cfp1 = CountFingerprint(counts=counts, bits=bits, level=0) 84 | >>> cfp1 85 | CountFingerprint(counts={80701568: 8, 580757632: 7, ..., 3342157822: 2, 4057322111: 7}, level=0, bits=4294967296, name=None) 86 | 87 | Unlike folding a bit fingerprint, by default, folding a count fingerprint 88 | performs a "SUM" operation on colliding counts. 89 | 90 | >>> cfp1.bit_count 91 | 60 92 | >>> cfp_folded = cfp1.fold(1024) 93 | >>> cfp_folded 94 | CountFingerprint(counts={128: 15, 257: 4, ..., 1022: 2, 639: 7}, level=0, bits=1024, name=None) 95 | >>> cfp_folded.bit_count 96 | 57 97 | 98 | It is trivial to interconvert the fingerprints. 99 | 100 | >>> cfp_folded2 = CountFingerprint.from_fingerprint(fp_folded) 101 | >>> cfp_folded2 102 | CountFingerprint(counts={9: 1, 87: 1, ..., 629: 1, 763: 1}, level=0, bits=1024, name=None) 103 | >>> cfp_folded2.indices[:5] 104 | array([ 9, 70, 72, 87, 174]) 105 | >>> fp_folded.indices[:5] 106 | array([ 9, 70, 72, 87, 174]) 107 | 108 | RDKit Morgan fingerprints (analogous to ECFP) may easily be converted to a 109 | :py:class:`.Fingerprint`. 110 | 111 | >>> from rdkit import Chem 112 | >>> from rdkit.Chem import AllChem 113 | >>> mol = Chem.MolFromSmiles('Cc1ccccc1') 114 | >>> mfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) 115 | >>> mfp 116 | 117 | >>> Fingerprint.from_rdkit(mfp) 118 | Fingerprint(indices=array([389, 1055, ..., 1873, 1920]), level=-1, bits=2048, name=None) 119 | 120 | Likewise, :py:class:`.Fingerprint` can be easily converted to a NumPy ndarray or 121 | SciPy sparse matrix. 122 | 123 | >>> fp_folded.to_vector() 124 | <1x1024 sparse matrix of type '' 125 | ...with 29 stored elements in Compressed Sparse Row format> 126 | >>> fp_folded.to_vector(sparse=False) 127 | array([False, False, False, ..., False, False, False], dtype=bool) 128 | >>> np.where(fp_folded.to_vector(sparse=False))[0] 129 | array([ 9, 70, 72, 87, ...]) 130 | >>> cfp_folded.to_vector(sparse=False) 131 | array([0, 0, 0, ..., 0, 2, 0], dtype=uint16) 132 | >>> cfp_folded.to_vector(sparse=False).sum() 133 | 252 134 | 135 | Algebra 136 | ------- 137 | 138 | Basic algebraic functions may be performed on fingerprints. If either 139 | fingerprint is a bit fingerprint, all algebraic functions are bit-wise. 140 | The following bit-wise operations are supported: 141 | 142 | Equality 143 | >>> fp1 = Fingerprint([0, 1, 6, 8, 12], bits=16) 144 | >>> fp2 = Fingerprint([1, 2, 4, 8, 11, 12], bits=16) 145 | >>> fp1 == fp2 146 | False 147 | >>> fp1_copy = Fingerprint.from_fingerprint(fp1) 148 | >>> fp1 == fp1_copy 149 | True 150 | >>> fp1_copy.level = 5 151 | >>> fp1 == fp1_copy 152 | False 153 | 154 | Union/OR 155 | >>> fp1 + fp2 156 | Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None) 157 | >>> fp1 | fp2 158 | Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None) 159 | 160 | Intersection/AND 161 | >>> fp1 & fp2 162 | Fingerprint(indices=array([1, 8, 12]), level=-1, bits=16, name=None) 163 | 164 | Difference/AND NOT 165 | >>> fp1 - fp2 166 | Fingerprint(indices=array([0, 6]), level=-1, bits=16, name=None) 167 | >>> fp2 - fp1 168 | Fingerprint(indices=array([2, 4, 11]), level=-1, bits=16, name=None) 169 | 170 | XOR 171 | >>> fp1 ^ fp2 172 | Fingerprint(indices=array([0, 2, 4, 6, 11]), level=-1, bits=16, name=None) 173 | 174 | With count or float fingerprints, bit-wise operations are still possible, but 175 | algebraic operations are applied to counts. 176 | 177 | >>> fp1 = CountFingerprint(counts={0: 3, 1: 2, 5: 1, 9: 3}, bits=16) 178 | >>> fp2 = CountFingerprint(counts={1: 2, 5: 2, 7: 3, 10: 7}, bits=16) 179 | >>> fp1 + fp2 180 | CountFingerprint(counts={0: 3, 1: 4, 5: 3, 7: 3, 9: 3, 10: 7}, level=-1, bits=16, name=None) 181 | >>> fp1 - fp2 182 | CountFingerprint(counts={0: 3, 1: 0, 5: -1, 7: -3, 9: 3, 10: -7}, level=-1, bits=16, name=None) 183 | >>> fp1 * 3 184 | CountFingerprint(counts={0: 9, 1: 6, 5: 3, 9: 9}, level=-1, bits=16, name=None) 185 | >>> fp1 / 2 186 | FloatFingerprint(counts={0: 1.5, 1: 1.0, 5: 0.5, 9: 1.5}, level=-1, bits=16, name=None) 187 | 188 | Finally, fingerprints may be batch added and averaged, producing either a count 189 | or float fingerprint when sensible. 190 | 191 | >>> from e3fp.fingerprint.fprint import add, mean 192 | >>> fps = [Fingerprint(np.random.randint(0, 32, 8), bits=32) for i in range(100)] 193 | >>> add(fps) 194 | CountFingerprint(counts={0: 23, 1: 23, ..., 30: 20, 31: 14}, level=-1, bits=32, name=None) 195 | >>> mean(fps) 196 | FloatFingerprint(counts={0: 0.23, 1: 0.23, ..., 30: 0.2, 31: 0.14}, level=-1, bits=32, name=None) 197 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | """Tests for fingerprint comparison metrics. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import pytest 7 | 8 | import numpy as np 9 | from scipy.sparse import csr_matrix 10 | from scipy.spatial.distance import cdist 11 | from e3fp.fingerprint import metrics, fprint, db 12 | from e3fp.fingerprint.metrics import array_metrics, fprint_metrics 13 | 14 | 15 | def _create_random_sparse(nrows, nbits=1024, perc_pos=0.1, counts=False): 16 | arr = csr_matrix( 17 | np.random.uniform(0, 1, (nrows, nbits)) > (1 - perc_pos), 18 | dtype=np.double, 19 | ) 20 | if counts: 21 | arr.data = np.random.randint(1, 30, arr.data.shape[0]).astype( 22 | np.double 23 | ) 24 | return arr 25 | 26 | def soergeldist(x, y): 27 | return np.abs(x - y).sum() / np.maximum(x, y).sum() 28 | 29 | 30 | class TestArrayMetrics: 31 | 32 | """Tests for array comparison metrics""" 33 | 34 | @staticmethod 35 | def _eval(func, X, Y=None, dense=False, **kwargs): 36 | if dense: 37 | X = X.toarray() 38 | if Y is not None: 39 | Y = Y.toarray() 40 | return func(X, Y, **kwargs) 41 | 42 | @pytest.mark.parametrize("dense", [True, False]) 43 | @pytest.mark.parametrize( 44 | "func,cdist_metric,counts", 45 | [ 46 | (array_metrics.tanimoto, "jaccard", False), 47 | (array_metrics.dice, "dice", False), 48 | (array_metrics.cosine, "cosine", False), 49 | (array_metrics.cosine, "cosine", True), 50 | (array_metrics.pearson, "correlation", False), 51 | (array_metrics.pearson, "correlation", True), 52 | (array_metrics.soergel, soergeldist, False), 53 | (array_metrics.soergel, soergeldist, True), 54 | ], 55 | ) 56 | def test_metrics_vs_cdist(self, func, cdist_metric, counts, dense): 57 | X = _create_random_sparse(10, counts=counts) 58 | Y = _create_random_sparse(8, counts=counts) 59 | expect_score = 1.0 - cdist(X.toarray(), Y.toarray(), metric=cdist_metric) 60 | score = self._eval(func, X, Y, dense=dense) 61 | assert type(score) is np.ndarray 62 | np.testing.assert_allclose(score, expect_score) 63 | # test self-comparison 64 | expect_score = 1.0 - cdist(X.toarray(), X.toarray(), metric=cdist_metric) 65 | score = self._eval(func, X, dense=dense) 66 | np.testing.assert_allclose(score, expect_score) 67 | 68 | @pytest.mark.parametrize("dense", [True, False]) 69 | def test_tanimoto_soergel_equal_for_binary(self, dense): 70 | X = _create_random_sparse(10, counts=False) 71 | Y = _create_random_sparse(8, counts=False) 72 | tscore = self._eval(array_metrics.tanimoto, X, Y, dense=dense) 73 | sscore = self._eval(array_metrics.soergel, X, Y, dense=dense) 74 | np.testing.assert_allclose(tscore, sscore) 75 | 76 | 77 | class TestFlexibleMetrics: 78 | 79 | """Tests for flexible comparison metrics""" 80 | 81 | metric_names = ["tanimoto", "soergel", "dice", "cosine", "pearson"] 82 | count_metric_names = ["soergel", "cosine", "pearson"] 83 | 84 | def test_binary_fprint_vs_fprint(self): 85 | fp1 = fprint.Fingerprint.from_vector( 86 | _create_random_sparse(1, counts=False, perc_pos=0.5) 87 | ) 88 | fp2 = fprint.Fingerprint.from_vector( 89 | _create_random_sparse(1, counts=False, perc_pos=0.5) 90 | ) 91 | for metric_name in self.metric_names: 92 | gen_score = getattr(metrics, metric_name)(fp1, fp2) 93 | fp_score = getattr(fprint_metrics, metric_name)(fp1, fp2) 94 | assert gen_score == pytest.approx(fp_score) 95 | array_score = getattr(array_metrics, metric_name)( 96 | fp1.to_vector(sparse=True), fp2.to_vector(sparse=True) 97 | ) 98 | assert gen_score == pytest.approx(array_score[0][0]) 99 | 100 | def test_count_fprint_vs_fprint(self): 101 | fp1 = fprint.CountFingerprint.from_vector( 102 | _create_random_sparse(1, nbits=32, counts=True, perc_pos=0.5) 103 | ) 104 | fp2 = fprint.CountFingerprint.from_vector( 105 | _create_random_sparse(1, nbits=32, counts=True, perc_pos=0.5) 106 | ) 107 | for metric_name in self.count_metric_names: 108 | gen_score = getattr(metrics, metric_name)(fp1, fp2) 109 | fp_score = getattr(fprint_metrics, metric_name)(fp1, fp2) 110 | assert gen_score == pytest.approx(fp_score) 111 | array_score = getattr(array_metrics, metric_name)( 112 | fp1.to_vector(sparse=True), fp2.to_vector(sparse=True) 113 | ) 114 | assert gen_score == pytest.approx(array_score[0][0]) 115 | 116 | def test_binary_fprint_vs_db(self): 117 | fp_array = _create_random_sparse(1, counts=False, perc_pos=0.5) 118 | fp = fprint.Fingerprint.from_vector(fp_array) 119 | db_array = _create_random_sparse(10, counts=False, perc_pos=0.5) 120 | fp_names = [str(i) for i in range(db_array.shape[0])] 121 | fdb = db.FingerprintDatabase.from_array( 122 | db_array, fp_names, fp_type=fprint.Fingerprint 123 | ) 124 | for metric_name in self.metric_names: 125 | gen_score = getattr(metrics, metric_name)(fp, fdb) 126 | array_score = getattr(array_metrics, metric_name)( 127 | fp_array, db_array 128 | ) 129 | np.testing.assert_allclose(gen_score, array_score) 130 | gen_score = getattr(metrics, metric_name)(fdb, fp) 131 | np.testing.assert_allclose(gen_score.T, array_score) 132 | 133 | def test_count_fprint_vs_db(self): 134 | fp_array = _create_random_sparse(1, counts=True, perc_pos=0.5) 135 | fp = fprint.CountFingerprint.from_vector(fp_array) 136 | db_array = _create_random_sparse(10, counts=True, perc_pos=0.5) 137 | fp_names = [str(i) for i in range(db_array.shape[0])] 138 | fdb = db.FingerprintDatabase.from_array( 139 | db_array, fp_names, fp_type=fprint.CountFingerprint 140 | ) 141 | for metric_name in self.count_metric_names: 142 | gen_score = getattr(metrics, metric_name)(fp, fdb) 143 | array_score = getattr(array_metrics, metric_name)( 144 | fp_array, db_array 145 | ) 146 | np.testing.assert_allclose(gen_score, array_score) 147 | # Check if reverse order produces transpose 148 | gen_score = getattr(metrics, metric_name)(fdb, fp) 149 | np.testing.assert_allclose(gen_score.T, array_score) 150 | 151 | def test_binary_db_vs_db(self): 152 | db_array1 = _create_random_sparse(1, counts=False, perc_pos=0.5) 153 | fp_names = [str(i) for i in range(db_array1.shape[0])] 154 | db1 = db.FingerprintDatabase.from_array( 155 | db_array1, fp_names, fp_type=fprint.Fingerprint 156 | ) 157 | db_array2 = _create_random_sparse(1, counts=False, perc_pos=0.5) 158 | fp_names = [str(i) for i in range(db_array2.shape[0])] 159 | db2 = db.FingerprintDatabase.from_array( 160 | db_array2, fp_names, fp_type=fprint.Fingerprint 161 | ) 162 | for metric_name in self.metric_names: 163 | gen_score = getattr(metrics, metric_name)(db1, db2) 164 | array_score = getattr(array_metrics, metric_name)( 165 | db_array1, db_array2 166 | ) 167 | np.testing.assert_allclose(gen_score, array_score) 168 | 169 | def test_count_db_vs_db(self): 170 | db_array1 = _create_random_sparse(1, counts=True, perc_pos=0.5) 171 | fp_names = [str(i) for i in range(db_array1.shape[0])] 172 | db1 = db.FingerprintDatabase.from_array( 173 | db_array1, fp_names, fp_type=fprint.CountFingerprint 174 | ) 175 | db_array2 = _create_random_sparse(1, counts=True, perc_pos=0.5) 176 | fp_names = [str(i) for i in range(db_array2.shape[0])] 177 | db2 = db.FingerprintDatabase.from_array( 178 | db_array2, fp_names, fp_type=fprint.CountFingerprint 179 | ) 180 | for metric_name in self.count_metric_names: 181 | gen_score = getattr(metrics, metric_name)(db1, db2) 182 | array_score = getattr(array_metrics, metric_name)( 183 | db_array1, db_array2 184 | ) 185 | np.testing.assert_allclose(gen_score, array_score) 186 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/metrics/array_metrics.py: -------------------------------------------------------------------------------- 1 | """Fingerprint array comparison metrics. 2 | 3 | Each is fully compatible with both dense and sparse inputs. 4 | 5 | Author: Seth Axen 6 | E-mail: seth.axen@gmail.com 7 | """ 8 | from __future__ import division 9 | 10 | import numpy as np 11 | import scipy 12 | from scipy.sparse import csr_matrix, issparse, vstack 13 | import scipy.sparse.linalg 14 | import scipy.spatial 15 | from e3fp.util import maybe_jit 16 | 17 | 18 | def tanimoto(X, Y=None): 19 | """Compute the Tanimoto coefficients between `X` and `Y`. 20 | 21 | Data must be binary. This is not checked. 22 | 23 | Parameters 24 | ---------- 25 | X : array_like or sparse matrix 26 | with shape (`n_fprints_X`, `n_bits`). 27 | Y : array_like or sparse matrix, optional 28 | with shape (`n_fprints_Y`, `n_bits`). 29 | 30 | Returns 31 | ------- 32 | tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`) 33 | 34 | See Also 35 | -------- 36 | soergel: Analog to Tanimoto for non-binary data. 37 | cosine, dice, pearson 38 | """ 39 | X, Y = _check_array_pair(X, Y) 40 | Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) 41 | with np.errstate(divide="ignore"): # handle 0 in denominator 42 | return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits))) 43 | 44 | 45 | def soergel(X, Y=None): 46 | """Compute the Soergel similarities between `X` and `Y`. 47 | 48 | Soergel similarity is the complement of Soergel distance and can be 49 | thought of as the analog of the Tanimoto coefficient for count/float-based 50 | data. For binary data, it is equivalent to the Tanimoto coefficient. 51 | 52 | Parameters 53 | ---------- 54 | X : array_like or sparse matrix 55 | with shape (`n_fprints_X`, `n_bits`). 56 | Y : array_like or sparse matrix, optional 57 | with shape (`n_fprints_Y`, `n_bits`). 58 | 59 | Returns 60 | ------- 61 | soergel : array of shape (`n_fprints_X`, `n_fprints_Y`) 62 | 63 | Notes 64 | -------- 65 | If Numba is available, this function is jit-compiled and much more efficient. 66 | 67 | See Also 68 | -------- 69 | tanimoto: A fast version of this function for binary data. 70 | pearson: Pearson correlation, also appropriate for non-binary data. 71 | cosine, dice 72 | """ 73 | X, Y = _check_array_pair(X, Y) 74 | S = np.empty((X.shape[0], Y.shape[0]), dtype=float) 75 | if issparse(X): 76 | return _sparse_soergel(X.data, X.indices, X.indptr, 77 | Y.data, Y.indices, Y.indptr, S) 78 | return _dense_soergel(X, Y, S) 79 | 80 | def dice(X, Y=None): 81 | """Compute the Dice coefficients between `X` and `Y`. 82 | 83 | Data must be binary. This is not checked. 84 | 85 | Parameters 86 | ---------- 87 | X : array_like or sparse matrix 88 | with shape (`n_fprints_X`, `n_bits`). 89 | Y : array_like or sparse matrix, optional 90 | with shape (`n_fprints_Y`, `n_bits`). 91 | 92 | Returns 93 | ------- 94 | dice : array of shape (`n_fprints_X`, `n_fprints_Y`) 95 | 96 | See Also 97 | -------- 98 | cosine, soergel, tanimoto, pearson 99 | """ 100 | X, Y = _check_array_pair(X, Y) 101 | Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) 102 | with np.errstate(divide="ignore"): # handle 0 in denominator 103 | return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T))) 104 | 105 | 106 | def cosine(X, Y=None, assume_binary=False): 107 | """Compute the Cosine similarities between `X` and `Y`. 108 | 109 | Parameters 110 | ---------- 111 | X : array_like or sparse matrix 112 | with shape (`n_fprints_X`, `n_bits`). 113 | Y : array_like or sparse matrix, optional 114 | with shape (`n_fprints_Y`, `n_bits`). 115 | assume_binary : bool, optional 116 | Assume data is binary (results in efficiency boost). If data is not 117 | binary, the result will be incorrect. 118 | 119 | Returns 120 | ------- 121 | cosine : array of shape (`n_fprints_X`, `n_fprints_Y`) 122 | 123 | See Also 124 | -------- 125 | dice, soergel, tanimoto 126 | """ 127 | X, Y = _check_array_pair(X, Y) 128 | if not issparse(X): 129 | return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine") 130 | if assume_binary: 131 | Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) 132 | with np.errstate(divide="ignore"): # handle 0 in denominator 133 | return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T))) 134 | else: 135 | return _sparse_cosine(X, Y) 136 | 137 | 138 | def pearson(X, Y=None): 139 | """Compute the Pearson correlation between `X` and `Y`. 140 | 141 | Parameters 142 | ---------- 143 | X : array_like or sparse matrix 144 | with shape (`n_fprints_X`, `n_bits`). 145 | Y : array_like or sparse matrix, optional 146 | with shape (`n_fprints_Y`, `n_bits`). 147 | 148 | Returns 149 | ------- 150 | pearson : array of shape (`n_fprints_X`, `n_fprints_Y`) 151 | 152 | 153 | See Also 154 | -------- 155 | soergel: Soergel similarity for non-binary data 156 | cosine, dice, tanimoto 157 | """ 158 | X, Y = _check_array_pair(X, Y) 159 | Xlen = X.shape[0] 160 | if issparse(X): 161 | X = vstack((X, Y), format="csr") 162 | X = X - X.mean(axis=1) 163 | cov = (X * X.T) / (X.shape[1] - 1.0) 164 | d = np.sqrt(np.diag(cov)) 165 | with np.errstate(divide="ignore"): # handle 0 in denominator 166 | pearson = cov / np.outer(d, d) 167 | else: 168 | with np.errstate(divide="ignore"): # handle 0 in denominator 169 | pearson = np.corrcoef(X, Y) 170 | return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:])) 171 | 172 | 173 | def _check_array(arr, dtype=float, force_sparse=False): 174 | if force_sparse or issparse(arr): 175 | return csr_matrix(arr, copy=False, dtype=dtype) 176 | else: 177 | return arr.astype(dtype, copy=False) 178 | 179 | 180 | def _check_array_pair(X, Y=None, dtype=float, force_sparse=False): 181 | if Y is not None and X.shape[1] != Y.shape[1]: 182 | raise ValueError("Arrays must have same width.") 183 | if force_sparse or issparse(X) or issparse(Y): 184 | force_sparse = True # ensure if one is sparse, all are sparse. 185 | X = _check_array(X, dtype=dtype, force_sparse=force_sparse) 186 | if Y is None or Y is X: 187 | Y = X 188 | else: 189 | Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse) 190 | return X, Y 191 | 192 | 193 | def _get_bitcount_arrays(X, Y, return_XYbits=False): 194 | if issparse(X): 195 | Xbits = np.sum(X, axis=1) 196 | if Y is X: 197 | Ybits = Xbits 198 | else: 199 | Ybits = np.sum(Y, axis=1) 200 | if return_XYbits: 201 | XYbits = (X * Y.T).toarray() 202 | return Xbits, Ybits, XYbits 203 | else: 204 | Xbits = np.sum(X, axis=1, keepdims=True) 205 | if Y is X: 206 | Ybits = Xbits 207 | else: 208 | Ybits = np.sum(Y, axis=1, keepdims=True) 209 | if return_XYbits: 210 | XYbits = np.dot(X, Y.T) 211 | return Xbits, Ybits, XYbits 212 | return Xbits, Ybits 213 | 214 | 215 | def _sparse_cosine(X, Y): 216 | Xnorm = scipy.sparse.linalg.norm(X, axis=1) 217 | if Y is X: 218 | Ynorm = Xnorm 219 | else: 220 | Ynorm = scipy.sparse.linalg.norm(Y, axis=1) 221 | XY = (X * Y.T).toarray() 222 | with np.errstate(divide="ignore"): # handle 0 in denominator 223 | return np.nan_to_num(XY / np.outer(Xnorm, Ynorm)) 224 | 225 | @maybe_jit(nopython=True, nogil=True, cache=True) 226 | def _dense_soergel(X, Y, S): 227 | for ix in range(S.shape[0]): 228 | for iy in range(S.shape[1]): 229 | sum_abs_diff = 0 230 | sum_max = 0 231 | for j in range(X.shape[1]): 232 | diff = X[ix, j] - Y[iy, j] 233 | if diff > 0: 234 | sum_abs_diff += diff 235 | sum_max += X[ix, j] 236 | else: 237 | sum_abs_diff -= diff 238 | sum_max += Y[iy, j] 239 | 240 | if sum_max == 0: 241 | S[ix, iy] = 0 242 | continue 243 | S[ix, iy] = 1 - sum_abs_diff / sum_max 244 | return S 245 | 246 | @maybe_jit(nopython=True, nogil=True, cache=True) 247 | def _sparse_soergel(Xdata, Xindices, Xindptr, Ydata, Yindices, Yindptr, S): 248 | for ix in range(S.shape[0]): 249 | if Xindptr[ix] == Xindptr[ix + 1]: 250 | for iy in range(S.shape[1]): # no X values in row 251 | S[ix, iy] = 0 252 | continue 253 | jxindmax = Xindptr[ix + 1] - 1 254 | for iy in range(S.shape[1]): 255 | if Yindptr[iy] == Yindptr[iy + 1]: # no Y values in row 256 | S[ix, iy] = 0 257 | continue 258 | 259 | sum_abs_diff = 0 260 | sum_max = 0 261 | # Implementation of the final step of merge sort 262 | jyindmax = Yindptr[iy + 1] - 1 263 | jx = Xindptr[ix] 264 | jy = Yindptr[iy] 265 | while jx <= jxindmax and jy <= jyindmax: 266 | jxind = Xindices[jx] 267 | jyind = Yindices[jy] 268 | if jxind < jyind: 269 | sum_max += Xdata[jx] 270 | sum_abs_diff += Xdata[jx] 271 | jx += 1 272 | elif jyind < jxind: 273 | sum_max += Ydata[jy] 274 | sum_abs_diff += Ydata[jy] 275 | jy += 1 276 | else: 277 | diff = Xdata[jx] - Ydata[jy] 278 | if diff > 0: 279 | sum_abs_diff += diff 280 | sum_max += Xdata[jx] 281 | else: 282 | sum_abs_diff -= diff 283 | sum_max += Ydata[jy] 284 | jx += 1 285 | jy += 1 286 | 287 | while jx <= jxindmax: 288 | sum_max += Xdata[jx] 289 | sum_abs_diff += Xdata[jx] 290 | jx += 1 291 | 292 | while jy <= jyindmax: 293 | sum_max += Ydata[jy] 294 | sum_abs_diff += Ydata[jy] 295 | jy += 1 296 | 297 | if sum_max == 0: 298 | S[ix, iy] = 0 299 | continue 300 | S[ix, iy] = 1 - sum_abs_diff / sum_max 301 | return S 302 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/array_ops.py: -------------------------------------------------------------------------------- 1 | """Various array operations. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import numpy as np 7 | from scipy.spatial.distance import pdist, squareform 8 | 9 | QUATERNION_DTYPE = float 10 | X_AXIS, Y_AXIS, Z_AXIS = np.identity(3, dtype=float) 11 | EPS = 1e-12 # epsilon, a number close to 0 12 | 13 | 14 | # Vector Algebra Methods 15 | def as_unit(v, axis=1): 16 | """Return array of unit vectors parallel to vectors in `v`. 17 | 18 | Parameters 19 | ---------- 20 | v : ndarray of float 21 | axis : int, optional 22 | Axis along which to normalize length. 23 | 24 | Returns 25 | ------- 26 | ndarray of float : Unit vector of `v`, i.e. `v` divided by its 27 | magnitude along `axis`. 28 | """ 29 | u = np.array(v, dtype=float, copy=True) 30 | if u.ndim == 1: 31 | sqmag = u.dot(u) 32 | if sqmag >= EPS: 33 | u /= sqmag ** 0.5 34 | else: 35 | if axis == 1: 36 | sqmag = np.einsum("...ij,...ij->...i", u, u) 37 | else: 38 | sqmag = np.einsum("...ij,...ij->...j", u, u) 39 | 40 | sqmag[sqmag < EPS] = 1.0 41 | u /= np.expand_dims(np.sqrt(sqmag), axis) 42 | return u 43 | 44 | 45 | def make_distance_matrix(coords): 46 | """Build pairwise distance matrix from coordinates. 47 | 48 | Parameters 49 | ---------- 50 | coords : ndarray of float 51 | an Mx3 array of cartesian coordinates. 52 | 53 | Returns 54 | ------- 55 | ndarray of float : square symmetrical distance matrix 56 | """ 57 | return squareform(pdist(coords)) 58 | 59 | 60 | def make_transform_matrix(center, y=None, z=None): 61 | """Make 4x4 homogenous transformation matrix. 62 | 63 | Given Nx4 array A where A[:, 4] = 1., the transform matrix M should be 64 | used with dot(M, A.T).T. Order of operations is 1. translation, 2. align 65 | `y` x `z` plane to yz-plane 3. align `y` to y-axis. 66 | 67 | Parameters 68 | ---------- 69 | center : 1x3 array of float 70 | Coordinate that should be centered after transformation. 71 | y : None or 1x3 array of float 72 | Vector that should lie on the y-axis after transformation 73 | z : None or 1x3 array of float 74 | Vector that after transformation should lie on yz-plane in direction 75 | of z-axis. 76 | 77 | Returns 78 | ------- 79 | 4x4 array of float 80 | 4x4 homogenous transformation matrix. 81 | """ 82 | translate = np.identity(4, dtype=float) 83 | translate[:3, 3] = -np.asarray(center, dtype=float) 84 | if y is not None: 85 | y = np.atleast_2d(y) 86 | if z is None: 87 | rotate = np.identity(4, dtype=float) 88 | rotate[:3, :3] = make_rotation_matrix(y, Y_AXIS) 89 | else: 90 | z = np.atleast_2d(z) 91 | rotate_norm = np.identity(4, dtype=float) 92 | x_unit = as_unit(np.cross(y, z)) 93 | rotate_norm[:3, :3] = make_rotation_matrix(x_unit, X_AXIS) 94 | new_y = np.dot(rotate_norm[:3, :3], y.flatten()) 95 | rotate_y = np.identity(4, dtype=float) 96 | rotate_y[:3, :3] = make_rotation_matrix(new_y.flatten(), Y_AXIS) 97 | rotate = np.dot(rotate_y, rotate_norm) 98 | transform = np.dot(rotate, translate) 99 | else: 100 | transform = translate 101 | return transform 102 | 103 | 104 | def make_rotation_matrix(v0, v1): 105 | """Create 3x3 matrix of rotation from `v0` onto `v1`. 106 | 107 | Should be used by dot(R, v0.T).T. 108 | 109 | Parameters 110 | ---------- 111 | v0 : 1x3 array of float 112 | Initial vector before alignment. 113 | v1 : 1x3 array of float 114 | Vector to which to align `v0`. 115 | """ 116 | v0 = as_unit(v0) 117 | v1 = as_unit(v1) 118 | u = np.cross(v0.ravel(), v1.ravel()) 119 | if np.all(u == 0.0): 120 | return np.identity(3, dtype=float) 121 | sin_ang = u.dot(u) ** 0.5 122 | u /= sin_ang 123 | cos_ang = np.dot(v0, v1.T) 124 | # fmt: off 125 | ux = np.array([[ 0., -u[2], u[1]], 126 | [ u[2], 0., -u[0]], 127 | [-u[1], u[0], 0.]], dtype=float) 128 | # fmt: on 129 | rot = ( 130 | cos_ang * np.identity(3, dtype=float) 131 | + sin_ang * ux 132 | + (1 - cos_ang) * np.outer(u, u) 133 | ) 134 | return rot 135 | 136 | 137 | def transform_array(transform_matrix, a): 138 | """Pad an array with 1s, transform, and return with original dimensions. 139 | 140 | Parameters 141 | ---------- 142 | transform_matrix : 4x4 array of float 143 | 4x4 homogenous transformation matrix 144 | a : Nx3 array of float 145 | Array of 3-D coordinates. 146 | 147 | Returns 148 | ------- 149 | Nx3 array of float : Transformed array 150 | """ 151 | return unpad_array(np.dot(transform_matrix, pad_array(a).T).T) 152 | 153 | 154 | def pad_array(a, n=1.0, axis=1): 155 | """Return `a` with row of `n` appended to `axis`. 156 | 157 | Parameters 158 | ---------- 159 | a : ndarray 160 | Array to pad 161 | n : float or int, optional 162 | Value to pad `a` with 163 | axis : int, optional 164 | Axis of `a` to pad with `n`. 165 | 166 | Returns 167 | ------- 168 | ndarray 169 | Padded array. 170 | """ 171 | if a.ndim == 1: 172 | pad = np.ones(a.shape[0] + 1, dtype=a.dtype) * n 173 | pad[: a.shape[0]] = a 174 | else: 175 | shape = list(a.shape) 176 | shape[axis] += 1 177 | pad = np.ones(shape, dtype=a.dtype) 178 | pad[: a.shape[0], : a.shape[1]] = a 179 | return pad 180 | 181 | 182 | def unpad_array(a, axis=1): 183 | """Return `a` with row removed along `axis`. 184 | 185 | Parameters 186 | ---------- 187 | a : ndarray 188 | Array from which to remove row 189 | axis : int, optional 190 | Axis from which to remove row 191 | 192 | Returns 193 | ------- 194 | ndarray 195 | Unpadded array. 196 | """ 197 | if a.ndim == 1: 198 | return a[:-1] 199 | else: 200 | shape = list(a.shape) 201 | shape[axis] -= 1 202 | return a[: shape[0], : shape[1]] 203 | 204 | 205 | def project_to_plane(vec_arr, norm): 206 | """Project array of vectors to plane with normal `norm`. 207 | 208 | Parameters 209 | ---------- 210 | vec_arr : Nx3 array 211 | Array of N 3D vectors. 212 | norm : 1x3 array 213 | Normal vector to plane. 214 | 215 | Returns 216 | ------- 217 | Nx3 array 218 | Array of vectors projected onto plane. 219 | """ 220 | unit_norm = as_unit(norm).flatten() 221 | mag_on_norm = np.dot(vec_arr, unit_norm) 222 | if vec_arr.ndim == 1: 223 | vec_on_norm = np.array(unit_norm, copy=True) 224 | vec_on_norm *= mag_on_norm 225 | else: 226 | vec_on_norm = np.tile(unit_norm, (vec_arr.shape[0], 1)) 227 | vec_on_norm *= mag_on_norm[:, None] 228 | return vec_arr - vec_on_norm 229 | 230 | 231 | def calculate_angles(vec_arr, ref, ref_norm=None): 232 | """Calculate angles between vectors in `vec_arr` and `ref` vector. 233 | 234 | If `ref_norm` is not provided, angle ranges between 0 and pi. If it is 235 | provided, angle ranges between 0 and 2pi. Note that if `ref_norm` is 236 | orthogonal to `vec_arr` and `ref`, then the angle is rotation around the 237 | axis, but if a non-orthogonal axis is provided, this may not be the case. 238 | 239 | Parameters 240 | ---------- 241 | vec_arr : Nx3 array of float 242 | Array of N 3D vectors. 243 | ref : 1x3 array of float 244 | Reference vector 245 | ref_norm : 1x3 array of float 246 | Normal vector. 247 | 248 | Returns 249 | ------- 250 | 1-D array 251 | Array of N angles 252 | """ 253 | unit_vec_arr = as_unit(vec_arr) 254 | unit_ref = as_unit(ref).flatten() 255 | ang = np.arccos(np.clip(np.dot(unit_vec_arr, unit_ref), -1.0, 1.0)) 256 | # handle cases where a vector is the origin 257 | ang[np.all(unit_vec_arr == np.zeros(3), axis=1)] = 0.0 258 | if ref_norm is not None: 259 | sign = np.sign( 260 | np.dot(ref_norm, np.cross(unit_vec_arr, unit_ref).T) 261 | ).flatten() 262 | sign[sign == 0] = 1 263 | ang = rotate_angles(sign * ang, 2 * np.pi) 264 | return ang 265 | 266 | 267 | def rotate_angles(angles, amount): 268 | """Rotate angles by `amount`, keeping in 0 to 2pi range. 269 | 270 | Parameters 271 | ---------- 272 | angles : 1-D array of float 273 | Angles in radians 274 | amount : float 275 | Amount to rotate angles by 276 | 277 | Returns 278 | ------- 279 | 1-D array of float : Rotated angles 280 | """ 281 | return (angles + amount) % (2 * np.pi) 282 | 283 | 284 | def quaternion_to_transform_matrix(quaternion, translation=np.zeros(3)): 285 | """Convert quaternion to homogenous 4x4 transform matrix. 286 | 287 | Parameters 288 | ---------- 289 | quaternion : 4x1 array of float 290 | Quaternion describing rotation after translation. 291 | translation : 3x1 array of float, optional 292 | Translation to be performed before rotation. 293 | """ 294 | q = np.array(quaternion, dtype=float, copy=True) 295 | n = np.linalg.norm(q) 296 | if n < 1e-12: 297 | return np.identity(4, dtype=float) 298 | q /= n 299 | q = 2 * np.outer(q, q) 300 | # fmt: off 301 | transform_mat = np.array( 302 | [[1.-q[2, 2]-q[3, 3], q[1, 2]-q[3, 0], q[1, 3]+q[2, 0], 0.], 303 | [ q[1, 2]+q[3, 0], 1.-q[1, 1]-q[3, 3], q[2, 3]-q[1, 0], 0.], 304 | [ q[1, 3]-q[2, 0], q[2, 3]+q[1, 0], 1.-q[1, 1]-q[2, 2], 0.], 305 | [ 0., 0., 0., 1.]], 306 | dtype=float 307 | ) 308 | # fmt: on 309 | transform_mat[:3, 3] = translation 310 | return transform_mat 311 | 312 | 313 | def transform_matrix_to_quaternion(transform_matrix, dtype=QUATERNION_DTYPE): 314 | """Convert homogenous 4x4 transform matrix to quaternion. 315 | 316 | Parameters 317 | ---------- 318 | transform_matrix : 4x4 array of float 319 | Homogenous transformation matrix. 320 | dtype : numpy dtype, optional 321 | Datatype for returned quaternion. 322 | """ 323 | T = np.array(transform_matrix, dtype=float) 324 | R = T[:3, :3] 325 | q = np.zeros(4, dtype=dtype) 326 | q[0] = np.sqrt(1.0 + R.trace()) / 2.0 327 | q[1] = R[2, 1] - R[1, 2] 328 | q[2] = R[0, 2] - R[2, 0] 329 | q[3] = R[1, 0] - R[0, 1] 330 | q[1:4] /= 4.0 * q[0] 331 | return q 332 | -------------------------------------------------------------------------------- /tests/test_struct.py: -------------------------------------------------------------------------------- 1 | """Tests for Shell and Substruct objects. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import os 7 | import pytest 8 | 9 | DATA_DIR = os.path.join(os.path.dirname(__file__), "data") 10 | PLANAR_SDF_FILE = os.path.join(DATA_DIR, "caffeine_planar.sdf.bz2") 11 | 12 | 13 | class TestShellCreation: 14 | def test_error_when_center_not_atom(self): 15 | from e3fp.fingerprint.structs import Shell 16 | 17 | with pytest.raises(TypeError): 18 | Shell(None) 19 | 20 | def test_error_when_shells_has_non_shell(self): 21 | from e3fp.fingerprint.structs import Shell 22 | 23 | atom = 0 24 | shells = [None] 25 | with pytest.raises(TypeError): 26 | Shell(atom, shells) 27 | 28 | def test_creation_with_atoms_or_ids_equivalent(self): 29 | from e3fp.fingerprint.structs import Shell 30 | from e3fp.conformer.util import mol_from_sdf 31 | 32 | mol = mol_from_sdf(PLANAR_SDF_FILE) 33 | atoms = list(mol.GetAtoms()) 34 | atom_ids = [x.GetIdx() for x in atoms] 35 | assert Shell(atoms[0], atoms[1:]) == Shell(atom_ids[0], atom_ids[1:]) 36 | 37 | def test_create_shell_no_shell(self): 38 | from e3fp.fingerprint.structs import Shell 39 | from e3fp.conformer.util import mol_from_sdf 40 | 41 | mol = mol_from_sdf(PLANAR_SDF_FILE) 42 | atoms = list(mol.GetAtoms()) 43 | center_atom = atoms[0] 44 | Shell(center_atom) 45 | 46 | def test_create_shell_with_same_center_fails(self): 47 | from e3fp.fingerprint.structs import Shell, FormatError 48 | from e3fp.conformer.util import mol_from_sdf 49 | 50 | mol = mol_from_sdf(PLANAR_SDF_FILE) 51 | atoms = list(mol.GetAtoms()) 52 | center_atom = atoms[0] 53 | with pytest.raises(FormatError): 54 | Shell(center_atom, atoms) 55 | 56 | def test_atoms_converted_to_shells(self): 57 | from e3fp.fingerprint.structs import Shell 58 | from e3fp.conformer.util import mol_from_sdf 59 | 60 | mol = mol_from_sdf(PLANAR_SDF_FILE) 61 | atoms = list(mol.GetAtoms()) 62 | center_atom = atoms[0] 63 | shell = Shell(center_atom, atoms[1:]) 64 | for s in shell.shells: 65 | assert isinstance(s, Shell) 66 | 67 | def test_creation_with_atoms_or_shells_equal(self): 68 | from e3fp.fingerprint.structs import Shell 69 | from e3fp.conformer.util import mol_from_sdf 70 | 71 | mol = mol_from_sdf(PLANAR_SDF_FILE) 72 | atoms = list(mol.GetAtoms()) 73 | shells = list(map(Shell, atoms)) 74 | center_atom = atoms[0] 75 | shell1 = Shell(center_atom, atoms[1:]) 76 | shell2 = Shell(center_atom, shells[1:]) 77 | assert shell1 == shell2 78 | 79 | def test_recursive_atom_shells_correct(self): 80 | from e3fp.fingerprint.structs import Shell 81 | from e3fp.conformer.util import mol_from_sdf 82 | 83 | mol = mol_from_sdf(PLANAR_SDF_FILE) 84 | atoms = list(mol.GetAtoms()) 85 | shell1 = Shell(atoms[5], atoms[6:8]) 86 | shell2 = Shell(atoms[2], atoms[3:5]) 87 | shell = Shell(atoms[0], (shell1, shell2)) 88 | assert shell.atoms == {x.GetIdx() for x in (atoms[0], atoms[2], atoms[5])} 89 | 90 | 91 | class TestShellComparison: 92 | def test_shells_same_center_same_atoms_equal(self): 93 | from e3fp.fingerprint.structs import Shell 94 | from e3fp.conformer.util import mol_from_sdf 95 | 96 | mol = mol_from_sdf(PLANAR_SDF_FILE) 97 | atoms = list(mol.GetAtoms()) 98 | center_atom = atoms[0] 99 | shell1 = Shell(center_atom, atoms[1:]) 100 | shell2 = Shell(center_atom, atoms[1:]) 101 | assert shell1 == shell2 102 | 103 | def test_shells_diff_center_same_atoms_nonequal(self): 104 | from e3fp.fingerprint.structs import Shell 105 | from e3fp.conformer.util import mol_from_sdf 106 | 107 | mol = mol_from_sdf(PLANAR_SDF_FILE) 108 | atoms = list(mol.GetAtoms()) 109 | shell1 = Shell(atoms[0], atoms[2:]) 110 | shell2 = Shell(atoms[1], atoms[2:]) 111 | assert shell1 != shell2 112 | 113 | def test_shells_same_center_diff_atoms_nonequal(self): 114 | from e3fp.fingerprint.structs import Shell 115 | from e3fp.conformer.util import mol_from_sdf 116 | 117 | mol = mol_from_sdf(PLANAR_SDF_FILE) 118 | atoms = list(mol.GetAtoms()) 119 | center_atom = atoms[0] 120 | shell1 = Shell(center_atom, atoms[1:]) 121 | shell2 = Shell(center_atom, atoms[2:]) 122 | assert shell1 != shell2 123 | 124 | def test_equal_shells_hash_to_same_value(self): 125 | from e3fp.fingerprint.structs import Shell 126 | from e3fp.conformer.util import mol_from_sdf 127 | 128 | mol = mol_from_sdf(PLANAR_SDF_FILE) 129 | atoms = list(mol.GetAtoms()) 130 | center_atom = atoms[0] 131 | shell1 = Shell(center_atom, atoms[1:]) 132 | shell2 = Shell(center_atom, atoms[1:]) 133 | assert hash(shell1) == hash(shell2) 134 | 135 | def test_same_shell_hashes_to_same_value(self): 136 | from e3fp.fingerprint.structs import Shell 137 | from e3fp.conformer.util import mol_from_sdf 138 | 139 | mol = mol_from_sdf(PLANAR_SDF_FILE) 140 | atoms = list(mol.GetAtoms()) 141 | center_atom = atoms[0] 142 | shell = Shell(center_atom, atoms[1:]) 143 | assert hash(shell) == hash(shell) 144 | 145 | 146 | class TestShellSubstructInterface: 147 | def test_recursive_shell_substruct_correct1(self): 148 | from e3fp.fingerprint.structs import Shell 149 | from e3fp.conformer.util import mol_from_sdf 150 | 151 | mol = mol_from_sdf(PLANAR_SDF_FILE) 152 | atoms = list(mol.GetAtoms()) 153 | shell1 = Shell(atoms[5], atoms[6:8]) 154 | shell2 = Shell(atoms[1], atoms[2:5]) 155 | shell = Shell(atoms[0], (shell1, shell2)) 156 | assert shell.substruct.atoms == {x.GetIdx() for x in atoms[:8]} 157 | 158 | def test_recursive_shell_substruct_correct2(self): 159 | from e3fp.fingerprint.structs import Shell 160 | from e3fp.conformer.util import mol_from_sdf 161 | 162 | mol = mol_from_sdf(PLANAR_SDF_FILE) 163 | atoms = list(mol.GetAtoms()) 164 | shell1 = Shell(atoms[1], atoms[2:5]) 165 | shell2 = Shell(atoms[5], {shell1}) 166 | shell3 = Shell(atoms[6], atoms[7:10]) 167 | shell4 = Shell(atoms[10], {shell3}) 168 | shell = Shell(atoms[0], (shell2, shell4)) 169 | assert shell.substruct.atoms == {x.GetIdx() for x in atoms[:11]} 170 | 171 | def test_shell_creation_from_substruct_without_center_fails(self): 172 | from e3fp.fingerprint.structs import Shell, Substruct, FormatError 173 | from e3fp.conformer.util import mol_from_sdf 174 | 175 | mol = mol_from_sdf(PLANAR_SDF_FILE) 176 | atoms = list(mol.GetAtoms()) 177 | substruct = Substruct(None, atoms[:2]) 178 | with pytest.raises(FormatError): 179 | Shell.from_substruct(substruct) 180 | 181 | def test_shell_creation_from_substruct(self): 182 | from e3fp.fingerprint.structs import Shell, Substruct 183 | from e3fp.conformer.util import mol_from_sdf 184 | 185 | mol = mol_from_sdf(PLANAR_SDF_FILE) 186 | atoms = list(mol.GetAtoms()) 187 | substruct = Substruct(atoms[0], atoms[:2]) 188 | shell = Shell.from_substruct(substruct) 189 | assert shell.atoms == substruct.atoms 190 | 191 | def test_substruct_creation_from_shell(self): 192 | from e3fp.fingerprint.structs import Shell, Substruct 193 | from e3fp.conformer.util import mol_from_sdf 194 | 195 | mol = mol_from_sdf(PLANAR_SDF_FILE) 196 | atoms = list(mol.GetAtoms()) 197 | shell = Shell(atoms[0], atoms[1:]) 198 | substruct = Substruct.from_shell(shell) 199 | assert shell.substruct == substruct 200 | 201 | 202 | class TestSubstructCreation: 203 | def test_error_when_center_not_atom(self): 204 | from e3fp.fingerprint.structs import Substruct 205 | 206 | with pytest.raises(TypeError): 207 | Substruct("foo") 208 | 209 | def test_error_when_atoms_has_non_atom(self): 210 | from e3fp.fingerprint.structs import Substruct 211 | 212 | atoms = [None] 213 | with pytest.raises(TypeError): 214 | Substruct(atoms=atoms) 215 | 216 | def test_center_atom_auto_added_to_atoms(self): 217 | from e3fp.fingerprint.structs import Substruct 218 | from e3fp.conformer.util import mol_from_sdf 219 | 220 | mol = mol_from_sdf(PLANAR_SDF_FILE) 221 | atoms = list(mol.GetAtoms()) 222 | center_atom = atoms[0] 223 | substruct = Substruct(center_atom, atoms[1:]) 224 | assert center_atom.GetIdx() in substruct.atoms 225 | 226 | 227 | class TestSubstructCreationComparison: 228 | def test_substructs_same_center_same_atoms_equal(self): 229 | from e3fp.fingerprint.structs import Substruct 230 | from e3fp.conformer.util import mol_from_sdf 231 | 232 | mol = mol_from_sdf(PLANAR_SDF_FILE) 233 | atoms = list(mol.GetAtoms()) 234 | center_atom = atoms[0] 235 | substruct1 = Substruct(center_atom, atoms) 236 | substruct2 = Substruct(center_atom, atoms) 237 | assert substruct1 == substruct2 238 | 239 | def test_substructs_diff_center_same_atoms_equal(self): 240 | from e3fp.fingerprint.structs import Substruct 241 | from e3fp.conformer.util import mol_from_sdf 242 | 243 | mol = mol_from_sdf(PLANAR_SDF_FILE) 244 | atoms = list(mol.GetAtoms()) 245 | substruct1 = Substruct(atoms[0], atoms) 246 | substruct2 = Substruct(atoms[1], atoms) 247 | assert substruct1 == substruct2 248 | 249 | def test_substructs_same_center_diff_atoms_nonequal(self): 250 | from e3fp.fingerprint.structs import Substruct 251 | from e3fp.conformer.util import mol_from_sdf 252 | 253 | mol = mol_from_sdf(PLANAR_SDF_FILE) 254 | atoms = list(mol.GetAtoms()) 255 | substruct1 = Substruct(atoms[0], atoms[1:]) 256 | substruct2 = Substruct(atoms[0], atoms[2:]) 257 | assert substruct1 != substruct2 258 | 259 | def test_equal_shells_hash_to_same_value(self): 260 | from e3fp.fingerprint.structs import Substruct 261 | from e3fp.conformer.util import mol_from_sdf 262 | 263 | mol = mol_from_sdf(PLANAR_SDF_FILE) 264 | atoms = list(mol.GetAtoms()) 265 | center_atom = atoms[0] 266 | substruct1 = Substruct(center_atom, atoms[1:]) 267 | substruct2 = Substruct(center_atom, atoms[1:]) 268 | assert hash(substruct1) == hash(substruct2) 269 | 270 | def test_same_shells_hash_to_same_value(self): 271 | from e3fp.fingerprint.structs import Substruct 272 | from e3fp.conformer.util import mol_from_sdf 273 | 274 | mol = mol_from_sdf(PLANAR_SDF_FILE) 275 | atoms = list(mol.GetAtoms()) 276 | center_atom = atoms[0] 277 | substruct = Substruct(center_atom, atoms[1:]) 278 | assert hash(substruct) == hash(substruct) 279 | -------------------------------------------------------------------------------- /src/e3fp/fingerprint/structs.py: -------------------------------------------------------------------------------- 1 | """Class for defining 3D atom environments. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | from __future__ import division, print_function 7 | from functools import reduce 8 | 9 | import numpy as np 10 | import rdkit.Chem 11 | 12 | import smart_open 13 | from e3fp.fingerprint import array_ops 14 | 15 | 16 | PDB_LINE = ( 17 | "HETATM{atom_id:>5d} {name:<4s} LIG A 1 " 18 | "{coord[0]:>8.3f}{coord[1]:>8.3f}{coord[2]:>8.3f}" 19 | "{occupancy:>6.2f}{temp:>6.2f} {elem:>2s}{charge:>2s}" 20 | ) 21 | 22 | 23 | class Shell(object): 24 | """A container for other Shells centered on an atom. 25 | 26 | Shells represent all atoms explicitly within a container. Atoms are 27 | represented by their ids. If atoms are provided instead of shells, they 28 | are converted to single-atom shells. A Substruct is generated from a Shell 29 | on the fly by recursion through member shells. An optional identifier may 30 | be set. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | center_atom, 36 | shells=set(), 37 | radius=None, 38 | last_shell=None, 39 | identifier=None, 40 | ): 41 | if isinstance(center_atom, rdkit.Chem.Atom): 42 | center_atom = center_atom.GetIdx() 43 | elif not isinstance(center_atom, (int, np.integer)): 44 | raise TypeError("center_atom must be Atom or atom id") 45 | self._center_atom = center_atom 46 | 47 | self._shells = set() 48 | for shell in shells: 49 | if isinstance(shell, int): 50 | shell = Shell(shell) 51 | elif isinstance(shell, rdkit.Chem.Atom): 52 | shell = Shell(shell.GetIdx()) 53 | elif not isinstance(shell, Shell): 54 | raise TypeError("shells must be Shells, Atoms, or atom ids") 55 | if shell.center_atom == self.center_atom: 56 | raise FormatError( 57 | "member shells cannot be centered on same " 58 | "center_atom as new shell" 59 | ) 60 | self._shells.add(shell) 61 | self._shells = frozenset(self._shells) 62 | 63 | self.radius = radius 64 | self.last_shell = last_shell 65 | self.atoms = None 66 | self.substruct = None 67 | self.identifier = identifier 68 | self.is_duplicate = False 69 | self.duplicate = None 70 | 71 | @classmethod 72 | def from_substruct(cls, substruct): 73 | """Create shell with one shell for each atom in the substruct.""" 74 | if substruct.center_atom is None: 75 | raise FormatError( 76 | "Can only create Shell from Substruct if " 77 | "center_atom is defined" 78 | ) 79 | atoms = substruct.atoms ^ {substruct.center_atom} 80 | return cls(substruct.center_atom, [Shell(x) for x in atoms]) 81 | 82 | @property 83 | def center_atom(self): 84 | return self._center_atom 85 | 86 | @property 87 | def shells(self): 88 | return self._shells 89 | 90 | @property 91 | def atoms(self): 92 | """Get all atoms explicitly within the shell.""" 93 | if self._atoms is None: 94 | self._atoms = set([self.center_atom,]) 95 | self._atoms.update([x.center_atom for x in self.shells]) 96 | return self._atoms 97 | 98 | @atoms.setter 99 | def atoms(self, atoms): 100 | self._atoms = atoms 101 | 102 | @property 103 | def substruct(self): 104 | """Get substruct with all atoms implicitly within the shell.""" 105 | if self._substruct is None: 106 | atom_sets = [set(x.substruct.atoms) for x in self.shells] 107 | if len(atom_sets) > 0: 108 | atoms = reduce(set.union, atom_sets) 109 | else: 110 | atoms = set() 111 | self._substruct = Substruct( 112 | center_atom=self.center_atom, atoms=atoms 113 | ) 114 | self._substruct.shell = self 115 | return self._substruct 116 | 117 | @substruct.setter 118 | def substruct(self, substruct): 119 | if not isinstance(substruct, Substruct) and substruct is not None: 120 | raise TypeError("substruct must be of type Substruct") 121 | self._substruct = substruct 122 | 123 | def __repr__(self): 124 | return ( 125 | "Shell(center_atom={!r}, shells={!r}, radius={!r}, " 126 | "last_shell={!r}, identifier={!r})" 127 | ).format( 128 | self.center_atom, 129 | tuple(self.shells), 130 | self.radius, 131 | self.last_shell, 132 | self.identifier, 133 | ) 134 | 135 | def __str__(self): 136 | return ( 137 | "Shell(center_atom={!r}, atoms={!r}, radius={!r}, " 138 | "identifier={!r})" 139 | ).format( 140 | self.center_atom, tuple(self.atoms), self.radius, self.identifier 141 | ) 142 | 143 | def __hash__(self): 144 | return hash((self.center_atom, self.shells)) 145 | 146 | def __eq__(self, other): 147 | return (self.center_atom == other.center_atom) and ( 148 | self.shells == other.shells 149 | ) 150 | 151 | def __ne__(self, other): 152 | return not self.__eq__(other) 153 | 154 | def __len__(self): 155 | return 1 + len(self.shells) 156 | 157 | def __contains__(self, key): 158 | if isinstance(key, (int, rdkit.Chem.Atom)): 159 | key = Shell(key) 160 | return key in self.shells or key == self 161 | 162 | 163 | class Substruct(object): 164 | """A container for atoms optionally centered on an atom. 165 | 166 | A Substruct represents all atoms implicitly within a Shell. Two Substructs 167 | are equal if they contain the same atoms. 168 | """ 169 | 170 | def __init__(self, center_atom=None, atoms=set()): 171 | self.center_atom = center_atom 172 | self.shell = None 173 | self._atoms = set() 174 | for atom in atoms: 175 | if isinstance(atom, rdkit.Chem.Atom): 176 | atom = atom.GetIdx() 177 | elif not isinstance(atom, (int, np.integer)): 178 | raise TypeError("atoms must be Atom or atom id") 179 | self._atoms.add(atom) 180 | if self.center_atom is not None: 181 | self._atoms.add(self.center_atom) 182 | self._atoms = frozenset(self._atoms) 183 | self.transform_matrix = np.identity(4, dtype=float) 184 | 185 | @classmethod 186 | def from_shell(cls, shell): 187 | return shell.substruct 188 | 189 | @property 190 | def center_atom(self): 191 | return self._center_atom 192 | 193 | @center_atom.setter 194 | def center_atom(self, center_atom): 195 | if isinstance(center_atom, rdkit.Chem.Atom): 196 | center_atom = center_atom.GetIdx() 197 | elif ( 198 | not isinstance(center_atom, (int, np.integer)) 199 | and center_atom is not None 200 | ): 201 | raise TypeError("center_atom must be Atom or atom id") 202 | self._center_atom = center_atom 203 | 204 | @property 205 | def atoms(self): 206 | return self._atoms 207 | 208 | def __repr__(self): 209 | return "Substruct(center_atom={!r}, atoms={!r})".format( 210 | self.center_atom, tuple(self.atoms) 211 | ) 212 | 213 | def __str__(self): 214 | return self.__repr__() 215 | 216 | def __hash__(self): 217 | return hash(self.atoms) 218 | 219 | def __eq__(self, other): 220 | return self.atoms == other.atoms 221 | 222 | def __ne__(self, other): 223 | return not self.__eq__(other) 224 | 225 | def __len__(self): 226 | return len(self.atoms) 227 | 228 | def __contains__(self, key): 229 | if isinstance(key, rdkit.Chem.Atom): 230 | key = key.GetIdx() 231 | return key in self.atoms 232 | 233 | 234 | class FormatError(Exception): 235 | pass 236 | 237 | 238 | # methods/classes for shell i/o 239 | def shell_to_pdb( 240 | mol, shell, atom_coords, bound_atoms_dict, out_file=None, reorient=True 241 | ): 242 | """Append substructure within shell to PDB. 243 | 244 | Parameters 245 | ---------- 246 | mol : RDKit Mol 247 | Input mol 248 | shell : Shell 249 | A shell 250 | atom_coords : dict 251 | Dict matching atom id to coordinates. 252 | bound_atoms_dict : dict 253 | Dict matching atom id to id of bound atoms. 254 | out_file : str or None, optional 255 | File to which to append coordinates. 256 | reorient : bool, optional 257 | Use the transformation matrix in the shell to align by the stereo 258 | quadrants. If no transformation matrix present, centers the center 259 | atom. 260 | 261 | Returns 262 | ------- 263 | list of str: list of PDB file lines, if `out_file` not specified 264 | """ 265 | remark = "REMARK 400" 266 | name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule" 267 | header_lines = [remark + " COMPOUND", remark + " " + name] 268 | lines = header_lines + [ 269 | "MODEL", 270 | ] 271 | atom_ids = sorted(shell.substruct.atoms) 272 | atoms = [mol.GetAtomWithIdx(int(x)) for x in atom_ids] 273 | coords = np.asarray(list(map(atom_coords.get, atom_ids)), dtype=float) 274 | if reorient: 275 | try: 276 | coords = array_ops.transform_array(shell.transform_matrix, coords) 277 | except AttributeError: 278 | coords -= atom_coords[shell.center_atom] 279 | 280 | for i, atom_id in enumerate(atom_ids): 281 | elem = atoms[i].GetSymbol() 282 | name = "{}{:d}".format(elem, atom_id + 1) 283 | charge = atoms[i].GetFormalCharge() 284 | if charge > 0: 285 | charge = "{:d}+".format(charge) 286 | elif charge < 0: 287 | charge = "{:d}-".format(abs(charge)) 288 | else: 289 | charge = "" 290 | if atom_id == shell.center_atom: 291 | temp = 1.0 292 | elif atom_id in shell.atoms: 293 | temp = 0.5 294 | else: 295 | temp = 0.0 296 | pdb_entries = { 297 | "atom_id": atom_id, 298 | "name": name, 299 | "coord": coords[i, :].flatten(), 300 | "occupancy": 0.0, 301 | "temp": temp, 302 | "elem": elem, 303 | "charge": charge, 304 | } 305 | lines.append(PDB_LINE.format(**pdb_entries)) 306 | 307 | # PLACEHOLDER FOR WRITING BONDS TO PDB 308 | # used_bonds = set() 309 | # write_bonds = [] 310 | # for atom_id in atom_ids: 311 | # write_bonds.append(atom_id) 312 | # bound_atom_ids = bound_atoms_dict.get(atom_id, set()) 313 | # for bound_atom_id in bound_atom_ids: 314 | # if (atom_id, bound_atom_id) in used_bonds: 315 | # continue 316 | # if len(write_bonds) > 3: 317 | # lines.append("CONECT "+" ".join(map(str, write_bonds))) 318 | # write_bonds = [atom_id,] 319 | # write_bonds.append(bound_atom_id) 320 | # used_bonds.add((atom_id, bound_atom_id)) 321 | # used_bonds.add((bound_atom_id, atom_id)) 322 | 323 | # lines.append("CONECT "+" ".join(map(str, write_bonds))) 324 | # write_bonds = [] 325 | 326 | lines.append("ENDMDL") 327 | 328 | if out_file is not None: 329 | with smart_open.open(out_file, "a") as f: 330 | for line in lines: 331 | f.write(line + "\n") 332 | else: 333 | return lines 334 | -------------------------------------------------------------------------------- /src/e3fp/conformer/util.py: -------------------------------------------------------------------------------- 1 | """Utilities for handling SMILES strings and RDKit mols and conformers. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import os 7 | import re 8 | import copy 9 | import logging 10 | from collections import namedtuple 11 | 12 | import rdkit 13 | import rdkit.Chem 14 | import rdkit.Chem.PropertyMol 15 | from rdkit.Chem.PropertyMol import PropertyMol 16 | from python_utilities.io_tools import touch_dir 17 | import smart_open 18 | 19 | PROTO_NAME_DELIM = "-" 20 | CONF_NAME_DELIM = "_" 21 | MOL_ITEM_REGEX = re.compile( 22 | r"(?P<{0}>.+?)(?:{1}(?P<{2}>\d+))?(?:{3}(?P<{4}>\d+))?$".format( 23 | "mol_name", 24 | PROTO_NAME_DELIM, 25 | "proto_state_num", 26 | CONF_NAME_DELIM, 27 | "conf_num", 28 | ) 29 | ) 30 | MOL_ITEM_FIELDS = ("mol_name", "proto_state_num", "conf_num") 31 | CONF_ENERGIES_PROPNAME = "_ConfEnergies" 32 | CONF_ENERGIES_DELIM = "|" 33 | CONF_ENERGY_PROPNAME = "Energy" 34 | 35 | MolItemTuple = namedtuple( 36 | "MolItemTuple", ["mol_name", "proto_state_num", "conf_num"] 37 | ) 38 | 39 | 40 | class MolItemName(object): 41 | """Class for parsing mol item names and converting to various formats.""" 42 | 43 | def __init__( 44 | self, 45 | mol_name=None, 46 | proto_state_num=None, 47 | conf_num=None, 48 | proto_delim=PROTO_NAME_DELIM, 49 | conf_delim=CONF_NAME_DELIM, 50 | ): 51 | self.mol_name = mol_name 52 | self.proto_state_num = proto_state_num 53 | self.conf_num = conf_num 54 | self.proto_delim = proto_delim 55 | self.conf_delim = conf_delim 56 | 57 | @classmethod 58 | def from_str( 59 | cls, 60 | mol_item_name, 61 | mol_item_regex=MOL_ITEM_REGEX, 62 | mol_item_fields=MOL_ITEM_FIELDS, 63 | **kwargs 64 | ): 65 | fields = cls.mol_item_name_to_dict( 66 | mol_item_name, 67 | mol_item_regex=mol_item_regex, 68 | mol_item_fields=mol_item_fields, 69 | ) 70 | return cls( 71 | fields["mol_name"], 72 | fields["proto_state_num"], 73 | fields["conf_num"], 74 | **kwargs 75 | ) 76 | 77 | def to_str(self): 78 | return self.mol_item_name 79 | 80 | @classmethod 81 | def from_tuple(cls, fields_tuple): 82 | return cls(*fields_tuple) 83 | 84 | def to_tuple(self): 85 | return MolItemTuple(self.mol_name, self.proto_state_num, self.conf_num) 86 | 87 | @property 88 | def mol_name(self): 89 | return self._mol_name 90 | 91 | @mol_name.setter 92 | def mol_name(self, mol_name): 93 | self._mol_name = mol_name 94 | 95 | def to_mol_name(self, as_proto=False): 96 | if as_proto: 97 | return self.proto_name 98 | else: 99 | return self.mol_name 100 | 101 | @property 102 | def proto_name(self): 103 | return self.to_proto_name(self.proto_state_num) 104 | 105 | def to_proto_name( 106 | self, proto_state_num=None, proto_delim=PROTO_NAME_DELIM 107 | ): 108 | if proto_state_num is not None: 109 | return "{}{}{:d}".format( 110 | self.mol_name, proto_delim, proto_state_num 111 | ) 112 | else: 113 | return self.mol_name 114 | 115 | @property 116 | def conf_name(self): 117 | return self.to_conf_name(conf_num=self.conf_num) 118 | 119 | def to_conf_name(self, conf_num=None, conf_delim=CONF_NAME_DELIM): 120 | if conf_num is not None: 121 | return "{}{}{:d}".format(self.proto_name, conf_delim, conf_num) 122 | else: 123 | return self.proto_name 124 | 125 | @property 126 | def mol_item_name(self): 127 | return self.conf_name 128 | 129 | @staticmethod 130 | def mol_item_name_to_dict( 131 | mol_item_name, 132 | mol_item_regex=MOL_ITEM_REGEX, 133 | mol_item_fields=MOL_ITEM_FIELDS, 134 | ): 135 | match = re.match(mol_item_regex, mol_item_name) 136 | groups = match.groups() 137 | fields = dict(zip(mol_item_fields, groups)) 138 | proto_state_num = fields.get("proto_state_num") 139 | if proto_state_num is not None: 140 | fields["proto_state_num"] = int(proto_state_num) 141 | conf_num = fields.get("conf_num") 142 | if conf_num is not None: 143 | fields["conf_num"] = int(conf_num) 144 | return fields 145 | 146 | def copy(self): 147 | return copy.copy(self) 148 | 149 | def __repr__(self): 150 | return ( 151 | "MolItemName(mol_name={}, proto_state_num={}, " 152 | "conf_num={})".format( 153 | self.mol_name, self.proto_state_num, self.conf_num 154 | ) 155 | ) 156 | 157 | def __str__(self): 158 | return self.conf_name 159 | 160 | def __eq__(self, other): 161 | return self.to_tuple() == other.to_tuple() 162 | 163 | def __ne__(self, other): 164 | return not self.__eq__(other) 165 | 166 | def __gt__(self, other): 167 | return self.to_tuple().__gt__(other.to_tuple()) 168 | 169 | def __lt__(self, other): 170 | return self.to_tuple().__lt__(other.to_tuple()) 171 | 172 | def __hash__(self): 173 | return hash(self.to_tuple()) 174 | 175 | 176 | def smiles_generator(*filenames): 177 | """Parse SMILES file(s) and yield (name, smile). 178 | 179 | Parameters 180 | ---------- 181 | files : iterable object 182 | List of files containing smiles. File must contain one smile per 183 | line, followed by a space and then the molecule name. 184 | 185 | Yields 186 | ------ 187 | tuple: 188 | `tuple` of the format (smile, name). 189 | """ 190 | for filename in filenames: 191 | with smart_open.open(filename, "r") as f: 192 | for i, line in enumerate(f): 193 | values = line.rstrip("\r\n").split() 194 | if len(values) >= 2: 195 | yield tuple(values[:2]) 196 | else: 197 | logging.warning( 198 | ( 199 | "Line {:d} of {} has {:d} entries. Expected at least" 200 | " 2.".format(i + 1, filename, len(values)) 201 | ), 202 | exc_info=True, 203 | ) 204 | 205 | 206 | def smiles_to_dict(smiles_file, unique=False, has_header=False): 207 | """Read SMILES file to dict.""" 208 | smiles_gen = smiles_generator(smiles_file) 209 | if has_header: 210 | header = next(smiles_gen) 211 | logging.info("Skipping first (header) values: {!r}".format(header)) 212 | if unique: 213 | used_smiles = set() 214 | smiles_dict = {} 215 | for smiles, name in smiles_gen: 216 | if name not in smiles_dict and smiles not in used_smiles: 217 | smiles_dict[name] = smiles 218 | used_smiles.add(smiles) 219 | else: 220 | smiles_dict = {name: smiles for smiles, name in smiles_gen} 221 | return smiles_dict 222 | 223 | 224 | def dict_to_smiles(smiles_file, smiles_dict): 225 | """Write SMILES dict to file.""" 226 | iter_to_smiles(smiles_file, sorted(smiles_dict.items())) 227 | 228 | 229 | def iter_to_smiles(smiles_file, smiles_iter): 230 | """Write iterator of (mol_name, SMILES) to file.""" 231 | with smart_open.open(smiles_file, "w") as f: 232 | for mol_name, smiles in smiles_iter: 233 | f.write("{} {}\n".format(smiles, mol_name)) 234 | 235 | 236 | def mol2_generator(*filenames): 237 | """Parse name from mol2 filename and return generator. 238 | 239 | Parameters 240 | ---------- 241 | files : iterable object 242 | List of mol2 files, where filename should be molecule name followed by 243 | ".mol2" 244 | 245 | Yields 246 | ------ 247 | tuple: 248 | `tuple` of the format (file, name). 249 | """ 250 | for filename in filenames: 251 | name = os.path.splitext(os.path.basename(filename))[0] 252 | yield (filename, name) 253 | 254 | 255 | def mol_from_smiles(smiles, name, standardise=False): 256 | """Generate a n RDKit `PropertyMol` from SMILES string. 257 | 258 | Parameters 259 | ---------- 260 | smile : str 261 | SMILES string 262 | name : str 263 | Name of molecule 264 | standardise : bool 265 | Clean Mol through standardisation 266 | 267 | Returns 268 | ------- 269 | RDKit PropertyMol : Molecule. 270 | """ 271 | mol = rdkit.Chem.MolFromSmiles(smiles) 272 | if mol is None: 273 | logging.error( 274 | "Mol creation failed from SMILES: {!r}".format((smiles, name)) 275 | ) 276 | return None 277 | if standardise: 278 | mol = mol_to_standardised_mol(mol, name) 279 | mol = PropertyMol(mol) 280 | mol.SetProp("_Name", name) 281 | mol.SetProp("_SMILES", smiles) 282 | return mol 283 | 284 | 285 | def mol_from_mol2(mol2_file, name=None, standardise=False): 286 | """Read a mol2 file into an RDKit `PropertyMol`. 287 | 288 | Parameters 289 | ---------- 290 | mol2_file : str 291 | path to a mol2 file 292 | name : str, optional 293 | Name of molecule. If not provided, uses file basename as name 294 | standardise : bool 295 | Clean mol through standardisation 296 | 297 | Returns 298 | ------- 299 | RDKit PropertyMol : Molecule. 300 | """ 301 | if name is None: 302 | name = os.path.splitext(os.path.basename(mol2_file))[0] 303 | mol = rdkit.Chem.MolFromMol2File(mol2_file) 304 | if standardise: 305 | mol = mol_to_standardised_mol(mol, name) 306 | mol = PropertyMol(mol) 307 | mol.SetProp("_Name", name) 308 | return mol 309 | 310 | 311 | def mol_from_sdf(sdf_file, conf_num=None, standardise=False, mode="rb"): 312 | """Read SDF file into an RDKit `Mol` object. 313 | 314 | Parameters 315 | ---------- 316 | sdf_file : str 317 | Path to an SDF file 318 | conf_num : int or None, optional 319 | Maximum number of conformers to read from file. Defaults to all. 320 | standardise : bool (default False) 321 | Clean mol through standardisation 322 | mode : str (default 'rb') 323 | Mode with which to open file 324 | 325 | Returns 326 | ------- 327 | RDKit Mol : `Mol` object with each molecule in SDF file as a conformer 328 | """ 329 | mol = None 330 | conf_energies = [] 331 | with smart_open.open(sdf_file, mode) as f: 332 | supplier = rdkit.Chem.ForwardSDMolSupplier(f) 333 | i = 0 334 | while True: 335 | if i == conf_num: 336 | break 337 | try: 338 | new_mol = next(supplier) 339 | except StopIteration: 340 | logging.debug( 341 | "Read {:d} conformers from {}.".format(i, sdf_file) 342 | ) 343 | break 344 | 345 | if new_mol.HasProp(CONF_ENERGY_PROPNAME): 346 | conf_energies.append( 347 | float(new_mol.GetProp(CONF_ENERGY_PROPNAME)) 348 | ) 349 | 350 | if mol is None: 351 | mol = rdkit.Chem.Mol(new_mol) 352 | mol.RemoveAllConformers() 353 | conf = new_mol.GetConformers()[0] 354 | mol.AddConformer(conf, assignId=True) 355 | i += 1 356 | if standardise: 357 | mol = mol_to_standardised_mol(mol) 358 | try: 359 | mol.GetProp("_Name") 360 | except KeyError: 361 | name = os.path.basename(sdf_file).split(".sdf")[0] 362 | mol.SetProp("_Name", name) 363 | 364 | if len(conf_energies) > 0: 365 | add_conformer_energies_to_mol(mol, conf_energies) 366 | mol.ClearProp(CONF_ENERGY_PROPNAME) 367 | 368 | return mol 369 | 370 | 371 | def mol_to_sdf(mol, out_file, conf_num=None): 372 | """Write RDKit `Mol` objects to an SDF file. 373 | 374 | Parameters 375 | ---------- 376 | mol : RDKit Mol 377 | A molecule containing 1 or more conformations to write to file. 378 | out_file : str 379 | Path to save SDF file. 380 | conf_num : int or None, optional 381 | Maximum number of conformers to save to file. Defaults to all. 382 | """ 383 | touch_dir(os.path.dirname(out_file)) 384 | with smart_open.open(out_file, "w") as fobj: 385 | writer = rdkit.Chem.SDWriter(fobj) 386 | conf_ids = [conf.GetId() for conf in mol.GetConformers()] 387 | conf_energies = get_conformer_energies_from_mol(mol) 388 | mol.ClearProp(CONF_ENERGIES_PROPNAME) 389 | for i in conf_ids: 390 | if conf_num not in {-1, None} and i >= conf_num: 391 | break 392 | try: 393 | conf_energy = conf_energies[i] 394 | mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy)) 395 | except (IndexError, TypeError): 396 | pass 397 | writer.write(mol, confId=i) 398 | writer.close() 399 | mol.ClearProp(CONF_ENERGY_PROPNAME) 400 | if conf_energies is not None: 401 | add_conformer_energies_to_mol(mol, conf_energies) 402 | logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file)) 403 | 404 | 405 | def mol_to_standardised_mol(mol, name=None): 406 | """Standardise mol(s).""" 407 | try: 408 | from standardiser import standardise 409 | from standardiser.utils import StandardiseException 410 | except ImportError: 411 | logging.warning( 412 | "standardiser module unavailable. Using unstandardised mol." 413 | ) 414 | return mol 415 | 416 | if name is None: 417 | try: 418 | name = mol.GetProp("_Name") 419 | except KeyError: 420 | name = repr(mol) 421 | 422 | if isinstance(mol, PropertyMol): 423 | mol_type = PropertyMol 424 | mol = rdkit.Chem.Mol(mol) 425 | else: 426 | mol_type = rdkit.Chem.Mol 427 | 428 | logging.debug("Standardising {}".format(name)) 429 | try: 430 | std_mol = standardise.run(mol) 431 | except AttributeError: # backwards-compatible with old standardiser 432 | std_mol = standardise.apply(mol) 433 | except StandardiseException: 434 | logging.error( 435 | ( 436 | "Standardisation of {} failed. Using unstandardised " 437 | "mol.".format(name) 438 | ), 439 | exc_info=True, 440 | ) 441 | return mol_type(mol) 442 | 443 | std_mol = mol_type(std_mol) 444 | try: 445 | std_mol.SetProp("_Name", mol.GetProp("_Name")) 446 | except KeyError: 447 | pass 448 | 449 | return std_mol 450 | 451 | 452 | def add_conformer_energies_to_mol(mol, energies): 453 | """Add conformer energies as mol property. 454 | 455 | See discussion at https://sourceforge.net/p/rdkit/mailman/message/27547551/ 456 | """ 457 | energies_str = CONF_ENERGIES_DELIM.join( 458 | "{:.4f}".format(e) for e in energies 459 | ) 460 | mol.SetProp(CONF_ENERGIES_PROPNAME, energies_str) 461 | return mol 462 | 463 | 464 | def get_conformer_energies_from_mol(mol): 465 | """Get conformer energies from mol.""" 466 | if not mol.HasProp(CONF_ENERGIES_PROPNAME): 467 | return None 468 | energies_str = mol.GetProp(CONF_ENERGIES_PROPNAME) 469 | energies = [float(x) for x in energies_str.split(CONF_ENERGIES_DELIM)] 470 | return energies 471 | -------------------------------------------------------------------------------- /src/e3fp/conformer/generator.py: -------------------------------------------------------------------------------- 1 | """Conformer generation. 2 | 3 | Author: Seth Axen 4 | E-mail: seth.axen@gmail.com 5 | """ 6 | import logging 7 | 8 | import numpy as np 9 | 10 | from rdkit import Chem 11 | from rdkit.Chem import AllChem 12 | from rdkit.Chem import PropertyMol 13 | from .util import add_conformer_energies_to_mol 14 | 15 | # Heavily modified by Seth Axen from code under the following license 16 | __author__ = "Steven Kearnes" 17 | __copyright__ = "Copyright 2014, Stanford University" 18 | __license__ = "3-clause BSD" 19 | 20 | # options 21 | FORCEFIELD_CHOICES = ("uff", "mmff94", "mmff94s") 22 | 23 | # default values 24 | NUM_CONF_DEF = -1 25 | FIRST_DEF = -1 26 | POOL_MULTIPLIER_DEF = 1 27 | RMSD_CUTOFF_DEF = 0.5 28 | MAX_ENERGY_DIFF_DEF = -1.0 29 | FORCEFIELD_DEF = "uff" 30 | SEED_DEF = -1 31 | 32 | 33 | class ConformerGenerator(object): 34 | """Generate conformers using RDKit. 35 | 36 | Procedure 37 | --------- 38 | 1. Generate a pool of conformers. 39 | 2. Minimize conformers. 40 | 3. Filter conformers using an RMSD threshold and optional minimum energy 41 | difference. 42 | 43 | Note that pruning is done _after_ minimization, which differs from the 44 | protocol described in the references. 45 | 46 | References 47 | ---------- 48 | * http://rdkit.org/docs/GettingStartedInPython.html 49 | #working-with-3d-molecules 50 | * http://pubs.acs.org/doi/full/10.1021/ci2004658 51 | * https://github.com/skearnes/rdkit-utils/blob/master/rdkit_utils/ 52 | conformers.py 53 | """ 54 | 55 | def __init__( 56 | self, 57 | num_conf: int=NUM_CONF_DEF, 58 | first: int=FIRST_DEF, 59 | rmsd_cutoff: float=RMSD_CUTOFF_DEF, 60 | max_energy_diff: float=MAX_ENERGY_DIFF_DEF, 61 | forcefield: str=FORCEFIELD_DEF, 62 | pool_multiplier: int=POOL_MULTIPLIER_DEF, 63 | seed: int=SEED_DEF, 64 | get_values: bool=False, 65 | sparse_rmsd: bool=True, 66 | store_energies: bool=True, 67 | ): 68 | """Initialize generator settings. 69 | 70 | Parameters 71 | ---------- 72 | num_conf : int, optional 73 | Maximum number of conformers to generate (after pruning). -1 74 | results in auto selection of max_conformers. 75 | first : int, optional 76 | Terminate when this number of conformers has been accepted, and 77 | only return those conformers. -1 results in all conformers being 78 | returned. 79 | pool_multiplier : int, optional 80 | Factor to multiply by max_conformers to generate the initial 81 | conformer pool. Since conformers are filtered after energy 82 | minimization, increasing the size of the pool increases the chance 83 | of identifying max_conformers unique conformers. 84 | rmsd_cutoff : float, optional 85 | RMSD cutoff for pruning conformers. If None or negative, no 86 | pruning is performed. 87 | max_energy_diff : float, optional 88 | If set, conformers with energies this amount above the minimum 89 | energy conformer are not accepted. 90 | forcefield : {'uff', 'mmff94', 'mmff94s'}, optional 91 | Force field to use for conformer energy calculation and 92 | minimization. 93 | seed : int, optional 94 | Random seed for conformer generation. If -1, the random number 95 | generator is unseeded. 96 | get_values : boolean, optional 97 | Return tuple of key values, for storage. 98 | sparse_rmsd : bool, optional 99 | If `get_values` is True, instead of returning full symmetric RMSD 100 | matrix, only return flattened upper triangle. 101 | store_energies : bool, optional 102 | Store conformer energies as property in mol. 103 | """ 104 | if not isinstance(num_conf, int) or num_conf < -1 or num_conf == 0: 105 | raise ValueError("num_conf must be either -1 or a positive integer") 106 | self.max_conformers = num_conf 107 | if not isinstance(first, int) or first < -1 or first == 0: 108 | raise ValueError("first must be either -1 or a positive integer") 109 | self.first_conformers = first 110 | if not rmsd_cutoff or rmsd_cutoff < 0: 111 | rmsd_cutoff = -1.0 112 | self.rmsd_cutoff = rmsd_cutoff 113 | 114 | if max_energy_diff is None or max_energy_diff < 0: 115 | max_energy_diff = -1.0 116 | self.max_energy_diff = max_energy_diff 117 | 118 | if forcefield not in FORCEFIELD_CHOICES: 119 | raise ValueError( 120 | "%s is not a valid option for forcefield" % forcefield 121 | ) 122 | self.forcefield = forcefield 123 | if not isinstance(pool_multiplier, int) or pool_multiplier < 1: 124 | raise ValueError("pool_multiplier must be a positive integer") 125 | self.pool_multiplier = pool_multiplier 126 | self.seed = seed 127 | self.get_values = get_values 128 | self.sparse_rmsd = sparse_rmsd 129 | self.store_energies = store_energies 130 | 131 | def __call__(self, mol): 132 | """Generate conformers for a molecule. 133 | 134 | Parameters 135 | ---------- 136 | mol : RDKit Mol 137 | Molecule. 138 | 139 | Returns 140 | ------- 141 | RDKit Mol : copy of the input molecule with embedded conformers 142 | """ 143 | return self.generate_conformers(mol) 144 | 145 | def generate_conformers(self, mol): 146 | """Generate conformers for a molecule. 147 | 148 | Parameters 149 | ---------- 150 | mol : RDKit Mol 151 | Molecule. 152 | 153 | Returns 154 | ------- 155 | RDKit Mol : copy of the input molecule with embedded conformers 156 | """ 157 | # initial embedding 158 | mol = self.embed_molecule(mol) 159 | if not mol.GetNumConformers(): 160 | msg = "No conformers generated for molecule" 161 | if mol.HasProp("_Name"): 162 | name = mol.GetProp("_Name") 163 | msg += ' "{}".'.format(name) 164 | else: 165 | msg += "." 166 | raise RuntimeError(msg) 167 | 168 | # minimization and filtering 169 | self.minimize_conformers(mol) 170 | mol, indices, energies, rmsds = self.filter_conformers(mol) 171 | 172 | if self.store_energies: 173 | add_conformer_energies_to_mol(mol, energies) 174 | 175 | if self.get_values is True: 176 | if self.sparse_rmsd: 177 | rmsds_mat = rmsds[np.triu_indices_from(rmsds, k=1)] 178 | else: 179 | rmsds_mat = rmsds 180 | return mol, (self.max_conformers, indices, energies, rmsds_mat) 181 | else: 182 | return mol 183 | 184 | @staticmethod 185 | def get_num_conformers(mol): 186 | """Return ideal number of conformers from rotatable bond number in model. 187 | 188 | Parameters 189 | ---------- 190 | mol : Mol 191 | RDKit `Mol` object for molecule 192 | 193 | Yields 194 | ------ 195 | num_conf : int 196 | Target number of conformers to accept 197 | """ 198 | num_rot = AllChem.CalcNumRotatableBonds(mol) 199 | if num_rot < 8: 200 | return 50 201 | elif num_rot >= 8 and num_rot <= 12: 202 | return 200 203 | elif num_rot > 12: 204 | return 300 205 | else: 206 | return 0 207 | 208 | def embed_molecule(self, mol): 209 | """Generate conformers, possibly with pruning. 210 | 211 | Parameters 212 | ---------- 213 | mol : RDKit Mol 214 | Molecule. 215 | """ 216 | log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule" 217 | logging.debug("Adding hydrogens for %s" % log_name) 218 | mol = Chem.AddHs(mol) # add hydrogens 219 | logging.debug("Hydrogens added to %s" % log_name) 220 | logging.debug("Sanitizing mol for %s" % log_name) 221 | Chem.SanitizeMol(mol) 222 | logging.debug("Mol sanitized for %s" % log_name) 223 | if self.max_conformers == -1 or type(self.max_conformers) is not int: 224 | self.max_conformers = self.get_num_conformers(mol) 225 | n_confs = self.max_conformers * self.pool_multiplier 226 | if self.first_conformers == -1: 227 | self.first_conformers = self.max_conformers 228 | logging.debug("Embedding %d conformers for %s" % (n_confs, log_name)) 229 | AllChem.EmbedMultipleConfs( 230 | mol, 231 | numConfs=n_confs, 232 | maxAttempts=10 * n_confs, 233 | pruneRmsThresh=-1.0, 234 | randomSeed=self.seed, 235 | ignoreSmoothingFailures=True, 236 | ) 237 | logging.debug("Conformers embedded for %s" % log_name) 238 | return mol 239 | 240 | def get_molecule_force_field(self, mol, conf_id=None, **kwargs): 241 | """Get a force field for a molecule. 242 | 243 | Parameters 244 | ---------- 245 | mol : RDKit Mol 246 | Molecule. 247 | conf_id : int, optional 248 | ID of the conformer to associate with the force field. 249 | **kwargs : dict, optional 250 | Keyword arguments for force field constructor. 251 | """ 252 | if self.forcefield == "uff": 253 | ff = AllChem.UFFGetMoleculeForceField( 254 | mol, confId=conf_id, **kwargs 255 | ) 256 | elif self.forcefield.startswith("mmff"): 257 | AllChem.MMFFSanitizeMolecule(mol) 258 | mmff_props = AllChem.MMFFGetMoleculeProperties( 259 | mol, mmffVariant=self.forcefield 260 | ) 261 | ff = AllChem.MMFFGetMoleculeForceField( 262 | mol, mmff_props, confId=conf_id, **kwargs 263 | ) 264 | else: 265 | raise ValueError( 266 | "Invalid forcefield " + "'{}'.".format(self.forcefield) 267 | ) 268 | return ff 269 | 270 | def minimize_conformers(self, mol): 271 | """Minimize molecule conformers. 272 | 273 | Parameters 274 | ---------- 275 | mol : RDKit Mol 276 | Molecule. 277 | """ 278 | log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule" 279 | logging.debug("Minimizing conformers for %s" % log_name) 280 | for conf in mol.GetConformers(): 281 | ff = self.get_molecule_force_field(mol, conf_id=conf.GetId()) 282 | ff.Minimize() 283 | logging.debug("Conformers minimized for %s" % log_name) 284 | 285 | def get_conformer_energies(self, mol): 286 | """Calculate conformer energies. 287 | 288 | Parameters 289 | ---------- 290 | mol : RDKit Mol 291 | Molecule. 292 | 293 | Returns 294 | ------- 295 | energies : array_like 296 | Minimized conformer energies. 297 | """ 298 | num_conf = mol.GetNumConformers() 299 | energies = np.empty((num_conf,), dtype=float) 300 | for i, conf in enumerate(mol.GetConformers()): 301 | ff = self.get_molecule_force_field(mol, conf_id=conf.GetId()) 302 | energies[i] = ff.CalcEnergy() 303 | return energies 304 | 305 | def filter_conformers(self, mol): 306 | """Filter conformers which do not meet an RMSD threshold. 307 | 308 | Parameters 309 | ---------- 310 | mol : RDKit Mol 311 | Molecule. 312 | 313 | Returns 314 | ------- 315 | A new RDKit Mol containing the chosen conformers, sorted by 316 | increasing energy. 317 | """ 318 | log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule" 319 | logging.debug("Pruning conformers for %s" % log_name) 320 | energies = self.get_conformer_energies(mol) 321 | energy_below_threshold = np.ones_like(energies, dtype=np.bool_) 322 | 323 | sort = np.argsort(energies) # sort by increasing energy 324 | confs = np.array(mol.GetConformers()) 325 | 326 | # remove hydrogens to speed up substruct match 327 | mol = Chem.RemoveHs(mol) 328 | accepted = [] # always accept lowest-energy conformer 329 | rejected = [] 330 | rmsds = np.zeros((confs.shape[0], confs.shape[0]), dtype=float) 331 | for i, fit_ind in enumerate(sort): 332 | accepted_num = len(accepted) 333 | 334 | # always accept lowest-energy conformer 335 | if accepted_num == 0: 336 | accepted.append(fit_ind) 337 | 338 | # pre-compute if Es are in acceptable range of min E 339 | if self.max_energy_diff != -1.0: 340 | energy_below_threshold = ( 341 | energies <= energies[fit_ind] + self.max_energy_diff 342 | ) 343 | 344 | continue 345 | 346 | # reject conformers after first_conformers is reached 347 | if accepted_num >= self.first_conformers: 348 | rejected.append(fit_ind) 349 | continue 350 | 351 | # check if energy is too high 352 | if not energy_below_threshold[fit_ind]: 353 | rejected.append(fit_ind) 354 | continue 355 | 356 | # get RMSD to selected conformers 357 | these_rmsds = np.zeros((accepted_num,), dtype=float) 358 | # reverse so all confs aligned to lowest energy 359 | for j, accepted_ind in self.reverse_enumerate(accepted): 360 | this_rmsd = AllChem.GetBestRMS( 361 | mol, 362 | mol, 363 | confs[accepted_ind].GetId(), 364 | confs[fit_ind].GetId(), 365 | ) 366 | # reject conformers within the RMSD threshold 367 | if this_rmsd < self.rmsd_cutoff: 368 | rejected.append(fit_ind) 369 | break 370 | else: 371 | these_rmsds[-j - 1] = this_rmsd 372 | else: 373 | rmsds[fit_ind, accepted] = these_rmsds 374 | rmsds[accepted, fit_ind] = these_rmsds 375 | accepted.append(fit_ind) 376 | 377 | # slice and order rmsds and energies to match accepted list 378 | rmsds = rmsds[np.ix_(accepted, accepted)] 379 | energies = energies[accepted] 380 | 381 | # create a new molecule with all conformers, sorted by energy 382 | new = PropertyMol.PropertyMol(mol) 383 | new.RemoveAllConformers() 384 | conf_ids = [conf.GetId() for conf in mol.GetConformers()] 385 | for i in accepted: 386 | conf = mol.GetConformer(conf_ids[i]) 387 | new.AddConformer(conf, assignId=True) 388 | 389 | logging.debug("Conformers filtered for %s" % log_name) 390 | return new, np.asarray(accepted, dtype=int), energies, rmsds 391 | 392 | @staticmethod 393 | def reverse_enumerate(iterable): 394 | """Enumerate, but with the last result first but still numbered last. 395 | 396 | Parameters 397 | ---------- 398 | iterable : some 1-D iterable 399 | 400 | Returns 401 | ------- 402 | iterable: 403 | Reverse of `enumerate` function 404 | """ 405 | return zip(reversed(range(len(iterable))), reversed(iterable)) 406 | 407 | # magic methods 408 | def __repr__(self): 409 | return """ConformerGenerator(num_conf=%r, first=%r,\ 410 | \n pool_multiplier=%r, rmsd_cutoff=%r,\ 411 | \n max_energy_diff=%r, forcefield=%r,\ 412 | \n get_values=%r, sparse_rmsd=%r)""" % ( 413 | self.max_conformers, 414 | self.first, 415 | self.pool_multiplier, 416 | self.rmsd_cutoff, 417 | self.max_energy_diff, 418 | self.forcefield, 419 | self.get_values, 420 | self.sparse_rmsd, 421 | ) 422 | 423 | def __str__(self): 424 | return self.__repr__() 425 | --------------------------------------------------------------------------------