├── tests
    ├── __init__.py
    ├── data
    │   ├── stereo1.sdf.bz2
    │   ├── stereo2.sdf.bz2
    │   ├── caffeine_planar.sdf.bz2
    │   ├── ritalin_nonplanar.sdf.bz2
    │   ├── rand_sdf_files
    │   │   ├── CHEMBL116226.sdf.bz2
    │   │   ├── CHEMBL197946.sdf.bz2
    │   │   ├── CHEMBL282186.sdf.bz2
    │   │   ├── CHEMBL501745.sdf.bz2
    │   │   └── CHEMBL2114064.sdf.bz2
    │   └── ritalin_nonplanar.sdf
    ├── test_config.py
    ├── test_dependencies.py
    ├── test_util.py
    ├── test_conformer.py
    ├── test_fingerprint.py
    ├── test_metrics.py
    └── test_struct.py
├── src
    └── e3fp
    │   ├── config
    │       ├── __init__.py
    │       ├── defaults.cfg
    │       └── params.py
    │   ├── conformer
    │       ├── __init__.py
    │       ├── protonation.py
    │       ├── util.py
    │       └── generator.py
    │   ├── fingerprint
    │       ├── __init__.py
    │       ├── metrics
    │       │   ├── __pycache__
    │       │   │   ├── array_metrics._dense_soergel-225.py312.nbi
    │       │   │   ├── array_metrics._sparse_soergel-246.py312.nbi
    │       │   │   ├── array_metrics._dense_soergel-225.py312.1.nbc
    │       │   │   └── array_metrics._sparse_soergel-246.py312.1.nbc
    │       │   ├── fprint_metrics.py
    │       │   ├── __init__.py
    │       │   └── array_metrics.py
    │       ├── util.py
    │       ├── array_ops.py
    │       └── structs.py
    │   ├── __init__.py
    │   ├── pipeline.py
    │   └── util.py
├── doc
    ├── source
    │   ├── examples
    │   │   └── data
    │   │   │   ├── caffeine.smi
    │   │   │   ├── new_params.cfg
    │   │   │   └── test_smiles.smi
    │   ├── api
    │   │   ├── index.rst
    │   │   ├── e3fp.util.rst
    │   │   ├── e3fp.pipeline.rst
    │   │   ├── e3fp.config.params.rst
    │   │   ├── e3fp.conformer.util.rst
    │   │   ├── e3fp.fingerprint.db.rst
    │   │   ├── e3fp.fingerprint.util.rst
    │   │   ├── e3fp.conformer.generate.rst
    │   │   ├── e3fp.conformer.generator.rst
    │   │   ├── e3fp.fingerprint.fprint.rst
    │   │   ├── e3fp.fingerprint.structs.rst
    │   │   ├── e3fp.fingerprint.fprinter.rst
    │   │   ├── e3fp.fingerprint.generate.rst
    │   │   ├── e3fp.conformer.protonation.rst
    │   │   ├── e3fp.fingerprint.array_ops.rst
    │   │   ├── e3fp.fingerprint.metrics.array_metrics.rst
    │   │   ├── e3fp.fingerprint.metrics.fprint_metrics.rst
    │   │   ├── e3fp.config.rst
    │   │   ├── e3fp.conformer.rst
    │   │   ├── e3fp.fingerprint.metrics.rst
    │   │   ├── e3fp.rst
    │   │   └── e3fp.fingerprint.rst
    │   ├── _static
    │   │   └── ritalin3d.png
    │   ├── index.rst
    │   ├── usage
    │   │   ├── fingerprints
    │   │   │   ├── index.rst
    │   │   │   ├── comparison.rst
    │   │   │   ├── storage.rst
    │   │   │   └── fprints.rst
    │   │   ├── index.rst
    │   │   ├── config.rst
    │   │   ├── pipeline.rst
    │   │   └── cli.rst
    │   ├── _templates
    │   │   └── layout.html
    │   ├── overview.rst
    │   ├── substitutions.rst
    │   ├── install.rst
    │   ├── conf.py
    │   └── dev
    │   │   └── index.rst
    └── Makefile
├── pytest.ini
├── .gitignore
├── .coveragerc
├── .readthedocs.yml
├── .github
    └── workflows
    │   ├── publish.yml
    │   └── ci.yml
├── pyproject.toml
├── README.rst
└── LICENSE.txt


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/e3fp/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/e3fp/conformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/source/examples/data/caffeine.smi:
--------------------------------------------------------------------------------
1 | CN1C=NC2=C1C(=O)N(C(=O)N2C)C caffeine
2 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules
3 | doctest_optionflags = ELLIPSIS
4 | 


--------------------------------------------------------------------------------
/tests/data/stereo1.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/stereo1.sdf.bz2


--------------------------------------------------------------------------------
/tests/data/stereo2.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/stereo2.sdf.bz2


--------------------------------------------------------------------------------
/doc/source/api/index.rst:
--------------------------------------------------------------------------------
1 | e3fp API
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 5
6 | 
7 |    e3fp
8 | 


--------------------------------------------------------------------------------
/doc/source/_static/ritalin3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/doc/source/_static/ritalin3d.png


--------------------------------------------------------------------------------
/tests/data/caffeine_planar.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/caffeine_planar.sdf.bz2


--------------------------------------------------------------------------------
/tests/data/ritalin_nonplanar.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/ritalin_nonplanar.sdf.bz2


--------------------------------------------------------------------------------
/doc/source/examples/data/new_params.cfg:
--------------------------------------------------------------------------------
1 | [conformer_generation]
2 | first = 10
3 | 
4 | [fingerprinting]
5 | bits = 4096
6 | first = 10


--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL116226.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL116226.sdf.bz2


--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL197946.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL197946.sdf.bz2


--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL282186.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL282186.sdf.bz2


--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL501745.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL501745.sdf.bz2


--------------------------------------------------------------------------------
/tests/data/rand_sdf_files/CHEMBL2114064.sdf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/tests/data/rand_sdf_files/CHEMBL2114064.sdf.bz2


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/*
 2 | dist/*
 3 | doc/_build/*
 4 | .cache/*
 5 | .coverage
 6 | .DS_Store
 7 | *egg*
 8 | *.pyc
 9 | *.so
10 | *.o
11 | uv.lock
12 | docs
13 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | branch = True
4 | source = e3fp
5 | omit =
6 |     */e3fp/test/*
7 |     */setup.py
8 |     */doc/*
9 | 


--------------------------------------------------------------------------------
/src/e3fp/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 | from .util import E3FPWarning, E3FPDeprecationWarning
3 | 
4 | __version__ = importlib.metadata.version("e3fp")
5 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.util.rst:
--------------------------------------------------------------------------------
1 | e3fp\.util module
2 | =================
3 | 
4 | .. automodule:: e3fp.util
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.pipeline.rst:
--------------------------------------------------------------------------------
1 | e3fp\.pipeline module
2 | =====================
3 | 
4 | .. automodule:: e3fp.pipeline
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.config.params.rst:
--------------------------------------------------------------------------------
1 | e3fp\.config\.params module
2 | ===========================
3 | 
4 | .. automodule:: e3fp.config.params
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.util.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.util module
2 | ============================
3 | 
4 | .. automodule:: e3fp.conformer.util
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.db.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.db module
2 | ============================
3 | 
4 | .. automodule:: e3fp.fingerprint.db
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.nbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.nbi


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.nbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.nbi


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.1.nbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._dense_soergel-225.py312.1.nbc


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.1.nbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keiserlab/e3fp/HEAD/src/e3fp/fingerprint/metrics/__pycache__/array_metrics._sparse_soergel-246.py312.1.nbc


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.util.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.util module
2 | ==============================
3 | 
4 | .. automodule:: e3fp.fingerprint.util
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.generate.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.generate module
2 | ================================
3 | 
4 | .. automodule:: e3fp.conformer.generate
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.generator.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.generator module
2 | =================================
3 | 
4 | .. automodule:: e3fp.conformer.generator
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.fprint.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.fprint module
2 | ================================
3 | 
4 | .. automodule:: e3fp.fingerprint.fprint
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.structs.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.structs module
2 | =================================
3 | 
4 | .. automodule:: e3fp.fingerprint.structs
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.fprinter.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.fprinter module
2 | ==================================
3 | 
4 | .. automodule:: e3fp.fingerprint.fprinter
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.generate.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.generate module
2 | ==================================
3 | 
4 | .. automodule:: e3fp.fingerprint.generate
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.protonation.rst:
--------------------------------------------------------------------------------
1 | e3fp\.conformer\.protonation module
2 | ===================================
3 | 
4 | .. automodule:: e3fp.conformer.protonation
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.array_ops.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.array\_ops module
2 | ====================================
3 | 
4 | .. automodule:: e3fp.fingerprint.array_ops
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.metrics.array_metrics.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.metrics\.array\_metrics module
2 | =================================================
3 | 
4 | .. automodule:: e3fp.fingerprint.metrics.array_metrics
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.metrics.fprint_metrics.rst:
--------------------------------------------------------------------------------
1 | e3fp\.fingerprint\.metrics\.fprint\_metrics module
2 | ==================================================
3 | 
4 | .. automodule:: e3fp.fingerprint.metrics.fprint_metrics
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. e3fp documentation master file
 2 | 
 3 | e3fp
 4 | ====
 5 | 
 6 | :Release: |version|
 7 | :Date: |today|
 8 | 
 9 | Contents
10 | -----------------
11 | 
12 | .. toctree::
13 |    :maxdepth: 2
14 | 
15 |    overview
16 |    install
17 |    usage/index
18 |    dev/index
19 |    api/index
20 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.config.rst:
--------------------------------------------------------------------------------
 1 | e3fp\.config package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    e3fp.config.params
10 | 
11 | Module contents
12 | ---------------
13 | 
14 | .. automodule:: e3fp.config
15 |     :members:
16 |     :undoc-members:
17 |     :show-inheritance:
18 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | """Tests for loading config files.
 2 | 
 3 | Author: Seth Axen
 4 | E-mail: seth.axen@gmail.com
 5 | """
 6 | import os
 7 | 
 8 | 
 9 | class TestConfig:
10 |     def test_config_file_exists(self):
11 |         from e3fp.config.params import DEF_PARAM_FILE
12 | 
13 |         assert os.path.isfile(DEF_PARAM_FILE)
14 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.13"
 7 |   commands:
 8 |     - asdf plugin add uv
 9 |     - asdf install uv latest
10 |     - asdf global uv latest
11 |     - uv sync --extra docs
12 |     - uv run -m sphinx -T -b html -d docs/_build/doctrees doc/source $READTHEDOCS_OUTPUT/html
13 | 


--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/index.rst:
--------------------------------------------------------------------------------
 1 | Using Fingerprints
 2 | ==================
 3 | 
 4 | While molecular fingerprints are widely used, few packages provide simple
 5 | interfaces for working with them and interfacing with machine learning
 6 | packages. E3FP provides a number of general utility classes and methods for
 7 | doing precisely this.
 8 | 
 9 | .. toctree::
10 |     fprints
11 |     storage
12 |     comparison
13 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.conformer.rst:
--------------------------------------------------------------------------------
 1 | e3fp\.conformer package
 2 | =======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    e3fp.conformer.generate
10 |    e3fp.conformer.generator
11 |    e3fp.conformer.protonation
12 |    e3fp.conformer.util
13 | 
14 | Module contents
15 | ---------------
16 | 
17 | .. automodule:: e3fp.conformer
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.metrics.rst:
--------------------------------------------------------------------------------
 1 | e3fp\.fingerprint\.metrics package
 2 | ==================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    e3fp.fingerprint.metrics.array_metrics
10 |    e3fp.fingerprint.metrics.fprint_metrics
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: e3fp.fingerprint.metrics
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/doc/source/api/e3fp.rst:
--------------------------------------------------------------------------------
 1 | e3fp package
 2 | ============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     e3fp.config
10 |     e3fp.conformer
11 |     e3fp.fingerprint
12 | 
13 | Submodules
14 | ----------
15 | 
16 | .. toctree::
17 | 
18 |    e3fp.pipeline
19 |    e3fp.util
20 | 
21 | Module contents
22 | ---------------
23 | 
24 | .. automodule:: e3fp
25 |     :members:
26 |     :undoc-members:
27 |     :show-inheritance:
28 | 


--------------------------------------------------------------------------------
/doc/source/usage/index.rst:
--------------------------------------------------------------------------------
 1 | Usage and Examples
 2 | ==================
 3 | 
 4 | To facilitate flexible use of the E3FP package, we provide multiple interfaces
 5 | for performing the same tasks. We have organized these below in the order in
 6 | which we expect them to be most of use to the average user.
 7 | 
 8 | .. toctree::
 9 |    :caption: Sections
10 |    :maxdepth: 2
11 | 
12 |    config
13 |    cli
14 |    pipeline
15 |    fingerprints/index
16 | 


--------------------------------------------------------------------------------
/doc/source/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 | 
 3 | {% block footer %}
 4 | {{ super() }}
 5 | <script>
 6 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
 7 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
 8 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
 9 |   })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
10 | 
11 |   ga('create', 'UA-55163959-2', 'auto');
12 |   ga('send', 'pageview');
13 | </script>
14 | {% endblock %}


--------------------------------------------------------------------------------
/doc/source/api/e3fp.fingerprint.rst:
--------------------------------------------------------------------------------
 1 | e3fp\.fingerprint package
 2 | =========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     e3fp.fingerprint.metrics
10 | 
11 | Submodules
12 | ----------
13 | 
14 | .. toctree::
15 | 
16 |    e3fp.fingerprint.array_ops
17 |    e3fp.fingerprint.db
18 |    e3fp.fingerprint.fprint
19 |    e3fp.fingerprint.fprinter
20 |    e3fp.fingerprint.generate
21 |    e3fp.fingerprint.structs
22 |    e3fp.fingerprint.util
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: e3fp.fingerprint
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/src/e3fp/config/defaults.cfg:
--------------------------------------------------------------------------------
 1 | [preprocessing]
 2 | standardise = False
 3 | protonate = False
 4 | 
 5 | [conformer_generation]
 6 | num_conf = -1
 7 | first = -1
 8 | pool_multiplier = 1
 9 | rmsd_cutoff = 0.5
10 | max_energy_diff = None
11 | forcefield = uff
12 | out_dir = conformers
13 | compress = 2
14 | seed = -1
15 | 
16 | ; Optimized parameters used in
17 | ; Axen et al. 2017
18 | [fingerprinting]
19 | bits = 1024
20 | level = 5
21 | first = 3
22 | radius_multiplier = 1.718
23 | stereo = True
24 | counts = False
25 | include_disconnected = True
26 | rdkit_invariants = False
27 | remove_duplicate_substructs = True
28 | exclude_floating = True
29 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/util.py:
--------------------------------------------------------------------------------
 1 | """Utility methods and class for fingerprinting-related functions.
 2 | 
 3 | Author: Seth Axen
 4 | E-mail: seth.axen@gmail.com
 5 | """
 6 | from ..util import E3FPError
 7 | 
 8 | 
 9 | class E3FPInvalidFingerprintError(E3FPError, TypeError):
10 |     """Fingerprint is incorrectly formatted."""
11 | 
12 | 
13 | class E3FPMolError(E3FPError, TypeError):
14 |     """Mol is of incorrect type."""
15 | 
16 | 
17 | class E3FPBitsValueError(E3FPError, ValueError):
18 |     """Bits value is invalid."""
19 | 
20 | 
21 | class E3FPCountsError(E3FPError, ValueError):
22 |     """Index in counts is invalid."""
23 | 
24 | 
25 | class E3FPOptionError(E3FPError, ValueError):
26 |     """Option provided is invalid."""
27 | 


--------------------------------------------------------------------------------
/doc/source/examples/data/test_smiles.smi:
--------------------------------------------------------------------------------
 1 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccccc1)C(C)C CHEMBL1643865
 2 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)C(C)C)C(C)C CHEMBL1643866
 3 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccccn1)C(C)C CHEMBL1643867
 4 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccc(OC)c1)C(C)C CHEMBL1643868
 5 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccc(F)c1)C(C)C CHEMBL1643869
 6 | CN1CCN(C(=O)c2ccc3n2Cc2ccccc2N(C(=O)c2ccc(NC(=O)c4ccccc4-c4ccccc4)cc2Cl)C3)CC1 CHEMBL164387
 7 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1cccnc1)C(C)C CHEMBL1643870
 8 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccncc1)C(C)C CHEMBL1643871
 9 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1ccc2ccccc2n1)C(C)C CHEMBL1643872
10 | CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)c1nccc2ccccc21)C(C)C CHEMBL1643873
11 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = e3fp
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Internal variables
12 | PAPEROPT_a4     = -D latex_paper_size=a4
13 | PAPEROPT_letter = -D latex_paper_size=letter
14 | ALLSPHINXOPTS   = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
15 | 
16 | # Put it first so that "make" without argument is like "make help".
17 | help:
18 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
19 | 
20 | .PHONY: help Makefile
21 | 
22 | # Catch-all target: route all unknown targets to Sphinx using the new
23 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
24 | %: Makefile
25 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 | 


--------------------------------------------------------------------------------
/tests/test_dependencies.py:
--------------------------------------------------------------------------------
 1 | """Integration tests for dependencies.
 2 | 
 3 | Author: Seth Axen
 4 | E-mail: seth.axen@gmail.com
 5 | """
 6 | 
 7 | 
 8 | class TestRequiredDependencies:
 9 |     def test_rdkit(self):
10 |         import rdkit
11 | 
12 |     def test_numpy(self):
13 |         import numpy
14 | 
15 |     def test_scipy(self):
16 |         import scipy
17 | 
18 |     def test_murmurhash(self):
19 |         import mmh3
20 | 
21 |     def test_python_utilities(self):
22 |         import python_utilities
23 | 
24 | 
25 | class TestOptionalFeatureDependencies:
26 |     def test_h5py(self):
27 |         import h5py
28 | 
29 |     def test_standardiser(self):
30 |         import standardiser
31 | 
32 | 
33 | class TestOptionalParallelDependencies:
34 |     def test_mpi4py(self):
35 |         import mpi4py
36 | 
37 |     def test_concurrent(self):
38 |         import concurrent.futures
39 | 
40 |     def test_python_utilities(self):
41 |         import python_utilities.parallel
42 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | 
 3 | on:
 4 |   release:
 5 |     types:
 6 |       - published
 7 | 
 8 | jobs:
 9 |   build-test:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       uv_version: "0.5.2"
13 |       python_version: "3.13"
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - name: Setup MPI
17 |         uses: mpi4py/setup-mpi@v1
18 |       - name: Install uv
19 |         uses: astral-sh/setup-uv@v3
20 |         with:
21 |           version: ${{ env.uv_version }}
22 |       - name: Build the project
23 |         run: uv build --no-sources --python ${{ env.python_version }}
24 |       - name: Sync only the test dependencies
25 |         run: uv sync --no-install-project --extra test
26 |       - name: Install and test source distribution
27 |         run: |
28 |           uv pip install dist/*.tar.gz
29 |           uv run --no-sync pytest
30 |           uv pip uninstall e3fp
31 |       - name: Install and test wheel
32 |         run: |
33 |           uv pip install dist/*.whl
34 |           uv run --no-sync pytest
35 |       - name: Publish to PyPI
36 |         run: uv publish --token ${{ secrets.PYPI_API_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches: [master]
 5 |   pull_request:
 6 | 
 7 | env:
 8 |   # Setting RDMAV_FORK_SAFE=1 to avoid libfabric EFA provider issues with
 9 |   # fork() on Python 3.9 and Ubuntu.
10 |   RDMAV_FORK_SAFE: 1
11 | 
12 | jobs:
13 |   test:
14 |     name: Python ${{ matrix.python-version }} - ${{ matrix.os }}
15 |     runs-on: ${{ matrix.os }}
16 |     env:
17 |       uv_version: "0.5.2"
18 |     strategy:
19 |       matrix:
20 |         os: ["ubuntu-latest", "macos-latest"]
21 |         python-version: ["3.9", "3.13"]
22 |       fail-fast: false
23 |     steps:
24 |       - uses: actions/checkout@v2
25 |         with:
26 |           fetch-depth: 2
27 |       - name: Setup MPI
28 |         uses: mpi4py/setup-mpi@v1
29 |       - name: Install uv
30 |         uses: astral-sh/setup-uv@v3
31 |         with:
32 |           version: ${{ env.uv_version }}
33 |       - name: Install the project
34 |         run: uv sync --extra test --python ${{ matrix.python-version }}
35 |       - name: Run tests
36 |         run: uv run pytest --cov=e3fp --cov-report=xml
37 |       - name: Upload coverage to Codecov
38 |         uses: codecov/codecov-action@v5
39 |         with:
40 |           fail_ci_if_error: false
41 | 


--------------------------------------------------------------------------------
/doc/source/overview.rst:
--------------------------------------------------------------------------------
 1 | Overview of E3FP
 2 | ================
 3 | 
 4 | Introduction
 5 | ------------
 6 | 
 7 | The Extended 3-Dimensional FingerPrint (E3FP) [1]_ is a 3D molecular
 8 | fingerprinting method inspired by Extended Connectivity FingerPrints (ECFP)
 9 | [2]_, integrating tightly with the RDKit_. It is developed by the 
10 | `Keiser Lab`_ at UCSF_ and maintained primarily by `Seth Axen`_.
11 | 
12 | For a thorough description of E3FP, please consult the original paper [1]_ and
13 | `paper repository`_ or :ref:`usage/index:Usage and Examples`.
14 | 
15 | Documentation is hosted by ReadTheDocs_.
16 | 
17 | Contributing
18 | ------------
19 | 
20 | Development occurs on GitHub_.
21 | Contributions, feature requests, and bug reports are greatly appreciated.
22 | Please consult the `issue tracker`_.
23 | 
24 | License
25 | -------
26 | E3FP is released under the |license_long| (|license|).
27 | 
28 | Briefly, this means E3FP can be used in any manner without modification,
29 | with proper attribution. However, if the source code is modified for an
30 | application, this modified source must also be released under |license| so that
31 | the community may benefit.
32 | 
33 | Citing E3FP
34 | -----------
35 | 
36 | To cite E3FP, please reference the original paper [1]_.
37 | 
38 | .. rubric:: References
39 | 
40 | .. [1] |axen2017|
41 | .. [2] |rogers2010|
42 | 
43 | .. include:: substitutions.rst
44 | .. _GitHub: https://github.com/keiserlab/e3fp
45 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | """Tests for util methods.
 2 | 
 3 | Author: Seth Axen
 4 | E-mail: seth.axen@gmail.com
 5 | """
 6 | import pytest
 7 | import warnings
 8 | 
 9 | 
10 | class TestUtil:
11 |     def test_deprecated(self):
12 |         from e3fp.util import deprecated, E3FPDeprecationWarning
13 | 
14 |         @deprecated("1.1", remove_version="1.3", msg="DEPRECATED!!!")
15 |         def dep_method():
16 |             pass
17 | 
18 |         with warnings.catch_warnings(record=True) as w:
19 |             warnings.simplefilter("always")
20 |             dep_method()
21 |             assert len(w) == 1
22 |             assert issubclass(w[-1].category, E3FPDeprecationWarning)
23 |             message = str(w[-1].message)
24 |             assert "deprecated in 1.1" in message
25 |             assert "removed in 1.3" in message
26 |             assert "DEPRECATED!!!" in str(w[-1].message)
27 | 
28 |         assert "\t.. deprecated:: 1.1\n\t    DEPRECATED!!!" in dep_method.__doc__
29 | 
30 |     def test_efficiency_warning(self):
31 |         from e3fp.util import E3FPEfficiencyWarning
32 | 
33 |         def test(warn=False):
34 |             if warn:
35 |                 raise E3FPEfficiencyWarning("Inefficient!")
36 | 
37 |         with warnings.catch_warnings(record=True):
38 |             warnings.simplefilter("error")
39 |             test(warn=False)
40 | 
41 |             with pytest.raises(E3FPEfficiencyWarning):
42 |                 test(warn=True)
43 | 


--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/comparison.rst:
--------------------------------------------------------------------------------
 1 | Fingerprint Comparison
 2 | ======================
 3 | 
 4 | The `e3fp.fingerprint.metrics` sub-package provides several useful methods for
 5 | batch comparison of fingerprints in various representations.
 6 | 
 7 | Fingerprint Metrics
 8 | -------------------
 9 | 
10 | These metrics operate directly on pairs of :py:class:`.Fingerprint` and
11 | :py:class:`.FingerprintDatabase` objects or on a combination of each. If
12 | only a single variable is specified, self-comparison is performed. The
13 | implemented methods are common functions for fingerprint similarity in the
14 | literature.
15 | 
16 | .. todo::
17 | 
18 |     Document examples
19 | 
20 | Array Metrics
21 | -------------
22 | 
23 | To efficiently compare fingerprint databases above, we provide comparison
24 | metrics that can operate directly on the internal sparse matrix representation
25 | without the need to "densify it". We describe these here, as they have several
26 | additional features.
27 | 
28 | The array metrics implemented in `e3fp.fingerprint.metrics.array_metrics` are
29 | implemented such that they may take any combination of dense and sparse inputs.
30 | Additionally, they are designed to function as
31 | `scikit-learn-compatible kernels <http://scikit-learn.org/stable/modules/metrics.html>`_
32 | for machine learning tasks. For example, one might perform an analysis using a
33 | support vector machine (SVM) and Tanimoto kernel.
34 | 
35 | .. code:: python
36 |     
37 |     >>> from sklearn.svm import SVC
38 |     >>> from e3fp.fingerprint.metrics.array_metrics import tanimoto
39 |     >>> clf = SVC(kernel=tanimoto)
40 |     >>> clf.fit(X, y)
41 |     ...
42 |     >>> clf.predict(test)
43 |     ...
44 | 
45 | Most common fingerprint comparison metrics only apply to binary fingerprints.
46 | We include several that operate equally well on count- and float-based
47 | fingerprints. For example, to our knowledge, we provide the only open source
48 | implementation of Soergel similarity, the analog to the Tanimoto coefficient
49 | for non-binary fingerprints that can efficiently operate on sparse inputs.
50 | 
51 | .. code:: python
52 |     
53 |     >>> from e3fp.fingerprint.metrics.array_metrics import soergel
54 |     >>> clf = SVC(kernel=soergel)
55 |     >>> clf.fit(X, y)
56 |     ...
57 |     >>> clf.predict(test)
58 |     ...
59 | 


--------------------------------------------------------------------------------
/doc/source/substitutions.rst:
--------------------------------------------------------------------------------
 1 | .. Common substitutions used throughout the documentation
 2 | 
 3 | .. URLs
 4 | .. _RDKit: http://www.rdkit.org
 5 | .. _NumPy: https://www.numpy.org
 6 | .. _SciPy: https://www.scipy.org
 7 | .. _mmh3: https://pypi.python.org/pypi/mmh3
 8 | .. _python_utilities: https://github.com/sdaxen/python_utilities
 9 | .. _mpi4py: http://mpi4py.scipy.org
10 | .. _smart_open: https://github.com/RaRe-Technologies/smart_open
11 | .. _standardiser: https://wwwdev.ebi.ac.uk/chembl/extra/francis/standardiser
12 | .. _cxcalc: https://docs.chemaxon.com/display/CALCPLUGS/cxcalc+command+line+tool
13 | .. _h5py: http://www.h5py.org/
14 | .. _numba: https://numba.pydata.org/
15 | .. _Anaconda: https://anaconda.org/conda-forge/e3fp
16 | .. _uv: https://docs.astral.sh/uv/
17 | .. _repository: https://github.com/keiserlab/e3fp
18 | .. _paper repository: https://github.com/keiserlab/e3fp-paper
19 | .. _issue tracker: https://github.com/keiserlab/e3fp/issues
20 | .. _ReadTheDocs: http://e3fp.readthedocs.io
21 | .. _Keiser Lab: http://www.keiserlab.org
22 | .. _UCSF: https://www.ucsf.edu
23 | .. _Seth Axen: http://sethaxen.com
24 | 
25 | .. Badges
26 | .. |bioRxiv| image:: https://img.shields.io/badge/bioRxiv-136705-blue.svg
27 |     :target: https://doi.org/10.1101/136705
28 |     :alt: Access the preprint on bioRxiv
29 | 
30 | .. References
31 | .. |axen2017_doi| image:: https://img.shields.io/badge/doi-10.1021/acs.jmedchem.7b00696-blue.svg
32 |     :target: http://dx.doi.org/10.1021/acs.jmedchem.7b00696
33 |     :alt: Access the paper
34 | .. |axen2017| replace:: Axen SD, Huang XP, Caceres EL, Gendelev L, Roth BL, Keiser MJ. A Simple Representation Of Three-Dimensional Molecular Structure. *J. Med. Chem.* **60** (17): 7393–7409 (2017). |axen2017_doi| |bioRxiv|
35 | .. |rogers2010_doi| image:: https://img.shields.io/badge/doi-10.1021/ci100050t-blue.svg
36 |     :target: http://dx.doi.org/10.1021/ci100050t
37 |     :alt: Access the paper
38 | .. |rogers2010| replace:: Rogers D & Hahn M. Extended-connectivity fingerprints. *J. Chem. Inf. Model.* **50**: 742-54 (2010). |rogers2010_doi|
39 | 
40 | .. Misc
41 | .. |license_link| replace:: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt
42 | .. |license_long| replace:: `GNU Lesser General Public License version 3.0`_
43 | .. _GNU Lesser General Public License version 3.0: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt
44 | .. |license| replace:: LGPLv3
45 | 


--------------------------------------------------------------------------------
/doc/source/install.rst:
--------------------------------------------------------------------------------
  1 | Setup and Installation
  2 | ======================
  3 | 
  4 | Dependencies
  5 | ------------
  6 | 
  7 | E3FP is compatible with Python 3.x. It additionally has the
  8 | following dependencies:
  9 | 
 10 | Required
 11 | ~~~~~~~~
 12 | 
 13 | - NumPy_
 14 | - SciPy_
 15 | - RDKit_
 16 | - mmh3_
 17 | - python_utilities_
 18 | - smart_open_
 19 | 
 20 | Optional
 21 | ~~~~~~~~
 22 | 
 23 | The following packages are required for the specified features:
 24 | 
 25 | - parallelization:
 26 | 
 27 |   + mpi4py_
 28 | 
 29 | - molecular standardisation:
 30 | 
 31 |   + standardiser_
 32 | 
 33 | - protonation states:
 34 | 
 35 |   + cxcalc_
 36 | 
 37 | - storing conformer energies:
 38 | 
 39 |   + h5py_
 40 | 
 41 | - faster fingerprint metric calculations:
 42 | 
 43 |   + numba_
 44 | 
 45 | 
 46 | Installation
 47 | ------------
 48 | 
 49 | The following installation approaches are listed in order of recommendation.
 50 | 
 51 | Option 1: Install with Pip
 52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 53 | 
 54 | Basic installation:
 55 | 
 56 | .. code:: bash
 57 | 
 58 |     $ pip install e3fp
 59 | 
 60 | With optional dependencies:
 61 | 
 62 | .. code:: bash
 63 | 
 64 |     $ pip install e3fp[optional]
 65 | 
 66 | 
 67 | Option 2: Install from conda-forge
 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 69 | 
 70 | E3FP is available on conda-forge.
 71 | 
 72 | .. code:: bash
 73 | 
 74 |     $ conda create -n e3fp_env -c conda-forge e3fp
 75 |     $ conda activate e3fp_env
 76 | 
 77 | To install optional dependencies:
 78 | 
 79 | .. code:: bash
 80 | 
 81 |     $ conda install -c conda-forge mpi4py h5py standardiser
 82 | 
 83 | Option 3: Install from source
 84 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 85 | 
 86 | 1. Clone the repository:
 87 | 
 88 |    .. code:: bash
 89 | 
 90 |       $ git clone https://github.com/keiserlab/e3fp.git
 91 |       $ cd e3fp
 92 | 
 93 | 2. Install for development in an already-activated environment.
 94 | 
 95 |    You can do this using pip:
 96 | 
 97 |    .. code:: bash
 98 | 
 99 |       $ pip install -e .[dev]
100 | 
101 |   Or use uv_ to set up a development environment:
102 | 
103 |   .. code:: bash
104 | 
105 |       $ uv sync --extra dev
106 | 
107 | Testing
108 | -------
109 | 
110 | Run tests using pytest:
111 | 
112 | .. code:: bash
113 | 
114 |     $ pip install pytest  # if not already installed
115 |     $ pytest e3fp
116 | 
117 | 
118 | .. include:: substitutions.rst
119 | 


--------------------------------------------------------------------------------
/doc/source/usage/config.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | =============
 3 | 
 4 | E3FP configurational parameters are stored in the widely used INI_ file
 5 | format. These may be passed to :ref:`usage/cli:Command Line Interface` programs
 6 | or parsed to Python dicts for :ref:`usage/pipeline:Pipeline Methods` or other
 7 | lower-level functions.
 8 | 
 9 | Loading Default Parameters
10 | --------------------------
11 | 
12 | The below example shows all default parameters, accessed via the
13 | :py:mod:`e3fp.config` module.
14 | 
15 | .. literalinclude:: ../../../src/e3fp/config/defaults.cfg
16 |    :caption: `defaults.cfg <https://github.com/keiserlab/e3fp/blob/master/e3fp/config/defaults.cfg>`_
17 | 
18 | :py:mod:`configparser` is used internally to parse and store these
19 | config parameters.
20 | 
21 |    >>> from e3fp.config.params import default_params
22 |    >>> default_params
23 |    <ConfigParser.ConfigParser instance at 0x...>
24 |    >>> print(default_params.sections())
25 |    ['preprocessing', 'conformer_generation', 'fingerprinting']
26 |    >>> default_params.items('fingerprinting')
27 |    [('bits', '1024'), ('level', '5'), ('first', '3'), ('radius_multiplier', '1.718'), ('stereo', 'True'), ('counts', 'False'), ('include_disconnected', 'True'), ('rdkit_invariants', 'False'), ('merge_duplicate_substructs', 'True'), ('exclude_floating', 'True')]
28 | 
29 | Parsing User-Provided Parameters
30 | --------------------------------
31 | 
32 | A user may provide a custom config file.
33 | 
34 | .. literalinclude:: ../examples/data/new_params.cfg
35 |    :caption: new_params.cfg
36 | 
37 | .. doctest::
38 | 
39 |    >>> from e3fp.config.params import read_params
40 |    >>> config = read_params("source/examples/data/new_params.cfg")
41 |    >>> config.items('fingerprinting')
42 |    [('bits', '4096'), ('first', '10')]
43 | 
44 | When passing these parameters to any downstream methods, default options will
45 | be used except where these options are specified.
46 | 
47 | Converting Parameters to Argument Dicts
48 | ---------------------------------------
49 | 
50 | To pass the parameters to Python methods for fingerprinting and conformer
51 | generation, we need to convert them to Python dicts.
52 | 
53 |    >>> from e3fp.pipeline import params_to_dicts
54 |    >>> confgen_params, fprint_params = params_to_dicts(config)
55 |    >>> fprint_params
56 |    {'bits': 4096, 'first': 10}
57 | 
58 | .. _INI: https://en.wikipedia.org/wiki/INI_file
59 | 
60 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "e3fp"
 7 | version = "1.2.7"
 8 | requires-python = ">=3.9, <3.14"
 9 | description = "Molecular 3D fingerprinting"
10 | readme = "README.rst"
11 | authors = [
12 |     {name = "Seth Axen", email = "seth.axen@gmail.com"},
13 | ]
14 | license = {file = "LICENSE.txt"}
15 | keywords = ["e3fp", "3d", "molecule", "fingerprint", "conformer"]
16 | classifiers = [
17 |     "Programming Language :: Python",
18 |     "Programming Language :: Python :: 3.9",
19 |     "Programming Language :: Python :: 3.10",
20 |     "Programming Language :: Python :: 3.11",
21 |     "Programming Language :: Python :: 3.12",
22 |     "Programming Language :: Python :: 3.13",
23 |     "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
24 |     "Operating System :: OS Independent",
25 |     "Development Status :: 4 - Beta",
26 |     "Intended Audience :: Science/Research",
27 |     "Intended Audience :: Developers",
28 |     "Topic :: Scientific/Engineering :: Chemistry",
29 |     "Topic :: Software Development :: Libraries :: Python Modules",
30 | ]
31 | dependencies = [
32 |     "mmh3>=2.3.1",
33 |     "numpy>=1.11.3",
34 |     "rdkit>=2016.03.4",
35 |     "scipy>=0.18.0",
36 |     "sdaxen_python_utilities>=0.1.5",
37 |     "smart_open>=1.8.3",
38 | ]
39 | 
40 | [project.optional-dependencies]
41 | optional = [
42 |     "h5py",
43 |     "mpi4py",
44 |     "numba",
45 |     "six", # needed by standardiser, but not listed as a dependency
46 |     "standardiser",
47 | ]
48 | test = [
49 |     "mock",
50 |     "pytest",
51 |     "pytest-cov",
52 |     "e3fp[optional]",
53 | ]
54 | docs = [
55 |     "sphinx",
56 |     "sphinxcontrib-programoutput",
57 |     "sphinx-rtd-theme",
58 | ]
59 | dev = [
60 |     "e3fp[docs]",
61 |     "e3fp[test]",
62 | ]
63 | 
64 | [project.urls]
65 | Homepage = "https://github.com/keiserlab/e3fp"
66 | Download = "https://github.com/keiserlab/e3fp/tarball/{version}"
67 | 
68 | [project.scripts]
69 | e3fp-fingerprint = "e3fp.fingerprint.generate:main"
70 | e3fp-conformer = "e3fp.conformer.generate:main"
71 | 
72 | [tool.pytest.ini_options]
73 | addopts = "-ra -q"
74 | testpaths = ["e3fp/test"]
75 | 
76 | # https://github.com/astral-sh/uv/issues/6281
77 | [tool.uv]
78 | constraint-dependencies = ["numba>=0.60.0"]
79 | # Resolve dependencies separately for each Python version
80 | environments = [
81 |     "python_version>='3.13'",
82 |     "python_version=='3.12'",
83 |     "python_version=='3.11'",
84 |     "python_version=='3.10'",
85 |     "python_version=='3.9'",
86 | ]
87 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | E3FP: Extended 3-Dimensional FingerPrint
 2 | ========================================
 3 | 
 4 | |Docs Status| |CI Status| |Codecov Status| |PyPi Version| |Conda Version| |License|
 5 | 
 6 | E3FP [1]_ is a 3D molecular fingerprinting method inspired by Extended
 7 | Connectivity FingerPrints (ECFP) [2]_, integrating tightly with the RDKit_.
 8 | 
 9 | Documentation is hosted by ReadTheDocs_, and development occurs on GitHub_.
10 | 
11 | Installation and Usage
12 | ----------------------
13 | 
14 | For installation and usage instructions, see the
15 | `documentation <http://e3fp.readthedocs.io>`__.
16 | 
17 | See the E3FP `paper repository`_ for an application of E3FP and all code used
18 | for the E3FP paper [1]_.
19 | 
20 | License
21 | -------
22 | 
23 | E3FP is available under the `GNU Lesser General Public License version 3.0
24 | <https://www.gnu.org/licenses/lgpl.html>`_ (LGPLv3). See the
25 | `documentation <http://e3fp.readthedocs.io/en/latest/overview.html#license>`__
26 | for more details.
27 | 
28 | 
29 | References
30 | ----------
31 | 
32 | .. [1] |axen2017|
33 | .. [2] |rogers2010|
34 | 
35 | .. substitutions
36 | 
37 | .. _RDKit: http://www.rdkit.org
38 | .. _GitHub: https://github.com/keiserlab/e3fp
39 | .. _paper repository: https://github.com/keiserlab/e3fp-paper
40 | .. _ReadTheDocs: http://e3fp.readthedocs.io
41 | .. |axen2017_doi| image:: https://img.shields.io/badge/doi-10.1021/acs.jmedchem.7b00696-blue.svg
42 |     :target: http://dx.doi.org/10.1021/acs.jmedchem.7b00696
43 |     :alt: Access the paper
44 | .. |axen2017| replace:: Axen SD, Huang XP, Caceres EL, Gendelev L, Roth BL, Keiser MJ. A Simple Representation Of Three-Dimensional Molecular Structure. *J. Med. Chem.* **60** (17): 7393–7409 (2017). |axen2017_doi| |bioRxiv|
45 | .. |rogers2010_doi| image:: https://img.shields.io/badge/doi-10.1021/ci100050t-blue.svg
46 |     :target: http://dx.doi.org/10.1021/ci100050t
47 |     :alt: Access the paper
48 | .. |rogers2010| replace:: Rogers D & Hahn M. Extended-connectivity fingerprints. *J. Chem. Inf. Model.* **50**: 742-54 (2010). |rogers2010_doi|
49 | .. |CI Status| image:: https://github.com/keiserlab/e3fp/actions/workflows/ci.yml/badge.svg
50 |    :target: https://github.com/keiserlab/e3fp/actions?query=workflow%3ACI
51 |    :alt: CI Status
52 | .. |Docs Status| image:: http://readthedocs.org/projects/e3fp/badge/?version=latest
53 |    :target: http://e3fp.readthedocs.io/en/latest/?badge=latest
54 |    :alt: Documentation Status
55 | .. |Codecov Status| image:: https://codecov.io/github/keiserlab/e3fp/coverage.svg?branch=master
56 |    :target: https://codecov.io/github/keiserlab/e3fp?branch=master
57 |    :alt: Code Coverage
58 | .. |PyPi Version| image:: https://img.shields.io/pypi/v/e3fp.svg
59 |    :target: https://pypi.python.org/pypi/e3fp
60 |    :alt: Package on PyPi
61 | .. |Conda Version| image:: https://img.shields.io/conda/v/conda-forge/e3fp.svg
62 |    :target: https://anaconda.org/conda-forge/e3fp
63 |    :alt: Package on Anaconda
64 | .. |License| image:: https://img.shields.io/badge/license-LGPLv3-blue.svg
65 |    :target: https://github.com/keiserlab/e3fp/blob/master/LICENSE.txt
66 | .. |bioRxiv| image:: https://img.shields.io/badge/bioRxiv-136705-blue.svg
67 |     :target: https://doi.org/10.1101/136705
68 |     :alt: Access the preprint on bioRxiv
69 | 


--------------------------------------------------------------------------------
/src/e3fp/pipeline.py:
--------------------------------------------------------------------------------
 1 | """Functions for various pipeline use cases.
 2 | 
 3 | Author: Seth Axen
 4 | E-mail: seth.axen@gmail.com
 5 | """
 6 | from .config.params import params_to_sections_dict
 7 | from .conformer.util import mol_from_smiles, mol_from_sdf, mol_to_sdf
 8 | from .conformer.generate import generate_conformers
 9 | from .fingerprint.generate import fprints_dict_from_mol
10 | 
11 | 
12 | def params_to_dicts(params):
13 |     """Get params dicts for pipeline functions from INI format params file."""
14 |     sections_dict = params_to_sections_dict(params, auto=True)
15 | 
16 |     # preproc_params will eventually be returned separately, when there's a
17 |     # pipeline function for protonation
18 |     preproc_params = sections_dict.get("preprocessing", {})
19 |     confgen_params = sections_dict.get("conformer_generation", {})
20 |     confgen_params.update(preproc_params)
21 |     fprint_params = sections_dict.get("fingerprinting", {})
22 |     return confgen_params, fprint_params
23 | 
24 | 
25 | def confs_from_smiles(smiles, name, confgen_params={}, save=False):
26 |     """Generate conformations of molecule from SMILES string."""
27 |     mol = mol_from_smiles(smiles, name)
28 |     confgen_result = generate_conformers(
29 |         mol, name, save=save, **confgen_params
30 |     )
31 |     mol = confgen_result[0]
32 |     return mol
33 | 
34 | 
35 | def sdf_from_smiles(
36 |     smiles, name, confgen_params={}, out_file=None, out_ext=".sdf.bz2"
37 | ):
38 |     """Generate conformations from SMILES string and save to SDF file."""
39 |     mol = confs_from_smiles(
40 |         smiles, name, confgen_params=confgen_params, save=False
41 |     )
42 |     if out_file is None:
43 |         out_file = name + out_ext
44 |     mol_to_sdf(mol, out_file)
45 | 
46 | 
47 | def fprints_from_fprints_dict(fprints_dict, level=-1):
48 |     """Get fingerprint at `level` from dict of level to fingerprint."""
49 |     fprints_list = fprints_dict.get(
50 |         level, fprints_dict[max(fprints_dict.keys())]
51 |     )
52 |     return fprints_list
53 | 
54 | 
55 | def fprints_from_mol(mol, fprint_params={}, save=False):
56 |     """Generate fingerprints for all `first` conformers in mol."""
57 |     fprints_dict = fprints_dict_from_mol(mol, save=save, **fprint_params)
58 |     level = fprint_params.get("level", -1)
59 |     fprints_list = fprints_from_fprints_dict(fprints_dict, level=level)
60 |     return fprints_list
61 | 
62 | 
63 | def fprints_from_smiles(
64 |     smiles, name, confgen_params={}, fprint_params={}, save=False
65 | ):
66 |     """Generate conformers and fingerprints from a SMILES string."""
67 |     if save is False and "first" not in confgen_params:
68 |         confgen_params["first"] = fprint_params.get("first", -1)
69 |     mol = confs_from_smiles(
70 |         smiles, name, confgen_params=confgen_params, save=save
71 |     )
72 |     fprints_list = fprints_from_mol(
73 |         mol, fprint_params=fprint_params, save=save
74 |     )
75 |     return fprints_list
76 | 
77 | 
78 | def fprints_from_sdf(sdf_file, fprint_params={}, save=False):
79 |     """Generate fingerprints from conformers in an SDF file."""
80 |     mol = mol_from_sdf(sdf_file)
81 |     fprints_list = fprints_from_mol(
82 |         mol, fprint_params=fprint_params, save=save
83 |     )
84 |     return fprints_list
85 | 


--------------------------------------------------------------------------------
/tests/test_conformer.py:
--------------------------------------------------------------------------------
  1 | """Tests for conformer generation.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | 
  7 | import os
  8 | 
  9 | DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 10 | SDF_FILE_COMPRESSED = os.path.join(DATA_DIR, "ritalin_nonplanar.sdf.bz2")
 11 | SDF_FILE_UNCOMPRESSED = os.path.join(DATA_DIR, "ritalin_nonplanar.sdf")
 12 | 
 13 | class TestConformer:
 14 |     def test_standardisation(self):
 15 |         import rdkit.Chem
 16 |         from e3fp.conformer.util import (
 17 |             mol_from_smiles,
 18 |             mol_to_standardised_mol,
 19 |         )
 20 | 
 21 |         smiles = "C[N-]c1cccc[n+]1C"
 22 |         mol = mol_from_smiles(smiles, "tmp")
 23 |         assert rdkit.Chem.MolToSmiles(mol) == smiles
 24 | 
 25 |         mol = mol_to_standardised_mol(mol)
 26 |         assert rdkit.Chem.MolToSmiles(mol) == "CN=c1ccccn1C"
 27 | 
 28 |     def test_default_is_unseeded(self):
 29 |         import rdkit.Chem
 30 |         from rdkit.Chem import AllChem
 31 |         from e3fp.conformer.util import (
 32 |             mol_from_smiles,
 33 |             mol_to_standardised_mol,
 34 |         )
 35 |         from e3fp.conformer.generate import generate_conformers
 36 | 
 37 |         ntrials = 10
 38 |         confgen_params = {"num_conf": 1}
 39 |         smiles = "C" * 20  # long flexible molecule
 40 |         mol = mol_from_smiles(smiles, "tmp")
 41 |         mols = [
 42 |             generate_conformers(mol, **confgen_params)[0]
 43 |             for i in range(ntrials)
 44 |         ]
 45 | 
 46 |         fail = True
 47 |         for i in range(ntrials):
 48 |             for j in range(i + 1, ntrials):
 49 |                 rms = AllChem.GetBestRMS(mols[i], mols[j])
 50 |                 if rms > 1e-2:
 51 |                     fail = False
 52 |                     break
 53 |         assert not fail
 54 | 
 55 |     def test_seed_produces_same_conformers(self):
 56 |         import rdkit.Chem
 57 |         from rdkit.Chem import AllChem
 58 |         from e3fp.conformer.util import (
 59 |             mol_from_smiles,
 60 |             mol_to_standardised_mol,
 61 |         )
 62 |         from e3fp.conformer.generate import generate_conformers
 63 | 
 64 |         ntrials = 10
 65 |         confgen_params = {"num_conf": 1, "seed": 42}
 66 |         smiles = "C" * 20  # long flexible molecule
 67 |         mol = mol_from_smiles(smiles, "tmp")
 68 |         mols = [
 69 |             generate_conformers(mol, **confgen_params)[0]
 70 |             for i in range(ntrials)
 71 |         ]
 72 | 
 73 |         fail = False
 74 |         for i in range(ntrials):
 75 |             for j in range(i + 1, ntrials):
 76 |                 rms = AllChem.GetBestRMS(mols[i], mols[j])
 77 |                 if rms > 1e-2:
 78 |                     fail = True
 79 |                     break
 80 |         assert not fail
 81 | 
 82 |     def test_compressed_sdf_reads_same_as_uncompressed(self):
 83 |         from rdkit import Chem
 84 |         from e3fp.conformer.util import mol_from_sdf
 85 | 
 86 |         sdf_files = [SDF_FILE_COMPRESSED, SDF_FILE_UNCOMPRESSED]
 87 |         smiles = [Chem.MolToSmiles(mol_from_sdf(f)) for f in sdf_files]
 88 |         assert smiles[0] == smiles[1]
 89 | 
 90 |     def test_conformer_generation_without_name(self):
 91 |         from e3fp.conformer.util import mol_from_smiles
 92 |         from e3fp.conformer.generate import generate_conformers
 93 | 
 94 |         confgen_params = {"num_conf": 1, "seed": 42}
 95 |         smiles = "C" * 20  # long flexible molecule
 96 |         mol = mol_from_smiles(smiles, "tmp")
 97 |         mol.ClearProp("_Name")
 98 |         assert not mol.HasProp("_Name")
 99 |         generate_conformers(mol, **confgen_params)
100 | 


--------------------------------------------------------------------------------
/doc/source/usage/pipeline.rst:
--------------------------------------------------------------------------------
 1 | Pipeline Methods
 2 | ================
 3 | 
 4 | E3FP can be easily plugged into an existing pipeline using the methods in the
 5 | `e3fp.pipeline` module. Each of these methods wraps functionality in other
 6 | modules for generating various outputs from inputs and specified options.
 7 | 
 8 | .. note::
 9 | 
10 |     As fingerprinting many molecules is embarrassingly parallel, we highly
11 |     recommend employing a parallelization strategy. We use our own
12 |     python_utilities_ package.
13 | 
14 | First we must choose configuration options. See :ref:`usage/config:Configuration` for
15 | detailed instructions. Here we will use defaults for all but a few options.
16 | 
17 | .. testsetup:: *
18 | 
19 |     smiles_file = "source/examples/data/test_smiles.smi"
20 | 
21 | .. doctest::
22 | 
23 |     >>> fprint_params = {'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True}
24 |     >>> confgen_params = {'max_energy_diff': 20.0, 'first': 3}
25 |     >>> smiles = "COC(=O)C(C1CCCCN1)C2=CC=CC=C2"
26 | 
27 | Generating Conformers from SMILES
28 | ---------------------------------
29 | 
30 | The following code snippet generates a multi-conformer molecule:
31 | 
32 |    >>> from e3fp.pipeline import confs_from_smiles
33 |    >>> mol = confs_from_smiles(smiles, "ritalin", confgen_params=confgen_params)
34 |    >>> mol.GetNumConformers()
35 |    3
36 | 
37 | This produces the following conformers:
38 | 
39 | .. image:: ../_static/ritalin3d.png
40 |    :width: 300px
41 |    :height: 300px
42 |    :alt: ritalin conformers
43 | 
44 | Generating Fingerprints from Conformers
45 | ---------------------------------------
46 | 
47 |    >>> from e3fp.pipeline import fprints_from_mol
48 |    >>> fprints = fprints_from_mol(mol, fprint_params=fprint_params)
49 |    >>> len(fprints)
50 |    3
51 |    >>> fprints[0]
52 |    Fingerprint(indices=array([188, 224, ..., 3775, 4053]), level=5, bits=4096, name=ritalin_0)
53 |    >>> fprints[1]
54 |    Fingerprint(indices=array([125, 188, ..., 3693, 4053]), level=5, bits=4096, name=ritalin_1)
55 |    >>> fprints[2]
56 |    Fingerprint(indices=array([188, 206, ..., 3743, 4053]), level=5, bits=4096, name=ritalin_2)
57 | 
58 | Generating Fingerprints from SMILES
59 | -----------------------------------
60 | 
61 |    >>> from e3fp.pipeline import fprints_from_smiles
62 |    >>> fprints = fprints_from_smiles(smiles, "ritalin", confgen_params=confgen_params, fprint_params=fprint_params)
63 |    >>> fprints[0]
64 |    Fingerprint(indices=array([188, 224, ..., 3775, 4053]), level=5, bits=4096, name=ritalin_0)
65 | 
66 | Parallel Fingerprinting
67 | -----------------------
68 | 
69 | The following script demonstrates use of python_utilities_ for fingerprinting
70 | all SDF files in a directory in parallel. This essentially is the same as the
71 | :ref:`usage/cli:Command Line Interface`, albeit with a less convenient interface.
72 | 
73 |     >>> from glob import glob
74 |     >>> from python_utilities.parallel import Parallelizer
75 |     >>> from e3fp.conformer.util import smiles_to_dict
76 |     >>> smiles_dict = smiles_to_dict(smiles_file)
77 |     >>> print(smiles_dict)
78 |     {'CHEMBL1643866': 'CCCC[C@H](CN(O)C=O)C(=O)[C@@H](NC(=O)C(C)C)C(C)C', ...}
79 |     >>> len(smiles_dict)
80 |     10
81 |     >>> smiles_iter = ((smiles, name) for name, smiles in smiles_dict.items())
82 |     >>> kwargs = {"confgen_params": confgen_params, "fprint_params": fprint_params}
83 |     >>> parallelizer = Parallelizer(parallel_mode="processes")
84 |     >>> fprints_list = parallelizer.run(fprints_from_smiles, smiles_iter, kwargs=kwargs) # doctest: +SKIP
85 |     >>> len(fprints_list) # doctest: +SKIP
86 |     10
87 | 
88 | For all pipeline methods, please see the `e3fp.pipeline` module API.
89 | 
90 | .. include:: ../substitutions.rst
91 | 


--------------------------------------------------------------------------------
/tests/data/ritalin_nonplanar.sdf:
--------------------------------------------------------------------------------
 1 | ZINC00896711
 2 |   -OEChem-11081520323D
 3 | 
 4 |  37 38  0     1  0  0  0  0  0999 V2000
 5 |    -0.0173    1.4248    0.0099 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     0.0021   -0.0041    0.0020 O   0  0  0  0  0  0  0  0  0  0  0  0
 7 |    -1.1855   -0.6297    0.0100 C   0  0  0  0  0  0  0  0  0  0  0  0
 8 |    -2.2076    0.0145    0.0232 O   0  0  0  0  0  0  0  0  0  0  0  0
 9 |    -1.2439   -2.1355    0.0025 C   0  0  2  0  0  0  0  0  0  0  0  0
10 |    -0.7531   -2.5137   -0.8943 H   0  0  0  0  0  0  0  0  0  0  0  0
11 |    -2.6831   -2.5824    0.0138 C   0  0  0  0  0  0  0  0  0  0  0  0
12 |    -3.5122   -2.2166    1.0577 C   0  0  0  0  0  0  0  0  0  0  0  0
13 |    -4.8323   -2.6265    1.0681 C   0  0  0  0  0  0  0  0  0  0  0  0
14 |    -5.3235   -3.4019    0.0344 C   0  0  0  0  0  0  0  0  0  0  0  0
15 |    -4.4946   -3.7670   -1.0099 C   0  0  0  0  0  0  0  0  0  0  0  0
16 |    -3.1756   -3.3535   -1.0225 C   0  0  0  0  0  0  0  0  0  0  0  0
17 |    -0.5311   -2.6798    1.2421 C   0  0  1  0  0  0  0  0  0  0  0  0
18 |    -1.0223   -2.3014    2.1385 H   0  0  0  0  0  0  0  0  0  0  0  0
19 |    -0.5921   -4.2087    1.2346 C   0  0  0  0  0  0  0  0  0  0  0  0
20 |     0.1222   -4.7487    2.4770 C   0  0  0  0  0  0  0  0  0  0  0  0
21 |     1.5613   -4.2254    2.4942 C   0  0  0  0  0  0  0  0  0  0  0  0
22 |     1.5425   -2.6958    2.4549 C   0  0  0  0  0  0  0  0  0  0  0  0
23 |     0.8702   -2.2430    1.2312 N   0  3  0  0  0  0  0  0  0  0  0  0
24 |     1.0053    1.8021    0.0021 H   0  0  0  0  0  0  0  0  0  0  0  0
25 |    -0.5445    1.7859   -0.8732 H   0  0  0  0  0  0  0  0  0  0  0  0
26 |    -0.5275    1.7763    0.9067 H   0  0  0  0  0  0  0  0  0  0  0  0
27 |    -3.1285   -1.6108    1.8652 H   0  0  0  0  0  0  0  0  0  0  0  0
28 |    -5.4799   -2.3413    1.8840 H   0  0  0  0  0  0  0  0  0  0  0  0
29 |    -6.3547   -3.7228    0.0429 H   0  0  0  0  0  0  0  0  0  0  0  0
30 |    -4.8782   -4.3731   -1.8173 H   0  0  0  0  0  0  0  0  0  0  0  0
31 |    -2.5290   -3.6356   -1.8402 H   0  0  0  0  0  0  0  0  0  0  0  0
32 |    -1.6332   -4.5315    1.2442 H   0  0  0  0  0  0  0  0  0  0  0  0
33 |    -0.1010   -4.5882    0.3386 H   0  0  0  0  0  0  0  0  0  0  0  0
34 |    -0.3992   -4.4111    3.3727 H   0  0  0  0  0  0  0  0  0  0  0  0
35 |     0.1309   -5.8383    2.4477 H   0  0  0  0  0  0  0  0  0  0  0  0
36 |     2.0594   -4.5600    3.4041 H   0  0  0  0  0  0  0  0  0  0  0  0
37 |     2.0971   -4.6052    1.6243 H   0  0  0  0  0  0  0  0  0  0  0  0
38 |     1.0067   -2.3170    3.3253 H   0  0  0  0  0  0  0  0  0  0  0  0
39 |     2.5655   -2.3199    2.4673 H   0  0  0  0  0  0  0  0  0  0  0  0
40 |     1.3372   -2.6344    0.4270 H   0  0  0  0  0  0  0  0  0  0  0  0
41 |     0.9071   -1.2358    1.1819 H   0  0  0  0  0  0  0  0  0  0  0  0
42 |   1  2  1  0  0  0  0
43 |   1 20  1  0  0  0  0
44 |   1 21  1  0  0  0  0
45 |   1 22  1  0  0  0  0
46 |   2  3  1  0  0  0  0
47 |   3  4  2  0  0  0  0
48 |   3  5  1  0  0  0  0
49 |   5  6  1  0  0  0  0
50 |   5  7  1  0  0  0  0
51 |   5 13  1  0  0  0  0
52 |   7 12  2  0  0  0  0
53 |   7  8  1  0  0  0  0
54 |   8  9  2  0  0  0  0
55 |   8 23  1  0  0  0  0
56 |   9 10  1  0  0  0  0
57 |   9 24  1  0  0  0  0
58 |  10 11  2  0  0  0  0
59 |  10 25  1  0  0  0  0
60 |  11 12  1  0  0  0  0
61 |  11 26  1  0  0  0  0
62 |  12 27  1  0  0  0  0
63 |  13 14  1  0  0  0  0
64 |  13 19  1  0  0  0  0
65 |  13 15  1  0  0  0  0
66 |  15 16  1  0  0  0  0
67 |  15 28  1  0  0  0  0
68 |  15 29  1  0  0  0  0
69 |  16 17  1  0  0  0  0
70 |  16 30  1  0  0  0  0
71 |  16 31  1  0  0  0  0
72 |  17 18  1  0  0  0  0
73 |  17 32  1  0  0  0  0
74 |  17 33  1  0  0  0  0
75 |  18 19  1  0  0  0  0
76 |  18 34  1  0  0  0  0
77 |  18 35  1  0  0  0  0
78 |  19 36  1  0  0  0  0
79 |  19 37  1  0  0  0  0
80 | M  CHG  1  19   1
81 | M  END
82 | $$$$
83 | 


--------------------------------------------------------------------------------
/doc/source/usage/cli.rst:
--------------------------------------------------------------------------------
 1 | Command Line Interface
 2 | ======================
 3 | 
 4 | Command line interfaces (CLI) are provided for the two most common tasks:
 5 | conformer generation and fingerprinting.
 6 | When e3fp is installed, the CLI commands are available as ``e3fp-conformer`` and
 7 | ``e3fp-fingerprint``.
 8 | 
 9 | Conformer Generation CLI
10 | ------------------------
11 | 
12 | To see all available options, run
13 | 
14 | .. command-output:: e3fp-conformer --help
15 |    :shell:
16 | 
17 | We will generate conformers for the molecule whose SMILES string is defined in
18 | ``caffeine.smi``.
19 | 
20 | .. literalinclude:: ../examples/data/caffeine.smi
21 |    :caption: caffeine.smi
22 | 
23 | The below example generates at most 3 conformers for this molecule.
24 | 
25 | .. code-block:: shell-session
26 | 
27 |     $ e3fp-conformer -s caffeine.smi --num_conf 3 -o ./
28 |     2017-07-17 00:11:05,743|WARNING|Only 1 processes available. 'mpi' mode not available.
29 |     2017-07-17 00:11:05,748|INFO|num_proc is not specified. 'processes' mode will use all 8 processes
30 |     2017-07-17 00:11:05,748|INFO|Parallelizer initialized with mode 'processes' and 8 processors.
31 |     2017-07-17 00:11:05,748|INFO|Input type: Detected SMILES file(s)
32 |     2017-07-17 00:11:05,748|INFO|Input file number: 1
33 |     2017-07-17 00:11:05,748|INFO|Parallel Type: processes
34 |     2017-07-17 00:11:05,748|INFO|Out Directory: ./
35 |     2017-07-17 00:11:05,749|INFO|Overwrite Existing Files: False
36 |     2017-07-17 00:11:05,749|INFO|Target Conformer Number: 3
37 |     2017-07-17 00:11:05,749|INFO|First Conformers Number: all
38 |     2017-07-17 00:11:05,749|INFO|Pool Multiplier: 1
39 |     2017-07-17 00:11:05,749|INFO|RMSD Cutoff: 0.5
40 |     2017-07-17 00:11:05,749|INFO|Maximum Energy Difference: None
41 |     2017-07-17 00:11:05,749|INFO|Forcefield: UFF
42 |     2017-07-17 00:11:05,749|INFO|Starting.
43 |     2017-07-17 00:11:05,779|INFO|Generating conformers for caffeine.
44 |     2017-07-17 00:11:05,823|INFO|Generated 1 conformers for caffeine.
45 |     2017-07-17 00:11:05,829|INFO|Saved conformers for caffeine to ./caffeine.sdf.bz2.
46 | 
47 | The result is a multi-conformer SDF file called ``caffeine.sdf.bz2`` in the
48 | current directory.
49 | 
50 | Fingerprinting CLI
51 | ------------------
52 | 
53 | To see all available options, run
54 | 
55 | .. command-output:: e3fp-fingerprint --help
56 |    :shell:
57 | 
58 | To continue the above example, we will fingerprint our caffeine conformers.
59 | 
60 | .. code-block:: shell-session
61 | 
62 |     $ e3fp-fingerprint caffeine.sdf.bz2 --bits 1024
63 |     2017-07-17 00:12:33,797|WARNING|Only 1 processes available. 'mpi' mode not available.
64 |     2017-07-17 00:12:33,801|INFO|num_proc is not specified. 'processes' mode will use all 8 processes
65 |     2017-07-17 00:12:33,801|INFO|Parallelizer initialized with mode 'processes' and 8 processors.
66 |     2017-07-17 00:12:33,801|INFO|Initializing E3FP generation.
67 |     2017-07-17 00:12:33,801|INFO|Getting SDF files
68 |     2017-07-17 00:12:33,801|INFO|SDF File Number: 1
69 |     2017-07-17 00:12:33,802|INFO|Database File: fingerprints.fpz
70 |     2017-07-17 00:12:33,802|INFO|Max First Conformers: 3
71 |     2017-07-17 00:12:33,802|INFO|Bits: 1024
72 |     2017-07-17 00:12:33,802|INFO|Level/Max Iterations: 5
73 |     2017-07-17 00:12:33,802|INFO|Shell Radius Multiplier: 1.718
74 |     2017-07-17 00:12:33,802|INFO|Stereo Mode: True
75 |     2017-07-17 00:12:33,802|INFO|Connected-only mode: on
76 |     2017-07-17 00:12:33,802|INFO|Invariant type: Daylight
77 |     2017-07-17 00:12:33,802|INFO|Parallel Mode: processes
78 |     2017-07-17 00:12:33,802|INFO|Starting
79 |     2017-07-17 00:12:33,829|INFO|Generating fingerprints for caffeine.
80 |     2017-07-17 00:12:33,935|INFO|Generated 1 fingerprints for caffeine.
81 |     2017-07-17 00:12:34,011|INFO|Saved FingerprintDatabase with fingerprints to fingerprints.fpz
82 | 
83 | The result is a file ``fingerprints.fpz`` containing a
84 | :py:class:`.FingerprintDatabase`. To use such a database, consult
85 | :ref:`usage/fingerprints/storage:Fingerprint Storage`.
86 | 


--------------------------------------------------------------------------------
/src/e3fp/util.py:
--------------------------------------------------------------------------------
  1 | """Utility classes/methods.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import inspect
  7 | import warnings
  8 | 
  9 | 
 10 | class E3FPError(Exception):
 11 |     """Base class for E3FP-specific errors.
 12 | 
 13 |     This class is provided for future E3FP-specific functionality.
 14 |     """
 15 | 
 16 | 
 17 | class E3FPWarning(Warning):
 18 |     """Base E3FP warning class.
 19 | 
 20 |     Unlike normal warnings, these are by default always set to on.
 21 |     """
 22 | 
 23 | 
 24 | # Always show custom warnings for this package
 25 | warnings.filterwarnings("always", category=E3FPWarning)
 26 | 
 27 | 
 28 | class E3FPDeprecationWarning(E3FPWarning, DeprecationWarning):
 29 |     """A warning class for a deprecated method or class."""
 30 | 
 31 | 
 32 | class E3FPEfficiencyWarning(E3FPWarning, RuntimeWarning):
 33 |     """A warning class for a potentially inefficient process."""
 34 | 
 35 | 
 36 | def maybe_jit(*args, **kwargs):
 37 |     """Decorator to jit a function using Numba if available.
 38 |     
 39 |     Usage is identical to `numba.jit`.
 40 |     """
 41 |     def wrapper(func):
 42 |         try:
 43 |             import numba
 44 |             has_numba = True
 45 |         except ImportError:
 46 |             has_numba = False
 47 | 
 48 |         if has_numba:
 49 |             return numba.jit(*args, **kwargs)(func)
 50 |         else:
 51 |             return func
 52 |     return wrapper
 53 | 
 54 | 
 55 | class deprecated(object):
 56 |     """Decorator to mark a function as deprecated.
 57 | 
 58 |     Issue a deprecation warning when a function is called, and update the
 59 |     documentation. A deprecation version must be provided.
 60 | 
 61 |     Examples
 62 |     --------
 63 |     >>> from e3fp.util import deprecated
 64 |     >>> @deprecated("1.1", remove_version="1.3",
 65 |     ...             msg="Function no longer needed")
 66 |     ... def my_function():
 67 |     ...     pass
 68 | 
 69 |     Notes
 70 |     -----
 71 |     Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary
 72 |     """
 73 | 
 74 |     def __init__(self, deprecated_version, remove_version=None, msg=None):
 75 |         """Constructor.
 76 | 
 77 |         Parameters
 78 |         ----------
 79 |         deprecated_version : str
 80 |             Version in which object was deprecated (e.g. '1.1')
 81 |         remove_version : str, optional
 82 |             Version in which object will be removed (e.g. '1.2'). If not
 83 |             specified, it is assumed the object will be removed in the next
 84 |             release (e.g. '1.2' if `deprecated_version` is '1.1')
 85 |         msg : str, optional
 86 |             Message to include with deprecation warning, to explain deprecation
 87 |             or point to newer version.
 88 |         """
 89 |         self.deprecated_version = deprecated_version
 90 |         if remove_version is None:
 91 |             version_info = deprecated_version.split(".")
 92 |             version_info[1] = str(int(version_info[1]) + 1)
 93 |             for i in range(2, len(version_info)):
 94 |                 version_info[i] = "0"
 95 |             remove_version = ".".join(version_info)
 96 |         self.remove_version = remove_version
 97 |         if msg is None:
 98 |             self.extra = ""
 99 |         else:
100 |             self.extra = " {0}".format(msg)
101 | 
102 |     def __call__(self, obj):
103 |         if inspect.isfunction(obj):
104 |             return self.deprecate_function(obj)
105 |         else:
106 |             raise ValueError("Deprecated object is not a function.")
107 | 
108 |     def deprecate_function(self, f):
109 |         """Return the decorated function."""
110 |         msg = (
111 |             "Function `{0}` was deprecated in {1} and will be removed "
112 |             "in {2}.{3}"
113 |         ).format(
114 |             f.__name__,
115 |             self.deprecated_version,
116 |             self.remove_version,
117 |             self.extra,
118 |         )
119 | 
120 |         def new_func(*args, **kwargs):
121 |             warnings.warn(msg, category=E3FPDeprecationWarning, stacklevel=2)
122 |             return f(*args, **kwargs)
123 | 
124 |         new_func.__name__ = f.__name__
125 |         new_func.__dict__ = f.__dict__
126 |         new_func.__doc__ = f.__doc__
127 |         self.update_docstring(new_func)
128 |         return new_func
129 | 
130 |     def update_docstring(self, obj):
131 |         """Add deprecation note to docstring."""
132 |         # print(obj.__doc__)
133 |         msg = (
134 |             f"\t.. deprecated:: {self.deprecated_version}\n"
135 |             f"\t   {self.extra}"
136 |         )
137 |         obj.__doc__ = f"{obj.__doc__}\n\n{msg}"
138 |         return obj
139 | 


--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/storage.rst:
--------------------------------------------------------------------------------
  1 | Fingerprint Storage
  2 | ===================
  3 | 
  4 | The most efficient way to store and interact with fingerprints is through the
  5 | :py:class:`.FingerprintDatabase` class. This class wraps a matrix with
  6 | sparse rows (:py:class:`scipy.sparse.csr_matrix`), where each row is a
  7 | fingerprint. This enables rapid I/O of the database while also minimizing the
  8 | memory footprint. Accessing the underlying sparse representation with the
  9 | :py:attr:`.FingerprintDatabase.array` attribute is convenient for machine learning
 10 | purposes, while the database class itself provides several useful functions.
 11 | 
 12 | .. note::
 13 | 
 14 |     We strongly recommend upgrading to at least SciPy v1.0.0 when working with
 15 |     large fingerprint databases, as old versions are much slower and have
 16 |     several bugs for database loading.
 17 | 
 18 | 
 19 | Database I/O and Indexing
 20 | -------------------------
 21 | 
 22 | See the full :py:class:`.FingerprintDatabase` documentation for a
 23 | description of basic database usage, attributes, and methods. Below, several
 24 | additional use cases are documented.
 25 | 
 26 | Batch Database Operations
 27 | -------------------------
 28 | 
 29 | Due to the sparse representation of the underlying data structure, an un-
 30 | folded database, a database with unfolded fingerprints does not use
 31 | significantly more disk space than a database with folded fingerprints. However,
 32 | it is usually necessary to fold fingerprints for machine learning tasks. The
 33 | :py:class:`.FingerprintDatabase` does this very quickly.
 34 | 
 35 | .. testsetup::
 36 | 
 37 |     import numpy as np
 38 |     np.random.seed(3)
 39 | 
 40 | .. doctest::
 41 | 
 42 |     >>> from e3fp.fingerprint.db import FingerprintDatabase
 43 |     >>> from e3fp.fingerprint.fprint import Fingerprint
 44 |     >>> import numpy as np
 45 |     >>> db = FingerprintDatabase(fp_type=Fingerprint, name="TestDB")
 46 |     >>> print(db)
 47 |     FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: None, fp_num: 0]
 48 |     >>> on_inds = [np.random.uniform(0, 2**32, size=30) for i in range(5)]
 49 |     >>> fps = [Fingerprint(x, bits=2**32) for x in on_inds]
 50 |     >>> db.add_fingerprints(fps)
 51 |     >>> print(db)
 52 |     FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5]
 53 |     >>> db.get_density()
 54 |     6.984919309616089e-09
 55 |     >>> fold_db = db.fold(1024)
 56 |     >>> print(fold_db)
 57 |     FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 1024, fp_num: 5]
 58 |     >>> fold_db.get_density()
 59 |     0.0287109375
 60 | 
 61 | A database can be converted to a different fingerprint type:
 62 | 
 63 |     >>> from e3fp.fingerprint.fprint import CountFingerprint 
 64 |     >>> count_db = db.as_type(CountFingerprint)
 65 |     >>> print(count_db)
 66 |     FingerprintDatabase[name: TestDB, fp_type: CountFingerprint, level: -1, bits: 4294967296, fp_num: 5]
 67 |     >>> count_db[0]
 68 |     CountFingerprint(counts={2977004690: 1, ..., 3041471738: 1}, level=-1, bits=4294967296, name=None)
 69 | 
 70 | The :py:func:`e3fp.fingerprint.db.concat` method allows efficient joining of multiple
 71 | databases.
 72 | 
 73 |     >>> from e3fp.fingerprint.db import concat
 74 |     >>> dbs = []
 75 |     >>> for i in range(10):
 76 |     ...     db = FingerprintDatabase(fp_type=Fingerprint)
 77 |     ...     on_inds = [np.random.uniform(0, 1024, size=30) for j in range(5)]
 78 |     ...     fps = [Fingerprint(x, bits=2**32, name="Mol{}".format(i)) for x in on_inds]
 79 |     ...     db.add_fingerprints(fps)
 80 |     ...     dbs.append(db)
 81 |     >>> dbs[0][0]
 82 |     Fingerprint(indices=array([94, 97, ..., 988, 994]), level=-1, bits=4294967296, name=Mol0)
 83 |     >>> print(dbs[0])
 84 |     FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5]
 85 |     >>> merge_db = concat(dbs)
 86 |     >>> print(merge_db)
 87 |     FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 50]
 88 | 
 89 | Database Comparison
 90 | -------------------
 91 | 
 92 | Two databases may be compared using various metrics in
 93 | :py:mod:`e3fp.fingerprint.metrics`. Additionally, all fingerprints in a database
 94 | may be compared to each other simply by only providing a single database.
 95 | See :ref:`usage/fingerprints/comparison:Fingerprint Comparison` for more details.
 96 | 
 97 | Performing Machine Learning on the Database
 98 | -------------------------------------------
 99 | 
100 | The underlying sparse matrix may be passed directly to machine learning tools
101 | in any package that is compatible with SciPy sparse matrices, such as
102 | `scikit-learn <http://scikit-learn.org/>`_.
103 | 
104 |     >>> from sklearn.naive_bayes import BernoulliNB
105 |     >>> clf = BernoulliNB()
106 |     >>> clf.fit(db.array, ypred)  # doctest: +SKIP
107 |     BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
108 |     >>> clf.predict(db2.array)   # doctest: +SKIP
109 |     ...
110 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/fprint_metrics.py:
--------------------------------------------------------------------------------
  1 | """Fingerprint comparison metrics.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | from __future__ import division
  7 | 
  8 | import numpy as np
  9 | from ..fprint import CountFingerprint, diff_counts_dict
 10 | 
 11 | 
 12 | def tanimoto(fp1, fp2):
 13 |     """Calculate Tanimoto coefficient between fingerprints.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     fp1 : Fingerprint
 18 |         Fingerprint 1
 19 |     fp2 : Fingerprint
 20 |         Fingerprint 2
 21 | 
 22 |     Returns
 23 |     -------
 24 |     float : Tanimoto coefficient.
 25 |     """
 26 |     try:
 27 |         intersect = np.intersect1d(
 28 |             fp1.indices, fp2.indices, assume_unique=True
 29 |         ).shape[0]
 30 |         return intersect / (fp1.bit_count + fp2.bit_count - intersect)
 31 |     except ZeroDivisionError:
 32 |         return 0.0
 33 | 
 34 | 
 35 | def soergel(fp1, fp2):
 36 |     """Calculate Soergel similarity between fingerprints.
 37 | 
 38 |     Soergel similarity is the complement of Soergel distance and can be
 39 |     thought of as the analog of the Tanimoto coefficient for count/float-based
 40 |     fingerprints. For `Fingerprint`, it is equivalent to the Tanimoto
 41 |     coefficient.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     fp1 : Fingerprint
 46 |         Fingerprint 1
 47 |     fp2 : Fingerprint
 48 |         Fingerprint 2
 49 | 
 50 |     Returns
 51 |     -------
 52 |     float : Soergel similarity.
 53 | 
 54 |     Reference
 55 |     -------
 56 | 
 57 |     """
 58 |     if not (
 59 |         isinstance(fp1, CountFingerprint) and isinstance(fp2, CountFingerprint)
 60 |     ):
 61 |         return tanimoto(fp1, fp2)
 62 | 
 63 |     counts_diff = diff_counts_dict(fp1, fp2)
 64 |     temp = np.asarray(
 65 |         [
 66 |             (abs(counts_diff[x]), max(fp1.get_count(x), fp2.get_count(x)))
 67 |             for x in counts_diff.keys()
 68 |         ],
 69 |         dtype=float,
 70 |     ).T
 71 |     soergel = 1 - np.sum(temp[0, :]) / np.sum(temp[1, :])
 72 | 
 73 |     return soergel
 74 | 
 75 | 
 76 | def dice(fp1, fp2):
 77 |     """Calculate Dice coefficient between fingerprints.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     fp1 : Fingerprint
 82 |         Fingerprint 1
 83 |     fp2 : Fingerprint
 84 |         Fingerprint 2
 85 | 
 86 |     Returns
 87 |     -------
 88 |     float : Dice coefficient.
 89 |     """
 90 |     try:
 91 |         intersect = np.intersect1d(
 92 |             fp1.indices, fp2.indices, assume_unique=True
 93 |         ).shape[0]
 94 |         return 2 * intersect / (fp1.bit_count + fp2.bit_count)
 95 |     except ZeroDivisionError:
 96 |         return 0.0
 97 | 
 98 | 
 99 | def cosine(fp1, fp2):
100 |     """Calculate cosine similarity between fingerprints.
101 | 
102 |     Parameters
103 |     ----------
104 |     fp1 : Fingerprint
105 |         Fingerprint 1
106 |     fp2 : Fingerprint
107 |         Fingerprint 2
108 | 
109 |     Returns
110 |     -------
111 |     float : Cosine similarity.
112 |     """
113 |     try:
114 |         dot = sum(v * fp2.get_count(k) for k, v in fp1.counts.items())
115 |         root_norm = (
116 |             sum(v ** 2 for v in fp1.counts.values())
117 |             * sum(v ** 2 for v in fp2.counts.values())
118 |         ) ** 0.5
119 |         return dot / root_norm
120 |     except ZeroDivisionError:
121 |         return 0.0
122 | 
123 | 
124 | def pearson(fp1, fp2):
125 |     """Calculate Pearson correlation between fingerprints.
126 | 
127 |     Parameters
128 |     ----------
129 |     fp1 : Fingerprint
130 |         Fingerprint 1
131 |     fp2 : Fingerprint
132 |         Fingerprint 2
133 | 
134 |     Returns
135 |     -------
136 |     float : Pearson correlation.
137 |     """
138 |     try:
139 |         dot = sum(v * fp2.get_count(k) for k, v in fp1.counts.items())
140 |         return (dot / fp1.bits - fp1.mean() * fp2.mean()) / (
141 |             fp1.std() * fp2.std()
142 |         )
143 |     except ZeroDivisionError:
144 |         return 0.0
145 | 
146 |     # intersect = np.intersect1d(fp1.indices, fp2.indices,
147 |     #                            assume_unique=True).shape[0]
148 |     # return ((intersect / fp1.bits) -
149 |     #         ((fp1.mean() * fp2.mean()) / (fp1.std() * fp2.std())))
150 | 
151 | 
152 | def hamming(fp1, fp2):
153 |     """Calculate Hamming distance between fingerprints.
154 | 
155 |     Parameters
156 |     ----------
157 |     fp1 : Fingerprint
158 |         Fingerprint 1
159 |     fp2 : Fingerprint
160 |         Fingerprint 2
161 | 
162 |     Returns
163 |     -------
164 |     float : Hamming distance.
165 |     """
166 |     intersect = np.intersect1d(
167 |         fp1.indices, fp2.indices, assume_unique=True
168 |     ).shape[0]
169 |     return fp1.bit_count + fp2.bit_count - 2 * intersect
170 | 
171 | 
172 | def distance(fp1, fp2):
173 |     """Calculate Euclidean distance between fingerprints.
174 | 
175 |     Parameters
176 |     ----------
177 |     fp1 : Fingerprint
178 |         Fingerprint 1
179 |     fp2 : Fingerprint
180 |         Fingerprint 2
181 | 
182 |     Returns
183 |     -------
184 |     float : Euclidian distance.
185 |     """
186 |     return hamming(fp1, fp2) ** 0.5
187 | 


--------------------------------------------------------------------------------
/tests/test_fingerprint.py:
--------------------------------------------------------------------------------
  1 | """Tests for E3FP fingerprints.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import pytest
  7 | 
  8 | class TestFingerprintIO:
  9 |     def test_fprint_from_indices(self):
 10 |         from e3fp.fingerprint.fprint import (
 11 |             Fingerprint,
 12 |             CountFingerprint,
 13 |             FloatFingerprint,
 14 |         )
 15 | 
 16 |         for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
 17 |             in_indices = [3, 1, 4, 5]
 18 |             bits = 32
 19 |             fprint = fp_type.from_indices(in_indices, bits=bits)
 20 |             assert sorted(in_indices) == sorted(fprint.indices)
 21 | 
 22 |     def test_fprint_from_fprint(self):
 23 |         from e3fp.fingerprint.fprint import (
 24 |             Fingerprint,
 25 |             CountFingerprint,
 26 |             FloatFingerprint,
 27 |         )
 28 | 
 29 |         for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
 30 |             in_indices = [3, 1, 4, 5, 1, 5, 9]
 31 |             bits = 32
 32 |             fprint1 = fp_type.from_indices(in_indices, bits=bits)
 33 |             fprint2 = fp_type.from_fingerprint(fprint1)
 34 |             assert fprint1 == fprint2
 35 | 
 36 |     def test_countfprint_from_counts(self):
 37 |         from e3fp.fingerprint.fprint import CountFingerprint
 38 | 
 39 |         in_counts = {3: 1, 1: 4, 5: 1}
 40 |         bits = 32
 41 |         fprint = CountFingerprint.from_counts(in_counts, bits=bits)
 42 |         out_counts = fprint.counts
 43 |         assert in_counts == out_counts
 44 | 
 45 |     def test_floatfprint_from_counts(self):
 46 |         from e3fp.fingerprint.fprint import FloatFingerprint
 47 | 
 48 |         in_counts = {3: 1.0, 1: 4.0, 5: 1.0}
 49 |         bits = 32
 50 |         fprint = FloatFingerprint.from_counts(in_counts, bits=bits)
 51 |         out_counts = fprint.counts
 52 |         assert in_counts == out_counts
 53 | 
 54 |     def test_unique_indices(self):
 55 |         from e3fp.fingerprint.fprint import (
 56 |             Fingerprint,
 57 |             CountFingerprint,
 58 |             FloatFingerprint,
 59 |         )
 60 | 
 61 |         for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
 62 |             in_indices = [3, 1, 4, 5, 1, 5, 9]
 63 |             bits = 32
 64 |             fprint = fp_type.from_indices(in_indices, bits=bits)
 65 |             assert sorted(set(in_indices)) == sorted(fprint.indices)
 66 | 
 67 |     def test_bitstring_io(self):
 68 |         from e3fp.fingerprint.fprint import (
 69 |             Fingerprint,
 70 |             CountFingerprint,
 71 |             FloatFingerprint,
 72 |         )
 73 | 
 74 |         for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
 75 |             in_bitstring = "1001001111011000"
 76 |             fprint = fp_type.from_bitstring(in_bitstring)
 77 |             out_bitstring = fprint.to_bitstring()
 78 |             assert in_bitstring == out_bitstring
 79 | 
 80 |     def test_vector_io(self):
 81 |         from e3fp.fingerprint.fprint import (
 82 |             Fingerprint,
 83 |             CountFingerprint,
 84 |             FloatFingerprint,
 85 |         )
 86 |         import numpy as np
 87 | 
 88 |         for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
 89 |             in_vector = np.array([0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=np.bool_)
 90 |             fprint = fp_type.from_vector(in_vector)
 91 |             out_vector = fprint.to_vector(sparse=False)
 92 |             np.testing.assert_array_equal(in_vector, out_vector)
 93 | 
 94 |     def test_rdkit_io(self):
 95 |         from e3fp.fingerprint.fprint import (
 96 |             Fingerprint,
 97 |             CountFingerprint,
 98 |             FloatFingerprint,
 99 |         )
100 | 
101 |         for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
102 |             indices = [3, 1, 4, 5]
103 |             bits = 32
104 |             fprint1 = fp_type.from_indices(indices, bits=bits)
105 |             rdkit_fprint1 = fprint1.to_rdkit()
106 |             fprint2 = fp_type.from_rdkit(rdkit_fprint1)
107 |             rdkit_fprint2 = fprint2.to_rdkit()
108 |             assert rdkit_fprint1 == rdkit_fprint2
109 | 
110 |     def test_basic_properties(self):
111 |         from e3fp.fingerprint.fprint import (
112 |             Fingerprint,
113 |             CountFingerprint,
114 |             FloatFingerprint,
115 |         )
116 |         import numpy as np
117 | 
118 |         bits = 1024
119 |         for i in range(10):
120 |             indices = np.random.randint(0, bits, 30)
121 |             unique_inds = np.unique(indices)
122 |             level = int(np.random.randint(0, 10))
123 |             for fp_type in (Fingerprint, CountFingerprint, FloatFingerprint):
124 |                 fp = fp_type.from_indices(indices, bits=bits, level=level)
125 |                 assert fp.bits == bits
126 |                 assert len(fp) == bits
127 |                 assert fp.bit_count == unique_inds.size
128 |                 assert fp.density == pytest.approx(float(unique_inds.size) / bits)
129 | 
130 | 
131 | class TestFingerprintAlgebra:
132 |     pass
133 | 
134 | 
135 | class TestFingerprintComparison:
136 |     pass
137 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/__init__.py:
--------------------------------------------------------------------------------
  1 | """Efficient comparison metrics for fingerprints and their databases.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import logging
  7 | 
  8 | from ..fprint import Fingerprint
  9 | from ..util import E3FPBitsValueError
 10 | from ..db import FingerprintDatabase
 11 | from . import array_metrics
 12 | from . import fprint_metrics
 13 | 
 14 | 
 15 | def tanimoto(A, B=None):
 16 |     """Compute Tanimoto coefficients between fingerprints.
 17 | 
 18 |     Fingerprints must have same number of bits. If not bit-fingerprints,
 19 |     arrays will be cast to binary. For non-binary data, use `soergel`. If only
 20 |     one fingerprint/database is provided, it is compared to self.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     A, B : Fingerprint or FingerprintDatabase
 25 |         Fingerprint(s) to be compared
 26 | 
 27 |     Returns
 28 |     -------
 29 |     tanimoto : float or ndarray [shape (num_fps_A, num_fps_B)]
 30 |         Pairwise tanimoto(s) between fingerprint(s) in `A` and `B`.
 31 | 
 32 |     See Also
 33 |     --------
 34 |     cosine, dice, pearson, soergel
 35 |     """
 36 |     A, B = _check_item_pair(A, B, fp_type=Fingerprint)
 37 |     if isinstance(A, Fingerprint):
 38 |         return fprint_metrics.tanimoto(A, B)
 39 |     return array_metrics.tanimoto(A.array, B.array)
 40 | 
 41 | 
 42 | def soergel(A, B=None):
 43 |     """Compute Soergel similarities between fingerprints.
 44 | 
 45 |     Soergel similarity is the complement of the Soergel distance and is
 46 |     analogous to the Tanimoto coefficient for count/float fingerprints. For
 47 |     binary data, it is equivalent to `tanimoto`.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     A, B : Fingerprint or FingerprintDatabase
 52 |         Fingerprint(s) to be compared
 53 | 
 54 |     Returns
 55 |     -------
 56 |     soergel : float or ndarray [shape (num_fps_A, num_fps_B)]
 57 | 
 58 |     See Also
 59 |     --------
 60 |     cosine, dice, pearson, tanimoto
 61 | 
 62 |     """
 63 |     A, B = _check_item_pair(A, B)
 64 |     if isinstance(A, Fingerprint):
 65 |         return fprint_metrics.soergel(A, B)
 66 |     return array_metrics.soergel(A.array, B.array)
 67 | 
 68 | 
 69 | def dice(A, B=None):
 70 |     """Compute Dice coefficients between fingerprints.
 71 | 
 72 |     Fingerprints must have same number of bits. If not bit-fingerprints,
 73 |     arrays will be cast to binary. If only one fingerprint/database is
 74 |     provided, it is compared to self.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     A, B : Fingerprint or FingerprintDatabase
 79 |         Fingerprint(s) to be compared
 80 | 
 81 |     Returns
 82 |     -------
 83 |     dice : float or ndarray [shape (num_fps_A, num_fps_B)]
 84 | 
 85 |     See Also
 86 |     --------
 87 |     cosine, pearson, soergel, tanimoto
 88 |     """
 89 |     A, B = _check_item_pair(A, B, fp_type=Fingerprint)
 90 |     if isinstance(A, Fingerprint):
 91 |         return fprint_metrics.dice(A, B)
 92 |     return array_metrics.dice(A.array, B.array)
 93 | 
 94 | 
 95 | def cosine(A, B=None):
 96 |     """Compute cosine similarities between fingerprints.
 97 | 
 98 |     Fingerprints must have same number of bits. If only one
 99 |     fingerprint/database is provided, it is compared to self.
100 | 
101 |     Parameters
102 |     ----------
103 |     A, B : Fingerprint or FingerprintDatabase
104 |         Fingerprint(s) to be compared
105 | 
106 |     Returns
107 |     -------
108 |     cosine : float or ndarray [shape (num_fps_A, num_fps_B)]
109 | 
110 |     See Also
111 |     --------
112 |     dice, pearson, soergel, tanimoto
113 |     """
114 |     A, B = _check_item_pair(A, B)
115 |     if isinstance(A, Fingerprint):
116 |         return fprint_metrics.cosine(A, B)
117 |     return array_metrics.cosine(A.array, B.array)
118 | 
119 | 
120 | def pearson(A, B=None):
121 |     """Compute Pearson correlation between fingerprints.
122 | 
123 |     Fingerprints must have same number of bits. If only one
124 |     fingerprint/database is provided, it is compared to self.
125 | 
126 |     Parameters
127 |     ----------
128 |     A, B : Fingerprint or FingerprintDatabase
129 |         Fingerprint(s) to be compared
130 | 
131 |     Returns
132 |     -------
133 |     pearson : float or ndarray [shape (num_fps_A, num_fps_B)]
134 | 
135 |     See Also
136 |     --------
137 |     cosine, dice, soergel, tanimoto
138 |     """
139 |     A, B = _check_item_pair(A, B)
140 |     if isinstance(A, Fingerprint):
141 |         return fprint_metrics.pearson(A, B)
142 |     return array_metrics.pearson(A.array, B.array)
143 | 
144 | 
145 | def _check_item(item, fp_type=None, force_db=False):
146 |     if force_db and isinstance(item, Fingerprint):
147 |         if not fp_type:
148 |             fp_type = item.__class__
149 |         db = FingerprintDatabase(fp_type=fp_type)
150 |         db.add_fingerprints([item])
151 |         item = db
152 |     elif fp_type and isinstance(item, FingerprintDatabase):
153 |         logging.debug(
154 |             "Casting database fingerprints to {}.".format(fp_type.__name__)
155 |         )
156 |         item = item.as_type(fp_type, copy=False)
157 |     return item
158 | 
159 | 
160 | def _check_item_pair(A, B, fp_type=None, force_db=False):
161 |     try:
162 |         if B is not None and A.bits != B.bits:
163 |             raise E3FPBitsValueError(
164 |                 "Fingerprints must have same number of bits."
165 |             )
166 |     except AttributeError:
167 |         raise TypeError("Items must be Fingerprint or FingerprintDatabase.")
168 |     if isinstance(A, FingerprintDatabase) or isinstance(
169 |         B, FingerprintDatabase
170 |     ):
171 |         force_db = True
172 |     A = _check_item(A, fp_type=fp_type, force_db=force_db)
173 |     if B is None:
174 |         B = A
175 |     else:
176 |         B = _check_item(B, fp_type=fp_type, force_db=force_db)
177 |     return A, B
178 | 


--------------------------------------------------------------------------------
/src/e3fp/config/params.py:
--------------------------------------------------------------------------------
  1 | """Get E3FP default parameters and read parameters from files.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import os
  7 | import copy
  8 | import ast
  9 | 
 10 | from configparser import (
 11 |     ConfigParser,
 12 |     NoSectionError,
 13 |     DuplicateSectionError,
 14 | )
 15 | 
 16 | CONFIG_DIR = os.path.dirname(os.path.realpath(__file__))
 17 | DEF_PARAM_FILE = os.path.join(CONFIG_DIR, "defaults.cfg")
 18 | 
 19 | 
 20 | def read_params(params=None, fill_defaults=False):
 21 |     """Get combination of provided parameters and default parameters.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     params : str or ConfigParser, optional
 26 |         User provided parameters as an INI file or `ConfigParser`.
 27 |         Any parameters provided will replace default parameters.
 28 |     fill_defaults : bool, optional
 29 |         Fill values that aren't provided with package defaults, if `params`
 30 |         is file.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     all_params : ConfigParser
 35 |         Combination of default and user-provided parameters.
 36 |     """
 37 |     if isinstance(params, ConfigParser):
 38 |         return copy.copy(params)
 39 | 
 40 |     params_list = []
 41 |     if fill_defaults:
 42 |         params_list.append(DEF_PARAM_FILE)
 43 |     if params is not None:
 44 |         params_list.append(params)
 45 | 
 46 |     all_params = ConfigParser()
 47 |     all_params.read(params_list)
 48 | 
 49 |     return all_params
 50 | 
 51 | 
 52 | def write_params(params, params_file="params.cfg"):
 53 |     """Write params to file.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     params : ConfigParser
 58 |         Params
 59 |     params_file : str
 60 |         Params file
 61 |     """
 62 |     with open(params_file, "w") as f:
 63 |         params.write(f)
 64 | 
 65 | 
 66 | def get_value(
 67 |     params, section_name, param_name, dtype=str, auto=False, fallback=None
 68 | ):
 69 |     """Get value from params with fallback.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     params : ConfigParser
 74 |         Parameters
 75 |     section_name : str
 76 |         Name of section in `params`
 77 |     param_name : str
 78 |         Name of parameter in `section`
 79 |     dtype : type, optional
 80 |         Type to return data as.
 81 |     auto : bool, optional
 82 |         Auto-discover type of value. If provided, `dtype` is ignored.
 83 |     fallback : any, optional
 84 |         Value to return if getting value fails.
 85 | 
 86 |     Returns
 87 |     -------
 88 |     value : any
 89 |         Value of parameter or `fallback`.
 90 |     """
 91 |     if auto:
 92 |         try:
 93 |             value = params.get(section_name, param_name)
 94 |         except ValueError:
 95 |             return fallback
 96 | 
 97 |         try:
 98 |             return ast.literal_eval(value)
 99 |         except (ValueError, SyntaxError):
100 |             return value
101 |     else:
102 |         get_function = params.get
103 |         if dtype is int:
104 |             get_function = params.getint
105 |         elif dtype is float:
106 |             get_function = params.getfloat
107 |         elif dtype is bool:
108 |             get_function = params.getboolean
109 | 
110 |         try:
111 |             return get_function(section_name, param_name)
112 |         except ValueError:
113 |             return fallback
114 | 
115 | 
116 | def get_default_value(*args, **kwargs):
117 |     global default_params
118 |     return get_value(default_params, *args, **kwargs)
119 | 
120 | 
121 | def update_params(
122 |     params_dict, params=None, section_name=None, fill_defaults=False
123 | ):
124 |     """Set `ConfigParser` values from a sections dict.
125 | 
126 |     Sections dict key must be parameter sections, and value must be dict
127 |     matching parameter name to value. If existing `ConfigParser` is
128 |     provided, parameter values are updated.
129 | 
130 |     Parameters
131 |     ----------
132 |     params_dict : dict
133 |         If `section_name` is provided, dict must match parameter names to
134 |         values. If `section_name` is not provided, dict key(s) must be
135 |         parameter sections, and value(s) must be parameter dict.
136 |     params : ConfigParser, optional
137 |         Existing parameters.
138 |     section_name : str, optional
139 |         Name of section to which to add parameters in `params_dict`
140 |     fill_defaults : bool, optional
141 |         Fill values that aren't provided with package defaults, if `params`
142 |         is file.
143 |     """
144 |     if params is None:
145 |         params = ConfigParser()
146 |     else:
147 |         params = read_params(params, fill_defaults=fill_defaults)
148 | 
149 |     if section_name is not None:
150 |         try:
151 |             params.add_section(section_name)
152 |         except DuplicateSectionError:
153 |             pass
154 | 
155 |         for param_name, param_value in params_dict.items():
156 |             params.set(section_name, param_name, str(param_value))
157 |     else:
158 |         sections_dict = params_dict
159 |         for section_name, params_dict in sections_dict.items():
160 |             for param_name, param_value in params_dict.items():
161 |                 params.set(section_name, param_name, param_value)
162 |     return params
163 | 
164 | 
165 | def params_to_sections_dict(params, auto=True):
166 |     """Get dict of sections dicts in params, with optional type discovery.
167 | 
168 |     Parameters
169 |     ----------
170 |     params : str or ConfigParser
171 |         Params to read
172 |     auto : bool, optional
173 |         Auto typing of parameter values.
174 | 
175 |     Returns
176 |     ----------
177 |     dict : dict matching sections to parameters to values.
178 |     """
179 |     params = read_params(params)
180 |     sections = default_params.sections()
181 |     params_dicts = {}
182 |     for section in sections:
183 |         try:
184 |             params_dict = dict(params.items(section))
185 |         except NoSectionError:
186 |             continue
187 |         if auto:
188 |             params_dict = {
189 |                 param_name: get_value(params, section, param_name, auto=True)
190 |                 for param_name in params_dict
191 |             }
192 |         params_dicts[section] = params_dict
193 |     return params_dicts
194 | 
195 | 
196 | default_params = read_params(fill_defaults=True)
197 | 


--------------------------------------------------------------------------------
/src/e3fp/conformer/protonation.py:
--------------------------------------------------------------------------------
  1 | """Functions for generating protonation states of molecules.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import os
  7 | import tempfile
  8 | import subprocess
  9 | import itertools
 10 | import logging
 11 | 
 12 | from .util import iter_to_smiles, MolItemName
 13 | 
 14 | 
 15 | def smiles_dict_to_proto_smiles_dict(
 16 |     in_smiles_dict,
 17 |     max_states=3,
 18 |     pka=7.4,
 19 |     dist_cutoff=20.0,
 20 |     add_missing=False,
 21 |     parallelizer=None,
 22 |     chunk_size=100,
 23 | ):
 24 |     """Generate dict of SMILES for protonated states from SMILES dict."""
 25 |     kwargs = {"max_states": max_states, "pka": pka, "dist_cutoff": dist_cutoff}
 26 |     in_smiles_iter = (
 27 |         (smiles, mol_name) for mol_name, smiles in in_smiles_dict.items()
 28 |     )
 29 |     if parallelizer is None:
 30 |         proto_smiles_iter = iter(
 31 |             smiles_list_to_proto_smiles_list(in_smiles_iter, **kwargs)
 32 |         )
 33 |     else:
 34 |         smiles_chunks_iter = (
 35 |             (chunk,)
 36 |             for chunk in _chunk_iter_to_lists(
 37 |                 in_smiles_iter, chunk_size=chunk_size
 38 |             )
 39 |         )
 40 |         results_iter = (
 41 |             result
 42 |             for result, data in parallelizer.run_gen(
 43 |                 smiles_list_to_proto_smiles_list,
 44 |                 smiles_chunks_iter,
 45 |                 kwargs=kwargs,
 46 |             )
 47 |             if result is not False
 48 |         )
 49 |         proto_smiles_iter = itertools.chain.from_iterable(results_iter)
 50 | 
 51 |     proto_smiles_dict = {
 52 |         mol_name: smiles for smiles, mol_name in proto_smiles_iter
 53 |     }
 54 |     if add_missing:
 55 |         for mol_name, smiles in in_smiles_dict.items():
 56 |             proto_name = MolItemName(mol_name, proto_state_num=0).proto_name
 57 |             if proto_name not in proto_smiles_dict:
 58 |                 logging.debug(
 59 |                     (
 60 |                         "Protonated SMILES for {} could not be generated. "
 61 |                         "Returning input SMILES."
 62 |                     ).format(mol_name)
 63 |                 )
 64 |                 proto_smiles_dict[mol_name] = smiles
 65 | 
 66 |     return proto_smiles_dict
 67 | 
 68 | 
 69 | def smiles_list_to_proto_smiles_list(
 70 |     in_smiles_list, max_states=3, pka=7.4, dist_cutoff=20.0
 71 | ):
 72 |     """Generate list of SMILES for protonated states from single SMILES."""
 73 |     in_smiles_file = tempfile.mkstemp(suffix=".smi")[1]
 74 |     iter_to_smiles(
 75 |         in_smiles_file,
 76 |         ((mol_name, smiles) for smiles, mol_name in in_smiles_list),
 77 |     )
 78 |     logging.debug("Protonating SMILES in %s" % (in_smiles_file))
 79 |     proc = subprocess.Popen(
 80 |         (
 81 |             "cxcalc %s --ignore-error dominanttautomerdistribution -H %g -C "
 82 |             'false -t dist -f "smiles:n,T:dist"'
 83 |         ).format(in_smiles_file, pka),
 84 |         shell=True,
 85 |         stdout=subprocess.PIPE,
 86 |     )
 87 | 
 88 |     proto_smiles_list = []
 89 |     try:
 90 |         stdout_iter = iter(proc.stdout.readline, b"")
 91 |         next(stdout_iter)
 92 |         curr_mol_name = None
 93 |         curr_states_count = 0
 94 |         for line in stdout_iter:
 95 |             try:
 96 |                 smiles, mol_name, dist = line.rstrip("\r\n").split()
 97 |             except ValueError:
 98 |                 logging.warning("Error parsing line:\n%s" % line)
 99 |                 continue
100 |             if mol_name != curr_mol_name:
101 |                 curr_states_count = 0
102 |                 curr_mol_name = mol_name
103 |             if curr_states_count >= max_states:
104 |                 continue
105 |             if float(dist) > dist_cutoff:
106 |                 proto_name = MolItemName(
107 |                     mol_name, proto_state_num=curr_states_count
108 |                 ).proto_name
109 |                 curr_states_count += 1
110 |                 proto_smiles_list.append((smiles, proto_name))
111 |         logging.debug("Finished protonating SMILES in %s" % (in_smiles_file))
112 |     except Exception:
113 |         logging.exception("Error running cxcalc", exc_info=True)
114 | 
115 |     proc.kill()
116 |     os.remove(in_smiles_file)
117 |     return proto_smiles_list
118 | 
119 | 
120 | def smiles_to_proto_smiles(
121 |     smiles, mol_name, max_states=3, pka=7.4, dist_cutoff=20.0
122 | ):
123 |     """Generate list of SMILES for protonated states from single SMILES.
124 | 
125 |     This is very inefficient in batch.
126 |     """
127 |     logging.debug("Protonating SMILES in %s" % (mol_name))
128 |     proc = subprocess.Popen(
129 |         (
130 |             'cxcalc "%s %s" --ignore-error dominanttautomerdistribution -H %g '
131 |             '-C false -t dist -f "smiles:n,T:dist"'
132 |         ).format(smiles, mol_name, pka),
133 |         shell=True,
134 |         stdout=subprocess.PIPE,
135 |     )
136 |     states_count = 0
137 |     proto_smiles_list = []
138 |     try:
139 |         stdout_iter = iter(proc.stdout.readline, b"")
140 |         next(stdout_iter)
141 |         for line in stdout_iter:
142 |             try:
143 |                 this_smiles, this_name, dist = line.rstrip("\r\n").split()
144 |             except ValueError:
145 |                 logging.warning("Error parsing line:\n%s" % line)
146 |                 continue
147 |             if states_count >= max_states:
148 |                 break
149 |             if float(dist) > dist_cutoff:
150 |                 proto_name = MolItemName(
151 |                     mol_name, proto_state_num=states_count
152 |                 ).proto_name
153 |                 states_count += 1
154 |                 proto_smiles_list.append((smiles, proto_name))
155 |         logging.debug("Finished protonating SMILES in %s" % (mol_name))
156 |     except OSError:
157 |         logging.exception(
158 |             "Error running cxcalc on %s" % (mol_name), exc_info=True
159 |         )
160 | 
161 |     proc.kill()
162 |     return proto_smiles_list
163 | 
164 | 
165 | def _chunk_iter_to_lists(iterable, chunk_size=100):
166 |     """Yield chunks of size `chunk_size` from iterator."""
167 |     i = 0
168 |     chunk = []
169 |     for item in iterable:
170 |         if i >= chunk_size:
171 |             yield chunk
172 |             chunk = []
173 |             i = 0
174 |         chunk.append(item)
175 |         i += 1
176 |     if len(chunk) != 0:
177 |         yield chunk
178 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # e3fp documentation build configuration file, created by
  4 | # sphinx-quickstart on Sun Jun 25 01:13:34 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | import importlib.metadata
 20 | import os
 21 | import sys
 22 | 
 23 | e3fp_version = importlib.metadata.version('e3fp')
 24 | 
 25 | # Set-up environment variable for programoutput
 26 | os.environ['E3FP_REPO'] = os.path.abspath("../..")
 27 | 
 28 | # -- General configuration ------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     'sphinx.ext.autosectionlabel',
 39 |     'sphinx.ext.autosummary',
 40 |     'sphinx.ext.intersphinx',
 41 |     'sphinx.ext.coverage',
 42 |     'sphinx.ext.ifconfig',
 43 |     'sphinx.ext.viewcode',
 44 |     'sphinx.ext.githubpages',
 45 |     'sphinx.ext.autodoc',
 46 |     'sphinx.ext.napoleon',
 47 |     'sphinx.ext.doctest',
 48 |     'sphinx.ext.todo',
 49 |     'sphinx.ext.imgconverter',
 50 |     'sphinxcontrib.programoutput']
 51 | 
 52 | napoleon_google_docstring = False
 53 | napoleon_numpy_docstring = True
 54 | napoleon_use_param = False
 55 | napoleon_use_ivar = True
 56 | 
 57 | autosummary_generate = True
 58 | 
 59 | add_module_names = False
 60 | 
 61 | # Add any paths that contain templates here, relative to this directory.
 62 | templates_path = ['_templates']
 63 | 
 64 | # The suffix(es) of source filenames.
 65 | # You can specify multiple suffix as a list of string:
 66 | #
 67 | # source_suffix = ['.rst', '.md']
 68 | source_suffix = {'.rst': 'restructuredtext'}
 69 | 
 70 | # The master toctree document.
 71 | master_doc = 'index'
 72 | 
 73 | # General information about the project.
 74 | project = u'e3fp'
 75 | copyright = u'2017, Seth Axen'
 76 | author = u'Seth Axen'
 77 | 
 78 | # The version info for the project you're documenting, acts as replacement for
 79 | # |version| and |release|, also used in various other places throughout the
 80 | # built documents.
 81 | #
 82 | # The short X.Y version.
 83 | version = '%s' % (e3fp_version)
 84 | # The full version, including alpha/beta/rc tags.
 85 | release = version
 86 | 
 87 | # The language for content autogenerated by Sphinx. Refer to documentation
 88 | # for a list of supported languages.
 89 | #
 90 | # This is also used if you do content translation via gettext catalogs.
 91 | # Usually you set "language" from the command line for these cases.
 92 | language = "en"
 93 | 
 94 | # The reST default role (used for this markup: `text`) to use for all
 95 | # documents. The autolink role functions as :obj: when the name referred can
 96 | # be resolved to a Python object
 97 | default_role = "autolink"
 98 | 
 99 | # List of patterns, relative to source directory, that match files and
100 | # directories to ignore when looking for source files.
101 | # This patterns also effect to html_static_path and html_extra_path
102 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
103 | 
104 | # The name of the Pygments (syntax highlighting) style to use.
105 | pygments_style = 'sphinx'
106 | 
107 | # If true, `todo` and `todoList` produce output, else they produce nothing.
108 | todo_include_todos = False
109 | 
110 | # Add unique prefixes to autosectionlabel to avoid duplicate labels
111 | autosectionlabel_prefix_document = True
112 | 
113 | 
114 | # -- Options for HTML output ----------------------------------------------
115 | 
116 | # The theme to use for HTML and HTML Help pages.  See the documentation for
117 | # a list of builtin themes.
118 | html_theme = 'sphinx_rtd_theme'
119 | 
120 | # Theme options are theme-specific and customize the look and feel of a theme
121 | # further.  For a list of options available for each theme, see the
122 | # documentation.
123 | #
124 | # html_theme_options = {}
125 | 
126 | # Add any paths that contain custom static files (such as style sheets) here,
127 | # relative to this directory. They are copied after the builtin static files,
128 | # so a file named "default.css" will overwrite the builtin "default.css".
129 | html_static_path = ['_static']
130 | 
131 | 
132 | # -- Options for HTMLHelp output ------------------------------------------
133 | 
134 | # Output file base name for HTML help builder.
135 | htmlhelp_basename = 'e3fpdoc'
136 | 
137 | 
138 | # -- Options for LaTeX output ---------------------------------------------
139 | 
140 | latex_elements = {
141 |     # The paper size ('letterpaper' or 'a4paper').
142 |     #
143 |     # 'papersize': 'letterpaper',
144 | 
145 |     # The font size ('10pt', '11pt' or '12pt').
146 |     #
147 |     # 'pointsize': '10pt',
148 | 
149 |     # Additional stuff for the LaTeX preamble.
150 |     #
151 |     # 'preamble': '',
152 | 
153 |     # Latex figure (float) alignment
154 |     #
155 |     # 'figure_align': 'htbp',
156 | }
157 | 
158 | # Grouping the document tree into LaTeX files. List of tuples
159 | # (source start file, target name, title,
160 | #  author, documentclass [howto, manual, or own class]).
161 | latex_documents = [
162 |     (master_doc, 'e3fp.tex', u'e3fp Documentation',
163 |      u'Seth Axen', 'manual'),
164 | ]
165 | 
166 | 
167 | # -- Options for manual page output ---------------------------------------
168 | 
169 | # One entry per manual page. List of tuples
170 | # (source start file, name, description, authors, manual section).
171 | man_pages = [
172 |     (master_doc, 'e3fp', u'e3fp Documentation',
173 |      [author], 1)
174 | ]
175 | 
176 | 
177 | # -- Options for Texinfo output -------------------------------------------
178 | 
179 | # Grouping the document tree into Texinfo files. List of tuples
180 | # (source start file, target name, title, author,
181 | #  dir menu entry, description, category)
182 | texinfo_documents = [
183 |     (master_doc, 'e3fp', u'e3fp Documentation',
184 |      author, 'e3fp', 'One line description of project.',
185 |      'Miscellaneous'),
186 | ]
187 | 
188 | 
189 | # Example configuration for intersphinx: refer to the Python standard library.
190 | intersphinx_mapping = {
191 |     'python': ('https://docs.python.org/3/', None),
192 |     'numpy': ('https://numpy.org/doc/stable/', None),
193 |     'scipy': ('https://docs.scipy.org/doc/scipy/', None),
194 | }
195 | 


--------------------------------------------------------------------------------
/doc/source/dev/index.rst:
--------------------------------------------------------------------------------
  1 | Developer Notes
  2 | ===============
  3 | 
  4 | We welcome contributions to E3FP! These notes are designed to help developers
  5 | contribute code
  6 | 
  7 | Authoring Code
  8 | --------------
  9 | 
 10 | Code Formatting
 11 | ~~~~~~~~~~~~~~~
 12 | 
 13 | E3FP's code should be *readable*. To ensure this, we rigorously follow the
 14 | PEP8_ style conventions and PEP257_ docstring conventions, which maximize
 15 | readability of the code and ease of future development. You may check your
 16 | code for conformation to these conventions with the pycodestyle_ and
 17 | pydocstyle_ utilities, respectively. Where the code is necessarily
 18 | complicated, inline comments should reorient the reader.
 19 | 
 20 | Utility Methods and Classes
 21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 22 | 
 23 | Three sets of utility methods and classes are provided: `e3fp.util`,
 24 | `e3fp.conformer.util`, and `e3fp.fingerprint.util`. These provide general and
 25 | often-used functionality in their corresponding packages. Additionally, they
 26 | provide E3FP-specific errors and exceptions.
 27 | 
 28 | Warnings and Errors
 29 | ~~~~~~~~~~~~~~~~~~~
 30 | 
 31 | By default, warnings in Python are silent. We therefore provide a warning base
 32 | class `e3fp.util.E3FPWarning` that is not silent by default. We provide several
 33 | general warnings:
 34 | 
 35 | :py:class:`.E3FPDeprecationWarning`
 36 |     warns when a deprecated method is called or class is instantiated.
 37 | 
 38 |     .. seealso::
 39 | 
 40 |        `Deprecation`_
 41 | 
 42 | :py:class:`.E3FPEfficiencyWarning`
 43 |     warns when a method, module version, or combination of parameters is known
 44 |     to be inefficient.
 45 | 
 46 |     .. note::
 47 | 
 48 |        If possible, the warning message should advise on a more efficient
 49 |        approach.
 50 | 
 51 | E3FP-specific errors should inherit `e3fp.util.E3FPError` base class. Several
 52 | fingerprinting-specific errors are defined in `e3fp.fingerprint.util`.
 53 | 
 54 | Deprecation
 55 | ~~~~~~~~~~~
 56 | 
 57 | Whenever changing the interface or behavior of a user-facing method or class,
 58 | it is proper to deprecate it for at least one release, so that the users have
 59 | time to update their scripts accordingly. A deprecated method should providing
 60 | an `e3fp.util.E3FPDeprecationWarning`, notifying the user in which release to
 61 | expect the method or class to be removed, and updating the documentation
 62 | accordingly. This functionality is automated with the `e3fp.util.deprecated`
 63 | decorator, as shown in this example:
 64 | 
 65 |     >>> import sys
 66 |     >>> sys.stderr = sys.stdout
 67 |     >>> from e3fp.util import deprecated
 68 |     >>> @deprecated("1.1", remove_version="1.3", msg="Function no longer needed.")
 69 |     ... def deprecated_method():
 70 |     ...     """A method to demonstrate method deprecation."""
 71 |     ...     pass
 72 |     >>> deprecated_method()
 73 |     ...: E3FPDeprecationWarning: Function `my_function` was deprecated in 1.1 and will be removed in 1.3. Function no longer needed.
 74 | 
 75 | In the api documentation, the method will appear as:
 76 | 
 77 | .. function:: deprecated_method()
 78 | 
 79 |     .. note:: Deprecated in e3fp 1.1.
 80 |        `deprecated_method` will be removed in e3fp 1.3. Function no longer needed.
 81 | 
 82 |     A method to demonstrate method deprecation.
 83 | 
 84 | .. note::
 85 |     If no `remove_version` is specified, then the remove version defaults to the
 86 |     next release after deprecation. For example, if the method was deprecated in
 87 |     1.1, it is by default marked for removal in 1.2.
 88 | 
 89 | Contributing Code
 90 | ~~~~~~~~~~~~~~~~~
 91 | 
 92 | Before contributing code to E3FP, it is advisable for major modifications to 
 93 | submit an issue to the
 94 | `issue tracker`_ to enable other
 95 | developers to contribute to the design of the code and to reduce the amount of
 96 | work necessary to conform the code to E3FP's standards. After writing the code,
 97 | create a `pull request`_. This is best even if you have push access to the
 98 | E3FP repo, as it enables the test suite to be run on the new code prior to
 99 | merging it with the remaining code base.
100 | 
101 | Writing Tests
102 | ~~~~~~~~~~~~~
103 | 
104 | The standard in E3FP is to commit a test for new functionality simultaneously
105 | with the new functionality or within the same pull request. While this slows
106 | development, it prevents building a large backlog of untested methods and
107 | classes. 
108 | 
109 | These should ideally be unit tests, though for some complicated
110 | functionalities, such as fingerprinting, integration tests are also
111 | necessary. For these complicated functions, specific units may still be
112 | tested using :py:mod:`unittest.mock`. For example,
113 | :py:meth:`unittest.mock.patch` may be used to force a high level method to
114 | produce a specific output. For examples, see the `fingeprinting tests
115 | <https://github.com/keiserlab/e3fp/blob/master/e3fp/test/test_fingerprinting.py>`_.
116 | 
117 | Continuous Integration
118 | ~~~~~~~~~~~~~~~~~~~~~~
119 | 
120 | E3FP uses `GitHub Actions`_ for continuous integration. This ensures that each commit
121 | and pull request passes all tests on a variety of a systems and for all
122 | supported versions of Python. Additionally, GitHub Actions updates code coverage on
123 | Codecov_ and tests all usage examples in the documentation using `doctest`.
124 | 
125 | Documentation
126 | -------------
127 | 
128 | In general, it is best to document the rationale and basic usage of a module,
129 | class, or method in its docstring instead of in a separate documentation file.
130 | See, for example, the docstring for `e3fp.fingerprint.db.FingerprintDatabase`.
131 | We use a variety of tools to ensure that our documentation is always
132 | up-to-date. The official documentation is hosted on ReadtheDocs_ and is
133 | automatically generated when new code is committed to the repository.
134 | 
135 | Documenting Code
136 | ~~~~~~~~~~~~~~~~
137 | 
138 | E3FP uses NumPy's `docstring conventions`_ for all docstrings. These are
139 | parsed by Sphinx_ using Napoleon_. All usage examples must be fully
140 | functional, as these are tested using `doctest`.
141 | 
142 | The purpose of a docstring is to explain the purpose of a class/method, any
143 | relevant implementation details, its parameters, its attributes, its outputs,
144 | and its usage. The goal is clarity. For self-evident methods with descriptive
145 | variables, a simple one- ine summary is all that is needed. For complicated use
146 | cases, often involving other methods/classes, it is better to document the
147 | usage elsewhere in the documentation.
148 | 
149 | Documentation Usage
150 | ~~~~~~~~~~~~~~~~~~~
151 | 
152 | Coming soon.
153 | 
154 | .. todo::
155 |     Write documentation usage
156 | 
157 | Releasing Code
158 | --------------
159 | 
160 | .. todo::
161 |     Write release protocol
162 | 
163 | .. _PEP8: https://www.python.org/dev/peps/pep-0008/
164 | .. _PEP257: https://www.python.org/dev/peps/pep-0257/
165 | .. _pycodestyle: http://pycodestyle.pycqa.org/en/latest/
166 | .. _pydocstyle: http://pydocstyle.pycqa.org/en/latest/
167 | .. _docstring conventions: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt
168 | .. _Napoleon: http://www.sphinx-doc.org/en/stable/ext/napoleon.html
169 | .. _Sphinx: http://www.sphinx-doc.org/en/stable/index.html
170 | .. _doctest: https://docs.python.org/3/library/doctest.html
171 | .. _pull request: https://help.github.com/articles/creating-a-pull-request/
172 | .. _GitHub Actions: https://github.com/keiserlab/e3fp/actions
173 | .. _Codecov: https://codecov.io/github/keiserlab/e3fp
174 | 
175 | .. include:: ../substitutions.rst
176 | 


--------------------------------------------------------------------------------
/doc/source/usage/fingerprints/fprints.rst:
--------------------------------------------------------------------------------
  1 | Fingerprints
  2 | ============
  3 | 
  4 | The simplest interface for molecular fingerprints are through three classes in
  5 | `e3fp.fingerprint.fprint`:
  6 | 
  7 | :py:class:`.Fingerprint`
  8 |     a fingerprint with "on" bits
  9 | 
 10 | :py:class:`.CountFingerprint`
 11 |     a fingerprint with counts for each "on" bit
 12 | 
 13 | :py:class:`.FloatFingerprint`
 14 |     a fingerprint with float values for each "on" bit, generated for example by
 15 |     averaging conformer fingerprints.
 16 | 
 17 | In addition to storing "on" indices and, for the latter two, corresponding
 18 | values, they store fingerprint properties, such as name, level, and any
 19 | arbitrary property. They also provide simple interfaces for fingerprint
 20 | comparison, some basic processing, and comparison.
 21 | 
 22 | .. note:: Many of these operations are more efficient when operating on a
 23 |     :py:class:`.FingerprintDatabase`. See
 24 |     :ref:`usage/fingerprints/storage:Fingerprint Storage` for more information.
 25 | 
 26 | In the below examples, we will focus on :py:class:`.Fingerprint` and
 27 | :py:class:`.CountFingerprint`. First, we execute the necessary imports.
 28 | 
 29 | .. testsetup::
 30 | 
 31 |     import numpy as np
 32 |     np.random.seed(0)
 33 | 
 34 | .. doctest::
 35 | 
 36 |     >>> from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint
 37 |     >>> import numpy as np
 38 | 
 39 | .. seealso::
 40 | 
 41 |     :ref:`usage/fingerprints/storage:Fingerprint Storage`,
 42 |     :ref:`usage/fingerprints/comparison:Fingerprint Comparison`
 43 | 
 44 | Creation and Conversion
 45 | -----------------------
 46 | 
 47 | Here we create a bit-fingerprint with random "on" indices.
 48 | 
 49 |     >>> bits = 2**32
 50 |     >>> indices = np.sort(np.random.randint(0, bits, 30))
 51 |     >>> indices
 52 |     array([ 243580376,  305097549, ..., 3975407269, 4138900056])
 53 |     >>> fp1 = Fingerprint(indices, bits=bits, level=0)
 54 |     >>> fp1
 55 |     Fingerprint(indices=array([243580376, ..., 4138900056]), level=0, bits=4294967296, name=None)
 56 | 
 57 | This fingerprint is extremely sparse
 58 | 
 59 |     >>> fp1.bit_count
 60 |     30
 61 |     >>> fp1.density
 62 |     6.984919309616089e-09
 63 | 
 64 | We can therefore "fold" the fingerprint through a series of bitwise "OR"
 65 | operations on halves of the sparse vector until it is of a specified length,
 66 | with minimal collision of bits.
 67 | 
 68 |     >>> fp_folded = fp1.fold(1024)
 69 |     >>> fp_folded
 70 |     Fingerprint(indices=array([9, 70, ..., 845, 849]), level=0, bits=1024, name=None)
 71 |     >>> fp_folded.bit_count
 72 |     29
 73 |     >>> fp_folded.density
 74 |     0.0283203125
 75 | 
 76 | A :py:class:`.CountFingerprint` may be created by also providing a dictionary
 77 | matching indices with nonzero counts to the counts.
 78 | 
 79 |     >>> indices2 = np.sort(np.random.randint(0, bits, 60))
 80 |     >>> counts = dict(zip(indices2, np.random.randint(1, 10, indices2.size)))
 81 |     >>> counts
 82 |     {80701568: 8, 580757632: 7, ..., 800291326: 5, 4057322111: 7}
 83 |     >>> cfp1 = CountFingerprint(counts=counts, bits=bits, level=0)
 84 |     >>> cfp1
 85 |     CountFingerprint(counts={80701568: 8, 580757632: 7, ..., 3342157822: 2, 4057322111: 7}, level=0, bits=4294967296, name=None)
 86 | 
 87 | Unlike folding a bit fingerprint, by default, folding a count fingerprint
 88 | performs a "SUM" operation on colliding counts.
 89 | 
 90 |     >>> cfp1.bit_count
 91 |     60
 92 |     >>> cfp_folded = cfp1.fold(1024)
 93 |     >>> cfp_folded
 94 |     CountFingerprint(counts={128: 15, 257: 4, ..., 1022: 2, 639: 7}, level=0, bits=1024, name=None)
 95 |     >>> cfp_folded.bit_count
 96 |     57
 97 | 
 98 | It is trivial to interconvert the fingerprints.
 99 | 
100 |     >>> cfp_folded2 = CountFingerprint.from_fingerprint(fp_folded)
101 |     >>> cfp_folded2
102 |     CountFingerprint(counts={9: 1, 87: 1, ..., 629: 1, 763: 1}, level=0, bits=1024, name=None)
103 |     >>> cfp_folded2.indices[:5]
104 |     array([  9,  70,  72,  87, 174])
105 |     >>> fp_folded.indices[:5]
106 |     array([  9,  70,  72,  87, 174])
107 | 
108 | RDKit Morgan fingerprints (analogous to ECFP) may easily be converted to a
109 | :py:class:`.Fingerprint`.
110 | 
111 |     >>> from rdkit import Chem
112 |     >>> from rdkit.Chem import AllChem
113 |     >>> mol = Chem.MolFromSmiles('Cc1ccccc1')
114 |     >>> mfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
115 |     >>> mfp
116 |     <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x...>
117 |     >>> Fingerprint.from_rdkit(mfp)
118 |     Fingerprint(indices=array([389, 1055, ..., 1873, 1920]), level=-1, bits=2048, name=None)
119 | 
120 | Likewise, :py:class:`.Fingerprint` can be easily converted to a NumPy ndarray or
121 | SciPy sparse matrix.
122 | 
123 |     >>> fp_folded.to_vector()
124 |     <1x1024 sparse matrix of type '<type 'numpy.bool_'>'
125 |     ...with 29 stored elements in Compressed Sparse Row format>
126 |     >>> fp_folded.to_vector(sparse=False)
127 |     array([False, False, False, ..., False, False, False], dtype=bool)
128 |     >>> np.where(fp_folded.to_vector(sparse=False))[0]
129 |     array([  9,  70,  72,  87, ...])
130 |     >>> cfp_folded.to_vector(sparse=False)
131 |     array([0, 0, 0, ..., 0, 2, 0], dtype=uint16)
132 |     >>> cfp_folded.to_vector(sparse=False).sum()
133 |     252
134 | 
135 | Algebra
136 | -------
137 | 
138 | Basic algebraic functions may be performed on fingerprints. If either
139 | fingerprint is a bit fingerprint, all algebraic functions are bit-wise.
140 | The following bit-wise operations are supported:
141 | 
142 | Equality
143 |     >>> fp1 = Fingerprint([0, 1, 6, 8, 12], bits=16)
144 |     >>> fp2 = Fingerprint([1, 2, 4, 8, 11, 12], bits=16)
145 |     >>> fp1 == fp2
146 |     False
147 |     >>> fp1_copy = Fingerprint.from_fingerprint(fp1)
148 |     >>> fp1 == fp1_copy
149 |     True
150 |     >>> fp1_copy.level = 5
151 |     >>> fp1 == fp1_copy
152 |     False
153 | 
154 | Union/OR
155 |     >>> fp1 + fp2
156 |     Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None)
157 |     >>> fp1 | fp2
158 |     Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None)
159 | 
160 | Intersection/AND
161 |     >>> fp1 & fp2
162 |     Fingerprint(indices=array([1, 8, 12]), level=-1, bits=16, name=None)
163 | 
164 | Difference/AND NOT
165 |     >>> fp1 - fp2
166 |     Fingerprint(indices=array([0, 6]), level=-1, bits=16, name=None)
167 |     >>> fp2 - fp1
168 |     Fingerprint(indices=array([2, 4, 11]), level=-1, bits=16, name=None)
169 | 
170 | XOR
171 |     >>> fp1 ^ fp2
172 |     Fingerprint(indices=array([0, 2, 4, 6, 11]), level=-1, bits=16, name=None)
173 | 
174 | With count or float fingerprints, bit-wise operations are still possible, but
175 | algebraic operations are applied to counts.
176 | 
177 |     >>> fp1 = CountFingerprint(counts={0: 3, 1: 2, 5: 1, 9: 3}, bits=16)
178 |     >>> fp2 = CountFingerprint(counts={1: 2, 5: 2, 7: 3, 10: 7}, bits=16)
179 |     >>> fp1 + fp2
180 |     CountFingerprint(counts={0: 3, 1: 4, 5: 3, 7: 3, 9: 3, 10: 7}, level=-1, bits=16, name=None)
181 |     >>> fp1 - fp2
182 |     CountFingerprint(counts={0: 3, 1: 0, 5: -1, 7: -3, 9: 3, 10: -7}, level=-1, bits=16, name=None)
183 |     >>> fp1 * 3
184 |     CountFingerprint(counts={0: 9, 1: 6, 5: 3, 9: 9}, level=-1, bits=16, name=None)
185 |     >>> fp1 / 2
186 |     FloatFingerprint(counts={0: 1.5, 1: 1.0, 5: 0.5, 9: 1.5}, level=-1, bits=16, name=None)
187 | 
188 | Finally, fingerprints may be batch added and averaged, producing either a count
189 | or float fingerprint when sensible.
190 | 
191 |     >>> from e3fp.fingerprint.fprint import add, mean
192 |     >>> fps = [Fingerprint(np.random.randint(0, 32, 8), bits=32) for i in range(100)]
193 |     >>> add(fps)
194 |     CountFingerprint(counts={0: 23, 1: 23, ..., 30: 20, 31: 14}, level=-1, bits=32, name=None)
195 |     >>> mean(fps)
196 |     FloatFingerprint(counts={0: 0.23, 1: 0.23, ..., 30: 0.2, 31: 0.14}, level=-1, bits=32, name=None)
197 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
  1 | """Tests for fingerprint comparison metrics.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import pytest
  7 | 
  8 | import numpy as np
  9 | from scipy.sparse import csr_matrix
 10 | from scipy.spatial.distance import cdist
 11 | from e3fp.fingerprint import metrics, fprint, db
 12 | from e3fp.fingerprint.metrics import array_metrics, fprint_metrics
 13 | 
 14 | 
 15 | def _create_random_sparse(nrows, nbits=1024, perc_pos=0.1, counts=False):
 16 |     arr = csr_matrix(
 17 |         np.random.uniform(0, 1, (nrows, nbits)) > (1 - perc_pos),
 18 |         dtype=np.double,
 19 |     )
 20 |     if counts:
 21 |         arr.data = np.random.randint(1, 30, arr.data.shape[0]).astype(
 22 |             np.double
 23 |         )
 24 |     return arr
 25 | 
 26 | def soergeldist(x, y):
 27 |     return np.abs(x - y).sum() / np.maximum(x, y).sum()
 28 | 
 29 | 
 30 | class TestArrayMetrics:
 31 | 
 32 |     """Tests for array comparison metrics"""
 33 | 
 34 |     @staticmethod
 35 |     def _eval(func, X, Y=None, dense=False, **kwargs):
 36 |         if dense:
 37 |             X = X.toarray()
 38 |             if Y is not None:
 39 |                 Y = Y.toarray()
 40 |         return func(X, Y, **kwargs)
 41 | 
 42 |     @pytest.mark.parametrize("dense", [True, False])
 43 |     @pytest.mark.parametrize(
 44 |         "func,cdist_metric,counts",
 45 |         [
 46 |             (array_metrics.tanimoto, "jaccard", False),
 47 |             (array_metrics.dice, "dice", False),
 48 |             (array_metrics.cosine, "cosine", False),
 49 |             (array_metrics.cosine, "cosine", True),
 50 |             (array_metrics.pearson, "correlation", False),
 51 |             (array_metrics.pearson, "correlation", True),
 52 |             (array_metrics.soergel, soergeldist, False),
 53 |             (array_metrics.soergel, soergeldist, True),
 54 |         ],
 55 |     )
 56 |     def test_metrics_vs_cdist(self, func, cdist_metric, counts, dense):
 57 |         X = _create_random_sparse(10, counts=counts)
 58 |         Y = _create_random_sparse(8, counts=counts)
 59 |         expect_score = 1.0 - cdist(X.toarray(), Y.toarray(), metric=cdist_metric)
 60 |         score = self._eval(func, X, Y, dense=dense)
 61 |         assert type(score) is np.ndarray
 62 |         np.testing.assert_allclose(score, expect_score)
 63 |         # test self-comparison
 64 |         expect_score = 1.0 - cdist(X.toarray(), X.toarray(), metric=cdist_metric)
 65 |         score = self._eval(func, X, dense=dense)
 66 |         np.testing.assert_allclose(score, expect_score)
 67 | 
 68 |     @pytest.mark.parametrize("dense", [True, False])
 69 |     def test_tanimoto_soergel_equal_for_binary(self, dense):
 70 |         X = _create_random_sparse(10, counts=False)
 71 |         Y = _create_random_sparse(8, counts=False)
 72 |         tscore = self._eval(array_metrics.tanimoto, X, Y, dense=dense)
 73 |         sscore = self._eval(array_metrics.soergel, X, Y, dense=dense)
 74 |         np.testing.assert_allclose(tscore, sscore)
 75 | 
 76 | 
 77 | class TestFlexibleMetrics:
 78 | 
 79 |     """Tests for flexible comparison metrics"""
 80 | 
 81 |     metric_names = ["tanimoto", "soergel", "dice", "cosine", "pearson"]
 82 |     count_metric_names = ["soergel", "cosine", "pearson"]
 83 | 
 84 |     def test_binary_fprint_vs_fprint(self):
 85 |         fp1 = fprint.Fingerprint.from_vector(
 86 |             _create_random_sparse(1, counts=False, perc_pos=0.5)
 87 |         )
 88 |         fp2 = fprint.Fingerprint.from_vector(
 89 |             _create_random_sparse(1, counts=False, perc_pos=0.5)
 90 |         )
 91 |         for metric_name in self.metric_names:
 92 |             gen_score = getattr(metrics, metric_name)(fp1, fp2)
 93 |             fp_score = getattr(fprint_metrics, metric_name)(fp1, fp2)
 94 |             assert gen_score == pytest.approx(fp_score)
 95 |             array_score = getattr(array_metrics, metric_name)(
 96 |                 fp1.to_vector(sparse=True), fp2.to_vector(sparse=True)
 97 |             )
 98 |             assert gen_score == pytest.approx(array_score[0][0])
 99 | 
100 |     def test_count_fprint_vs_fprint(self):
101 |         fp1 = fprint.CountFingerprint.from_vector(
102 |             _create_random_sparse(1, nbits=32, counts=True, perc_pos=0.5)
103 |         )
104 |         fp2 = fprint.CountFingerprint.from_vector(
105 |             _create_random_sparse(1, nbits=32, counts=True, perc_pos=0.5)
106 |         )
107 |         for metric_name in self.count_metric_names:
108 |             gen_score = getattr(metrics, metric_name)(fp1, fp2)
109 |             fp_score = getattr(fprint_metrics, metric_name)(fp1, fp2)
110 |             assert gen_score == pytest.approx(fp_score)
111 |             array_score = getattr(array_metrics, metric_name)(
112 |                 fp1.to_vector(sparse=True), fp2.to_vector(sparse=True)
113 |             )
114 |             assert gen_score == pytest.approx(array_score[0][0])
115 | 
116 |     def test_binary_fprint_vs_db(self):
117 |         fp_array = _create_random_sparse(1, counts=False, perc_pos=0.5)
118 |         fp = fprint.Fingerprint.from_vector(fp_array)
119 |         db_array = _create_random_sparse(10, counts=False, perc_pos=0.5)
120 |         fp_names = [str(i) for i in range(db_array.shape[0])]
121 |         fdb = db.FingerprintDatabase.from_array(
122 |             db_array, fp_names, fp_type=fprint.Fingerprint
123 |         )
124 |         for metric_name in self.metric_names:
125 |             gen_score = getattr(metrics, metric_name)(fp, fdb)
126 |             array_score = getattr(array_metrics, metric_name)(
127 |                 fp_array, db_array
128 |             )
129 |             np.testing.assert_allclose(gen_score, array_score)
130 |             gen_score = getattr(metrics, metric_name)(fdb, fp)
131 |             np.testing.assert_allclose(gen_score.T, array_score)
132 | 
133 |     def test_count_fprint_vs_db(self):
134 |         fp_array = _create_random_sparse(1, counts=True, perc_pos=0.5)
135 |         fp = fprint.CountFingerprint.from_vector(fp_array)
136 |         db_array = _create_random_sparse(10, counts=True, perc_pos=0.5)
137 |         fp_names = [str(i) for i in range(db_array.shape[0])]
138 |         fdb = db.FingerprintDatabase.from_array(
139 |             db_array, fp_names, fp_type=fprint.CountFingerprint
140 |         )
141 |         for metric_name in self.count_metric_names:
142 |             gen_score = getattr(metrics, metric_name)(fp, fdb)
143 |             array_score = getattr(array_metrics, metric_name)(
144 |                 fp_array, db_array
145 |             )
146 |             np.testing.assert_allclose(gen_score, array_score)
147 |             # Check if reverse order produces transpose
148 |             gen_score = getattr(metrics, metric_name)(fdb, fp)
149 |             np.testing.assert_allclose(gen_score.T, array_score)
150 | 
151 |     def test_binary_db_vs_db(self):
152 |         db_array1 = _create_random_sparse(1, counts=False, perc_pos=0.5)
153 |         fp_names = [str(i) for i in range(db_array1.shape[0])]
154 |         db1 = db.FingerprintDatabase.from_array(
155 |             db_array1, fp_names, fp_type=fprint.Fingerprint
156 |         )
157 |         db_array2 = _create_random_sparse(1, counts=False, perc_pos=0.5)
158 |         fp_names = [str(i) for i in range(db_array2.shape[0])]
159 |         db2 = db.FingerprintDatabase.from_array(
160 |             db_array2, fp_names, fp_type=fprint.Fingerprint
161 |         )
162 |         for metric_name in self.metric_names:
163 |             gen_score = getattr(metrics, metric_name)(db1, db2)
164 |             array_score = getattr(array_metrics, metric_name)(
165 |                 db_array1, db_array2
166 |             )
167 |             np.testing.assert_allclose(gen_score, array_score)
168 | 
169 |     def test_count_db_vs_db(self):
170 |         db_array1 = _create_random_sparse(1, counts=True, perc_pos=0.5)
171 |         fp_names = [str(i) for i in range(db_array1.shape[0])]
172 |         db1 = db.FingerprintDatabase.from_array(
173 |             db_array1, fp_names, fp_type=fprint.CountFingerprint
174 |         )
175 |         db_array2 = _create_random_sparse(1, counts=True, perc_pos=0.5)
176 |         fp_names = [str(i) for i in range(db_array2.shape[0])]
177 |         db2 = db.FingerprintDatabase.from_array(
178 |             db_array2, fp_names, fp_type=fprint.CountFingerprint
179 |         )
180 |         for metric_name in self.count_metric_names:
181 |             gen_score = getattr(metrics, metric_name)(db1, db2)
182 |             array_score = getattr(array_metrics, metric_name)(
183 |                 db_array1, db_array2
184 |             )
185 |             np.testing.assert_allclose(gen_score, array_score)
186 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/metrics/array_metrics.py:
--------------------------------------------------------------------------------
  1 | """Fingerprint array comparison metrics.
  2 | 
  3 | Each is fully compatible with both dense and sparse inputs.
  4 | 
  5 | Author: Seth Axen
  6 | E-mail: seth.axen@gmail.com
  7 | """
  8 | from __future__ import division
  9 | 
 10 | import numpy as np
 11 | import scipy
 12 | from scipy.sparse import csr_matrix, issparse, vstack
 13 | import scipy.sparse.linalg
 14 | import scipy.spatial
 15 | from e3fp.util import maybe_jit
 16 | 
 17 | 
 18 | def tanimoto(X, Y=None):
 19 |     """Compute the Tanimoto coefficients between `X` and `Y`.
 20 | 
 21 |     Data must be binary. This is not checked.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     X : array_like or sparse matrix
 26 |         with shape (`n_fprints_X`, `n_bits`).
 27 |     Y : array_like or sparse matrix, optional
 28 |         with shape (`n_fprints_Y`, `n_bits`).
 29 | 
 30 |     Returns
 31 |     -------
 32 |     tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`)
 33 | 
 34 |     See Also
 35 |     --------
 36 |     soergel: Analog to Tanimoto for non-binary data.
 37 |     cosine, dice, pearson
 38 |     """
 39 |     X, Y = _check_array_pair(X, Y)
 40 |     Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
 41 |     with np.errstate(divide="ignore"):  # handle 0 in denominator
 42 |         return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits)))
 43 | 
 44 | 
 45 | def soergel(X, Y=None):
 46 |     """Compute the Soergel similarities between `X` and `Y`.
 47 | 
 48 |     Soergel similarity is the complement of Soergel distance and can be
 49 |     thought of as the analog of the Tanimoto coefficient for count/float-based
 50 |     data. For binary data, it is equivalent to the Tanimoto coefficient.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     X : array_like or sparse matrix
 55 |         with shape (`n_fprints_X`, `n_bits`).
 56 |     Y : array_like or sparse matrix, optional
 57 |         with shape (`n_fprints_Y`, `n_bits`).
 58 | 
 59 |     Returns
 60 |     -------
 61 |     soergel : array of shape (`n_fprints_X`, `n_fprints_Y`)
 62 | 
 63 |     Notes
 64 |     --------
 65 |     If Numba is available, this function is jit-compiled and much more efficient.
 66 | 
 67 |     See Also
 68 |     --------
 69 |     tanimoto: A fast version of this function for binary data.
 70 |     pearson: Pearson correlation, also appropriate for non-binary data.
 71 |     cosine, dice
 72 |     """
 73 |     X, Y = _check_array_pair(X, Y)
 74 |     S = np.empty((X.shape[0], Y.shape[0]), dtype=float)
 75 |     if issparse(X):
 76 |         return _sparse_soergel(X.data, X.indices, X.indptr,
 77 |                                Y.data, Y.indices, Y.indptr, S)
 78 |     return _dense_soergel(X, Y, S)
 79 | 
 80 | def dice(X, Y=None):
 81 |     """Compute the Dice coefficients between `X` and `Y`.
 82 | 
 83 |     Data must be binary. This is not checked.
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     X : array_like or sparse matrix
 88 |         with shape (`n_fprints_X`, `n_bits`).
 89 |     Y : array_like or sparse matrix, optional
 90 |         with shape (`n_fprints_Y`, `n_bits`).
 91 | 
 92 |     Returns
 93 |     -------
 94 |     dice : array of shape (`n_fprints_X`, `n_fprints_Y`)
 95 | 
 96 |     See Also
 97 |     --------
 98 |     cosine, soergel, tanimoto, pearson
 99 |     """
100 |     X, Y = _check_array_pair(X, Y)
101 |     Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
102 |     with np.errstate(divide="ignore"):  # handle 0 in denominator
103 |         return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T)))
104 | 
105 | 
106 | def cosine(X, Y=None, assume_binary=False):
107 |     """Compute the Cosine similarities between `X` and `Y`.
108 | 
109 |     Parameters
110 |     ----------
111 |     X : array_like or sparse matrix
112 |         with shape (`n_fprints_X`, `n_bits`).
113 |     Y : array_like or sparse matrix, optional
114 |         with shape (`n_fprints_Y`, `n_bits`).
115 |     assume_binary : bool, optional
116 |         Assume data is binary (results in efficiency boost). If data is not
117 |         binary, the result will be incorrect.
118 | 
119 |     Returns
120 |     -------
121 |     cosine : array of shape (`n_fprints_X`, `n_fprints_Y`)
122 | 
123 |     See Also
124 |     --------
125 |     dice, soergel, tanimoto
126 |     """
127 |     X, Y = _check_array_pair(X, Y)
128 |     if not issparse(X):
129 |         return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine")
130 |     if assume_binary:
131 |         Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
132 |         with np.errstate(divide="ignore"):  # handle 0 in denominator
133 |             return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T)))
134 |     else:
135 |         return _sparse_cosine(X, Y)
136 | 
137 | 
138 | def pearson(X, Y=None):
139 |     """Compute the Pearson correlation between `X` and `Y`.
140 | 
141 |     Parameters
142 |     ----------
143 |     X : array_like or sparse matrix
144 |         with shape (`n_fprints_X`, `n_bits`).
145 |     Y : array_like or sparse matrix, optional
146 |         with shape (`n_fprints_Y`, `n_bits`).
147 | 
148 |     Returns
149 |     -------
150 |     pearson : array of shape (`n_fprints_X`, `n_fprints_Y`)
151 | 
152 | 
153 |     See Also
154 |     --------
155 |     soergel: Soergel similarity for non-binary data
156 |     cosine, dice, tanimoto
157 |     """
158 |     X, Y = _check_array_pair(X, Y)
159 |     Xlen = X.shape[0]
160 |     if issparse(X):
161 |         X = vstack((X, Y), format="csr")
162 |         X = X - X.mean(axis=1)
163 |         cov = (X * X.T) / (X.shape[1] - 1.0)
164 |         d = np.sqrt(np.diag(cov))
165 |         with np.errstate(divide="ignore"):  # handle 0 in denominator
166 |             pearson = cov / np.outer(d, d)
167 |     else:
168 |         with np.errstate(divide="ignore"):  # handle 0 in denominator
169 |             pearson = np.corrcoef(X, Y)
170 |     return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
171 | 
172 | 
173 | def _check_array(arr, dtype=float, force_sparse=False):
174 |     if force_sparse or issparse(arr):
175 |         return csr_matrix(arr, copy=False, dtype=dtype)
176 |     else:
177 |         return arr.astype(dtype, copy=False)
178 | 
179 | 
180 | def _check_array_pair(X, Y=None, dtype=float, force_sparse=False):
181 |     if Y is not None and X.shape[1] != Y.shape[1]:
182 |         raise ValueError("Arrays must have same width.")
183 |     if force_sparse or issparse(X) or issparse(Y):
184 |         force_sparse = True  # ensure if one is sparse, all are sparse.
185 |     X = _check_array(X, dtype=dtype, force_sparse=force_sparse)
186 |     if Y is None or Y is X:
187 |         Y = X
188 |     else:
189 |         Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse)
190 |     return X, Y
191 | 
192 | 
193 | def _get_bitcount_arrays(X, Y, return_XYbits=False):
194 |     if issparse(X):
195 |         Xbits = np.sum(X, axis=1)
196 |         if Y is X:
197 |             Ybits = Xbits
198 |         else:
199 |             Ybits = np.sum(Y, axis=1)
200 |         if return_XYbits:
201 |             XYbits = (X * Y.T).toarray()
202 |             return Xbits, Ybits, XYbits
203 |     else:
204 |         Xbits = np.sum(X, axis=1, keepdims=True)
205 |         if Y is X:
206 |             Ybits = Xbits
207 |         else:
208 |             Ybits = np.sum(Y, axis=1, keepdims=True)
209 |         if return_XYbits:
210 |             XYbits = np.dot(X, Y.T)
211 |             return Xbits, Ybits, XYbits
212 |     return Xbits, Ybits
213 | 
214 | 
215 | def _sparse_cosine(X, Y):
216 |     Xnorm = scipy.sparse.linalg.norm(X, axis=1)
217 |     if Y is X:
218 |         Ynorm = Xnorm
219 |     else:
220 |         Ynorm = scipy.sparse.linalg.norm(Y, axis=1)
221 |     XY = (X * Y.T).toarray()
222 |     with np.errstate(divide="ignore"):  # handle 0 in denominator
223 |         return np.nan_to_num(XY / np.outer(Xnorm, Ynorm))
224 | 
225 | @maybe_jit(nopython=True, nogil=True, cache=True)
226 | def _dense_soergel(X, Y, S):
227 |     for ix in range(S.shape[0]):
228 |         for iy in range(S.shape[1]):
229 |             sum_abs_diff = 0
230 |             sum_max = 0
231 |             for j in range(X.shape[1]):
232 |                 diff = X[ix, j] - Y[iy, j]
233 |                 if diff > 0:
234 |                     sum_abs_diff += diff
235 |                     sum_max += X[ix, j]
236 |                 else:
237 |                     sum_abs_diff -= diff
238 |                     sum_max += Y[iy, j]
239 | 
240 |             if sum_max == 0:
241 |                 S[ix, iy] = 0
242 |                 continue
243 |             S[ix, iy] = 1 - sum_abs_diff / sum_max
244 |     return S
245 | 
246 | @maybe_jit(nopython=True, nogil=True, cache=True)
247 | def _sparse_soergel(Xdata, Xindices, Xindptr, Ydata, Yindices, Yindptr, S):
248 |     for ix in range(S.shape[0]):
249 |         if Xindptr[ix] == Xindptr[ix + 1]:
250 |             for iy in range(S.shape[1]):  # no X values in row
251 |                 S[ix, iy] = 0
252 |             continue
253 |         jxindmax = Xindptr[ix + 1] - 1
254 |         for iy in range(S.shape[1]):
255 |             if Yindptr[iy] == Yindptr[iy + 1]:  # no Y values in row
256 |                 S[ix, iy] = 0
257 |                 continue
258 | 
259 |             sum_abs_diff = 0
260 |             sum_max = 0
261 |             # Implementation of the final step of merge sort
262 |             jyindmax = Yindptr[iy + 1] - 1
263 |             jx = Xindptr[ix]
264 |             jy = Yindptr[iy]
265 |             while jx <= jxindmax and jy <= jyindmax:
266 |                 jxind = Xindices[jx]
267 |                 jyind = Yindices[jy]   
268 |                 if jxind < jyind:
269 |                     sum_max += Xdata[jx]
270 |                     sum_abs_diff += Xdata[jx]
271 |                     jx += 1
272 |                 elif jyind < jxind:
273 |                     sum_max += Ydata[jy]
274 |                     sum_abs_diff += Ydata[jy]
275 |                     jy += 1
276 |                 else:
277 |                     diff = Xdata[jx] - Ydata[jy]
278 |                     if diff > 0:
279 |                         sum_abs_diff += diff
280 |                         sum_max += Xdata[jx]
281 |                     else:
282 |                         sum_abs_diff -= diff
283 |                         sum_max += Ydata[jy]
284 |                     jx += 1
285 |                     jy += 1
286 | 
287 |             while jx <= jxindmax:
288 |                 sum_max += Xdata[jx]
289 |                 sum_abs_diff += Xdata[jx]
290 |                 jx += 1
291 | 
292 |             while jy <= jyindmax:
293 |                 sum_max += Ydata[jy]
294 |                 sum_abs_diff += Ydata[jy]
295 |                 jy += 1
296 | 
297 |             if sum_max == 0:
298 |                 S[ix, iy] = 0
299 |                 continue
300 |             S[ix, iy] = 1 - sum_abs_diff / sum_max
301 |     return S
302 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/array_ops.py:
--------------------------------------------------------------------------------
  1 | """Various array operations.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import numpy as np
  7 | from scipy.spatial.distance import pdist, squareform
  8 | 
  9 | QUATERNION_DTYPE = float
 10 | X_AXIS, Y_AXIS, Z_AXIS = np.identity(3, dtype=float)
 11 | EPS = 1e-12  # epsilon, a number close to 0
 12 | 
 13 | 
 14 | # Vector Algebra Methods
 15 | def as_unit(v, axis=1):
 16 |     """Return array of unit vectors parallel to vectors in `v`.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     v : ndarray of float
 21 |     axis : int, optional
 22 |         Axis along which to normalize length.
 23 | 
 24 |     Returns
 25 |     -------
 26 |     ndarray of float : Unit vector of `v`, i.e. `v` divided by its
 27 |                        magnitude along `axis`.
 28 |     """
 29 |     u = np.array(v, dtype=float, copy=True)
 30 |     if u.ndim == 1:
 31 |         sqmag = u.dot(u)
 32 |         if sqmag >= EPS:
 33 |             u /= sqmag ** 0.5
 34 |     else:
 35 |         if axis == 1:
 36 |             sqmag = np.einsum("...ij,...ij->...i", u, u)
 37 |         else:
 38 |             sqmag = np.einsum("...ij,...ij->...j", u, u)
 39 | 
 40 |         sqmag[sqmag < EPS] = 1.0
 41 |         u /= np.expand_dims(np.sqrt(sqmag), axis)
 42 |     return u
 43 | 
 44 | 
 45 | def make_distance_matrix(coords):
 46 |     """Build pairwise distance matrix from coordinates.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     coords : ndarray of float
 51 |         an Mx3 array of cartesian coordinates.
 52 | 
 53 |     Returns
 54 |     -------
 55 |     ndarray of float : square symmetrical distance matrix
 56 |     """
 57 |     return squareform(pdist(coords))
 58 | 
 59 | 
 60 | def make_transform_matrix(center, y=None, z=None):
 61 |     """Make 4x4 homogenous transformation matrix.
 62 | 
 63 |     Given Nx4 array A where A[:, 4] = 1., the transform matrix M should be
 64 |     used with dot(M, A.T).T. Order of operations is 1. translation, 2. align
 65 |     `y` x `z` plane to yz-plane 3. align `y` to y-axis.
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     center : 1x3 array of float
 70 |         Coordinate that should be centered after transformation.
 71 |     y : None or 1x3 array of float
 72 |         Vector that should lie on the y-axis after transformation
 73 |     z : None or 1x3 array of float
 74 |         Vector that after transformation should lie on yz-plane in direction
 75 |         of z-axis.
 76 | 
 77 |     Returns
 78 |     -------
 79 |     4x4 array of float
 80 |         4x4 homogenous transformation matrix.
 81 |     """
 82 |     translate = np.identity(4, dtype=float)
 83 |     translate[:3, 3] = -np.asarray(center, dtype=float)
 84 |     if y is not None:
 85 |         y = np.atleast_2d(y)
 86 |         if z is None:
 87 |             rotate = np.identity(4, dtype=float)
 88 |             rotate[:3, :3] = make_rotation_matrix(y, Y_AXIS)
 89 |         else:
 90 |             z = np.atleast_2d(z)
 91 |             rotate_norm = np.identity(4, dtype=float)
 92 |             x_unit = as_unit(np.cross(y, z))
 93 |             rotate_norm[:3, :3] = make_rotation_matrix(x_unit, X_AXIS)
 94 |             new_y = np.dot(rotate_norm[:3, :3], y.flatten())
 95 |             rotate_y = np.identity(4, dtype=float)
 96 |             rotate_y[:3, :3] = make_rotation_matrix(new_y.flatten(), Y_AXIS)
 97 |             rotate = np.dot(rotate_y, rotate_norm)
 98 |         transform = np.dot(rotate, translate)
 99 |     else:
100 |         transform = translate
101 |     return transform
102 | 
103 | 
104 | def make_rotation_matrix(v0, v1):
105 |     """Create 3x3 matrix of rotation from `v0` onto `v1`.
106 | 
107 |     Should be used by dot(R, v0.T).T.
108 | 
109 |     Parameters
110 |     ----------
111 |     v0 : 1x3 array of float
112 |         Initial vector before alignment.
113 |     v1 : 1x3 array of float
114 |         Vector to which to align `v0`.
115 |     """
116 |     v0 = as_unit(v0)
117 |     v1 = as_unit(v1)
118 |     u = np.cross(v0.ravel(), v1.ravel())
119 |     if np.all(u == 0.0):
120 |         return np.identity(3, dtype=float)
121 |     sin_ang = u.dot(u) ** 0.5
122 |     u /= sin_ang
123 |     cos_ang = np.dot(v0, v1.T)
124 |     # fmt: off
125 |     ux = np.array([[   0., -u[2],  u[1]],
126 |                    [ u[2],    0., -u[0]],
127 |                    [-u[1],  u[0],    0.]], dtype=float)
128 |     # fmt: on
129 |     rot = (
130 |         cos_ang * np.identity(3, dtype=float)
131 |         + sin_ang * ux
132 |         + (1 - cos_ang) * np.outer(u, u)
133 |     )
134 |     return rot
135 | 
136 | 
137 | def transform_array(transform_matrix, a):
138 |     """Pad an array with 1s, transform, and return with original dimensions.
139 | 
140 |     Parameters
141 |     ----------
142 |     transform_matrix : 4x4 array of float
143 |         4x4 homogenous transformation matrix
144 |     a : Nx3 array of float
145 |         Array of 3-D coordinates.
146 | 
147 |     Returns
148 |     -------
149 |     Nx3 array of float : Transformed array
150 |     """
151 |     return unpad_array(np.dot(transform_matrix, pad_array(a).T).T)
152 | 
153 | 
154 | def pad_array(a, n=1.0, axis=1):
155 |     """Return `a` with row of `n` appended to `axis`.
156 | 
157 |     Parameters
158 |     ----------
159 |     a : ndarray
160 |         Array to pad
161 |     n : float or int, optional
162 |         Value to pad `a` with
163 |     axis : int, optional
164 |         Axis of `a` to pad with `n`.
165 | 
166 |     Returns
167 |     -------
168 |     ndarray
169 |         Padded array.
170 |     """
171 |     if a.ndim == 1:
172 |         pad = np.ones(a.shape[0] + 1, dtype=a.dtype) * n
173 |         pad[: a.shape[0]] = a
174 |     else:
175 |         shape = list(a.shape)
176 |         shape[axis] += 1
177 |         pad = np.ones(shape, dtype=a.dtype)
178 |         pad[: a.shape[0], : a.shape[1]] = a
179 |     return pad
180 | 
181 | 
182 | def unpad_array(a, axis=1):
183 |     """Return `a` with row removed along `axis`.
184 | 
185 |     Parameters
186 |     ----------
187 |     a : ndarray
188 |         Array from which to remove row
189 |     axis : int, optional
190 |         Axis from which to remove row
191 | 
192 |     Returns
193 |     -------
194 |     ndarray
195 |         Unpadded array.
196 |     """
197 |     if a.ndim == 1:
198 |         return a[:-1]
199 |     else:
200 |         shape = list(a.shape)
201 |         shape[axis] -= 1
202 |         return a[: shape[0], : shape[1]]
203 | 
204 | 
205 | def project_to_plane(vec_arr, norm):
206 |     """Project array of vectors to plane with normal `norm`.
207 | 
208 |     Parameters
209 |     ----------
210 |     vec_arr : Nx3 array
211 |         Array of N 3D vectors.
212 |     norm : 1x3 array
213 |         Normal vector to plane.
214 | 
215 |     Returns
216 |     -------
217 |     Nx3 array
218 |         Array of vectors projected onto plane.
219 |     """
220 |     unit_norm = as_unit(norm).flatten()
221 |     mag_on_norm = np.dot(vec_arr, unit_norm)
222 |     if vec_arr.ndim == 1:
223 |         vec_on_norm = np.array(unit_norm, copy=True)
224 |         vec_on_norm *= mag_on_norm
225 |     else:
226 |         vec_on_norm = np.tile(unit_norm, (vec_arr.shape[0], 1))
227 |         vec_on_norm *= mag_on_norm[:, None]
228 |     return vec_arr - vec_on_norm
229 | 
230 | 
231 | def calculate_angles(vec_arr, ref, ref_norm=None):
232 |     """Calculate angles between vectors in `vec_arr` and `ref` vector.
233 | 
234 |     If `ref_norm` is not provided, angle ranges between 0 and pi. If it is
235 |     provided, angle ranges between 0 and 2pi. Note that if `ref_norm` is
236 |     orthogonal to `vec_arr` and `ref`, then the angle is rotation around the
237 |     axis, but if a non-orthogonal axis is provided, this may not be the case.
238 | 
239 |     Parameters
240 |     ----------
241 |     vec_arr : Nx3 array of float
242 |         Array of N 3D vectors.
243 |     ref : 1x3 array of float
244 |         Reference vector
245 |     ref_norm : 1x3 array of float
246 |         Normal vector.
247 | 
248 |     Returns
249 |     -------
250 |     1-D array
251 |         Array of N angles
252 |     """
253 |     unit_vec_arr = as_unit(vec_arr)
254 |     unit_ref = as_unit(ref).flatten()
255 |     ang = np.arccos(np.clip(np.dot(unit_vec_arr, unit_ref), -1.0, 1.0))
256 |     # handle cases where a vector is the origin
257 |     ang[np.all(unit_vec_arr == np.zeros(3), axis=1)] = 0.0
258 |     if ref_norm is not None:
259 |         sign = np.sign(
260 |             np.dot(ref_norm, np.cross(unit_vec_arr, unit_ref).T)
261 |         ).flatten()
262 |         sign[sign == 0] = 1
263 |         ang = rotate_angles(sign * ang, 2 * np.pi)
264 |     return ang
265 | 
266 | 
267 | def rotate_angles(angles, amount):
268 |     """Rotate angles by `amount`, keeping in 0 to 2pi range.
269 | 
270 |     Parameters
271 |     ----------
272 |     angles : 1-D array of float
273 |         Angles in radians
274 |     amount : float
275 |         Amount to rotate angles by
276 | 
277 |     Returns
278 |     -------
279 |     1-D array of float : Rotated angles
280 |     """
281 |     return (angles + amount) % (2 * np.pi)
282 | 
283 | 
284 | def quaternion_to_transform_matrix(quaternion, translation=np.zeros(3)):
285 |     """Convert quaternion to homogenous 4x4 transform matrix.
286 | 
287 |     Parameters
288 |     ----------
289 |     quaternion : 4x1 array of float
290 |         Quaternion describing rotation after translation.
291 |     translation : 3x1 array of float, optional
292 |         Translation to be performed before rotation.
293 |     """
294 |     q = np.array(quaternion, dtype=float, copy=True)
295 |     n = np.linalg.norm(q)
296 |     if n < 1e-12:
297 |         return np.identity(4, dtype=float)
298 |     q /= n
299 |     q = 2 * np.outer(q, q)
300 |     # fmt: off
301 |     transform_mat = np.array(
302 |         [[1.-q[2, 2]-q[3, 3],    q[1, 2]-q[3, 0],    q[1, 3]+q[2, 0], 0.],
303 |          [   q[1, 2]+q[3, 0], 1.-q[1, 1]-q[3, 3],    q[2, 3]-q[1, 0], 0.],
304 |          [   q[1, 3]-q[2, 0],    q[2, 3]+q[1, 0], 1.-q[1, 1]-q[2, 2], 0.],
305 |          [                0.,                 0.,                 0., 1.]],
306 |         dtype=float
307 |     )
308 |     # fmt: on
309 |     transform_mat[:3, 3] = translation
310 |     return transform_mat
311 | 
312 | 
313 | def transform_matrix_to_quaternion(transform_matrix, dtype=QUATERNION_DTYPE):
314 |     """Convert homogenous 4x4 transform matrix to quaternion.
315 | 
316 |     Parameters
317 |     ----------
318 |     transform_matrix : 4x4 array of float
319 |         Homogenous transformation matrix.
320 |     dtype : numpy dtype, optional
321 |         Datatype for returned quaternion.
322 |     """
323 |     T = np.array(transform_matrix, dtype=float)
324 |     R = T[:3, :3]
325 |     q = np.zeros(4, dtype=dtype)
326 |     q[0] = np.sqrt(1.0 + R.trace()) / 2.0
327 |     q[1] = R[2, 1] - R[1, 2]
328 |     q[2] = R[0, 2] - R[2, 0]
329 |     q[3] = R[1, 0] - R[0, 1]
330 |     q[1:4] /= 4.0 * q[0]
331 |     return q
332 | 


--------------------------------------------------------------------------------
/tests/test_struct.py:
--------------------------------------------------------------------------------
  1 | """Tests for Shell and Substruct objects.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import os
  7 | import pytest
  8 | 
  9 | DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 10 | PLANAR_SDF_FILE = os.path.join(DATA_DIR, "caffeine_planar.sdf.bz2")
 11 | 
 12 | 
 13 | class TestShellCreation:
 14 |     def test_error_when_center_not_atom(self):
 15 |         from e3fp.fingerprint.structs import Shell
 16 | 
 17 |         with pytest.raises(TypeError):
 18 |             Shell(None)
 19 | 
 20 |     def test_error_when_shells_has_non_shell(self):
 21 |         from e3fp.fingerprint.structs import Shell
 22 | 
 23 |         atom = 0
 24 |         shells = [None]
 25 |         with pytest.raises(TypeError):
 26 |             Shell(atom, shells)
 27 | 
 28 |     def test_creation_with_atoms_or_ids_equivalent(self):
 29 |         from e3fp.fingerprint.structs import Shell
 30 |         from e3fp.conformer.util import mol_from_sdf
 31 | 
 32 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 33 |         atoms = list(mol.GetAtoms())
 34 |         atom_ids = [x.GetIdx() for x in atoms]
 35 |         assert Shell(atoms[0], atoms[1:]) == Shell(atom_ids[0], atom_ids[1:])
 36 | 
 37 |     def test_create_shell_no_shell(self):
 38 |         from e3fp.fingerprint.structs import Shell
 39 |         from e3fp.conformer.util import mol_from_sdf
 40 | 
 41 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 42 |         atoms = list(mol.GetAtoms())
 43 |         center_atom = atoms[0]
 44 |         Shell(center_atom)
 45 | 
 46 |     def test_create_shell_with_same_center_fails(self):
 47 |         from e3fp.fingerprint.structs import Shell, FormatError
 48 |         from e3fp.conformer.util import mol_from_sdf
 49 | 
 50 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 51 |         atoms = list(mol.GetAtoms())
 52 |         center_atom = atoms[0]
 53 |         with pytest.raises(FormatError):
 54 |             Shell(center_atom, atoms)
 55 | 
 56 |     def test_atoms_converted_to_shells(self):
 57 |         from e3fp.fingerprint.structs import Shell
 58 |         from e3fp.conformer.util import mol_from_sdf
 59 | 
 60 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 61 |         atoms = list(mol.GetAtoms())
 62 |         center_atom = atoms[0]
 63 |         shell = Shell(center_atom, atoms[1:])
 64 |         for s in shell.shells:
 65 |             assert isinstance(s, Shell)
 66 | 
 67 |     def test_creation_with_atoms_or_shells_equal(self):
 68 |         from e3fp.fingerprint.structs import Shell
 69 |         from e3fp.conformer.util import mol_from_sdf
 70 | 
 71 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 72 |         atoms = list(mol.GetAtoms())
 73 |         shells = list(map(Shell, atoms))
 74 |         center_atom = atoms[0]
 75 |         shell1 = Shell(center_atom, atoms[1:])
 76 |         shell2 = Shell(center_atom, shells[1:])
 77 |         assert shell1 == shell2
 78 | 
 79 |     def test_recursive_atom_shells_correct(self):
 80 |         from e3fp.fingerprint.structs import Shell
 81 |         from e3fp.conformer.util import mol_from_sdf
 82 | 
 83 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 84 |         atoms = list(mol.GetAtoms())
 85 |         shell1 = Shell(atoms[5], atoms[6:8])
 86 |         shell2 = Shell(atoms[2], atoms[3:5])
 87 |         shell = Shell(atoms[0], (shell1, shell2))
 88 |         assert shell.atoms == {x.GetIdx() for x in (atoms[0], atoms[2], atoms[5])}
 89 | 
 90 | 
 91 | class TestShellComparison:
 92 |     def test_shells_same_center_same_atoms_equal(self):
 93 |         from e3fp.fingerprint.structs import Shell
 94 |         from e3fp.conformer.util import mol_from_sdf
 95 | 
 96 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
 97 |         atoms = list(mol.GetAtoms())
 98 |         center_atom = atoms[0]
 99 |         shell1 = Shell(center_atom, atoms[1:])
100 |         shell2 = Shell(center_atom, atoms[1:])
101 |         assert shell1 == shell2
102 | 
103 |     def test_shells_diff_center_same_atoms_nonequal(self):
104 |         from e3fp.fingerprint.structs import Shell
105 |         from e3fp.conformer.util import mol_from_sdf
106 | 
107 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
108 |         atoms = list(mol.GetAtoms())
109 |         shell1 = Shell(atoms[0], atoms[2:])
110 |         shell2 = Shell(atoms[1], atoms[2:])
111 |         assert shell1 != shell2
112 | 
113 |     def test_shells_same_center_diff_atoms_nonequal(self):
114 |         from e3fp.fingerprint.structs import Shell
115 |         from e3fp.conformer.util import mol_from_sdf
116 | 
117 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
118 |         atoms = list(mol.GetAtoms())
119 |         center_atom = atoms[0]
120 |         shell1 = Shell(center_atom, atoms[1:])
121 |         shell2 = Shell(center_atom, atoms[2:])
122 |         assert shell1 != shell2
123 | 
124 |     def test_equal_shells_hash_to_same_value(self):
125 |         from e3fp.fingerprint.structs import Shell
126 |         from e3fp.conformer.util import mol_from_sdf
127 | 
128 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
129 |         atoms = list(mol.GetAtoms())
130 |         center_atom = atoms[0]
131 |         shell1 = Shell(center_atom, atoms[1:])
132 |         shell2 = Shell(center_atom, atoms[1:])
133 |         assert hash(shell1) == hash(shell2)
134 | 
135 |     def test_same_shell_hashes_to_same_value(self):
136 |         from e3fp.fingerprint.structs import Shell
137 |         from e3fp.conformer.util import mol_from_sdf
138 | 
139 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
140 |         atoms = list(mol.GetAtoms())
141 |         center_atom = atoms[0]
142 |         shell = Shell(center_atom, atoms[1:])
143 |         assert hash(shell) == hash(shell)
144 | 
145 | 
146 | class TestShellSubstructInterface:
147 |     def test_recursive_shell_substruct_correct1(self):
148 |         from e3fp.fingerprint.structs import Shell
149 |         from e3fp.conformer.util import mol_from_sdf
150 | 
151 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
152 |         atoms = list(mol.GetAtoms())
153 |         shell1 = Shell(atoms[5], atoms[6:8])
154 |         shell2 = Shell(atoms[1], atoms[2:5])
155 |         shell = Shell(atoms[0], (shell1, shell2))
156 |         assert shell.substruct.atoms == {x.GetIdx() for x in atoms[:8]}
157 | 
158 |     def test_recursive_shell_substruct_correct2(self):
159 |         from e3fp.fingerprint.structs import Shell
160 |         from e3fp.conformer.util import mol_from_sdf
161 | 
162 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
163 |         atoms = list(mol.GetAtoms())
164 |         shell1 = Shell(atoms[1], atoms[2:5])
165 |         shell2 = Shell(atoms[5], {shell1})
166 |         shell3 = Shell(atoms[6], atoms[7:10])
167 |         shell4 = Shell(atoms[10], {shell3})
168 |         shell = Shell(atoms[0], (shell2, shell4))
169 |         assert shell.substruct.atoms == {x.GetIdx() for x in atoms[:11]}
170 | 
171 |     def test_shell_creation_from_substruct_without_center_fails(self):
172 |         from e3fp.fingerprint.structs import Shell, Substruct, FormatError
173 |         from e3fp.conformer.util import mol_from_sdf
174 | 
175 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
176 |         atoms = list(mol.GetAtoms())
177 |         substruct = Substruct(None, atoms[:2])
178 |         with pytest.raises(FormatError):
179 |             Shell.from_substruct(substruct)
180 | 
181 |     def test_shell_creation_from_substruct(self):
182 |         from e3fp.fingerprint.structs import Shell, Substruct
183 |         from e3fp.conformer.util import mol_from_sdf
184 | 
185 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
186 |         atoms = list(mol.GetAtoms())
187 |         substruct = Substruct(atoms[0], atoms[:2])
188 |         shell = Shell.from_substruct(substruct)
189 |         assert shell.atoms == substruct.atoms
190 | 
191 |     def test_substruct_creation_from_shell(self):
192 |         from e3fp.fingerprint.structs import Shell, Substruct
193 |         from e3fp.conformer.util import mol_from_sdf
194 | 
195 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
196 |         atoms = list(mol.GetAtoms())
197 |         shell = Shell(atoms[0], atoms[1:])
198 |         substruct = Substruct.from_shell(shell)
199 |         assert shell.substruct == substruct
200 | 
201 | 
202 | class TestSubstructCreation:
203 |     def test_error_when_center_not_atom(self):
204 |         from e3fp.fingerprint.structs import Substruct
205 | 
206 |         with pytest.raises(TypeError):
207 |             Substruct("foo")
208 | 
209 |     def test_error_when_atoms_has_non_atom(self):
210 |         from e3fp.fingerprint.structs import Substruct
211 | 
212 |         atoms = [None]
213 |         with pytest.raises(TypeError):
214 |             Substruct(atoms=atoms)
215 | 
216 |     def test_center_atom_auto_added_to_atoms(self):
217 |         from e3fp.fingerprint.structs import Substruct
218 |         from e3fp.conformer.util import mol_from_sdf
219 | 
220 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
221 |         atoms = list(mol.GetAtoms())
222 |         center_atom = atoms[0]
223 |         substruct = Substruct(center_atom, atoms[1:])
224 |         assert center_atom.GetIdx() in substruct.atoms
225 | 
226 | 
227 | class TestSubstructCreationComparison:
228 |     def test_substructs_same_center_same_atoms_equal(self):
229 |         from e3fp.fingerprint.structs import Substruct
230 |         from e3fp.conformer.util import mol_from_sdf
231 | 
232 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
233 |         atoms = list(mol.GetAtoms())
234 |         center_atom = atoms[0]
235 |         substruct1 = Substruct(center_atom, atoms)
236 |         substruct2 = Substruct(center_atom, atoms)
237 |         assert substruct1 == substruct2
238 | 
239 |     def test_substructs_diff_center_same_atoms_equal(self):
240 |         from e3fp.fingerprint.structs import Substruct
241 |         from e3fp.conformer.util import mol_from_sdf
242 | 
243 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
244 |         atoms = list(mol.GetAtoms())
245 |         substruct1 = Substruct(atoms[0], atoms)
246 |         substruct2 = Substruct(atoms[1], atoms)
247 |         assert substruct1 == substruct2
248 | 
249 |     def test_substructs_same_center_diff_atoms_nonequal(self):
250 |         from e3fp.fingerprint.structs import Substruct
251 |         from e3fp.conformer.util import mol_from_sdf
252 | 
253 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
254 |         atoms = list(mol.GetAtoms())
255 |         substruct1 = Substruct(atoms[0], atoms[1:])
256 |         substruct2 = Substruct(atoms[0], atoms[2:])
257 |         assert substruct1 != substruct2
258 | 
259 |     def test_equal_shells_hash_to_same_value(self):
260 |         from e3fp.fingerprint.structs import Substruct
261 |         from e3fp.conformer.util import mol_from_sdf
262 | 
263 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
264 |         atoms = list(mol.GetAtoms())
265 |         center_atom = atoms[0]
266 |         substruct1 = Substruct(center_atom, atoms[1:])
267 |         substruct2 = Substruct(center_atom, atoms[1:])
268 |         assert hash(substruct1) == hash(substruct2)
269 | 
270 |     def test_same_shells_hash_to_same_value(self):
271 |         from e3fp.fingerprint.structs import Substruct
272 |         from e3fp.conformer.util import mol_from_sdf
273 | 
274 |         mol = mol_from_sdf(PLANAR_SDF_FILE)
275 |         atoms = list(mol.GetAtoms())
276 |         center_atom = atoms[0]
277 |         substruct = Substruct(center_atom, atoms[1:])
278 |         assert hash(substruct) == hash(substruct)
279 | 


--------------------------------------------------------------------------------
/src/e3fp/fingerprint/structs.py:
--------------------------------------------------------------------------------
  1 | """Class for defining 3D atom environments.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | from __future__ import division, print_function
  7 | from functools import reduce
  8 | 
  9 | import numpy as np
 10 | import rdkit.Chem
 11 | 
 12 | import smart_open
 13 | from e3fp.fingerprint import array_ops
 14 | 
 15 | 
 16 | PDB_LINE = (
 17 |     "HETATM{atom_id:>5d} {name:<4s} LIG A   1    "
 18 |     "{coord[0]:>8.3f}{coord[1]:>8.3f}{coord[2]:>8.3f}"
 19 |     "{occupancy:>6.2f}{temp:>6.2f}          {elem:>2s}{charge:>2s}"
 20 | )
 21 | 
 22 | 
 23 | class Shell(object):
 24 |     """A container for other Shells centered on an atom.
 25 | 
 26 |     Shells represent all atoms explicitly within a container. Atoms are
 27 |     represented by their ids. If atoms are provided instead of shells, they
 28 |     are converted to single-atom shells. A Substruct is generated from a Shell
 29 |     on the fly by recursion through member shells. An optional identifier may
 30 |     be set.
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         center_atom,
 36 |         shells=set(),
 37 |         radius=None,
 38 |         last_shell=None,
 39 |         identifier=None,
 40 |     ):
 41 |         if isinstance(center_atom, rdkit.Chem.Atom):
 42 |             center_atom = center_atom.GetIdx()
 43 |         elif not isinstance(center_atom, (int, np.integer)):
 44 |             raise TypeError("center_atom must be Atom or atom id")
 45 |         self._center_atom = center_atom
 46 | 
 47 |         self._shells = set()
 48 |         for shell in shells:
 49 |             if isinstance(shell, int):
 50 |                 shell = Shell(shell)
 51 |             elif isinstance(shell, rdkit.Chem.Atom):
 52 |                 shell = Shell(shell.GetIdx())
 53 |             elif not isinstance(shell, Shell):
 54 |                 raise TypeError("shells must be Shells, Atoms, or atom ids")
 55 |             if shell.center_atom == self.center_atom:
 56 |                 raise FormatError(
 57 |                     "member shells cannot be centered on same "
 58 |                     "center_atom as new shell"
 59 |                 )
 60 |             self._shells.add(shell)
 61 |         self._shells = frozenset(self._shells)
 62 | 
 63 |         self.radius = radius
 64 |         self.last_shell = last_shell
 65 |         self.atoms = None
 66 |         self.substruct = None
 67 |         self.identifier = identifier
 68 |         self.is_duplicate = False
 69 |         self.duplicate = None
 70 | 
 71 |     @classmethod
 72 |     def from_substruct(cls, substruct):
 73 |         """Create shell with one shell for each atom in the substruct."""
 74 |         if substruct.center_atom is None:
 75 |             raise FormatError(
 76 |                 "Can only create Shell from Substruct if "
 77 |                 "center_atom is defined"
 78 |             )
 79 |         atoms = substruct.atoms ^ {substruct.center_atom}
 80 |         return cls(substruct.center_atom, [Shell(x) for x in atoms])
 81 | 
 82 |     @property
 83 |     def center_atom(self):
 84 |         return self._center_atom
 85 | 
 86 |     @property
 87 |     def shells(self):
 88 |         return self._shells
 89 | 
 90 |     @property
 91 |     def atoms(self):
 92 |         """Get all atoms explicitly within the shell."""
 93 |         if self._atoms is None:
 94 |             self._atoms = set([self.center_atom,])
 95 |             self._atoms.update([x.center_atom for x in self.shells])
 96 |         return self._atoms
 97 | 
 98 |     @atoms.setter
 99 |     def atoms(self, atoms):
100 |         self._atoms = atoms
101 | 
102 |     @property
103 |     def substruct(self):
104 |         """Get substruct with all atoms implicitly within the shell."""
105 |         if self._substruct is None:
106 |             atom_sets = [set(x.substruct.atoms) for x in self.shells]
107 |             if len(atom_sets) > 0:
108 |                 atoms = reduce(set.union, atom_sets)
109 |             else:
110 |                 atoms = set()
111 |             self._substruct = Substruct(
112 |                 center_atom=self.center_atom, atoms=atoms
113 |             )
114 |             self._substruct.shell = self
115 |         return self._substruct
116 | 
117 |     @substruct.setter
118 |     def substruct(self, substruct):
119 |         if not isinstance(substruct, Substruct) and substruct is not None:
120 |             raise TypeError("substruct must be of type Substruct")
121 |         self._substruct = substruct
122 | 
123 |     def __repr__(self):
124 |         return (
125 |             "Shell(center_atom={!r}, shells={!r}, radius={!r}, "
126 |             "last_shell={!r}, identifier={!r})"
127 |         ).format(
128 |             self.center_atom,
129 |             tuple(self.shells),
130 |             self.radius,
131 |             self.last_shell,
132 |             self.identifier,
133 |         )
134 | 
135 |     def __str__(self):
136 |         return (
137 |             "Shell(center_atom={!r}, atoms={!r}, radius={!r}, "
138 |             "identifier={!r})"
139 |         ).format(
140 |             self.center_atom, tuple(self.atoms), self.radius, self.identifier
141 |         )
142 | 
143 |     def __hash__(self):
144 |         return hash((self.center_atom, self.shells))
145 | 
146 |     def __eq__(self, other):
147 |         return (self.center_atom == other.center_atom) and (
148 |             self.shells == other.shells
149 |         )
150 | 
151 |     def __ne__(self, other):
152 |         return not self.__eq__(other)
153 | 
154 |     def __len__(self):
155 |         return 1 + len(self.shells)
156 | 
157 |     def __contains__(self, key):
158 |         if isinstance(key, (int, rdkit.Chem.Atom)):
159 |             key = Shell(key)
160 |         return key in self.shells or key == self
161 | 
162 | 
163 | class Substruct(object):
164 |     """A container for atoms optionally centered on an atom.
165 | 
166 |     A Substruct represents all atoms implicitly within a Shell. Two Substructs
167 |     are equal if they contain the same atoms.
168 |     """
169 | 
170 |     def __init__(self, center_atom=None, atoms=set()):
171 |         self.center_atom = center_atom
172 |         self.shell = None
173 |         self._atoms = set()
174 |         for atom in atoms:
175 |             if isinstance(atom, rdkit.Chem.Atom):
176 |                 atom = atom.GetIdx()
177 |             elif not isinstance(atom, (int, np.integer)):
178 |                 raise TypeError("atoms must be Atom or atom id")
179 |             self._atoms.add(atom)
180 |         if self.center_atom is not None:
181 |             self._atoms.add(self.center_atom)
182 |         self._atoms = frozenset(self._atoms)
183 |         self.transform_matrix = np.identity(4, dtype=float)
184 | 
185 |     @classmethod
186 |     def from_shell(cls, shell):
187 |         return shell.substruct
188 | 
189 |     @property
190 |     def center_atom(self):
191 |         return self._center_atom
192 | 
193 |     @center_atom.setter
194 |     def center_atom(self, center_atom):
195 |         if isinstance(center_atom, rdkit.Chem.Atom):
196 |             center_atom = center_atom.GetIdx()
197 |         elif (
198 |             not isinstance(center_atom, (int, np.integer))
199 |             and center_atom is not None
200 |         ):
201 |             raise TypeError("center_atom must be Atom or atom id")
202 |         self._center_atom = center_atom
203 | 
204 |     @property
205 |     def atoms(self):
206 |         return self._atoms
207 | 
208 |     def __repr__(self):
209 |         return "Substruct(center_atom={!r}, atoms={!r})".format(
210 |             self.center_atom, tuple(self.atoms)
211 |         )
212 | 
213 |     def __str__(self):
214 |         return self.__repr__()
215 | 
216 |     def __hash__(self):
217 |         return hash(self.atoms)
218 | 
219 |     def __eq__(self, other):
220 |         return self.atoms == other.atoms
221 | 
222 |     def __ne__(self, other):
223 |         return not self.__eq__(other)
224 | 
225 |     def __len__(self):
226 |         return len(self.atoms)
227 | 
228 |     def __contains__(self, key):
229 |         if isinstance(key, rdkit.Chem.Atom):
230 |             key = key.GetIdx()
231 |         return key in self.atoms
232 | 
233 | 
234 | class FormatError(Exception):
235 |     pass
236 | 
237 | 
238 | # methods/classes for shell i/o
239 | def shell_to_pdb(
240 |     mol, shell, atom_coords, bound_atoms_dict, out_file=None, reorient=True
241 | ):
242 |     """Append substructure within shell to PDB.
243 | 
244 |     Parameters
245 |     ----------
246 |     mol : RDKit Mol
247 |         Input mol
248 |     shell : Shell
249 |         A shell
250 |     atom_coords : dict
251 |         Dict matching atom id to coordinates.
252 |     bound_atoms_dict : dict
253 |         Dict matching atom id to id of bound atoms.
254 |     out_file : str or None, optional
255 |         File to which to append coordinates.
256 |     reorient : bool, optional
257 |         Use the transformation matrix in the shell to align by the stereo
258 |         quadrants. If no transformation matrix present, centers the center
259 |         atom.
260 | 
261 |     Returns
262 |     -------
263 |     list of str: list of PDB file lines, if `out_file` not specified
264 |     """
265 |     remark = "REMARK 400"
266 |     name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
267 |     header_lines = [remark + " COMPOUND", remark + " " + name]
268 |     lines = header_lines + [
269 |         "MODEL",
270 |     ]
271 |     atom_ids = sorted(shell.substruct.atoms)
272 |     atoms = [mol.GetAtomWithIdx(int(x)) for x in atom_ids]
273 |     coords = np.asarray(list(map(atom_coords.get, atom_ids)), dtype=float)
274 |     if reorient:
275 |         try:
276 |             coords = array_ops.transform_array(shell.transform_matrix, coords)
277 |         except AttributeError:
278 |             coords -= atom_coords[shell.center_atom]
279 | 
280 |     for i, atom_id in enumerate(atom_ids):
281 |         elem = atoms[i].GetSymbol()
282 |         name = "{}{:d}".format(elem, atom_id + 1)
283 |         charge = atoms[i].GetFormalCharge()
284 |         if charge > 0:
285 |             charge = "{:d}+".format(charge)
286 |         elif charge < 0:
287 |             charge = "{:d}-".format(abs(charge))
288 |         else:
289 |             charge = ""
290 |         if atom_id == shell.center_atom:
291 |             temp = 1.0
292 |         elif atom_id in shell.atoms:
293 |             temp = 0.5
294 |         else:
295 |             temp = 0.0
296 |         pdb_entries = {
297 |             "atom_id": atom_id,
298 |             "name": name,
299 |             "coord": coords[i, :].flatten(),
300 |             "occupancy": 0.0,
301 |             "temp": temp,
302 |             "elem": elem,
303 |             "charge": charge,
304 |         }
305 |         lines.append(PDB_LINE.format(**pdb_entries))
306 | 
307 |     # PLACEHOLDER FOR WRITING BONDS TO PDB
308 |     # used_bonds = set()
309 |     # write_bonds = []
310 |     # for atom_id in atom_ids:
311 |     #     write_bonds.append(atom_id)
312 |     #     bound_atom_ids = bound_atoms_dict.get(atom_id, set())
313 |     #     for bound_atom_id in bound_atom_ids:
314 |     #         if (atom_id, bound_atom_id) in used_bonds:
315 |     #             continue
316 |     #         if len(write_bonds) > 3:
317 |     #             lines.append("CONECT "+" ".join(map(str, write_bonds)))
318 |     #             write_bonds = [atom_id,]
319 |     #         write_bonds.append(bound_atom_id)
320 |     #         used_bonds.add((atom_id, bound_atom_id))
321 |     #         used_bonds.add((bound_atom_id, atom_id))
322 | 
323 |     #     lines.append("CONECT "+" ".join(map(str, write_bonds)))
324 |     #     write_bonds = []
325 | 
326 |     lines.append("ENDMDL")
327 | 
328 |     if out_file is not None:
329 |         with smart_open.open(out_file, "a") as f:
330 |             for line in lines:
331 |                 f.write(line + "\n")
332 |     else:
333 |         return lines
334 | 


--------------------------------------------------------------------------------
/src/e3fp/conformer/util.py:
--------------------------------------------------------------------------------
  1 | """Utilities for handling SMILES strings and RDKit mols and conformers.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import os
  7 | import re
  8 | import copy
  9 | import logging
 10 | from collections import namedtuple
 11 | 
 12 | import rdkit
 13 | import rdkit.Chem
 14 | import rdkit.Chem.PropertyMol
 15 | from rdkit.Chem.PropertyMol import PropertyMol
 16 | from python_utilities.io_tools import touch_dir
 17 | import smart_open
 18 | 
 19 | PROTO_NAME_DELIM = "-"
 20 | CONF_NAME_DELIM = "_"
 21 | MOL_ITEM_REGEX = re.compile(
 22 |     r"(?P<{0}>.+?)(?:{1}(?P<{2}>\d+))?(?:{3}(?P<{4}>\d+))?$".format(
 23 |         "mol_name",
 24 |         PROTO_NAME_DELIM,
 25 |         "proto_state_num",
 26 |         CONF_NAME_DELIM,
 27 |         "conf_num",
 28 |     )
 29 | )
 30 | MOL_ITEM_FIELDS = ("mol_name", "proto_state_num", "conf_num")
 31 | CONF_ENERGIES_PROPNAME = "_ConfEnergies"
 32 | CONF_ENERGIES_DELIM = "|"
 33 | CONF_ENERGY_PROPNAME = "Energy"
 34 | 
 35 | MolItemTuple = namedtuple(
 36 |     "MolItemTuple", ["mol_name", "proto_state_num", "conf_num"]
 37 | )
 38 | 
 39 | 
 40 | class MolItemName(object):
 41 |     """Class for parsing mol item names and converting to various formats."""
 42 | 
 43 |     def __init__(
 44 |         self,
 45 |         mol_name=None,
 46 |         proto_state_num=None,
 47 |         conf_num=None,
 48 |         proto_delim=PROTO_NAME_DELIM,
 49 |         conf_delim=CONF_NAME_DELIM,
 50 |     ):
 51 |         self.mol_name = mol_name
 52 |         self.proto_state_num = proto_state_num
 53 |         self.conf_num = conf_num
 54 |         self.proto_delim = proto_delim
 55 |         self.conf_delim = conf_delim
 56 | 
 57 |     @classmethod
 58 |     def from_str(
 59 |         cls,
 60 |         mol_item_name,
 61 |         mol_item_regex=MOL_ITEM_REGEX,
 62 |         mol_item_fields=MOL_ITEM_FIELDS,
 63 |         **kwargs
 64 |     ):
 65 |         fields = cls.mol_item_name_to_dict(
 66 |             mol_item_name,
 67 |             mol_item_regex=mol_item_regex,
 68 |             mol_item_fields=mol_item_fields,
 69 |         )
 70 |         return cls(
 71 |             fields["mol_name"],
 72 |             fields["proto_state_num"],
 73 |             fields["conf_num"],
 74 |             **kwargs
 75 |         )
 76 | 
 77 |     def to_str(self):
 78 |         return self.mol_item_name
 79 | 
 80 |     @classmethod
 81 |     def from_tuple(cls, fields_tuple):
 82 |         return cls(*fields_tuple)
 83 | 
 84 |     def to_tuple(self):
 85 |         return MolItemTuple(self.mol_name, self.proto_state_num, self.conf_num)
 86 | 
 87 |     @property
 88 |     def mol_name(self):
 89 |         return self._mol_name
 90 | 
 91 |     @mol_name.setter
 92 |     def mol_name(self, mol_name):
 93 |         self._mol_name = mol_name
 94 | 
 95 |     def to_mol_name(self, as_proto=False):
 96 |         if as_proto:
 97 |             return self.proto_name
 98 |         else:
 99 |             return self.mol_name
100 | 
101 |     @property
102 |     def proto_name(self):
103 |         return self.to_proto_name(self.proto_state_num)
104 | 
105 |     def to_proto_name(
106 |         self, proto_state_num=None, proto_delim=PROTO_NAME_DELIM
107 |     ):
108 |         if proto_state_num is not None:
109 |             return "{}{}{:d}".format(
110 |                 self.mol_name, proto_delim, proto_state_num
111 |             )
112 |         else:
113 |             return self.mol_name
114 | 
115 |     @property
116 |     def conf_name(self):
117 |         return self.to_conf_name(conf_num=self.conf_num)
118 | 
119 |     def to_conf_name(self, conf_num=None, conf_delim=CONF_NAME_DELIM):
120 |         if conf_num is not None:
121 |             return "{}{}{:d}".format(self.proto_name, conf_delim, conf_num)
122 |         else:
123 |             return self.proto_name
124 | 
125 |     @property
126 |     def mol_item_name(self):
127 |         return self.conf_name
128 | 
129 |     @staticmethod
130 |     def mol_item_name_to_dict(
131 |         mol_item_name,
132 |         mol_item_regex=MOL_ITEM_REGEX,
133 |         mol_item_fields=MOL_ITEM_FIELDS,
134 |     ):
135 |         match = re.match(mol_item_regex, mol_item_name)
136 |         groups = match.groups()
137 |         fields = dict(zip(mol_item_fields, groups))
138 |         proto_state_num = fields.get("proto_state_num")
139 |         if proto_state_num is not None:
140 |             fields["proto_state_num"] = int(proto_state_num)
141 |         conf_num = fields.get("conf_num")
142 |         if conf_num is not None:
143 |             fields["conf_num"] = int(conf_num)
144 |         return fields
145 | 
146 |     def copy(self):
147 |         return copy.copy(self)
148 | 
149 |     def __repr__(self):
150 |         return (
151 |             "MolItemName(mol_name={}, proto_state_num={}, "
152 |             "conf_num={})".format(
153 |                 self.mol_name, self.proto_state_num, self.conf_num
154 |             )
155 |         )
156 | 
157 |     def __str__(self):
158 |         return self.conf_name
159 | 
160 |     def __eq__(self, other):
161 |         return self.to_tuple() == other.to_tuple()
162 | 
163 |     def __ne__(self, other):
164 |         return not self.__eq__(other)
165 | 
166 |     def __gt__(self, other):
167 |         return self.to_tuple().__gt__(other.to_tuple())
168 | 
169 |     def __lt__(self, other):
170 |         return self.to_tuple().__lt__(other.to_tuple())
171 | 
172 |     def __hash__(self):
173 |         return hash(self.to_tuple())
174 | 
175 | 
176 | def smiles_generator(*filenames):
177 |     """Parse SMILES file(s) and yield (name, smile).
178 | 
179 |     Parameters
180 |     ----------
181 |     files : iterable object
182 |         List of files containing smiles. File must contain one smile per
183 |         line, followed by a space and then the molecule name.
184 | 
185 |     Yields
186 |     ------
187 |     tuple:
188 |         `tuple` of the format (smile, name).
189 |     """
190 |     for filename in filenames:
191 |         with smart_open.open(filename, "r") as f:
192 |             for i, line in enumerate(f):
193 |                 values = line.rstrip("\r\n").split()
194 |                 if len(values) >= 2:
195 |                     yield tuple(values[:2])
196 |                 else:
197 |                     logging.warning(
198 |                         (
199 |                             "Line {:d} of {} has {:d} entries. Expected at least"
200 |                             " 2.".format(i + 1, filename, len(values))
201 |                         ),
202 |                         exc_info=True,
203 |                     )
204 | 
205 | 
206 | def smiles_to_dict(smiles_file, unique=False, has_header=False):
207 |     """Read SMILES file to dict."""
208 |     smiles_gen = smiles_generator(smiles_file)
209 |     if has_header:
210 |         header = next(smiles_gen)
211 |         logging.info("Skipping first (header) values: {!r}".format(header))
212 |     if unique:
213 |         used_smiles = set()
214 |         smiles_dict = {}
215 |         for smiles, name in smiles_gen:
216 |             if name not in smiles_dict and smiles not in used_smiles:
217 |                 smiles_dict[name] = smiles
218 |                 used_smiles.add(smiles)
219 |     else:
220 |         smiles_dict = {name: smiles for smiles, name in smiles_gen}
221 |     return smiles_dict
222 | 
223 | 
224 | def dict_to_smiles(smiles_file, smiles_dict):
225 |     """Write SMILES dict to file."""
226 |     iter_to_smiles(smiles_file, sorted(smiles_dict.items()))
227 | 
228 | 
229 | def iter_to_smiles(smiles_file, smiles_iter):
230 |     """Write iterator of (mol_name, SMILES) to file."""
231 |     with smart_open.open(smiles_file, "w") as f:
232 |         for mol_name, smiles in smiles_iter:
233 |             f.write("{} {}\n".format(smiles, mol_name))
234 | 
235 | 
236 | def mol2_generator(*filenames):
237 |     """Parse name from mol2 filename and return generator.
238 | 
239 |     Parameters
240 |     ----------
241 |     files : iterable object
242 |         List of mol2 files, where filename should be molecule name followed by
243 |         ".mol2"
244 | 
245 |     Yields
246 |     ------
247 |     tuple:
248 |         `tuple` of the format (file, name).
249 |     """
250 |     for filename in filenames:
251 |         name = os.path.splitext(os.path.basename(filename))[0]
252 |         yield (filename, name)
253 | 
254 | 
255 | def mol_from_smiles(smiles, name, standardise=False):
256 |     """Generate a n RDKit `PropertyMol` from SMILES string.
257 | 
258 |     Parameters
259 |     ----------
260 |     smile : str
261 |         SMILES string
262 |     name : str
263 |         Name of molecule
264 |     standardise : bool
265 |         Clean Mol through standardisation
266 | 
267 |     Returns
268 |     -------
269 |     RDKit PropertyMol : Molecule.
270 |     """
271 |     mol = rdkit.Chem.MolFromSmiles(smiles)
272 |     if mol is None:
273 |         logging.error(
274 |             "Mol creation failed from SMILES: {!r}".format((smiles, name))
275 |         )
276 |         return None
277 |     if standardise:
278 |         mol = mol_to_standardised_mol(mol, name)
279 |     mol = PropertyMol(mol)
280 |     mol.SetProp("_Name", name)
281 |     mol.SetProp("_SMILES", smiles)
282 |     return mol
283 | 
284 | 
285 | def mol_from_mol2(mol2_file, name=None, standardise=False):
286 |     """Read a mol2 file into an RDKit `PropertyMol`.
287 | 
288 |     Parameters
289 |     ----------
290 |     mol2_file : str
291 |         path to a mol2 file
292 |     name : str, optional
293 |         Name of molecule. If not provided, uses file basename as name
294 |     standardise : bool
295 |         Clean mol through standardisation
296 | 
297 |     Returns
298 |     -------
299 |     RDKit PropertyMol : Molecule.
300 |     """
301 |     if name is None:
302 |         name = os.path.splitext(os.path.basename(mol2_file))[0]
303 |     mol = rdkit.Chem.MolFromMol2File(mol2_file)
304 |     if standardise:
305 |         mol = mol_to_standardised_mol(mol, name)
306 |     mol = PropertyMol(mol)
307 |     mol.SetProp("_Name", name)
308 |     return mol
309 | 
310 | 
311 | def mol_from_sdf(sdf_file, conf_num=None, standardise=False, mode="rb"):
312 |     """Read SDF file into an RDKit `Mol` object.
313 | 
314 |     Parameters
315 |     ----------
316 |     sdf_file : str
317 |         Path to an SDF file
318 |     conf_num : int or None, optional
319 |         Maximum number of conformers to read from file. Defaults to all.
320 |     standardise : bool (default False)
321 |         Clean mol through standardisation
322 |     mode : str (default 'rb')
323 |         Mode with which to open file
324 | 
325 |     Returns
326 |     -------
327 |     RDKit Mol : `Mol` object with each molecule in SDF file as a conformer
328 |     """
329 |     mol = None
330 |     conf_energies = []
331 |     with smart_open.open(sdf_file, mode) as f:
332 |         supplier = rdkit.Chem.ForwardSDMolSupplier(f)
333 |         i = 0
334 |         while True:
335 |             if i == conf_num:
336 |                 break
337 |             try:
338 |                 new_mol = next(supplier)
339 |             except StopIteration:
340 |                 logging.debug(
341 |                     "Read {:d} conformers from {}.".format(i, sdf_file)
342 |                 )
343 |                 break
344 | 
345 |             if new_mol.HasProp(CONF_ENERGY_PROPNAME):
346 |                 conf_energies.append(
347 |                     float(new_mol.GetProp(CONF_ENERGY_PROPNAME))
348 |                 )
349 | 
350 |             if mol is None:
351 |                 mol = rdkit.Chem.Mol(new_mol)
352 |                 mol.RemoveAllConformers()
353 |             conf = new_mol.GetConformers()[0]
354 |             mol.AddConformer(conf, assignId=True)
355 |             i += 1
356 |     if standardise:
357 |         mol = mol_to_standardised_mol(mol)
358 |     try:
359 |         mol.GetProp("_Name")
360 |     except KeyError:
361 |         name = os.path.basename(sdf_file).split(".sdf")[0]
362 |         mol.SetProp("_Name", name)
363 | 
364 |     if len(conf_energies) > 0:
365 |         add_conformer_energies_to_mol(mol, conf_energies)
366 |         mol.ClearProp(CONF_ENERGY_PROPNAME)
367 | 
368 |     return mol
369 | 
370 | 
371 | def mol_to_sdf(mol, out_file, conf_num=None):
372 |     """Write RDKit `Mol` objects to an SDF file.
373 | 
374 |     Parameters
375 |     ----------
376 |     mol : RDKit Mol
377 |         A molecule containing 1 or more conformations to write to file.
378 |     out_file : str
379 |         Path to save SDF file.
380 |     conf_num : int or None, optional
381 |         Maximum number of conformers to save to file. Defaults to all.
382 |     """
383 |     touch_dir(os.path.dirname(out_file))
384 |     with smart_open.open(out_file, "w") as fobj:
385 |         writer = rdkit.Chem.SDWriter(fobj)
386 |         conf_ids = [conf.GetId() for conf in mol.GetConformers()]
387 |         conf_energies = get_conformer_energies_from_mol(mol)
388 |         mol.ClearProp(CONF_ENERGIES_PROPNAME)
389 |         for i in conf_ids:
390 |             if conf_num not in {-1, None} and i >= conf_num:
391 |                 break
392 |             try:
393 |                 conf_energy = conf_energies[i]
394 |                 mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy))
395 |             except (IndexError, TypeError):
396 |                 pass
397 |             writer.write(mol, confId=i)
398 |         writer.close()
399 |         mol.ClearProp(CONF_ENERGY_PROPNAME)
400 |         if conf_energies is not None:
401 |             add_conformer_energies_to_mol(mol, conf_energies)
402 |     logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
403 | 
404 | 
405 | def mol_to_standardised_mol(mol, name=None):
406 |     """Standardise mol(s)."""
407 |     try:
408 |         from standardiser import standardise
409 |         from standardiser.utils import StandardiseException
410 |     except ImportError:
411 |         logging.warning(
412 |             "standardiser module unavailable. Using unstandardised mol."
413 |         )
414 |         return mol
415 | 
416 |     if name is None:
417 |         try:
418 |             name = mol.GetProp("_Name")
419 |         except KeyError:
420 |             name = repr(mol)
421 | 
422 |     if isinstance(mol, PropertyMol):
423 |         mol_type = PropertyMol
424 |         mol = rdkit.Chem.Mol(mol)
425 |     else:
426 |         mol_type = rdkit.Chem.Mol
427 | 
428 |     logging.debug("Standardising {}".format(name))
429 |     try:
430 |         std_mol = standardise.run(mol)
431 |     except AttributeError:  # backwards-compatible with old standardiser
432 |         std_mol = standardise.apply(mol)
433 |     except StandardiseException:
434 |         logging.error(
435 |             (
436 |                 "Standardisation of {} failed. Using unstandardised "
437 |                 "mol.".format(name)
438 |             ),
439 |             exc_info=True,
440 |         )
441 |         return mol_type(mol)
442 | 
443 |     std_mol = mol_type(std_mol)
444 |     try:
445 |         std_mol.SetProp("_Name", mol.GetProp("_Name"))
446 |     except KeyError:
447 |         pass
448 | 
449 |     return std_mol
450 | 
451 | 
452 | def add_conformer_energies_to_mol(mol, energies):
453 |     """Add conformer energies as mol property.
454 | 
455 |     See discussion at https://sourceforge.net/p/rdkit/mailman/message/27547551/
456 |     """
457 |     energies_str = CONF_ENERGIES_DELIM.join(
458 |         "{:.4f}".format(e) for e in energies
459 |     )
460 |     mol.SetProp(CONF_ENERGIES_PROPNAME, energies_str)
461 |     return mol
462 | 
463 | 
464 | def get_conformer_energies_from_mol(mol):
465 |     """Get conformer energies from mol."""
466 |     if not mol.HasProp(CONF_ENERGIES_PROPNAME):
467 |         return None
468 |     energies_str = mol.GetProp(CONF_ENERGIES_PROPNAME)
469 |     energies = [float(x) for x in energies_str.split(CONF_ENERGIES_DELIM)]
470 |     return energies
471 | 


--------------------------------------------------------------------------------
/src/e3fp/conformer/generator.py:
--------------------------------------------------------------------------------
  1 | """Conformer generation.
  2 | 
  3 | Author: Seth Axen
  4 | E-mail: seth.axen@gmail.com
  5 | """
  6 | import logging
  7 | 
  8 | import numpy as np
  9 | 
 10 | from rdkit import Chem
 11 | from rdkit.Chem import AllChem
 12 | from rdkit.Chem import PropertyMol
 13 | from .util import add_conformer_energies_to_mol
 14 | 
 15 | # Heavily modified by Seth Axen from code under the following license
 16 | __author__ = "Steven Kearnes"
 17 | __copyright__ = "Copyright 2014, Stanford University"
 18 | __license__ = "3-clause BSD"
 19 | 
 20 | # options
 21 | FORCEFIELD_CHOICES = ("uff", "mmff94", "mmff94s")
 22 | 
 23 | # default values
 24 | NUM_CONF_DEF = -1
 25 | FIRST_DEF = -1
 26 | POOL_MULTIPLIER_DEF = 1
 27 | RMSD_CUTOFF_DEF = 0.5
 28 | MAX_ENERGY_DIFF_DEF = -1.0
 29 | FORCEFIELD_DEF = "uff"
 30 | SEED_DEF = -1
 31 | 
 32 | 
 33 | class ConformerGenerator(object):
 34 |     """Generate conformers using RDKit.
 35 | 
 36 |     Procedure
 37 |     ---------
 38 |     1. Generate a pool of conformers.
 39 |     2. Minimize conformers.
 40 |     3. Filter conformers using an RMSD threshold and optional minimum energy
 41 |        difference.
 42 | 
 43 |     Note that pruning is done _after_ minimization, which differs from the
 44 |     protocol described in the references.
 45 | 
 46 |     References
 47 |     ----------
 48 |     * http://rdkit.org/docs/GettingStartedInPython.html
 49 |       #working-with-3d-molecules
 50 |     * http://pubs.acs.org/doi/full/10.1021/ci2004658
 51 |     * https://github.com/skearnes/rdkit-utils/blob/master/rdkit_utils/
 52 |       conformers.py
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         num_conf: int=NUM_CONF_DEF,
 58 |         first: int=FIRST_DEF,
 59 |         rmsd_cutoff: float=RMSD_CUTOFF_DEF,
 60 |         max_energy_diff: float=MAX_ENERGY_DIFF_DEF,
 61 |         forcefield: str=FORCEFIELD_DEF,
 62 |         pool_multiplier: int=POOL_MULTIPLIER_DEF,
 63 |         seed: int=SEED_DEF,
 64 |         get_values: bool=False,
 65 |         sparse_rmsd: bool=True,
 66 |         store_energies: bool=True,
 67 |     ):
 68 |         """Initialize generator settings.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         num_conf : int, optional
 73 |             Maximum number of conformers to generate (after pruning). -1
 74 |             results in auto selection of max_conformers.
 75 |         first : int, optional
 76 |             Terminate when this number of conformers has been accepted, and
 77 |             only return those conformers. -1 results in all conformers being
 78 |             returned.
 79 |         pool_multiplier : int, optional
 80 |             Factor to multiply by max_conformers to generate the initial
 81 |             conformer pool. Since conformers are filtered after energy
 82 |             minimization, increasing the size of the pool increases the chance
 83 |             of identifying max_conformers unique conformers.
 84 |         rmsd_cutoff : float, optional
 85 |             RMSD cutoff for pruning conformers. If None or negative, no
 86 |             pruning is performed.
 87 |         max_energy_diff : float, optional
 88 |             If set, conformers with energies this amount above the minimum
 89 |             energy conformer are not accepted.
 90 |         forcefield : {'uff', 'mmff94', 'mmff94s'}, optional
 91 |             Force field to use for conformer energy calculation and
 92 |             minimization.
 93 |         seed : int, optional
 94 |             Random seed for conformer generation. If -1, the random number
 95 |             generator is unseeded.
 96 |         get_values : boolean, optional
 97 |             Return tuple of key values, for storage.
 98 |         sparse_rmsd : bool, optional
 99 |             If `get_values` is True, instead of returning full symmetric RMSD
100 |             matrix, only return flattened upper triangle.
101 |         store_energies : bool, optional
102 |             Store conformer energies as property in mol.
103 |         """
104 |         if not isinstance(num_conf, int) or num_conf < -1 or num_conf == 0:
105 |             raise ValueError("num_conf must be either -1 or a positive integer")
106 |         self.max_conformers = num_conf
107 |         if not isinstance(first, int) or first < -1 or first == 0:
108 |             raise ValueError("first must be either -1 or a positive integer")
109 |         self.first_conformers = first
110 |         if not rmsd_cutoff or rmsd_cutoff < 0:
111 |             rmsd_cutoff = -1.0
112 |         self.rmsd_cutoff = rmsd_cutoff
113 | 
114 |         if max_energy_diff is None or max_energy_diff < 0:
115 |             max_energy_diff = -1.0
116 |         self.max_energy_diff = max_energy_diff
117 | 
118 |         if forcefield not in FORCEFIELD_CHOICES:
119 |             raise ValueError(
120 |                 "%s is not a valid option for forcefield" % forcefield
121 |             )
122 |         self.forcefield = forcefield
123 |         if not isinstance(pool_multiplier, int) or pool_multiplier < 1:
124 |             raise ValueError("pool_multiplier must be a positive integer")
125 |         self.pool_multiplier = pool_multiplier
126 |         self.seed = seed
127 |         self.get_values = get_values
128 |         self.sparse_rmsd = sparse_rmsd
129 |         self.store_energies = store_energies
130 | 
131 |     def __call__(self, mol):
132 |         """Generate conformers for a molecule.
133 | 
134 |         Parameters
135 |         ----------
136 |         mol : RDKit Mol
137 |             Molecule.
138 | 
139 |         Returns
140 |         -------
141 |         RDKit Mol : copy of the input molecule with embedded conformers
142 |         """
143 |         return self.generate_conformers(mol)
144 | 
145 |     def generate_conformers(self, mol):
146 |         """Generate conformers for a molecule.
147 | 
148 |         Parameters
149 |         ----------
150 |         mol : RDKit Mol
151 |             Molecule.
152 | 
153 |         Returns
154 |         -------
155 |         RDKit Mol : copy of the input molecule with embedded conformers
156 |         """
157 |         # initial embedding
158 |         mol = self.embed_molecule(mol)
159 |         if not mol.GetNumConformers():
160 |             msg = "No conformers generated for molecule"
161 |             if mol.HasProp("_Name"):
162 |                 name = mol.GetProp("_Name")
163 |                 msg += ' "{}".'.format(name)
164 |             else:
165 |                 msg += "."
166 |             raise RuntimeError(msg)
167 | 
168 |         # minimization and filtering
169 |         self.minimize_conformers(mol)
170 |         mol, indices, energies, rmsds = self.filter_conformers(mol)
171 | 
172 |         if self.store_energies:
173 |             add_conformer_energies_to_mol(mol, energies)
174 | 
175 |         if self.get_values is True:
176 |             if self.sparse_rmsd:
177 |                 rmsds_mat = rmsds[np.triu_indices_from(rmsds, k=1)]
178 |             else:
179 |                 rmsds_mat = rmsds
180 |             return mol, (self.max_conformers, indices, energies, rmsds_mat)
181 |         else:
182 |             return mol
183 | 
184 |     @staticmethod
185 |     def get_num_conformers(mol):
186 |         """Return ideal number of conformers from rotatable bond number in model.
187 | 
188 |         Parameters
189 |         ----------
190 |         mol : Mol
191 |             RDKit `Mol` object for molecule
192 | 
193 |         Yields
194 |         ------
195 |         num_conf : int
196 |             Target number of conformers to accept
197 |         """
198 |         num_rot = AllChem.CalcNumRotatableBonds(mol)
199 |         if num_rot < 8:
200 |             return 50
201 |         elif num_rot >= 8 and num_rot <= 12:
202 |             return 200
203 |         elif num_rot > 12:
204 |             return 300
205 |         else:
206 |             return 0
207 | 
208 |     def embed_molecule(self, mol):
209 |         """Generate conformers, possibly with pruning.
210 | 
211 |         Parameters
212 |         ----------
213 |         mol : RDKit Mol
214 |             Molecule.
215 |         """
216 |         log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
217 |         logging.debug("Adding hydrogens for %s" % log_name)
218 |         mol = Chem.AddHs(mol)  # add hydrogens
219 |         logging.debug("Hydrogens added to %s" % log_name)
220 |         logging.debug("Sanitizing mol for %s" % log_name)
221 |         Chem.SanitizeMol(mol)
222 |         logging.debug("Mol sanitized for %s" % log_name)
223 |         if self.max_conformers == -1 or type(self.max_conformers) is not int:
224 |             self.max_conformers = self.get_num_conformers(mol)
225 |         n_confs = self.max_conformers * self.pool_multiplier
226 |         if self.first_conformers == -1:
227 |             self.first_conformers = self.max_conformers
228 |         logging.debug("Embedding %d conformers for %s" % (n_confs, log_name))
229 |         AllChem.EmbedMultipleConfs(
230 |             mol,
231 |             numConfs=n_confs,
232 |             maxAttempts=10 * n_confs,
233 |             pruneRmsThresh=-1.0,
234 |             randomSeed=self.seed,
235 |             ignoreSmoothingFailures=True,
236 |         )
237 |         logging.debug("Conformers embedded for %s" % log_name)
238 |         return mol
239 | 
240 |     def get_molecule_force_field(self, mol, conf_id=None, **kwargs):
241 |         """Get a force field for a molecule.
242 | 
243 |         Parameters
244 |         ----------
245 |         mol : RDKit Mol
246 |             Molecule.
247 |         conf_id : int, optional
248 |             ID of the conformer to associate with the force field.
249 |         **kwargs : dict, optional
250 |             Keyword arguments for force field constructor.
251 |         """
252 |         if self.forcefield == "uff":
253 |             ff = AllChem.UFFGetMoleculeForceField(
254 |                 mol, confId=conf_id, **kwargs
255 |             )
256 |         elif self.forcefield.startswith("mmff"):
257 |             AllChem.MMFFSanitizeMolecule(mol)
258 |             mmff_props = AllChem.MMFFGetMoleculeProperties(
259 |                 mol, mmffVariant=self.forcefield
260 |             )
261 |             ff = AllChem.MMFFGetMoleculeForceField(
262 |                 mol, mmff_props, confId=conf_id, **kwargs
263 |             )
264 |         else:
265 |             raise ValueError(
266 |                 "Invalid forcefield " + "'{}'.".format(self.forcefield)
267 |             )
268 |         return ff
269 | 
270 |     def minimize_conformers(self, mol):
271 |         """Minimize molecule conformers.
272 | 
273 |         Parameters
274 |         ----------
275 |         mol : RDKit Mol
276 |             Molecule.
277 |         """
278 |         log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
279 |         logging.debug("Minimizing conformers for %s" % log_name)
280 |         for conf in mol.GetConformers():
281 |             ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
282 |             ff.Minimize()
283 |         logging.debug("Conformers minimized for %s" % log_name)
284 | 
285 |     def get_conformer_energies(self, mol):
286 |         """Calculate conformer energies.
287 | 
288 |         Parameters
289 |         ----------
290 |         mol : RDKit Mol
291 |             Molecule.
292 | 
293 |         Returns
294 |         -------
295 |         energies : array_like
296 |             Minimized conformer energies.
297 |         """
298 |         num_conf = mol.GetNumConformers()
299 |         energies = np.empty((num_conf,), dtype=float)
300 |         for i, conf in enumerate(mol.GetConformers()):
301 |             ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
302 |             energies[i] = ff.CalcEnergy()
303 |         return energies
304 | 
305 |     def filter_conformers(self, mol):
306 |         """Filter conformers which do not meet an RMSD threshold.
307 | 
308 |         Parameters
309 |         ----------
310 |         mol : RDKit Mol
311 |             Molecule.
312 | 
313 |         Returns
314 |         -------
315 |         A new RDKit Mol containing the chosen conformers, sorted by
316 |         increasing energy.
317 |         """
318 |         log_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "molecule"
319 |         logging.debug("Pruning conformers for %s" % log_name)
320 |         energies = self.get_conformer_energies(mol)
321 |         energy_below_threshold = np.ones_like(energies, dtype=np.bool_)
322 | 
323 |         sort = np.argsort(energies)  # sort by increasing energy
324 |         confs = np.array(mol.GetConformers())
325 | 
326 |         # remove hydrogens to speed up substruct match
327 |         mol = Chem.RemoveHs(mol)
328 |         accepted = []  # always accept lowest-energy conformer
329 |         rejected = []
330 |         rmsds = np.zeros((confs.shape[0], confs.shape[0]), dtype=float)
331 |         for i, fit_ind in enumerate(sort):
332 |             accepted_num = len(accepted)
333 | 
334 |             # always accept lowest-energy conformer
335 |             if accepted_num == 0:
336 |                 accepted.append(fit_ind)
337 | 
338 |                 # pre-compute if Es are in acceptable range of min E
339 |                 if self.max_energy_diff != -1.0:
340 |                     energy_below_threshold = (
341 |                         energies <= energies[fit_ind] + self.max_energy_diff
342 |                     )
343 | 
344 |                 continue
345 | 
346 |             # reject conformers after first_conformers is reached
347 |             if accepted_num >= self.first_conformers:
348 |                 rejected.append(fit_ind)
349 |                 continue
350 | 
351 |             # check if energy is too high
352 |             if not energy_below_threshold[fit_ind]:
353 |                 rejected.append(fit_ind)
354 |                 continue
355 | 
356 |             # get RMSD to selected conformers
357 |             these_rmsds = np.zeros((accepted_num,), dtype=float)
358 |             # reverse so all confs aligned to lowest energy
359 |             for j, accepted_ind in self.reverse_enumerate(accepted):
360 |                 this_rmsd = AllChem.GetBestRMS(
361 |                     mol,
362 |                     mol,
363 |                     confs[accepted_ind].GetId(),
364 |                     confs[fit_ind].GetId(),
365 |                 )
366 |                 # reject conformers within the RMSD threshold
367 |                 if this_rmsd < self.rmsd_cutoff:
368 |                     rejected.append(fit_ind)
369 |                     break
370 |                 else:
371 |                     these_rmsds[-j - 1] = this_rmsd
372 |             else:
373 |                 rmsds[fit_ind, accepted] = these_rmsds
374 |                 rmsds[accepted, fit_ind] = these_rmsds
375 |                 accepted.append(fit_ind)
376 | 
377 |         # slice and order rmsds and energies to match accepted list
378 |         rmsds = rmsds[np.ix_(accepted, accepted)]
379 |         energies = energies[accepted]
380 | 
381 |         # create a new molecule with all conformers, sorted by energy
382 |         new = PropertyMol.PropertyMol(mol)
383 |         new.RemoveAllConformers()
384 |         conf_ids = [conf.GetId() for conf in mol.GetConformers()]
385 |         for i in accepted:
386 |             conf = mol.GetConformer(conf_ids[i])
387 |             new.AddConformer(conf, assignId=True)
388 | 
389 |         logging.debug("Conformers filtered for %s" % log_name)
390 |         return new, np.asarray(accepted, dtype=int), energies, rmsds
391 | 
392 |     @staticmethod
393 |     def reverse_enumerate(iterable):
394 |         """Enumerate, but with the last result first but still numbered last.
395 | 
396 |         Parameters
397 |         ----------
398 |         iterable : some 1-D iterable
399 | 
400 |         Returns
401 |         -------
402 |         iterable:
403 |             Reverse of `enumerate` function
404 |         """
405 |         return zip(reversed(range(len(iterable))), reversed(iterable))
406 | 
407 |     # magic methods
408 |     def __repr__(self):
409 |         return """ConformerGenerator(num_conf=%r, first=%r,\
410 |                \n                   pool_multiplier=%r, rmsd_cutoff=%r,\
411 |                \n                   max_energy_diff=%r, forcefield=%r,\
412 |                \n                   get_values=%r, sparse_rmsd=%r)""" % (
413 |             self.max_conformers,
414 |             self.first,
415 |             self.pool_multiplier,
416 |             self.rmsd_cutoff,
417 |             self.max_energy_diff,
418 |             self.forcefield,
419 |             self.get_values,
420 |             self.sparse_rmsd,
421 |         )
422 | 
423 |     def __str__(self):
424 |         return self.__repr__()
425 | 


--------------------------------------------------------------------------------