├── .github
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── PULL_REQUEST_TEMPLATE.md
├── SECURITY.md
└── workflows
│ ├── code-check.yml
│ ├── doc.yml
│ ├── release.yml
│ └── test.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── binder
├── environment.yml
└── postBuild
├── codecov.yml
├── datamol
├── __init__.py
├── _sanifix4.py
├── _version.py
├── align.py
├── cluster.py
├── conformers
│ ├── __init__.py
│ ├── _conformers.py
│ └── _features.py
├── convert.py
├── data
│ ├── __init__.py
│ ├── cdk2.sdf
│ ├── chembl_approved_drugs.parquet
│ ├── chembl_drugs.csv
│ ├── chembl_samples.csv
│ ├── freesolv.csv
│ ├── reactions.json
│ ├── salts_solvents.smi
│ ├── solubility.test.sdf
│ └── solubility.train.sdf
├── descriptors
│ ├── __init__.py
│ ├── compute.py
│ └── descriptors.py
├── fp.py
├── fragment
│ ├── __init__.py
│ ├── _assemble.py
│ └── _fragment.py
├── graph.py
├── io.py
├── isomers
│ ├── __init__.py
│ ├── _enumerate.py
│ └── _structural.py
├── log.py
├── mcs.py
├── mol.py
├── molar.py
├── predictors
│ ├── __init__.py
│ └── esol.py
├── reactions
│ ├── __init__.py
│ ├── _attachments.py
│ └── _reactions.py
├── scaffold
│ ├── __init__.py
│ └── _fuzzy.py
├── similarity.py
├── types.py
├── utils
│ ├── __init__.py
│ ├── decorators.py
│ ├── fs.py
│ ├── jobs.py
│ ├── perf.py
│ └── testing.py
└── viz
│ ├── __init__.py
│ ├── _circle_grid.py
│ ├── _conformers.py
│ ├── _lasso_highlight.py
│ ├── _substructure.py
│ ├── _viz.py
│ └── utils.py
├── docs
├── CNAME
├── api
│ ├── datamol.align.md
│ ├── datamol.cluster.md
│ ├── datamol.conformers.md
│ ├── datamol.convert.md
│ ├── datamol.data.md
│ ├── datamol.descriptors.md
│ ├── datamol.fp.md
│ ├── datamol.fragment.md
│ ├── datamol.graph.md
│ ├── datamol.io.md
│ ├── datamol.isomers.md
│ ├── datamol.log.md
│ ├── datamol.mol.md
│ ├── datamol.molar.md
│ ├── datamol.reactions.md
│ ├── datamol.scaffold.md
│ ├── datamol.similarity.md
│ ├── datamol.utils.fs.md
│ ├── datamol.utils.md
│ └── datamol.viz.md
├── assets
│ ├── css
│ │ ├── custom-datamol.css
│ │ ├── custom.css
│ │ └── tweak-width.css
│ └── js
│ │ └── google-analytics.js
├── contribute.md
├── images
│ ├── logo-black.png
│ ├── logo-black.svg
│ ├── logo-title.svg
│ ├── logo.png
│ └── logo.svg
├── index.md
├── license.md
├── tutorials
│ ├── Aligning.ipynb
│ ├── Clustering.ipynb
│ ├── Conformers.ipynb
│ ├── Descriptors.ipynb
│ ├── Filesystem.ipynb
│ ├── Fragment.ipynb
│ ├── Fuzzy_Scaffolds.ipynb
│ ├── Preprocessing.ipynb
│ ├── Reactions.ipynb
│ ├── Scaffolds.ipynb
│ ├── The_Basics.ipynb
│ ├── Visualization.ipynb
│ ├── data
│ │ ├── Enamine_DNA_Libary_5530cmpds_20200831_SMALL.sdf
│ │ └── ReactionBlock.rxn
│ └── images
│ │ ├── Aligning_1.png
│ │ ├── Aligning_2.png
│ │ ├── Conformers_1.png
│ │ ├── Descriptors_1.png
│ │ ├── Fragment_1.png
│ │ ├── Fragment_2.png
│ │ ├── Fragment_3.png
│ │ ├── Preprocess_1.png
│ │ └── Scaffolds_1.png
└── usage.md
├── env.yml
├── mkdocs.yml
├── notebooks
└── Get_ChEMBL_Approved_Drugs.ipynb
├── pyproject.toml
└── tests
├── conftest.py
├── data
├── TUBB3-observations-last-broken.sdf
├── TUBB3-observations.sdf
├── TUBB3-observations.sdf.gz
├── freesolv.csv
├── freesolv.xlsx
└── test.mol2
├── test_align.py
├── test_cluster.py
├── test_conformers.py
├── test_convert.py
├── test_data.py
├── test_descriptors.py
├── test_fp.py
├── test_fragment.py
├── test_graph.py
├── test_import.py
├── test_io.py
├── test_isomers.py
├── test_log.py
├── test_mcs.py
├── test_mol.py
├── test_molar.py
├── test_notebooks.py
├── test_predictors.py
├── test_reactions.py
├── test_scaffold.py
├── test_similarity.py
├── test_utils_fs.py
├── test_utils_jobs.py
├── test_utils_perf.py
├── test_viz.py
├── test_viz_lasso_highlight.py
└── test_viz_substrcture.py
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @hadim
2 |
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | .
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | see documentation directly at https://docs.datamol.io/stable/contribute.html
2 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Changelogs
2 |
3 | - _enumerate the changes of that PR._
4 |
5 | ---
6 |
7 | _Checklist:_
8 |
9 | - [ ] _Was this PR discussed in an issue? It is recommended to first discuss a new feature into a GitHub issue before opening a PR._
10 | - [ ] _Add tests to cover the fixed bug(s) or the new introduced feature(s) (if appropriate)._
11 | - [ ] _Update the API documentation is a new function is added, or an existing one is deleted._
12 | - [ ] _Write concise and explanatory changelogs below._
13 | - [ ] _If possible, assign one of the following labels to the PR: `feature`, `fix` or `test` (or ask a maintainer to do it for you)._
14 |
15 | ---
16 |
17 | _discussion related to that PR_
18 |
--------------------------------------------------------------------------------
/.github/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | Please report any security-related issues directly to hadrien@valencediscovery.com.
4 |
--------------------------------------------------------------------------------
/.github/workflows/code-check.yml:
--------------------------------------------------------------------------------
1 | name: code-check
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 | tags: ["*"]
7 | pull_request:
8 | branches:
9 | - "*"
10 | - "!gh-pages"
11 |
12 | jobs:
13 | python-format-black:
14 | name: Python lint [black]
15 | runs-on: ubuntu-latest
16 | steps:
17 | - name: Checkout the code
18 | uses: actions/checkout@v4
19 |
20 | - name: Set up Python
21 | uses: actions/setup-python@v4
22 | with:
23 | python-version: "3.10"
24 |
25 | - name: Install black
26 | run: |
27 | pip install black>=24
28 |
29 | - name: Lint
30 | run: black --check .
31 |
32 | python-lint-ruff:
33 | name: Python lint [ruff]
34 | runs-on: ubuntu-latest
35 | steps:
36 | - name: Checkout the code
37 | uses: actions/checkout@v4
38 |
39 | - name: Set up Python
40 | uses: actions/setup-python@v4
41 | with:
42 | python-version: "3.10"
43 |
44 | - name: Install ruff
45 | run: |
46 | pip install ruff
47 |
48 | - name: Lint
49 | run: ruff .
50 |
--------------------------------------------------------------------------------
/.github/workflows/doc.yml:
--------------------------------------------------------------------------------
1 | name: doc
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 |
7 | # Prevent doc action on `main` to conflict with each others.
8 | concurrency:
9 | group: doc-${{ github.ref }}
10 | cancel-in-progress: true
11 |
12 | jobs:
13 | doc:
14 | runs-on: "ubuntu-latest"
15 | timeout-minutes: 30
16 |
17 | defaults:
18 | run:
19 | shell: bash -l {0}
20 |
21 | steps:
22 | - name: Checkout the code
23 | uses: actions/checkout@v4
24 |
25 | - name: Setup mamba
26 | uses: mamba-org/setup-micromamba@v1
27 | with:
28 | environment-file: env.yml
29 | environment-name: my_env
30 | cache-environment: true
31 | cache-downloads: true
32 |
33 | - name: Install library
34 | run: python -m pip install --no-deps .
35 |
36 | - name: Configure git
37 | run: |
38 | git config --global user.name "${GITHUB_ACTOR}"
39 | git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
40 |
41 | - name: Deploy the doc
42 | run: |
43 | echo "Get the gh-pages branch"
44 | git fetch origin gh-pages
45 |
46 | echo "Build and deploy the doc on main"
47 | mike deploy --push main
48 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: release
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | release-version:
7 | description: "A valid Semver version string"
8 | required: true
9 |
10 | permissions:
11 | contents: write
12 | pull-requests: write
13 |
14 | jobs:
15 | release:
16 | # Do not release if not triggered from the default branch
17 | if: github.ref == format('refs/heads/{0}', github.event.repository.default_branch)
18 |
19 | runs-on: ubuntu-latest
20 | timeout-minutes: 30
21 |
22 | defaults:
23 | run:
24 | shell: bash -l {0}
25 |
26 | steps:
27 | - name: Checkout the code
28 | uses: actions/checkout@v4
29 |
30 | - name: Setup mamba
31 | uses: mamba-org/setup-micromamba@v1
32 | with:
33 | environment-file: env.yml
34 | environment-name: my_env
35 | cache-environment: true
36 | cache-downloads: true
37 | create-args: >-
38 | pip
39 | semver
40 | python-build
41 | setuptools_scm
42 |
43 | - name: Check the version is valid semver
44 | run: |
45 | RELEASE_VERSION="${{ inputs.release-version }}"
46 |
47 | {
48 | pysemver check $RELEASE_VERSION
49 | } || {
50 | echo "The version '$RELEASE_VERSION' is not a valid Semver version string."
51 | echo "Please use a valid semver version string. More details at https://semver.org/"
52 | echo "The release process is aborted."
53 | exit 1
54 | }
55 |
56 | - name: Check the version is higher than the latest one
57 | run: |
58 | # Retrieve the git tags first
59 | git fetch --prune --unshallow --tags &> /dev/null
60 |
61 | RELEASE_VERSION="${{ inputs.release-version }}"
62 | LATEST_VERSION=$(git describe --abbrev=0 --tags)
63 |
64 | IS_HIGHER_VERSION=$(pysemver compare $RELEASE_VERSION $LATEST_VERSION)
65 |
66 | if [ "$IS_HIGHER_VERSION" != "1" ]; then
67 | echo "The version '$RELEASE_VERSION' is not higher than the latest version '$LATEST_VERSION'."
68 | echo "The release process is aborted."
69 | exit 1
70 | fi
71 |
72 | - name: Build Changelog
73 | id: github_release
74 | uses: mikepenz/release-changelog-builder-action@v4
75 | env:
76 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
77 | with:
78 | toTag: "main"
79 |
80 | - name: Configure git
81 | run: |
82 | git config --global user.name "${GITHUB_ACTOR}"
83 | git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
84 |
85 | - name: Create and push git tag
86 | env:
87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88 | run: |
89 | # Tag the release
90 | git tag -a "${{ inputs.release-version }}" -m "Release version ${{ inputs.release-version }}"
91 |
92 | # Checkout the git tag
93 | git checkout "${{ inputs.release-version }}"
94 |
95 | # Push the modified changelogs
96 | git push origin main
97 |
98 | # Push the tags
99 | git push origin "${{ inputs.release-version }}"
100 |
101 | - name: Install library
102 | run: python -m pip install --no-deps .
103 |
104 | - name: Build the wheel and sdist
105 | run: python -m build --no-isolation
106 |
107 | - name: Publish package to PyPI
108 | uses: pypa/gh-action-pypi-publish@release/v1
109 | with:
110 | password: ${{ secrets.PYPI_API_TOKEN }}
111 | packages-dir: dist/
112 |
113 | - name: Create GitHub Release
114 | uses: softprops/action-gh-release@de2c0eb89ae2a093876385947365aca7b0e5f844
115 | with:
116 | tag_name: ${{ inputs.release-version }}
117 | body: ${{steps.github_release.outputs.changelog}}
118 |
119 | - name: Deploy the doc
120 | run: |
121 | echo "Get the gh-pages branch"
122 | git fetch origin gh-pages
123 |
124 | echo "Build and deploy the doc on ${{ inputs.release-version }}"
125 | mike deploy --push stable
126 | mike deploy --push ${{ inputs.release-version }}
127 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: test
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 | tags: ["*"]
7 | pull_request:
8 | branches:
9 | - "*"
10 | - "!gh-pages"
11 | schedule:
12 | - cron: "0 4 * * MON"
13 |
14 | jobs:
15 | test:
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.10", "3.11"]
20 | os: ["ubuntu-latest", "macos-latest", "windows-latest"]
21 | rdkit-version: ["2023.09", "2024.03"]
22 |
23 | runs-on: ${{ matrix.os }}
24 | timeout-minutes: 30
25 |
26 | defaults:
27 | run:
28 | shell: bash -l {0}
29 |
30 | name: |
31 | os=${{ matrix.os }}
32 | - python=${{ matrix.python-version }}
33 | - rdkit=${{ matrix.rdkit-version }}
34 |
35 | steps:
36 | - name: Checkout the code
37 | uses: actions/checkout@v4
38 |
39 | - name: Setup mamba
40 | uses: mamba-org/setup-micromamba@v1
41 | with:
42 | environment-file: env.yml
43 | environment-name: my_env
44 | cache-environment: true
45 | cache-downloads: true
46 | create-args: >-
47 | python=${{ matrix.python-version }}
48 | rdkit=${{ matrix.rdkit-version }}
49 |
50 | - name: Install library
51 | run: python -m pip install --no-deps -e . # `-e` required for correct `coverage` run.
52 |
53 | - name: Run tests
54 | run: pytest
55 |
56 | - name: Codecov Upload
57 | uses: codecov/codecov-action@v4
58 | with:
59 | files: ./coverage.xml
60 | flags: unittests
61 | name: codecov-umbrella
62 | fail_ci_if_error: false
63 | verbose: false
64 | env_vars: ${{ matrix.os }},${{ matrix.python-version }},${{ matrix.rdkit-version }}
65 |
66 | - name: Test building the doc
67 | run: mkdocs build
68 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.env
2 | cov.xml
3 | coverage.xml
4 |
5 | .vscode/
6 |
7 | .ipynb_checkpoints/
8 |
9 | *.py[cod]
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Packages
15 | *.egg
16 | *.egg-info
17 | dist
18 | build
19 | eggs
20 | parts
21 | bin
22 | var
23 | sdist
24 | develop-eggs
25 | .installed.cfg
26 | lib
27 | lib64
28 |
29 | # Installer logs
30 | pip-log.txt
31 |
32 | # Unit test / coverage reports
33 | .coverage*
34 | .tox
35 | nosetests.xml
36 | htmlcov
37 |
38 | # Translations
39 | *.mo
40 |
41 | # Mr Developer
42 | .mr.developer.cfg
43 | .project
44 | .pydevproject
45 |
46 | # Complexity
47 | output/*.html
48 | output/*/index.html
49 |
50 | # Sphinx
51 | docs/_build
52 |
53 | MANIFEST
54 |
55 | *.tif
56 |
57 | # Rever
58 | rever/
59 |
60 | # Dev notebook
61 | dev.ipynb
62 |
63 | # MkDocs
64 | site/
65 |
66 | .idea/
67 | __pycache__
68 | .DS_Store
69 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |
5 |
6 | datamol - molecular processing made easy
7 |
8 |
9 |
10 | Docs
11 | |
12 |
13 | Homepage
14 |
15 |
16 |
17 | ---
18 |
19 | [](https://zenodo.org/badge/latestdoi/341603042)
20 | [](https://mybinder.org/v2/gh/datamol-io/datamol/main?urlpath=lab/tree/docs/tutorials/The_Basics.ipynb)
21 | [](https://pypi.org/project/datamol/)
22 | [](https://anaconda.org/conda-forge/datamol)
23 | [](https://pypi.org/project/datamol/)
24 | [](https://anaconda.org/conda-forge/datamol)
25 | [](https://pypi.org/project/datamol/)
26 | [](https://github.com/datamol-io/datamol/blob/main/LICENSE)
27 | [](https://github.com/datamol-io/datamol/stargazers)
28 | [](https://github.com/datamol-io/datamol/network/members)
29 | [](https://codecov.io/gh/datamol-io/datamol)
30 |
31 | Datamol is a python library to work with molecules. It's a layer built on top of [RDKit](https://www.rdkit.org/) and aims to be as light as possible.
32 |
33 | - 🐍 Simple pythonic API
34 | - ⚗️ RDKit first: all you manipulate are `rdkit.Chem.Mol` objects.
35 | - ✅ Manipulating molecules often relies on many options; Datamol provides good defaults by design.
36 | - 🧠 Performance matters: built-in efficient parallelization when possible with an optional progress bar.
37 | - 🕹️ Modern IO: out-of-the-box support for remote paths using `fsspec` to read and write multiple formats (sdf, xlsx, csv, etc).
38 |
39 | ## Try Online
40 |
41 | Visit [](https://mybinder.org/v2/gh/datamol-io/datamol/main?urlpath=lab/tree/docs/tutorials/The_Basics.ipynb) and try Datamol online.
42 |
43 | ## Documentation
44 |
45 | Visit .
46 |
47 | ## Installation
48 |
49 | Use conda:
50 |
51 | ```bash
52 | mamba install -c conda-forge datamol
53 | ```
54 |
55 | ## Quick API Tour
56 |
57 | ```python
58 | import datamol as dm
59 |
60 | # Common functions
61 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
62 | fp = dm.to_fp(mol)
63 | selfies = dm.to_selfies(mol)
64 | inchi = dm.to_inchi(mol)
65 |
66 | # Standardize and sanitize
67 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O")
68 | mol = dm.fix_mol(mol)
69 | mol = dm.sanitize_mol(mol)
70 | mol = dm.standardize_mol(mol)
71 |
72 | # Dataframe manipulation
73 | df = dm.data.freesolv()
74 | mols = dm.from_df(df)
75 |
76 | # 2D viz
77 | legends = [dm.to_smiles(mol) for mol in mols[:10]]
78 | dm.viz.to_image(mols[:10], legends=legends)
79 |
80 | # Generate conformers
81 | smiles = "O=C(C)Oc1ccccc1C(=O)O"
82 | mol = dm.to_mol(smiles)
83 | mol_with_conformers = dm.conformers.generate(mol)
84 |
85 | # 3D viz (using nglview)
86 | dm.viz.conformers(mol, n_confs=10)
87 |
88 | # Compute SASA from conformers
89 | sasa = dm.conformers.sasa(mol_with_conformers)
90 |
91 | # Easy IO
92 | mols = dm.read_sdf("s3://my-awesome-data-lake/smiles.sdf", as_df=False)
93 | dm.to_sdf(mols, "gs://data-bucket/smiles.sdf")
94 | ```
95 |
96 | ## How to cite
97 |
98 | Please cite Datamol if you use it in your research: [](https://zenodo.org/badge/latestdoi/341603042).
99 |
100 | ## Compatibilities
101 |
102 | Version compatibilities are an essential topic for production-software stacks. We are cautious about documenting compatibility between `datamol`, `python` and `rdkit`.
103 |
104 | See below the associated versions of Python and RDKit, for which a minor version of Datamol **has been tested** during its whole lifecycle. _It does not mean other combinations does not work but that those are not tested._
105 |
106 | | `datamol` | `python` | `rdkit` |
107 | | --------- | ------------------- | ----------------------------- |
108 | | `0.12.x` | `[3.10, 3.11]` | `[2023.03, 2023.09]` |
109 | | `0.11.x` | `[3.9, 3.10, 3.11]` | `[2022.09, 2023.03]` |
110 | | `0.10.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` |
111 | | `0.9.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` |
112 | | `0.8.x` | `[3.8, 3.9, 3.10]` | `[2021.09, 2022.03, 2022.09]` |
113 | | `0.7.x` | `[3.8, 3.9]` | `[2021.09, 2022.03]` |
114 | | `0.6.x` | `[3.8, 3.9]` | `[2021.09]` |
115 | | `0.5.x` | `[3.8, 3.9]` | `[2021.03, 2021.09]` |
116 | | `0.4.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` |
117 | | `0.3.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` |
118 |
119 | ## CI Status
120 |
121 | The CI runs tests and performs code quality checks for the following combinations:
122 |
123 | - The three major platforms: Windows, OSX and Linux.
124 | - The two latest Python versions.
125 | - The two latest RDKit versions.
126 |
127 | | | `main` |
128 | | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
129 | | Lib build & Testing | [](https://github.com/datamol-io/datamol/actions/workflows/test.yml) |
130 | | Code Sanity (linting and type analysis) | [](https://github.com/datamol-io/datamol/actions/workflows/code-check.yml) |
131 | | Documentation Build | [](https://github.com/datamol-io/datamol/actions/workflows/doc.yml) |
132 |
133 | ## License
134 |
135 | Under the Apache-2.0 license. See [LICENSE](LICENSE).
136 |
--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 |
4 | dependencies:
5 | - python >=3.8
6 | - pip
7 | - tqdm
8 | - loguru
9 | - joblib
10 | - fsspec >=2021.9
11 | - s3fs >=2021.9
12 | - gcsfs >=2021.9
13 | - platformdirs
14 | - packaging
15 | - typing_extensions
16 | - importlib_resources
17 |
18 | # Scientific
19 | - pandas
20 | - numpy
21 | - scipy
22 | - pillow
23 | - matplotlib
24 | - scikit-learn
25 |
26 | # Chemistry
27 | - rdkit >=2021.03
28 | - selfies
29 |
30 | # Optional deps
31 | - openpyxl
32 | - networkx
33 | - nglview
34 | - xlsxwriter
35 | - pyarrow
36 |
37 | # Dev
38 | - pytest >=6.0
39 | - pytest-cov
40 | - pytest-xdist
41 | - black >=24
42 | - jupyterlab
43 | - mypy
44 | - codecov
45 | - nbconvert
46 |
47 | # Doc
48 | - mkdocs
49 | - mkdocs-material >=7.1.1
50 | - mkdocs-material-extensions
51 | - mkdocstrings
52 | - mkdocstrings-python
53 | - mkdocs-jupyter
54 | - markdown-include
55 | - mdx_truly_sane_lists
56 | - mike >=1.0.0
57 | - seaborn
58 |
--------------------------------------------------------------------------------
/binder/postBuild:
--------------------------------------------------------------------------------
1 | pip install -e .
2 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | range: "50...80"
3 | status:
4 | project:
5 | default:
6 | threshold: 1%
7 | patch: false
8 |
--------------------------------------------------------------------------------
/datamol/_sanifix4.py:
--------------------------------------------------------------------------------
1 | """
2 | sanifix4.py
3 | Original code from rdkit [James Davidson]
4 | """
5 |
6 | from rdkit import Chem, RDLogger
7 |
8 |
9 | logger = RDLogger.logger()
10 |
11 |
12 | def _FragIndicesToMol(oMol, indices):
13 | em = Chem.EditableMol(Chem.Mol())
14 |
15 | newIndices = {}
16 | for i, idx in enumerate(indices):
17 | em.AddAtom(oMol.GetAtomWithIdx(idx))
18 | newIndices[idx] = i
19 |
20 | for i, idx in enumerate(indices):
21 | at = oMol.GetAtomWithIdx(idx)
22 | for bond in at.GetBonds():
23 | if bond.GetBeginAtomIdx() == idx:
24 | oidx = bond.GetEndAtomIdx()
25 | else:
26 | oidx = bond.GetBeginAtomIdx()
27 | # make sure every bond only gets added once:
28 | if oidx < idx:
29 | continue
30 | em.AddBond(newIndices[idx], newIndices[oidx], bond.GetBondType())
31 | res = em.GetMol()
32 | res.ClearComputedProps()
33 | Chem.GetSymmSSSR(res)
34 | res.UpdatePropertyCache(False)
35 | res._idxMap = newIndices
36 | return res
37 |
38 |
39 | def _recursivelyModifyNs(mol, matches, indices=None):
40 | if indices is None:
41 | indices = []
42 | res = None
43 | while len(matches) and res is None:
44 | tIndices = indices[:]
45 | nextIdx = matches.pop(0)
46 | tIndices.append(nextIdx)
47 | nm = Chem.Mol(mol.ToBinary())
48 | nm.GetAtomWithIdx(nextIdx).SetNoImplicit(True)
49 | nm.GetAtomWithIdx(nextIdx).SetNumExplicitHs(1)
50 | cp = Chem.Mol(nm.ToBinary())
51 | try:
52 | Chem.SanitizeMol(cp)
53 | except ValueError:
54 | res, indices = _recursivelyModifyNs(nm, matches, indices=tIndices)
55 | else:
56 | indices = tIndices
57 | res = cp
58 | return res, indices
59 |
60 |
61 | def AdjustAromaticNs(m, nitrogenPattern="[n&D2&H0;r5,r6]"):
62 | """
63 | default nitrogen pattern matches Ns in 5 rings and 6 rings in order to be able
64 | to fix: O=c1ccncc1
65 | """
66 | Chem.GetSymmSSSR(m)
67 | m.UpdatePropertyCache(False)
68 |
69 | # break non-ring bonds linking rings:
70 | em = Chem.EditableMol(m)
71 | linkers = m.GetSubstructMatches(Chem.MolFromSmarts("[r]!@[r]"))
72 | plsFix = set()
73 | for a, b in linkers:
74 | em.RemoveBond(a, b)
75 | plsFix.add(a)
76 | plsFix.add(b)
77 | nm = em.GetMol()
78 | for at in plsFix:
79 | at = nm.GetAtomWithIdx(at)
80 | if at.GetIsAromatic() and at.GetAtomicNum() == 7:
81 | at.SetNumExplicitHs(1)
82 | at.SetNoImplicit(True)
83 |
84 | # build molecules from the fragments:
85 | fragLists = Chem.GetMolFrags(nm)
86 | frags = [_FragIndicesToMol(nm, x) for x in fragLists]
87 |
88 | # loop through the fragments in turn and try to aromatize them:
89 | ok = True
90 | for i, frag in enumerate(frags):
91 | cp = Chem.Mol(frag)
92 | try:
93 | Chem.SanitizeMol(cp)
94 | except ValueError:
95 | matches = [x[0] for x in frag.GetSubstructMatches(Chem.MolFromSmarts(nitrogenPattern))]
96 | lres, indices = _recursivelyModifyNs(frag, matches)
97 | if not lres:
98 | # print 'frag %d failed (%s)'%(i,str(fragLists[i]))
99 | ok = False
100 | break
101 | else:
102 | revMap = {}
103 | for k, v in frag._idxMap.items():
104 | revMap[v] = k
105 | for idx in indices:
106 | oatom = m.GetAtomWithIdx(revMap[idx])
107 | oatom.SetNoImplicit(True)
108 | oatom.SetNumExplicitHs(1)
109 | if not ok:
110 | return None
111 | return m
112 |
113 |
114 | def sanifix(m):
115 | if m is None:
116 | return None
117 | try:
118 | m.UpdatePropertyCache(False)
119 | cp = Chem.Mol(m.ToBinary())
120 | Chem.SanitizeMol(cp)
121 | return cp
122 | except ValueError as e:
123 | logger.debug(f"{Chem.MolToSmiles(m)} failed due to {e}")
124 | try:
125 | m = AdjustAromaticNs(m)
126 | if m is not None:
127 | Chem.SanitizeMol(m)
128 | return m
129 | except Exception as ee:
130 | logger.debug(f"{Chem.MolToSmiles(m)} failed due to {ee}")
131 | return None
132 | except RuntimeError as e:
133 | logger.debug(f"{Chem.MolToSmiles(m)} failed due to {e}")
134 | logger.info(f"The faulty smiles is: {Chem.MolToSmiles(m)}")
135 | raise e
136 |
--------------------------------------------------------------------------------
/datamol/_version.py:
--------------------------------------------------------------------------------
1 | try:
2 | from importlib.metadata import version
3 | from importlib.metadata import PackageNotFoundError
4 | except ModuleNotFoundError:
5 | # Try backported to PY<38 `importlib_metadata`.
6 | from importlib_metadata import version
7 | from importlib_metadata import PackageNotFoundError
8 |
9 |
10 | import rdkit
11 | import packaging.version
12 |
13 |
14 | try:
15 | __version__ = version("datamol")
16 | except PackageNotFoundError:
17 | # package is not installed
18 | __version__ = "dev"
19 |
20 | CURRENT_RDKIT_VERSION = rdkit.__version__
21 | CURRENT_RDKIT_VERSION_OBJ = packaging.version.parse(CURRENT_RDKIT_VERSION)
22 |
23 |
24 | def is_lower_than_current_rdkit_version(rdkit_version: str):
25 | return CURRENT_RDKIT_VERSION_OBJ < packaging.version.parse(rdkit_version)
26 |
27 |
28 | def is_greater_than_current_rdkit_version(rdkit_version: str):
29 | return CURRENT_RDKIT_VERSION_OBJ > packaging.version.parse(rdkit_version)
30 |
31 |
32 | def is_lower_eq_than_current_rdkit_version(rdkit_version: str):
33 | return CURRENT_RDKIT_VERSION_OBJ <= packaging.version.parse(rdkit_version)
34 |
35 |
36 | def is_greater_eq_than_current_rdkit_version(rdkit_version: str):
37 | return CURRENT_RDKIT_VERSION_OBJ >= packaging.version.parse(rdkit_version)
38 |
--------------------------------------------------------------------------------
/datamol/conformers/__init__.py:
--------------------------------------------------------------------------------
1 | from ._conformers import generate
2 | from ._conformers import cluster
3 | from ._conformers import rmsd
4 | from ._conformers import return_centroids
5 | from ._conformers import translate
6 | from ._conformers import align_conformers
7 |
8 | from ._features import sasa
9 | from ._features import get_coords
10 | from ._features import center_of_mass
11 | from ._features import keep_conformers
12 |
--------------------------------------------------------------------------------
/datamol/conformers/_features.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from typing import List
3 | from typing import Optional
4 |
5 | import numpy as np
6 |
7 | from ..types import Mol
8 | from ..utils.jobs import JobRunner
9 | from ..utils import decorators
10 | from ..mol import PERIODIC_TABLE
11 | from ..mol import copy_mol
12 |
13 |
14 | @decorators.disable_on_os("win")
15 | def sasa(
16 | mol: Mol,
17 | conf_id: Optional[Union[int, List[int]]] = None,
18 | n_jobs: int = 1,
19 | ) -> np.ndarray:
20 | """Compute Solvent Accessible Surface Area of all the conformers
21 | using FreeSASA (https://freesasa.github.io/). Values are returned
22 | as an array and also stored within each conformer as a property
23 | called `rdkit_free_sasa`.
24 |
25 | Example:
26 |
27 | ```python
28 | smiles = "O=C(C)Oc1ccccc1C(=O)O"
29 | mol = dm.to_mol(smiles)
30 | mol = dm.conformers.generate(mol)
31 |
32 | # Compute SASA for all the conformers without parallelization
33 | sasa_values = dm.conformers.sasa(mol, conf_id=None, n_jobs=1)
34 |
35 | # If minimization has been enabled (default to True)
36 | # you can access the computed energy.
37 | conf = mol.GetConformer(0)
38 | props = conf.GetPropsAsDict()
39 | print(props)
40 | # {'rdkit_uff_energy': 1.7649408317784008}
41 | ```
42 |
43 | Args:
44 | mol: a molecule
45 | conf_id: Id of the conformers to compute. If None, compute all.
46 | n_jobs: Number of jobs for parallelization. Set to 1 to disable
47 | and -1 to use all cores.
48 |
49 | Returns:
50 | mol: the molecule with the conformers.
51 | """
52 | from rdkit.Chem import rdFreeSASA
53 |
54 | if mol.GetNumConformers() == 0:
55 | raise ValueError(
56 | "The molecule has 0 conformers. You can generate conformers with `dm.conformers.generate(mol)`."
57 | )
58 |
59 | # Get Van der Waals radii (angstrom)
60 | radii = [PERIODIC_TABLE.GetRvdw(atom.GetAtomicNum()) for atom in mol.GetAtoms()]
61 |
62 | # Which conformers to compute
63 | conf_ids = []
64 | if conf_id is None:
65 | # If None compute for all the conformers
66 | conf_ids = list(range(mol.GetNumConformers())) # type: ignore
67 | elif isinstance(conf_id, int):
68 | conf_ids = [conf_id]
69 | else:
70 | conf_ids = conf_id
71 |
72 | # Compute solvent accessible surface area
73 | def _get_sasa(i):
74 | conf = mol.GetConformer(i)
75 | sasa = rdFreeSASA.CalcSASA(mol, radii, confIdx=conf.GetId())
76 | conf.SetDoubleProp("rdkit_free_sasa", sasa)
77 | return sasa
78 |
79 | runner = JobRunner(n_jobs=n_jobs)
80 | sasa_values = runner(_get_sasa, conf_ids)
81 | return np.array(sasa_values)
82 |
83 |
84 | def get_coords(mol: Mol, conf_id: int = -1):
85 | """Get the coordinate of a conformer of a molecule.
86 |
87 | Args:
88 | mol: a molecule.
89 | conf_id: a conformer id.
90 | """
91 |
92 | if mol.GetNumConformers() == 0:
93 | raise ValueError("Molecule does not have any conformers.")
94 |
95 | conf = mol.GetConformer(id=conf_id)
96 | return conf.GetPositions()
97 |
98 |
99 | def center_of_mass(
100 | mol: Mol,
101 | use_atoms: bool = True,
102 | digits: Optional[int] = None,
103 | conf_id: int = -1,
104 | ) -> np.ndarray:
105 | """Compute the center of mass of a conformer of a molecule.
106 |
107 | Args:
108 | mol: a molecule
109 | use_atoms: Whether to compute the true center of mass or the geometrical center.
110 | digits: Number of digits to round to.
111 | conf_id: the conformer id.
112 |
113 | Returns
114 | cm: Center of mass or geometrical center
115 | """
116 | coords = get_coords(mol, conf_id=conf_id)
117 | atom_weight = np.ones((coords.shape[0]))
118 |
119 | if use_atoms:
120 | atom_weight = np.array([atom.GetMass() for atom in mol.GetAtoms()])
121 |
122 | atom_weight = atom_weight[:, None]
123 | atom_weight /= atom_weight.sum()
124 | center = (coords * atom_weight).sum(axis=0)
125 |
126 | if digits is not None:
127 | center = center.round(digits)
128 |
129 | return center
130 |
131 |
132 | def keep_conformers(
133 | mol: Mol,
134 | indices_to_keep: Union[int, List[int]] = -1,
135 | assign_id: bool = True,
136 | copy: bool = True,
137 | ):
138 | """Keep on the specified conformer(s) in `indices_to_keep`.
139 |
140 | Args:
141 | mol: A molecule.
142 | indices_to_keep: A indice or a least of indices of conformers to keep.
143 | assign_id: Whether to assign the kept conformers an id or keep the original one.
144 | copy: Whether to copy the molecule or not.
145 | """
146 |
147 | if copy:
148 | mol = copy_mol(mol)
149 |
150 | if not isinstance(indices_to_keep, list):
151 | indices_to_keep = [indices_to_keep]
152 |
153 | # Extract conformers to keep
154 | confs_to_keep = [mol.GetConformer(conf_id) for conf_id in indices_to_keep]
155 |
156 | # Copy current mol and remove all conformers
157 | mol2 = copy_mol(mol)
158 | mol2.RemoveAllConformers()
159 |
160 | # Add conformers
161 | _ = [mol2.AddConformer(conf, assignId=assign_id) for conf in confs_to_keep]
162 |
163 | # Cleanup
164 | mol = mol2
165 |
166 | return mol
167 |
--------------------------------------------------------------------------------
/datamol/data/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The data module aims to provide a fast and convenient access to various molecular datasets.
3 |
4 | ---
5 | """
6 |
7 | from typing import Optional
8 | from typing import cast
9 | from typing import Union
10 | from typing import List
11 | from typing import overload
12 | from typing import Literal
13 |
14 | import sys
15 | import io
16 | import functools
17 |
18 | try:
19 | import importlib.resources as importlib_resources
20 | except ImportError:
21 | import importlib_resources
22 |
23 | import pandas as pd
24 |
25 | from ..types import Mol
26 | from ..io import read_sdf
27 | from ..convert import from_df
28 | from ..convert import render_mol_df
29 |
30 |
31 | @functools.lru_cache()
32 | def datamol_data_file_path(filename: str, dm_module: str = "datamol.data") -> str:
33 | if sys.version_info < (3, 9, 0):
34 | with importlib_resources.path(dm_module, filename) as p:
35 | data_path = p
36 | else:
37 | data_path = importlib_resources.files(dm_module).joinpath(filename)
38 |
39 | return str(data_path)
40 |
41 |
42 | def open_datamol_data_file(
43 | filename: str,
44 | open_binary: bool = False,
45 | dm_module: str = "datamol.data",
46 | ):
47 | if sys.version_info < (3, 9, 0):
48 | if open_binary:
49 | file_context_manager = importlib_resources.open_binary(dm_module, filename)
50 | else:
51 | file_context_manager = importlib_resources.open_text(dm_module, filename)
52 | else:
53 | if open_binary:
54 | mode = "rb"
55 | else:
56 | mode = "r"
57 |
58 | file_context_manager = (
59 | importlib_resources.files(dm_module).joinpath(filename).open(mode=mode)
60 | )
61 |
62 | # NOTE(hadim): we assume the file always exists
63 | file_context_manager = cast(io.TextIOWrapper, file_context_manager)
64 |
65 | return file_context_manager
66 |
67 |
68 | @overload
69 | def freesolv(as_df: Literal[True] = True) -> pd.DataFrame: ...
70 |
71 |
72 | @overload
73 | def freesolv(as_df: Literal[False] = False) -> List[Mol]: ...
74 |
75 |
76 | @overload
77 | def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: ...
78 |
79 |
80 | def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
81 | """Return the FreeSolv dataset as a dataframe.
82 |
83 | The dataset contains 642 molecules and the following columns:
84 | `['iupac', 'smiles', 'expt', 'calc']`.
85 |
86 | Warning:
87 | This dataset is only meant to be used as a toy dataset for pedagogic and
88 | testing purposes. **It is not** a dataset for benchmarking, analysis or
89 | model training.
90 | """
91 |
92 | with open_datamol_data_file("freesolv.csv") as f:
93 | data = pd.read_csv(f)
94 |
95 | if not as_df:
96 | data = from_df(data)
97 |
98 | return data
99 |
100 |
101 | @overload
102 | def cdk2(as_df: Literal[True] = True, mol_column: Optional[str] = "mol") -> pd.DataFrame: ...
103 |
104 |
105 | @overload
106 | def cdk2(as_df: Literal[False] = False, mol_column: Optional[str] = "mol") -> List[Mol]: ...
107 |
108 |
109 | @overload
110 | def cdk2(
111 | as_df: bool = True, mol_column: Optional[str] = "mol"
112 | ) -> Union[List[Mol], pd.DataFrame]: ...
113 |
114 |
115 | def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"):
116 | """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`.
117 |
118 | Args:
119 | as_df: Whether to return a list mol or a pandas DataFrame.
120 | mol_column: Name of the mol column. Only relevant if `as_df` is True.
121 | """
122 |
123 | with open_datamol_data_file("cdk2.sdf", open_binary=True) as f:
124 | data = read_sdf(f, as_df=as_df, mol_column=mol_column)
125 | return data
126 |
127 |
128 | @overload
129 | def solubility(as_df: Literal[True] = True, mol_column: Optional[str] = "mol") -> pd.DataFrame: ...
130 |
131 |
132 | @overload
133 | def solubility(as_df: Literal[False] = False, mol_column: Optional[str] = "mol") -> List[Mol]: ...
134 |
135 |
136 | @overload
137 | def solubility(
138 | as_df: bool = True, mol_column: Optional[str] = "mol"
139 | ) -> Union[List[Mol], pd.DataFrame]: ...
140 |
141 |
142 | def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
143 | """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`.
144 |
145 | The dataframe or the list of molecules with contain a `split` column, either `train` or `test`.
146 |
147 | Args:
148 | as_df: Whether to return a list mol or a pandas DataFrame.
149 | mol_column: Name of the mol column. Only relevant if `as_df` is True.
150 | """
151 |
152 | with open_datamol_data_file("solubility.train.sdf", open_binary=True) as f:
153 | train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)
154 |
155 | with open_datamol_data_file("solubility.test.sdf", open_binary=True) as f:
156 | test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)
157 |
158 | train = cast(pd.DataFrame, train)
159 | test = cast(pd.DataFrame, test)
160 |
161 | train["split"] = "train"
162 | test["split"] = "test"
163 |
164 | # NOTE(hadim): LMAO RDkit consistency xD
165 | test = test.rename(columns={"SMILES": "smiles"})
166 |
167 | data = pd.concat([train, test], ignore_index=True)
168 |
169 | if as_df:
170 | if mol_column is None:
171 | data = data.drop(columns=["mol"])
172 |
173 | render_mol_df(data)
174 | return data
175 |
176 | return from_df(data, mol_column=mol_column)
177 |
178 |
179 | @overload
180 | def chembl_drugs(as_df: Literal[True] = True) -> pd.DataFrame: ...
181 |
182 |
183 | @overload
184 | def chembl_drugs(as_df: Literal[False] = False) -> List[Mol]: ...
185 |
186 |
187 | def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
188 | """A list of ~2.5k molecules from ChEMBL (all approved drugs) in SMILES format.
189 | Includes metadata indicating year of first approval, molecule chembl id, molecule type and pref_name.
190 |
191 | List was generated with ['Get_ChEMBL_Approved_Drugs.ipynb'](https://github.com/datamol-io/datamol/notebooks/Get_ChEMBL_Approved_Drugs.ipynb) on 2023-10-18.
192 | The notebook works with the chembl_webresource_client api to collect chembl IDs and metadata, then focuses on small molecules with valid SMILES and first approval date.
193 | """
194 | with open_datamol_data_file("chembl_approved_drugs.parquet", open_binary=True) as f:
195 | data = pd.read_parquet(f)
196 |
197 | if not as_df:
198 | data = from_df(data)
199 |
200 | return data
201 |
202 |
203 | @overload
204 | def chembl_samples(as_df: Literal[True] = True) -> pd.DataFrame: ...
205 |
206 |
207 | @overload
208 | def chembl_samples(as_df: Literal[False] = False) -> List[Mol]: ...
209 |
210 |
211 | def chembl_samples(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
212 | """A list of ~2k molecules from ChEMBL.
213 |
214 | Originally, proposed by Patrick Walters at .
215 | """
216 |
217 | with open_datamol_data_file("chembl_samples.csv") as f:
218 | data = pd.read_csv(f)
219 |
220 | if not as_df:
221 | data = from_df(data)
222 |
223 | return data
224 |
--------------------------------------------------------------------------------
/datamol/data/chembl_approved_drugs.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/datamol/data/chembl_approved_drugs.parquet
--------------------------------------------------------------------------------
/datamol/descriptors/__init__.py:
--------------------------------------------------------------------------------
1 | from .descriptors import mw
2 | from .descriptors import fsp3
3 | from .descriptors import n_hba
4 | from .descriptors import n_hbd
5 | from .descriptors import n_lipinski_hba
6 | from .descriptors import n_lipinski_hbd
7 | from .descriptors import n_rings
8 | from .descriptors import n_hetero_atoms
9 | from .descriptors import n_heavy_atoms
10 | from .descriptors import n_rotatable_bonds
11 | from .descriptors import n_radical_electrons
12 | from .descriptors import tpsa
13 | from .descriptors import qed
14 | from .descriptors import clogp
15 | from .descriptors import sas
16 | from .descriptors import n_NHOH
17 | from .descriptors import n_NO
18 | from .descriptors import formal_charge
19 | from .descriptors import n_aliphatic_carbocycles
20 | from .descriptors import n_aliphatic_heterocyles
21 | from .descriptors import n_aliphatic_rings
22 | from .descriptors import n_aromatic_carbocycles
23 | from .descriptors import n_aromatic_heterocyles
24 | from .descriptors import n_aromatic_rings
25 | from .descriptors import n_saturated_carbocycles
26 | from .descriptors import n_saturated_heterocyles
27 | from .descriptors import n_saturated_rings
28 | from .descriptors import n_aromatic_atoms
29 | from .descriptors import n_aromatic_atoms_proportion
30 | from .descriptors import refractivity
31 | from .descriptors import n_rigid_bonds
32 | from .descriptors import n_stereo_centers
33 | from .descriptors import n_charged_atoms
34 | from .descriptors import n_stereo_centers_unspecified
35 | from .descriptors import n_spiro_atoms
36 |
37 | from .compute import any_rdkit_descriptor
38 | from .compute import compute_many_descriptors
39 | from .compute import batch_compute_many_descriptors
40 |
--------------------------------------------------------------------------------
/datamol/descriptors/compute.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 | from typing import Dict
3 | from typing import List
4 | from typing import Union
5 | from typing import Optional
6 |
7 | import functools
8 |
9 | import pandas as pd
10 |
11 | from rdkit.Chem import Descriptors
12 | from rdkit.Chem import rdMolDescriptors
13 |
14 | from .. import Mol
15 | from ..utils.jobs import parallelized
16 |
17 | from .descriptors import mw
18 | from .descriptors import fsp3
19 | from .descriptors import n_lipinski_hba
20 | from .descriptors import n_lipinski_hbd
21 | from .descriptors import n_rings
22 | from .descriptors import n_hetero_atoms
23 | from .descriptors import n_heavy_atoms
24 | from .descriptors import n_rotatable_bonds
25 | from .descriptors import n_radical_electrons
26 | from .descriptors import tpsa
27 | from .descriptors import qed
28 | from .descriptors import clogp
29 | from .descriptors import sas
30 | from .descriptors import n_aliphatic_carbocycles
31 | from .descriptors import n_aliphatic_heterocyles
32 | from .descriptors import n_aliphatic_rings
33 | from .descriptors import n_aromatic_carbocycles
34 | from .descriptors import n_aromatic_heterocyles
35 | from .descriptors import n_aromatic_rings
36 | from .descriptors import n_saturated_carbocycles
37 | from .descriptors import n_saturated_heterocyles
38 | from .descriptors import n_saturated_rings
39 |
40 |
41 | def any_rdkit_descriptor(name: str) -> Callable:
42 | """Return a descriptor function by name either from
43 | `rdkit.Chem import Descriptors` or `rdkit.Chem.rdMolDescriptors`.
44 |
45 | Args:
46 | name: Descriptor name.
47 | """
48 | fn = getattr(Descriptors, name, None)
49 |
50 | if fn is None:
51 | fn = getattr(rdMolDescriptors, name, None)
52 |
53 | if fn is None:
54 | raise ValueError(f"Descriptor {name} not found.")
55 |
56 | return fn
57 |
58 |
59 | _DEFAULT_PROPERTIES_FN = {
60 | "mw": mw,
61 | "fsp3": fsp3,
62 | "n_lipinski_hba": n_lipinski_hba,
63 | "n_lipinski_hbd": n_lipinski_hbd,
64 | "n_rings": n_rings,
65 | "n_hetero_atoms": n_hetero_atoms,
66 | "n_heavy_atoms": n_heavy_atoms,
67 | "n_rotatable_bonds": n_rotatable_bonds,
68 | "n_radical_electrons": n_radical_electrons,
69 | "tpsa": tpsa,
70 | "qed": qed,
71 | "clogp": clogp,
72 | "sas": sas,
73 | "n_aliphatic_carbocycles": n_aliphatic_carbocycles,
74 | "n_aliphatic_heterocyles": n_aliphatic_heterocyles,
75 | "n_aliphatic_rings": n_aliphatic_rings,
76 | "n_aromatic_carbocycles": n_aromatic_carbocycles,
77 | "n_aromatic_heterocyles": n_aromatic_heterocyles,
78 | "n_aromatic_rings": n_aromatic_rings,
79 | "n_saturated_carbocycles": n_saturated_carbocycles,
80 | "n_saturated_heterocyles": n_saturated_heterocyles,
81 | "n_saturated_rings": n_saturated_rings,
82 | }
83 |
84 |
85 | def compute_many_descriptors(
86 | mol: Mol,
87 | properties_fn: Optional[Dict[str, Union[Callable, str]]] = None,
88 | add_properties: bool = True,
89 | ) -> dict:
90 | """Compute a list of opiniated molecular properties.
91 |
92 | Args:
93 | mol: A molecule.
94 | properties_fn: A list of functions that compute properties. If None,
95 | a default list of properties is used. If the function is a string,
96 | `dm.descriptors.any_descriptor()` is used to retrieve the descriptor
97 | function.
98 | add_properties: Whether to add the computed properties to the default list.
99 |
100 | Returns:
101 | Computed properties as a dict.
102 | """
103 |
104 | if properties_fn is None:
105 | properties_fn = _DEFAULT_PROPERTIES_FN
106 | elif add_properties:
107 | [properties_fn.setdefault(k, v) for k, v in _DEFAULT_PROPERTIES_FN.items()]
108 |
109 | props = {}
110 | for k, v in properties_fn.items():
111 | if isinstance(v, str):
112 | v = any_rdkit_descriptor(v)
113 |
114 | props[k] = v(mol)
115 |
116 | return props
117 |
118 |
119 | def batch_compute_many_descriptors(
120 | mols: List[Mol],
121 | properties_fn: Optional[Dict[str, Union[Callable, str]]] = None,
122 | add_properties: bool = True,
123 | n_jobs: int = 1,
124 | batch_size: Optional[int] = None,
125 | progress: bool = False,
126 | progress_leave: bool = True,
127 | ) -> pd.DataFrame:
128 | """Compute a list of opiniated molecular properties on a list of molecules.
129 |
130 | Args:
131 | mols: A list of molecules.
132 | properties_fn: A list of functions that compute properties. If None,
133 | a default list of properties is used. If the function is a string,
134 | `dm.descriptors.any_descriptor()` is used to retrieve the descriptor
135 | function.
136 | add_properties: Whether to add the computed properties to the default list.
137 |
138 | Returns:
139 | A dataframe of computed properties with one row per input molecules.
140 | """
141 |
142 | compute_fn = functools.partial(
143 | compute_many_descriptors,
144 | properties_fn=properties_fn,
145 | add_properties=add_properties,
146 | )
147 |
148 | props = parallelized(
149 | compute_fn,
150 | mols,
151 | batch_size=batch_size,
152 | progress=progress,
153 | n_jobs=n_jobs,
154 | tqdm_kwargs=dict(leave=progress_leave),
155 | )
156 | return pd.DataFrame(props)
157 |
--------------------------------------------------------------------------------
/datamol/descriptors/descriptors.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | from rdkit.Chem import Descriptors
5 | from rdkit.Chem import rdMolDescriptors
6 | from rdkit.Chem import RDConfig
7 | from rdkit.Chem import Lipinski
8 | from rdkit.Chem import rdmolops
9 | from rdkit.Chem import Crippen
10 |
11 |
12 | from .. import Mol
13 | from ..convert import from_smarts
14 | from ..log import no_rdkit_log
15 | from .._version import is_lower_than_current_rdkit_version
16 |
17 |
18 | @no_rdkit_log
19 | def _sasscorer(mol: Mol):
20 | sys.path.append(os.path.join(RDConfig.RDContribDir, "SA_Score"))
21 | try:
22 | import sascorer # type:ignore
23 | except ImportError:
24 | raise ImportError(
25 | "Could not import sascorer. If you installed rdkit-pypi with `pip`, please uninstall it and reinstall rdkit with `conda` or `mamba`."
26 | )
27 |
28 | return sascorer.calculateScore(mol)
29 |
30 |
31 | _AROMATIC_QUERY = from_smarts("a")
32 |
33 | mw = rdMolDescriptors.CalcExactMolWt
34 | fsp3 = rdMolDescriptors.CalcFractionCSP3
35 | tpsa = rdMolDescriptors.CalcTPSA
36 | qed = Descriptors.qed
37 | clogp = Descriptors.MolLogP # type: ignore
38 | sas = _sasscorer
39 | formal_charge = rdmolops.GetFormalCharge
40 | refractivity = Crippen.MolMR
41 |
42 | n_hba = rdMolDescriptors.CalcNumHBA
43 | n_hbd = rdMolDescriptors.CalcNumHBD
44 | n_lipinski_hba = rdMolDescriptors.CalcNumLipinskiHBA
45 | n_lipinski_hbd = rdMolDescriptors.CalcNumLipinskiHBD
46 | n_rings = rdMolDescriptors.CalcNumRings
47 | n_hetero_atoms = rdMolDescriptors.CalcNumHeteroatoms
48 |
49 |
50 | if is_lower_than_current_rdkit_version("2021.09"):
51 | n_heavy_atoms = Descriptors.HeavyAtomCount # type: ignore
52 | else:
53 | n_heavy_atoms = rdMolDescriptors.CalcNumHeavyAtoms
54 |
55 | n_rotatable_bonds = rdMolDescriptors.CalcNumRotatableBonds
56 | n_radical_electrons = Descriptors.NumRadicalElectrons
57 | n_NHOH = Lipinski.NHOHCount
58 | n_NO = Lipinski.NOCount
59 | n_spiro_atoms = rdMolDescriptors.CalcNumSpiroAtoms
60 |
61 | n_aliphatic_carbocycles = rdMolDescriptors.CalcNumAliphaticCarbocycles
62 | n_aliphatic_heterocyles = rdMolDescriptors.CalcNumAliphaticHeterocycles
63 | n_aliphatic_rings = rdMolDescriptors.CalcNumAliphaticRings
64 |
65 | n_aromatic_carbocycles = rdMolDescriptors.CalcNumAromaticCarbocycles
66 | n_aromatic_heterocyles = rdMolDescriptors.CalcNumAromaticHeterocycles
67 | n_aromatic_rings = rdMolDescriptors.CalcNumAromaticRings
68 |
69 | n_saturated_carbocycles = rdMolDescriptors.CalcNumSaturatedCarbocycles
70 | n_saturated_heterocyles = rdMolDescriptors.CalcNumSaturatedHeterocycles
71 | n_saturated_rings = rdMolDescriptors.CalcNumSaturatedRings
72 |
73 |
74 | def n_rigid_bonds(mol: Mol) -> int:
75 | """Compute the number of rigid bonds in a molecule.
76 |
77 | Rigid bonds are bonds that are not single and not in rings.
78 |
79 | Args:
80 | mol: A molecule.
81 |
82 | Returns:
83 | n_rigid_bonds: number of rigid bonds in the molecule
84 | """
85 | non_rigid_bonds_count = from_smarts("*-&!@*")
86 | n_rigid_bonds = mol.GetNumBonds() - len(mol.GetSubstructMatches(non_rigid_bonds_count))
87 | return n_rigid_bonds
88 |
89 |
90 | def n_aromatic_atoms(mol: Mol) -> int:
91 | """Calculate the number of aromatic atoms."""
92 | matches = mol.GetSubstructMatches(_AROMATIC_QUERY)
93 | return len(matches)
94 |
95 |
96 | def n_aromatic_atoms_proportion(mol: Mol) -> int:
97 | """Calculate the aromatic proportion: # aromatic atoms/#atoms total.
98 |
99 | Args:
100 | mol: A molecule.
101 |
102 | Only heavy atoms are considered.
103 | """
104 | return n_aromatic_atoms(mol) / mol.GetNumHeavyAtoms()
105 |
106 |
107 | def n_stereo_centers(mol: Mol) -> int:
108 | """Compute the number of stereocenters in a molecule.
109 |
110 | Args:
111 | mol: A molecule.
112 |
113 | Returns:
114 | n_stero_center: number of stereocenters in the molecule
115 | """
116 | n = 0
117 | try:
118 | rdmolops.FindPotentialStereo(mol, cleanIt=False)
119 | n = rdMolDescriptors.CalcNumAtomStereoCenters(mol)
120 | except Exception:
121 | pass
122 | return n
123 |
124 |
125 | def n_stereo_centers_unspecified(mol: Mol) -> int:
126 | """Compute the number of unspecified stereocenters in a molecule.
127 |
128 | Args:
129 | mol: A molecule.
130 |
131 | Returns:
132 | n_stereo_centers_unspecified: number of unspecified stereocenters in the molecule
133 | """
134 | n = 0
135 | try:
136 | rdmolops.FindPotentialStereo(mol, cleanIt=False)
137 | n = rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(mol)
138 | except Exception:
139 | pass
140 | return n
141 |
142 |
143 | def n_charged_atoms(mol: Mol) -> int:
144 | """Compute the number of charged atoms in a molecule.
145 |
146 | Args:
147 | mol: A molecule.
148 |
149 | Returns:
150 | n_charged_atoms: number of charged atoms in the molecule
151 | """
152 | return sum([at.GetFormalCharge() != 0 for at in mol.GetAtoms()])
153 |
--------------------------------------------------------------------------------
/datamol/fragment/__init__.py:
--------------------------------------------------------------------------------
1 | from ._fragment import brics
2 | from ._fragment import frag
3 | from ._fragment import recap
4 | from ._fragment import anybreak
5 | from ._fragment import mmpa_frag
6 | from ._fragment import mmpa_cut
7 |
8 | from ._assemble import assemble_fragment_order
9 | from ._assemble import break_mol
10 | from ._assemble import build
11 |
--------------------------------------------------------------------------------
/datamol/fragment/_fragment.py:
--------------------------------------------------------------------------------
1 | from typing import Set
2 | from typing import Optional
3 | from typing import Any
4 |
5 | from rdkit import Chem
6 | from rdkit.Chem import BRICS
7 | from rdkit.Chem import Recap
8 | from rdkit.Chem import rdMMPA
9 |
10 | from rdkit.Chem.Fraggle import FraggleSim
11 |
12 | import datamol as dm
13 |
14 |
15 | def brics(
16 | mol: Chem.rdchem.Mol,
17 | singlepass: bool = True,
18 | remove_parent: bool = False,
19 | sanitize: bool = True,
20 | fix: bool = True,
21 | ):
22 | """Run BRICS on the molecules and potentially fix dummy atoms.
23 |
24 | Args:
25 | mol: a molecule.
26 | singlepass: Single pass for `BRICSDecompose`.
27 | remove_parent: Remove parent from the fragments.
28 | sanitize: Wether to sanitize the fragments.
29 | fix: Wether to fix the fragments.
30 | """
31 | frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
32 | frags = list(frags)
33 |
34 | if fix:
35 | frags = [dm.fix_mol(x) for x in frags]
36 | if sanitize:
37 | frags = [dm.sanitize_mol(x) for x in frags]
38 | if remove_parent:
39 | frags.pop(0)
40 |
41 | frags = [x for x in frags if x is not None]
42 |
43 | return frags
44 |
45 |
46 | def frag(
47 | mol: Chem.rdchem.Mol,
48 | remove_parent: bool = False,
49 | sanitize: bool = True,
50 | fix: bool = True,
51 | ):
52 | """Generate all possible fragmentation of a molecule.
53 |
54 | Args:
55 | mol: a molecule.
56 | remove_parent: Remove parent from the fragments.
57 | sanitize: Wether to sanitize the fragments.
58 | fix: Wether to fix the fragments.
59 | """
60 | frags = FraggleSim.generate_fraggle_fragmentation(mol)
61 |
62 | smiles = set([])
63 | for seq in frags:
64 | smiles |= {s.strip() for s in seq.split(".")}
65 |
66 | smiles = list(sorted(smiles, reverse=True))
67 | frags = [dm.to_mol(s) for s in smiles]
68 |
69 | if fix:
70 | frags = [dm.fix_mol(x) for x in frags]
71 | if sanitize:
72 | frags = [dm.sanitize_mol(x) for x in frags]
73 |
74 | frags = [x for x in frags if x is not None]
75 |
76 | if remove_parent:
77 | return frags
78 | return [mol] + frags
79 |
80 |
81 | def recap(
82 | mol: Chem.rdchem.Mol,
83 | remove_parent: bool = False,
84 | sanitize: bool = True,
85 | fix: bool = True,
86 | ):
87 | """Fragment the molecule using the recap algorithm.
88 |
89 | Args:
90 | mol: a molecule.
91 | remove_parent: Remove parent from the fragments.
92 | sanitize: Wether to sanitize the fragments.
93 | fix: Wether to fix the fragments.
94 | """
95 | res = Recap.RecapDecompose(mol)
96 | frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]
97 |
98 | if fix:
99 | frags = [dm.fix_mol(x) for x in frags]
100 | if sanitize:
101 | frags = [dm.sanitize_mol(x) for x in frags]
102 |
103 | frags = [x for x in frags if x is not None]
104 |
105 | if remove_parent:
106 | return frags
107 | return [mol] + frags
108 |
109 |
110 | def anybreak(
111 | mol: Chem.rdchem.Mol,
112 | remove_parent: bool = False,
113 | sanitize: bool = True,
114 | fix: bool = True,
115 | ):
116 | """Fragment molecule by applying brics first, then fall back to frag.
117 |
118 | Args:
119 | mol: a molecule.
120 | remove_parent: Remove parent from the fragments.
121 | sanitize: Wether to sanitize the fragments.
122 | fix: Wether to fix the fragments.
123 | """
124 | frags = []
125 | try:
126 | frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
127 | except Exception:
128 | pass
129 |
130 | if len(frags) == 0:
131 | frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)
132 |
133 | return frags
134 |
135 |
136 | def mmpa_frag(
137 | mol: dm.Mol,
138 | pattern: Optional[str] = None,
139 | max_cut: int = 1,
140 | max_bond_cut: int = 20,
141 | h_split: bool = False,
142 | ) -> Optional[Set[dm.Mol]]:
143 | """Fragment molecule on specific bonds suitable for a MMPA analysis.
144 |
145 | Args:
146 | mol: Molecule to fragment.
147 | pattern: Bond pattern to split on. Will use default rdkit pattern
148 | '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
149 | max_cut: Number of cuts.
150 | max_bond_cut: Maximum number of bond to cut. Default to 20.
151 | h_split: Whether to split at hydrogen position too.
152 | This is equivalent to enabling the addition of new fragments.
153 |
154 | Returns:
155 | List of fragments.
156 | """
157 |
158 | frags = []
159 | if pattern is None:
160 | frags = rdMMPA.FragmentMol(
161 | mol,
162 | maxCuts=max_cut,
163 | resultsAsMols=False,
164 | maxCutBonds=max_bond_cut,
165 | )
166 | elif pattern:
167 | frags = rdMMPA.FragmentMol(
168 | mol,
169 | pattern=pattern,
170 | maxCuts=max_cut,
171 | resultsAsMols=False,
172 | maxCutBonds=max_bond_cut,
173 | )
174 |
175 | if h_split:
176 | mol = dm.add_hs(mol)
177 | frags += rdMMPA.FragmentMol(
178 | mol,
179 | pattern="[#1]!@!=!#[!#1]",
180 | maxCuts=1,
181 | resultsAsMols=False,
182 | maxCutBonds=max_bond_cut,
183 | )
184 | return set(frags)
185 |
186 |
187 | def mmpa_cut(mol: dm.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
188 | """Cut molecules to perform mmpa analysis later
189 |
190 | Args:
191 | mol: Molecule to fragment.
192 | rdkit_pattern: Whether to perform the fragmentation
193 | using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"
194 |
195 | Returns:
196 | List of 'smiles,core,chains'
197 | """
198 |
199 | if mol is None:
200 | return mol
201 |
202 | outlines = set()
203 |
204 | smiles = dm.to_smiles(mol)
205 |
206 | if rdkit_pattern:
207 | frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
208 | else:
209 | # heavy atoms
210 | frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
211 | frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))
212 |
213 | frags = set(frags)
214 | for core, chains in frags:
215 | output = f"{smiles},{core},{chains}\n"
216 | outlines.add(output)
217 |
218 | # hydrogen splitting
219 | mol = dm.add_hs(mol)
220 | smiles = dm.to_smiles(mol)
221 |
222 | n = mol.GetNumHeavyAtoms()
223 | if n < 60:
224 | frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
225 | for core, chains in frags:
226 | output = f"{smiles},{core},{chains}\n"
227 | outlines.add(output)
228 |
229 | return outlines
230 |
--------------------------------------------------------------------------------
/datamol/isomers/__init__.py:
--------------------------------------------------------------------------------
1 | from ._structural import IsomerEnumerator
2 |
3 | from ._enumerate import enumerate_stereoisomers
4 | from ._enumerate import enumerate_tautomers
5 | from ._enumerate import enumerate_structisomers
6 | from ._enumerate import count_stereoisomers
7 | from ._enumerate import remove_stereochemistry
8 | from ._enumerate import canonical_tautomer
9 |
--------------------------------------------------------------------------------
/datamol/log.py:
--------------------------------------------------------------------------------
1 | from rdkit import RDLogger
2 | from rdkit import rdBase
3 | from functools import wraps
4 |
5 |
6 | class without_rdkit_log:
7 | """Context manager to disable RDKit logs. By default all logs are disabled.
8 |
9 | Example:
10 |
11 | ```python
12 | import datamol as dm
13 |
14 | with dm.without_rdkit_log():
15 | mol = dm.to_mol("CCCCO") # potential RDKit logs won't show
16 | ```
17 | """
18 |
19 | def __init__(
20 | self,
21 | mute_errors: bool = True,
22 | mute_warning: bool = True,
23 | mute_info: bool = True,
24 | mute_debug: bool = True,
25 | enable: bool = True,
26 | ):
27 | if enable is False:
28 | mute_errors = False
29 | mute_warning = False
30 | mute_info = False
31 | mute_debug = False
32 |
33 | # Get current log state
34 | self.previous_status = self._get_log_status()
35 |
36 | # Init the desired log state to apply during in the context
37 | self.desired_status = {}
38 | self.desired_status["rdApp.error"] = not mute_errors
39 | self.desired_status["rdApp.warning"] = not mute_warning
40 | self.desired_status["rdApp.debug"] = not mute_debug
41 | self.desired_status["rdApp.info"] = not mute_info
42 |
43 | def _get_log_status(self):
44 | """Get the current log status of RDKit logs."""
45 | log_status = rdBase.LogStatus()
46 | log_status = {st.split(":")[0]: st.split(":")[1] for st in log_status.split("\n")}
47 | log_status = {k: True if v == "enabled" else False for k, v in log_status.items()}
48 | return log_status
49 |
50 | def _apply_log_status(self, log_status):
51 | """Apply an RDKit log status."""
52 | for k, v in log_status.items():
53 | if v is True:
54 | rdBase.EnableLog(k)
55 | else:
56 | rdBase.DisableLog(k)
57 |
58 | def __enter__(self):
59 | self._apply_log_status(self.desired_status)
60 |
61 | def __exit__(self, *args, **kwargs):
62 | self._apply_log_status(self.previous_status)
63 |
64 |
65 | def disable_rdkit_log():
66 | """Disable all rdkit logs."""
67 | for log_level in RDLogger._levels:
68 | rdBase.DisableLog(log_level)
69 |
70 |
71 | def enable_rdkit_log():
72 | """Enable all rdkit logs."""
73 | for log_level in RDLogger._levels:
74 | rdBase.EnableLog(log_level)
75 |
76 |
77 | def no_rdkit_log(
78 | func=None,
79 | *,
80 | mute_errors: bool = True,
81 | mute_warning: bool = True,
82 | mute_info: bool = True,
83 | mute_debug: bool = True,
84 | enable: bool = True,
85 | ):
86 | """Decorator to disable RDKit logs.
87 |
88 | This decorator can be used to suppress RDKit logs when executing a specific function.
89 | By default, all log levels (error, warning, info, and debug) are muted.
90 |
91 | Args:
92 | mute_errors : Whether to mute error logs (default is True).
93 | mute_warning : Whether to mute warning logs (default is True).
94 | mute_info : Whether to mute info logs (default is True).
95 | mute_debug : Whether to mute debug logs (default is True).
96 | enable: Whether to enable the log muting (default is True). If set to False, no logs will be muted.
97 |
98 | Example:
99 | ```python
100 | @no_rdkit_log()
101 | def example_function():
102 | # Your function code here
103 | pass
104 |
105 | example_function() # RDKit logs won't show during this function's execution
106 | ```
107 | """
108 |
109 | if func is None:
110 | return lambda f: no_rdkit_log(
111 | f,
112 | mute_errors=mute_errors,
113 | mute_warning=mute_warning,
114 | mute_info=mute_info,
115 | mute_debug=mute_debug,
116 | enable=enable,
117 | )
118 |
119 | @wraps(func)
120 | def wrapper(*args, **kwargs):
121 | with without_rdkit_log(mute_errors, mute_warning, mute_info, mute_debug, enable):
122 | return func(*args, **kwargs)
123 |
124 | return wrapper
125 |
--------------------------------------------------------------------------------
/datamol/mcs.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from typing import Any
3 |
4 | from rdkit.Chem import rdFMCS
5 |
6 | import datamol as dm
7 |
8 | ALLOWED_ATOM_COMPARE = ["CompareAny", "CompareAnyHeavyAtom", "CompareElements", "CompareIsotopes"]
9 | ALLOWED_BOND_COMPARE = ["CompareAny", "CompareOrder", "CompareOrderExact"]
10 | ALLOWED_RING_COMPARE = ["IgnoreRingFusion", "PermissiveRingFusion", "StrictRingFusion"]
11 |
12 |
13 | def find_mcs(
14 | mols: List[dm.Mol],
15 | maximize_bonds: bool = True,
16 | threshold: float = 0.0,
17 | timeout: int = 5,
18 | verbose: bool = False,
19 | match_valences: bool = False,
20 | ring_matches_ring_only: bool = True,
21 | complete_rings_only: bool = False,
22 | match_chiral_tag: bool = False,
23 | seed_smarts: str = "",
24 | atom_compare: str = "CompareElements",
25 | bond_compare: str = "CompareOrder",
26 | ring_compare: str = "IgnoreRingFusion",
27 | with_details: bool = False,
28 | **kwargs: Any,
29 | ):
30 | """Find the maximum common substructure from a list of molecules.
31 |
32 | Args:
33 | mols: List of molecules.
34 | maximize_bonds: Maximize the number of bonds in the substructure.
35 | threshold: The threshold for the MCS (between 0 and 1).
36 | timeout: The timeout for the MCS.
37 | verbose: Whether to enable verbose mode.
38 | match_valences: Whether to match valences.
39 | ring_matches_ring_only: Whether to match rings only.
40 | complete_rings_only: Whether to match complete rings only.
41 | match_chiral_tag: Whether to match chiral tags.
42 | seed_smarts: The seed SMARTS.
43 | atom_compare: One of "CompareAny", "CompareAnyHeavyAtom", "CompareElements",
44 | "CompareIsotopes".
45 | bond_compare: One of "CompareAny", "CompareOrder", "CompareOrderExact".
46 | ring_compare: One of "IgnoreRingFusion", "PermissiveRingFusion", "StrictRingFusion".
47 | with_details: Whether to return the RDKit MCS object or just the SMARTS string.
48 | **kwargs: Additional arguments for the MCS.
49 | """
50 |
51 | if atom_compare not in ALLOWED_ATOM_COMPARE:
52 | raise ValueError(f"atom_compare must be one of {ALLOWED_ATOM_COMPARE}")
53 |
54 | if bond_compare not in ALLOWED_BOND_COMPARE:
55 | raise ValueError(f"bond_compare must be one of {ALLOWED_BOND_COMPARE}")
56 |
57 | if ring_compare not in ALLOWED_RING_COMPARE:
58 | raise ValueError(f"ring_compare must be one of {ALLOWED_RING_COMPARE}")
59 |
60 | args = {}
61 | args["maximizeBonds"] = maximize_bonds
62 | args["threshold"] = threshold
63 | args["timeout"] = timeout
64 | args["verbose"] = verbose
65 | args["matchValences"] = match_valences
66 | args["ringMatchesRingOnly"] = ring_matches_ring_only
67 | args["completeRingsOnly"] = complete_rings_only
68 | args["matchChiralTag"] = match_chiral_tag
69 | args["seedSmarts"] = seed_smarts
70 | args["atomCompare"] = rdFMCS.AtomCompare.names[atom_compare]
71 | args["bondCompare"] = rdFMCS.BondCompare.names[bond_compare]
72 | args["ringCompare"] = rdFMCS.RingCompare.names[ring_compare]
73 |
74 | args.update(kwargs)
75 |
76 | mcs = rdFMCS.FindMCS(mols, **args)
77 |
78 | if with_details:
79 | return mcs
80 |
81 | smarts = mcs.smartsString
82 | if smarts == "":
83 | smarts = None
84 | return smarts
85 |
--------------------------------------------------------------------------------
/datamol/molar.py:
--------------------------------------------------------------------------------
1 | """A set of utility functions to convert between various units and formats used in drug discovery.
2 | """
3 |
4 | from typing import Union
5 | from typing import Iterable
6 |
7 | import numpy as np
8 |
9 |
10 | _MOLAR_SCALES = {"M": 1, "mM": 1e-3, "uM": 1e-6, "nM": 1e-9, "pM": 1e-12, "fM": 1e-15}
11 |
12 |
13 | def molar_to_log(
14 | values: Union[float, Iterable[float], np.ndarray],
15 | unit: str,
16 | ) -> Union[float, Iterable[float], np.ndarray]:
17 | """Convert a molar concentration (XC50 for example) to its log scaled value (pXC50).
18 |
19 | Args:
20 | values: A molar concentration (can be a scalar, a list or an array).
21 | unit: The unit of the input concentration. Choose from:
22 | `{'M', 'fM', 'mM', 'nM', 'pM', 'uM'}`.
23 | """
24 |
25 | if unit not in _MOLAR_SCALES:
26 | raise ValueError(
27 | f"The unit '{unit}' is not supported. Choose from {set(_MOLAR_SCALES.keys())}."
28 | )
29 |
30 | return -1 * np.log10(np.array(values) * _MOLAR_SCALES[unit])
31 |
32 |
33 | def log_to_molar(
34 | values: Union[float, Iterable[float], np.ndarray],
35 | unit: str,
36 | ) -> Union[float, Iterable[float], np.ndarray]:
37 | """Convert a log-scaled molar concentration (pXC50 for example) to its unscaled value (XC50).
38 |
39 | Args:
40 | values: A log-scaled molar concentration (can be a scalar, a list or an array).
41 | unit: The unit of the input concentration. Choose from:
42 | `{'M', 'fM', 'mM', 'nM', 'pM', 'uM'}`.
43 | """
44 |
45 | if unit not in _MOLAR_SCALES:
46 | raise ValueError(
47 | f"The unit '{unit}' is not supported. Choose from {set(_MOLAR_SCALES.keys())}."
48 | )
49 |
50 | return 10 ** (-1 * np.array(values, dtype="float")) / _MOLAR_SCALES[unit]
51 |
--------------------------------------------------------------------------------
/datamol/predictors/__init__.py:
--------------------------------------------------------------------------------
1 | from .esol import esol
2 | from .esol import esol_from_data
3 |
--------------------------------------------------------------------------------
/datamol/predictors/esol.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import pandas as pd
4 |
5 |
6 | from .. import Mol
7 |
8 | from ..descriptors.descriptors import clogp
9 | from ..descriptors.descriptors import mw
10 | from ..descriptors.descriptors import n_rotatable_bonds
11 | from ..descriptors.descriptors import n_aromatic_atoms_proportion
12 |
13 |
14 | _ESOL_INTERCEPT = 0.26121066137801696
15 | _ESOL_COEF = {
16 | "mw": -0.0066138847738667125,
17 | "clogp": -0.7416739523408995,
18 | "n_rotatable_bonds": 0.003451545565957996,
19 | "n_aromatic_atoms_proportion": -0.42624840441316975,
20 | }
21 |
22 |
23 | def esol(mol: Mol):
24 | """Compute the solubility descriptor ESOL.
25 |
26 | Note that the intermediate descriptors will be computed on-the-fly. If you prefer
27 | precomputing those then you can use `esol_from_data`.
28 |
29 | Source: https://github.com/PatWalters/solubility/blob/d1536c58afe5e0e7ac4c96e2ffef496d5b98664b/esol.py
30 | """
31 |
32 | esol = (
33 | _ESOL_INTERCEPT
34 | + _ESOL_COEF["clogp"] * clogp(mol)
35 | + _ESOL_COEF["mw"] * mw(mol)
36 | + _ESOL_COEF["n_rotatable_bonds"] * n_rotatable_bonds(mol)
37 | + _ESOL_COEF["n_aromatic_atoms_proportion"] * n_aromatic_atoms_proportion(mol)
38 | )
39 |
40 | return esol
41 |
42 |
43 | def esol_from_data(data: Union[pd.Series, pd.DataFrame, dict]):
44 | """Compute the solubility descriptor ESOL.
45 |
46 | `data` must contains the following intermediate descriptors:
47 |
48 | - `clogp`: `dm.descriptors.clogp`
49 | - `mw`: `dm.descriptors.mw`
50 | - `n_rotatable_bonds`: `dm.descriptors.n_rotatable_bonds`
51 | - `n_aromatic_atoms_proportion`: `dm.descriptors.n_aromatic_atoms_proportion`
52 |
53 | Source: https://github.com/PatWalters/solubility/blob/d1536c58afe5e0e7ac4c96e2ffef496d5b98664b/esol.py
54 |
55 | Args:
56 | data: A dataframe or series containing the intermediate descriptors.
57 | """
58 |
59 | esol = (
60 | _ESOL_INTERCEPT
61 | + _ESOL_COEF["clogp"] * data["clogp"]
62 | + _ESOL_COEF["mw"] * data["mw"]
63 | + _ESOL_COEF["n_rotatable_bonds"] * data["n_rotatable_bonds"]
64 | + _ESOL_COEF["n_aromatic_atoms_proportion"] * data["n_aromatic_atoms_proportion"]
65 | )
66 |
67 | return esol
68 |
--------------------------------------------------------------------------------
/datamol/reactions/__init__.py:
--------------------------------------------------------------------------------
1 | from ._reactions import is_reaction_ok
2 | from ._reactions import select_reaction_output
3 | from ._reactions import apply_reaction
4 | from ._reactions import can_react
5 | from ._reactions import inverse_reaction
6 | from ._reactions import find_reactant_position
7 | from ._reactions import ATTACHING_RXN
8 | from ._reactions import rxn_from_smarts
9 | from ._reactions import rxn_to_smarts
10 | from ._reactions import rxn_from_block
11 | from ._reactions import rxn_from_block_file
12 | from ._reactions import rxn_to_block
13 | from ._reactions import rxn_to_block_file
14 |
15 | from ._attachments import add_brackets_to_attachment_points
16 | from ._attachments import convert_attach_to_isotope
17 | from ._attachments import num_attachment_points
18 | from ._attachments import open_attach_points
19 |
--------------------------------------------------------------------------------
/datamol/reactions/_attachments.py:
--------------------------------------------------------------------------------
1 | from typing import cast
2 | from typing import Union
3 |
4 | import re
5 | import operator
6 |
7 | import datamol as dm
8 | from rdkit import Chem
9 |
10 | ATTACHMENT_POINT_TOKEN = "*"
11 | ATTACHMENT_POINT_NUM_REGEXP = r"\[{}:?(\d*)\]".format(re.escape(ATTACHMENT_POINT_TOKEN))
12 | ATTACHMENT_POINT_REGEXP = r"(?:{0}|\[{0}[^\]]*\])".format(re.escape(ATTACHMENT_POINT_TOKEN))
13 | ATTACHMENT_POINT_NO_BRACKETS_REGEXP = r"(? str:
20 | """
21 | Adds brackets to the attachment points (if they don't have them).
22 | Example: "CC(C)CO*" to "CC(C)CO[*]"
23 |
24 | Args:
25 | smiles: A smiles string.
26 |
27 | Returns:
28 | A smiles string with brackets.
29 | """
30 | return re.sub(
31 | ATTACHMENT_POINT_NO_BRACKETS_REGEXP,
32 | "[{}]".format(ATTACHMENT_POINT_TOKEN),
33 | smiles,
34 | )
35 |
36 |
37 | def convert_attach_to_isotope(
38 | mol_or_smiles: Union[dm.Mol, str],
39 | same_isotope: bool = False,
40 | as_smiles: bool = False,
41 | ) -> Union[dm.Mol, str]:
42 | """Convert attachment to isotope mapping.
43 |
44 | Examples: "O=C(NCc1cnc([*])c1)[*]" to "O=C(NCc1cnc([1*])c1)[2*]"
45 |
46 | Args:
47 | mol_or_smiles: A Mol object or a smiles to be converted
48 | same_isotope: Whether convert to the same isotope.
49 | Example: "O=C(NCc1cnc([*])c1)[*]" to "O=C(NCc1cnc([1*])c1)[1*]"
50 |
51 | Returns:
52 | Converted Mol object or SMILES.
53 | """
54 | mol = dm.to_mol(mol_or_smiles)
55 | smiles = dm.to_smiles(mol)
56 | smiles = cast(str, smiles)
57 |
58 | smiles = add_brackets_to_attachment_points(smiles)
59 |
60 | # reg matching seems to be the most effective
61 | subs_reg = r"[\g<1>{}]"
62 | if same_isotope:
63 | subs_reg = "[1{}]"
64 |
65 | smiles = re.sub(ATTACHMENT_POINT_NUM_REGEXP, subs_reg.format(ATTACHMENT_POINT_TOKEN), smiles)
66 |
67 | if as_smiles:
68 | return smiles
69 | return dm.to_mol(smiles)
70 |
71 |
72 | def num_attachment_points(mol_or_smiles: Union[dm.Mol, str]) -> int:
73 | """
74 | Get the number of attachment point in the
75 |
76 | Args:
77 | mol_or_smiles: A Mol object or a smiles to be converted
78 |
79 | Returns:
80 | Number of attachment points of the given molecule.
81 | """
82 | if isinstance(mol_or_smiles, dm.Mol):
83 | mol = cast(dm.Mol, mol_or_smiles)
84 | n_points = len(
85 | [atom for atom in mol.GetAtoms() if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN]
86 | )
87 | else:
88 | n_points = len(re.findall(ATTACHMENT_POINT_REGEXP, mol_or_smiles))
89 |
90 | return n_points
91 |
92 |
93 | def open_attach_points(
94 | mol: dm.Mol,
95 | fix_atom_map: bool = False,
96 | bond_type: dm.BondType = dm.SINGLE_BOND,
97 | ) -> dm.Mol:
98 | """Compute attachment points on a molecule.
99 | This will highlight all valid attachment point on the current molecule instead.
100 |
101 | Args:
102 | mol: A Mol object to be processed.
103 | fix_atom_map: Whether fix the atom mapping of the molecule.
104 | bond_type: The bond type to be opened.
105 |
106 | Returns:
107 | Molecule with open attachment points
108 | """
109 |
110 | emol = Chem.rdchem.RWMol(dm.to_mol(mol))
111 | with dm.log.without_rdkit_log():
112 | atoms = [
113 | (a.GetIdx(), a)
114 | for a in emol.GetAtoms()
115 | if a.GetSymbol() != ATTACHMENT_POINT_TOKEN
116 | and a.GetImplicitValence() > 0
117 | and (not a.HasProp("_protected") or a.GetProp("_protected") != "1")
118 | ]
119 | atoms.sort(reverse=True, key=operator.itemgetter(0))
120 |
121 | for atom in atoms:
122 | new_atom = Chem.rdchem.Atom(ATTACHMENT_POINT_TOKEN)
123 | new_atom.SetAtomMapNum(1 if fix_atom_map else atom[0])
124 | new_index = emol.AddAtom(new_atom)
125 | emol.UpdatePropertyCache(strict=False)
126 | if bond_type is not None:
127 | emol.AddBond(atom[0], new_index, bond_type)
128 | else:
129 | emol.AddBond(atom[0], new_index)
130 |
131 | mol = dm.sanitize_mol(emol)
132 | return mol
133 |
--------------------------------------------------------------------------------
/datamol/scaffold/__init__.py:
--------------------------------------------------------------------------------
1 | from ._fuzzy import trim_side_chain
2 | from ._fuzzy import fuzzy_scaffolding
3 |
--------------------------------------------------------------------------------
/datamol/similarity.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from typing import Optional
3 | from typing import Union
4 | from typing import Any
5 |
6 | import functools
7 |
8 | import numpy as np
9 | from sklearn.metrics import pairwise_distances_chunked
10 | from scipy.spatial import distance
11 |
12 | import datamol as dm
13 |
14 |
15 | def pdist(
16 | mols: List[Union[str, dm.Mol]],
17 | n_jobs: Optional[int] = 1,
18 | squareform: bool = True,
19 | **fp_args: Any,
20 | ) -> np.ndarray:
21 | """Compute the pairwise tanimoto distance between the fingerprints of all the
22 | molecules in the input set.
23 |
24 | Args:
25 | mols: list of molecules
26 | n_jobs: Number of jobs for parallelization. Let to 1 for no
27 | parallelization. Set to -1 to use all available cores.
28 | squareform: Whether to return in square form (matrix) or in a condensed
29 | form (1D vector).
30 | **fp_args: list of args to pass to `to_fp()`.
31 |
32 | Returns:
33 | dist_mat
34 | """
35 |
36 | fps = dm.parallelized(
37 | functools.partial(dm.to_fp, as_array=True, **fp_args),
38 | mols,
39 | n_jobs=n_jobs,
40 | )
41 |
42 | fps_array = np.array(fps)
43 |
44 | dist_mat = distance.pdist(fps_array, metric="jaccard")
45 |
46 | if squareform:
47 | dist_mat = distance.squareform(dist_mat, force="tomatrix")
48 |
49 | return dist_mat
50 |
51 |
52 | def cdist(
53 | mols1: List[Union[str, dm.Mol]],
54 | mols2: List[Union[str, dm.Mol]],
55 | n_jobs: Optional[int] = 1,
56 | distances_chunk: bool = False,
57 | distances_chunk_memory: int = 1024,
58 | distances_n_jobs: int = -1,
59 | **fp_args: Any,
60 | ) -> np.ndarray:
61 | """Compute the tanimoto distance between the fingerprints of each pair of
62 | molecules of the two collections of inputs.
63 |
64 | Args:
65 | mols1: list of molecules.
66 | mols2: list of molecules.
67 | n_jobs: Number of jobs for fingerprint computation. Let to 1 for no
68 | parallelization. Set to -1 to use all available cores.
69 | distances_chunk: Whether to use chunked computation.
70 | distances_chunk_memory: Memory size in MB to use for chunked computation.
71 | distances_n_jobs: Number of jobs for parallelization.
72 | **fp_args: list of args to pass to `to_fp()`.
73 |
74 | Returns:
75 | distmat
76 | """
77 |
78 | fps1 = dm.parallelized(
79 | functools.partial(dm.to_fp, as_array=True, **fp_args),
80 | mols1,
81 | n_jobs=n_jobs,
82 | )
83 |
84 | fps2 = dm.parallelized(
85 | functools.partial(dm.to_fp, as_array=True, **fp_args),
86 | mols2,
87 | n_jobs=n_jobs,
88 | )
89 |
90 | fps1_array = np.array(fps1).astype(bool)
91 | fps2_array = np.array(fps2).astype(bool)
92 |
93 | if distances_chunk:
94 | distances = pairwise_distances_chunked(
95 | fps1_array,
96 | fps2_array,
97 | metric="jaccard",
98 | n_jobs=distances_n_jobs,
99 | working_memory=distances_chunk_memory,
100 | )
101 | distances_array = np.vstack(list(distances))
102 | else:
103 | distances_array = distance.cdist(fps1_array, fps2_array, metric="jaccard")
104 |
105 | return distances_array
106 |
--------------------------------------------------------------------------------
/datamol/types.py:
--------------------------------------------------------------------------------
1 | # NOTE(hadim): typing_extensions can be replaced by typing once we drop support for Python 3.9.
2 | from typing_extensions import TypeAlias
3 | from typing import Union
4 | from typing import Tuple
5 |
6 | from rdkit import Chem
7 | from rdkit.Chem import rdChemReactions
8 |
9 | Mol: TypeAlias = Chem.rdchem.Mol
10 | BondType: TypeAlias = Chem.rdchem.BondType
11 | ChemicalReaction: TypeAlias = rdChemReactions.ChemicalReaction
12 | Atom: TypeAlias = Chem.rdchem.Atom
13 | Bond: TypeAlias = Chem.rdchem.Bond
14 |
15 | RDKitColor = Union[Tuple[float, float, float, float], Tuple[float, float, float]]
16 | DatamolColor = Union[RDKitColor, str]
17 |
--------------------------------------------------------------------------------
/datamol/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .jobs import JobRunner
2 | from .jobs import parallelized
3 | from .jobs import parallelized_with_batches
4 |
5 | from . import fs
6 | from . import perf
7 |
8 | from . import decorators
9 |
--------------------------------------------------------------------------------
/datamol/utils/decorators.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 | from typing import List
3 | from typing import Union
4 |
5 | import platform
6 | from functools import wraps
7 |
8 |
9 | def disable_on_os(os_names: Union[str, List[str]]):
10 | """A decorator to disable a function raising an error if the OS detected is not supported.
11 |
12 | Args:
13 | os_names: OS names to disable this function. Valid OS names are: `["linux", "osx", "win"]`.
14 | """
15 |
16 | if isinstance(os_names, str):
17 | os_names = [os_names]
18 |
19 | valid_os_names = []
20 | for os_name in os_names:
21 | if os_name == "linux":
22 | valid_os_names.append("Linux")
23 | elif os_name == "win":
24 | valid_os_names.append("Windows")
25 | elif os_name == "osx":
26 | valid_os_names.append("Darwin")
27 | else:
28 | valid_os_names.append(os_name)
29 |
30 | def real_decorator(function: Callable):
31 | @wraps(function)
32 | def wrapper(*args, **kwargs):
33 | if platform.system() not in valid_os_names:
34 | retval = function(*args, **kwargs)
35 | return retval
36 | else:
37 | raise NotImplementedError(
38 | f"The function {function.__name__} is not supported"
39 | f" for the platform '{platform.system()}'."
40 | )
41 |
42 | return wrapper
43 |
44 | return real_decorator
45 |
--------------------------------------------------------------------------------
/datamol/utils/perf.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from loguru import logger
4 |
5 |
6 | duration_intervals = (
7 | ("weeks", 604800), # 60 * 60 * 24 * 7
8 | ("days", 86400), # 60 * 60 * 24
9 | ("h", 3600), # 60 * 60
10 | ("min", 60),
11 | ("s", 1),
12 | ("ms", 1e-3),
13 | ("us", 1e-6),
14 | )
15 |
16 |
17 | def human_duration(seconds: float, granularity: int = 1):
18 | # NOTE(hadim): far from being perfect.
19 |
20 | result = []
21 | duration: float = seconds
22 | for name, count in duration_intervals:
23 | value = duration // count
24 | if value:
25 | duration -= value * count
26 | result.append(f"{value:.0f}{name}")
27 | return ", ".join(result[:granularity])
28 |
29 |
30 | class watch_duration:
31 | """A Python decorator to measure execution time with logging capability.
32 |
33 | Args:
34 | log: Whether to log the measured duration.
35 | log_human_duration: Whether to log duration in a human way
36 | depending on the amount.
37 |
38 | Example:
39 |
40 | ```python
41 | def fn(n):
42 | for i in range(n):
43 | print(i)
44 | time.sleep(0.2)
45 |
46 | with dm.utils.perf.watch_duration(log=True) as w:
47 | fn(5)
48 |
49 | print(w.duration)
50 | ```
51 | """
52 |
53 | def __init__(self, log: bool = True, log_human_duration: bool = True):
54 | self.log = log
55 | self.log_human_duration = log_human_duration
56 |
57 | self.start = None
58 | self.end = None
59 | self.duration = None
60 | self.duration_minutes = None
61 |
62 | def __enter__(self):
63 | self.start = time.time()
64 | return self
65 |
66 | def __exit__(self, *_):
67 | assert self.start is not None
68 |
69 | self.end = time.time()
70 | self.duration = self.end - self.start
71 | self.duration_minutes = self.duration / 60
72 |
73 | if self.log:
74 | if self.log_human_duration:
75 | logger.info(f"Duration {human_duration(self.duration)}.")
76 | else:
77 | logger.info(f"Duration {self.duration_minutes:.2f} minutes")
78 |
--------------------------------------------------------------------------------
/datamol/utils/testing.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from typing import Optional
3 | from typing import Union
4 |
5 | import functools
6 |
7 | import numpy as np
8 | from scipy.spatial import distance
9 |
10 | from rdkit import Chem
11 | from rdkit.DataManip.Metric import GetTanimotoDistMat # type: ignore
12 | from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
13 |
14 | import datamol as dm
15 |
16 |
17 | def pdist_rdkit(
18 | mols: List[Union[str, Chem.rdchem.Mol]],
19 | n_jobs: Optional[int] = 1,
20 | squareform: bool = True,
21 | **fp_args,
22 | ) -> np.ndarray:
23 | """Equivalent to `dm.similarity.pdist` but uses the RDKit API.
24 |
25 | Important:
26 | This function is only used for testing and shoult not be used in production.
27 | """
28 |
29 | fps = dm.parallelized(
30 | functools.partial(dm.to_fp, as_array=False, **fp_args),
31 | mols,
32 | n_jobs=n_jobs,
33 | )
34 |
35 | fps = list(fps) # type: ignore
36 |
37 | dist = GetTanimotoDistMat(fps)
38 |
39 | # Put in squareform: `scipy.spatial.distance.squareform` is incompatible with RDKit returned vector.
40 | dist_mat = np.zeros((len(fps), len(fps)))
41 | dist_mat[np.tril_indices_from(dist_mat, -1)] = dist
42 | dist_mat += dist_mat.T
43 |
44 | if not squareform:
45 | dist_mat = distance.squareform(dist_mat, force="tovector")
46 |
47 | return dist_mat
48 |
49 |
50 | def cdist_rdkit(
51 | mols1: List[Union[str, Chem.rdchem.Mol]],
52 | mols2: List[Union[str, Chem.rdchem.Mol]],
53 | n_jobs: Optional[int] = 1,
54 | **fp_args,
55 | ) -> np.ndarray:
56 | """Equivalent to `dm.similarity.cdist` but uses the RDKit API.
57 |
58 | Important:
59 | This function is only used for testing and shoult not be used in production.
60 | """
61 |
62 | fps1 = dm.parallelized(
63 | functools.partial(dm.to_fp, as_array=False, **fp_args),
64 | mols1,
65 | n_jobs=n_jobs,
66 | )
67 |
68 | fps2 = dm.parallelized(
69 | functools.partial(dm.to_fp, as_array=False, **fp_args),
70 | mols2,
71 | n_jobs=n_jobs,
72 | )
73 |
74 | fps1 = list(fps1) # type: ignore
75 | fps2 = list(fps2) # type: ignore
76 |
77 | dist_mat = np.zeros((len(fps1), len(fps2)))
78 | for i in range(len(fps1)):
79 | for j in range(len(fps2)):
80 | d = 1 - TanimotoSimilarity(fps1[i], fps2[j])
81 | dist_mat[i, j] = d
82 |
83 | return dist_mat
84 |
--------------------------------------------------------------------------------
/datamol/viz/__init__.py:
--------------------------------------------------------------------------------
1 | from . import utils
2 |
3 | from ._viz import to_image
4 |
5 | from ._substructure import match_substructure
6 |
7 | from ._conformers import conformers
8 |
9 | from ._circle_grid import circle_grid
10 | from ._circle_grid import MolsCircleGrid
11 |
12 | from ._lasso_highlight import lasso_highlight_image
13 |
--------------------------------------------------------------------------------
/datamol/viz/_conformers.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from typing import List
3 | from typing import Optional
4 |
5 | import copy
6 | import itertools
7 |
8 | from rdkit import Chem
9 | from rdkit.Chem import rdMolAlign
10 |
11 |
12 | def _get_nglview():
13 | try:
14 | import nglview as nv
15 |
16 | return nv
17 | except ImportError:
18 | raise ImportError("You must install nglview from https://github.com/nglviewer/nglview.")
19 |
20 |
21 | def _get_ipywidgets():
22 | try:
23 | import ipywidgets as widgets
24 |
25 | return widgets
26 | except ImportError:
27 | raise ImportError(
28 | "You must install ipywidgets from https://github.com/jupyter-widgets/ipywidgets/."
29 | )
30 |
31 |
32 | def conformers(
33 | mol: Chem.rdchem.Mol,
34 | conf_id: int = -1,
35 | n_confs: Optional[Union[int, List[int]]] = None,
36 | align_conf: bool = True,
37 | n_cols: int = 3,
38 | sync_views: bool = True,
39 | remove_hs: bool = True,
40 | width: str = "auto",
41 | ):
42 | """Visualize the conformer(s) of a molecule.
43 |
44 | Args:
45 | mol: a molecule.
46 | conf_id: The ID of the conformer to show. -1 shows
47 | the first conformer. Only works if `n_confs` is None.
48 | n_confs: Can be a number of conformers
49 | to shows or a list of conformer indices. When None, only the first
50 | conformer is displayed. When -1, show all conformers.
51 | align_conf: Whether to align conformers together.
52 | n_cols: Number of columns. Defaults to 3.
53 | sync_views: Wether to sync the multiple views.
54 | remove_hs: Wether to remove the hydrogens of the conformers.
55 | width: The width of the returned view. Defaults to "auto".
56 | """
57 |
58 | widgets = _get_ipywidgets()
59 | nv = _get_nglview()
60 |
61 | if mol.GetNumConformers() == 0:
62 | raise ValueError(
63 | "The molecule has 0 conformers. You can generate conformers with `dm.conformers.generate(mol)`."
64 | )
65 |
66 | # Clone the molecule
67 | mol = copy.deepcopy(mol)
68 |
69 | if remove_hs:
70 | mol = Chem.RemoveHs(mol) # type: ignore
71 | else:
72 | mol = Chem.AddHs(mol) # type: ignore
73 |
74 | if n_confs is None:
75 | return nv.show_rdkit(mol, conf_id=conf_id)
76 |
77 | # If n_confs is int, convert to list of conformer IDs
78 | if n_confs == -1:
79 | n_confs = [conf.GetId() for conf in mol.GetConformers()]
80 | elif isinstance(n_confs, int):
81 | if n_confs > mol.GetNumConformers():
82 | n_confs = mol.GetNumConformers()
83 | n_confs = list(range(n_confs)) # type: ignore
84 |
85 | if align_conf:
86 | rdMolAlign.AlignMolConformers(mol, confIds=n_confs)
87 |
88 | # Get number of rows
89 | n_rows = len(n_confs) // n_cols
90 | n_rows += 1 if (len(n_confs) % n_cols) > 0 else 0
91 |
92 | # Create a grid
93 | grid = widgets.GridspecLayout(n_rows, n_cols) # type: ignore
94 |
95 | # Create and add views to the grid.
96 | widget_coords = itertools.product(range(n_rows), range(n_cols))
97 | views = []
98 | for i, (conf_id, (x, y)) in enumerate(zip(n_confs, widget_coords)):
99 | view = nv.show_rdkit(mol, conf_id=conf_id)
100 | view.layout.width = width
101 | view.layout.align_self = "stretch"
102 | grid[x, y] = view
103 | views.append(view)
104 |
105 | # Sync views
106 | if sync_views:
107 | for view in views:
108 | view._set_sync_camera(views)
109 |
110 | return grid
111 |
--------------------------------------------------------------------------------
/datamol/viz/_substructure.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 | from typing import Union
3 | from typing import List
4 |
5 | import datamol as dm
6 |
7 | from ._viz import to_image
8 |
9 |
10 | def match_substructure(
11 | mols: Union[List[dm.Mol], dm.Mol],
12 | queries: Union[List[dm.Mol], dm.Mol],
13 | highlight_bonds: bool = True,
14 | copy: bool = True,
15 | **kwargs: Any,
16 | ):
17 | """Generate an image of molecule(s) with substructure matches for a given
18 | pattern or substructure.
19 |
20 | Args:
21 | mols: One or more molecules.
22 | queries: One or more queries.
23 | highlight_bonds: Whether to also highlight the bonds matching the patterns.
24 | copy: Whether to copy the molecules and the queries.
25 | kwargs: Other kwargs passed to `dm.viz.to_image`.
26 | """
27 |
28 | # NOTE(hadim): `MolsToGridImage` used in `to_image` can't use a list of list of indices
29 | # for every molecules so it's not really possible to have different colors for different
30 | # matches in the same molecules.
31 | # In the future, we will implement our custom `MolsToGridImage` in order to have more controls
32 | # on the colors used.
33 | # For the same reason, we don't bother about colors here.
34 |
35 | if isinstance(mols, dm.Mol):
36 | mols = [mols]
37 |
38 | if isinstance(queries, dm.Mol):
39 | queries = [queries]
40 |
41 | # Copy mols and patterns
42 | if copy:
43 | mols = [dm.copy_mol(mol) for mol in mols]
44 | queries = [dm.copy_mol(mol) for mol in queries]
45 |
46 | all_atom_indices = []
47 | all_bond_indices = []
48 |
49 | for mol in mols:
50 | atom_indices = []
51 | bond_indices = []
52 |
53 | for query in queries:
54 | if highlight_bonds:
55 | atom_matches, bond_matches = dm.substructure_matching_bonds(mol, query)
56 | atom_indices += atom_matches
57 | bond_indices += bond_matches
58 | else:
59 | atom_indices += list(mol.GetSubstructMatches(query, uniquify=True)) # type: ignore
60 | bond_indices += []
61 |
62 | # NOTE(hadim): we must flatten the atom/bond indices, since `MolsToGridImage`
63 | # don't accept multiple list of indices for every single molecule.
64 | bond_indices = [item for sublist in bond_indices for item in sublist]
65 | atom_indices = [item for sublist in atom_indices for item in sublist]
66 |
67 | all_atom_indices.append(atom_indices)
68 | all_bond_indices.append(bond_indices)
69 |
70 | image = to_image(
71 | mols,
72 | highlight_atom=all_atom_indices,
73 | highlight_bond=all_bond_indices,
74 | **kwargs,
75 | )
76 |
77 | return image
78 |
--------------------------------------------------------------------------------
/datamol/viz/_viz.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from typing import List
3 | from typing import Tuple
4 | from typing import Optional
5 | from typing import Any
6 | from loguru import logger
7 |
8 | from rdkit.Chem import Draw
9 |
10 | import datamol as dm
11 |
12 | from .utils import prepare_mol_for_drawing
13 | from .utils import image_to_file
14 |
15 |
16 | def to_image(
17 | mols: Union[List[Union[dm.Mol, str]], dm.Mol, str],
18 | legends: Union[List[Union[str, None]], str, None] = None,
19 | n_cols: int = 4,
20 | use_svg: bool = True,
21 | mol_size: Union[Tuple[int, int], int] = (300, 300),
22 | highlight_atom: Optional[List[List[int]]] = None,
23 | highlight_bond: Optional[List[List[int]]] = None,
24 | outfile: Optional[str] = None,
25 | max_mols: int = 32,
26 | max_mols_ipython: int = 50,
27 | copy: bool = True,
28 | indices: bool = False,
29 | bond_indices: bool = False,
30 | bond_line_width: int = 2,
31 | stereo_annotations: bool = True,
32 | legend_fontsize: int = 16,
33 | kekulize: bool = True,
34 | align: Union[dm.Mol, str, bool] = False,
35 | **kwargs: Any,
36 | ):
37 | """Generate an image out of a molecule or a list of molecules.
38 |
39 | Args:
40 | mols: One or a list of molecules.
41 | legends: A string or a list of string as legend for every molecules.
42 | n_cols: Number of molecules per column.
43 | use_svg: Whether to ouput an SVG (or a PNG).
44 | mol_size: A int or a tuple of int defining the size per molecule.
45 | highlight_atom: the atoms to highlight.
46 | highlight_bond: The bonds to highlight.
47 | outfile: Path where to save the image (local or remote path).
48 | max_mols: The maximum number of molecules to display.
49 | max_mols_ipython: The maximum number of molecules to display when running within an IPython environment.
50 | copy: Whether to copy the molecules or not.
51 | indices: Whether to draw the atom indices.
52 | bond_indices: Whether to draw the bond indices.
53 | bond_line_width: The width of the bond lines.
54 | legend_fontsize: Font size for the legend.
55 | kekulize: Run kekulization routine on molecules. Skipped if fails.
56 | align: Whether to align the 2D coordinates of the molecules.
57 | - If set to True, align all molecules with `dm.align.auto_align_many()`.
58 | - If set to a molecule, it is used as a template for alignment with `dm.align.template_align()`.
59 | - If set to False, no alignment is performed.
60 | For a more custom alignment, we suggest using directly the module `dm.align` instead.
61 | **kwargs: Additional arguments to pass to the drawing function. See RDKit
62 | documentation related to `MolDrawOptions` for more details at
63 | https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.
64 | """
65 |
66 | if isinstance(mol_size, int):
67 | mol_size = (mol_size, mol_size)
68 |
69 | if isinstance(mols, (dm.Mol, str)):
70 | mols = [mols]
71 |
72 | # Convert smiles to molecules if strings are provided as input for API consistency
73 | mols = mols[:] # avoid in place modification
74 | for i in range(len(mols)):
75 | if isinstance(mols[i], str):
76 | mols[i] = dm.to_mol(mols[i])
77 |
78 | if isinstance(legends, str):
79 | legends = [legends]
80 |
81 | if copy:
82 | mols = [dm.copy_mol(mol) for mol in mols]
83 |
84 | if max_mols is not None:
85 | mols = mols[:max_mols]
86 |
87 | if legends is not None:
88 | legends = legends[:max_mols]
89 |
90 | # Whether to align the molecules
91 | if isinstance(align, (dm.Mol, str)):
92 | mols = [dm.align.template_align(mol, template=align) for mol in mols]
93 | elif align is True:
94 | mols = dm.align.auto_align_many(mols)
95 |
96 | # Prepare molecules before drawing
97 | mols = [prepare_mol_for_drawing(mol, kekulize=kekulize) for mol in mols]
98 |
99 | _highlight_atom = highlight_atom
100 | if highlight_atom is not None and isinstance(highlight_atom[0], int):
101 | _highlight_atom = [highlight_atom]
102 |
103 | _highlight_bond = highlight_bond
104 | if highlight_bond is not None and isinstance(highlight_bond[0], int):
105 | _highlight_bond = [highlight_bond]
106 |
107 | # Don't make the image bigger than it
108 | if len(mols) < n_cols:
109 | n_cols = len(mols)
110 |
111 | draw_options = Draw.rdMolDraw2D.MolDrawOptions()
112 | draw_options.legendFontSize = legend_fontsize
113 | draw_options.addAtomIndices = indices
114 | draw_options.addBondIndices = bond_indices
115 | draw_options.addStereoAnnotation = stereo_annotations
116 | draw_options.bondLineWidth = bond_line_width
117 |
118 | # Add the custom drawing options.
119 | _kwargs = {}
120 | for k, v in kwargs.items():
121 | if hasattr(draw_options, k):
122 | setattr(draw_options, k, v)
123 | else:
124 | _kwargs[k] = v
125 |
126 | # Check if we are in a Jupyter notebook or IPython display context
127 | # If so, conditionally add the maxMols argument
128 | in_notebook = dm.viz.utils.is_ipython_session()
129 |
130 | if in_notebook:
131 | _kwargs["maxMols"] = max_mols_ipython
132 | if max_mols > max_mols_ipython:
133 | logger.warning(
134 | f"You have set max_mols to {max_mols}, which is higher than max_mols_ipython ({max_mols_ipython}). "
135 | "Consider increasing max_mols_ipython if you want to display all molecules in an IPython environment."
136 | )
137 |
138 | image = Draw.MolsToGridImage(
139 | mols,
140 | legends=legends,
141 | molsPerRow=n_cols,
142 | useSVG=use_svg,
143 | subImgSize=mol_size,
144 | highlightAtomLists=_highlight_atom,
145 | highlightBondLists=_highlight_bond,
146 | drawOptions=draw_options,
147 | **_kwargs,
148 | )
149 |
150 | if outfile is not None:
151 | image_to_file(image, outfile, as_svg=use_svg)
152 | return image
153 |
--------------------------------------------------------------------------------
/datamol/viz/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from typing import Union
3 |
4 | import io
5 | import fsspec
6 |
7 | from rdkit.Chem import Draw
8 | from matplotlib import colors as mcolors
9 |
10 | import PIL.Image
11 | import PIL.PngImagePlugin
12 |
13 | import datamol as dm
14 |
15 | from datamol.types import RDKitColor
16 | from datamol.types import DatamolColor
17 |
18 |
19 | def prepare_mol_for_drawing(mol: Optional[dm.Mol], kekulize: bool = True) -> Optional[dm.Mol]:
20 | """Prepare the molecule before drawing to avoid any error due to unsanitized molecule
21 | or incorrect valence or aromaticity.
22 |
23 | Code is inspired from `rdkit.Chem.Draw._moltoimg`.
24 |
25 | Args:
26 | mol: A molecule to prepare. If set to None, the function will return None.
27 | kekulize: Whether to kekulize the molecule.
28 | """
29 |
30 | if mol is None:
31 | return None
32 |
33 | try:
34 | with dm.without_rdkit_log():
35 | # Check for implicit and explicit valence
36 | if mol.NeedsUpdatePropertyCache(): # type: ignore
37 | mol.UpdatePropertyCache(False) # type: ignore
38 |
39 | # Check for aromaticity
40 | if dm.is_lower_than_current_rdkit_version("2022.09"):
41 | _kekulize = Draw._okToKekulizeMol(mol, kekulize) # type: ignore
42 | else:
43 | _kekulize = Draw.shouldKekulize(mol, kekulize)
44 |
45 | # Run the rdkit preparation procedure
46 | _mol = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=_kekulize)
47 |
48 | except ValueError: # <- can happen on a kekulization failure
49 | # Run the rdkit preparation procedure with kekulize set to `False`
50 | _mol = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False)
51 |
52 | return _mol
53 |
54 |
55 | def is_ipython_session() -> bool:
56 | try:
57 | kernel_name = get_ipython().__class__.__name__ # noqa: F821 # type: ignore
58 | module_name = get_ipython().__class__.__module__ # noqa: F821 # type: ignore
59 |
60 | if kernel_name == "ZMQInteractiveShell" or module_name == "google.colab._shell":
61 | return True
62 | except Exception:
63 | pass
64 |
65 | return False
66 |
67 |
68 | def drawer_to_image(drawer: Draw.rdMolDraw2D.MolDraw2D):
69 | """Convert an RDkit drawer to an image. The image can be either a PNG or SVG depending on the
70 | drawer class. The returned image type will depends whether the Python session is an IPython one or not.
71 |
72 | This function matches the behavior of `datamol.to_image` and `rdkit.Chem.Draw.MolDraw2DToImage`.
73 |
74 | Args:
75 | drawer: An RDkit drawer.
76 |
77 | Returns:
78 | An image: either PNG or SVG depending on the drawer class. If within an IPython sessions,
79 | IPython display objects are returned.
80 | """
81 |
82 | is_svg = isinstance(drawer, Draw.rdMolDraw2D.MolDraw2DSVG)
83 |
84 | if is_ipython_session():
85 | if is_svg:
86 | from IPython.core.display import SVG
87 |
88 | return SVG(drawer.GetDrawingText())
89 | else:
90 | from IPython.core.display import Image
91 |
92 | return Image(drawer.GetDrawingText())
93 | else:
94 | if is_svg:
95 | return drawer.GetDrawingText()
96 | else:
97 | from PIL import Image
98 |
99 | return Image.open(io.BytesIO(drawer.GetDrawingText()))
100 |
101 |
102 | def image_to_file(
103 | image: Union[
104 | str,
105 | PIL.PngImagePlugin.PngImageFile,
106 | bytes,
107 | PIL.Image.Image,
108 | ],
109 | outfile,
110 | as_svg: bool = False,
111 | ):
112 | """Save image to file. The image can be either a PNG or SVG depending
113 |
114 | Args:
115 | image: Image to save to a file
116 | outfile: Path to the output file where to save the image
117 | as_svg: Whether the image is an SVG or not
118 | """
119 |
120 | with fsspec.open(outfile, "wb") as f:
121 | if as_svg:
122 | if isinstance(image, str):
123 | # in a terminal process
124 | f.write(image.encode()) # type: ignore
125 | else:
126 | # in a jupyter kernel process
127 | f.write(image.data.encode()) # type: ignore
128 | else:
129 | if isinstance(image, PIL.PngImagePlugin.PngImageFile): # type: ignore
130 | # in a terminal process
131 | image.save(f) # type: ignore
132 | else:
133 | # in a jupyter kernel process
134 | f.write(image.data) # type: ignore
135 |
136 |
137 | def to_rdkit_color(color: Optional[DatamolColor]) -> Optional[RDKitColor]:
138 | """If required convert a datamol color (rgb, rgba or hex string) to an RDKit
139 | color (rgb or rgba).
140 |
141 | Args:
142 | color: A datamol color: hex, rgb, rgba or None.
143 | """
144 | if color is None:
145 | return None
146 |
147 | if isinstance(color, str):
148 | return mcolors.to_rgba(color) # type: ignore
149 | if isinstance(color, (tuple, list)) and len(color) in [3, 4] and any(x > 1 for x in color):
150 | return tuple(x / 255 if i < 3 else x for i, x in enumerate(color))
151 |
152 | return color
153 |
--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | docs.datamol.io
2 |
--------------------------------------------------------------------------------
/docs/api/datamol.align.md:
--------------------------------------------------------------------------------
1 | # `datamol.align`
2 |
3 | ::: datamol.align
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.cluster.md:
--------------------------------------------------------------------------------
1 | # `datamol.cluster`
2 |
3 | ::: datamol.cluster
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.conformers.md:
--------------------------------------------------------------------------------
1 | # `datamol.conformers`
2 |
3 | ::: datamol.conformers._conformers
4 | ::: datamol.conformers._features
5 |
--------------------------------------------------------------------------------
/docs/api/datamol.convert.md:
--------------------------------------------------------------------------------
1 | # `datamol.convert`
2 |
3 | ::: datamol.convert
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.data.md:
--------------------------------------------------------------------------------
1 | # `datamol.data`
2 |
3 | ::: datamol.data
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.descriptors.md:
--------------------------------------------------------------------------------
1 | # `datamol.descriptors`
2 |
3 | ::: datamol.descriptors.descriptors
4 | ::: datamol.descriptors.compute
5 |
--------------------------------------------------------------------------------
/docs/api/datamol.fp.md:
--------------------------------------------------------------------------------
1 | # `datamol.fp`
2 |
3 | ::: datamol.fp
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.fragment.md:
--------------------------------------------------------------------------------
1 | # `datamol.fragment`
2 |
3 | ::: datamol.fragment._fragment
4 | ::: datamol.fragment._assemble
5 |
--------------------------------------------------------------------------------
/docs/api/datamol.graph.md:
--------------------------------------------------------------------------------
1 | # `datamol.graph`
2 |
3 | ::: datamol.graph
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.io.md:
--------------------------------------------------------------------------------
1 | # `datamol.io`
2 |
3 | ::: datamol.io
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.isomers.md:
--------------------------------------------------------------------------------
1 | # `datamol.isomers`
2 |
3 | ::: datamol.isomers._enumerate
4 | ::: datamol.isomers._structural
5 |
--------------------------------------------------------------------------------
/docs/api/datamol.log.md:
--------------------------------------------------------------------------------
1 | # `datamol.log`
2 |
3 | ::: datamol.log
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.mol.md:
--------------------------------------------------------------------------------
1 | # `datamol.mol`
2 |
3 | ::: datamol.mol
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.molar.md:
--------------------------------------------------------------------------------
1 | # `datamol.molar`
2 |
3 | ::: datamol.molar
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.reactions.md:
--------------------------------------------------------------------------------
1 | # `datamol.reactions`
2 |
3 | ::: datamol.reactions._reactions
4 | ::: datamol.reactions._attachments
5 |
--------------------------------------------------------------------------------
/docs/api/datamol.scaffold.md:
--------------------------------------------------------------------------------
1 | # `datamol.scaffold`
2 |
3 | ::: datamol.scaffold._fuzzy
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.similarity.md:
--------------------------------------------------------------------------------
1 | # `datamol.similarity`
2 |
3 | ::: datamol.similarity
4 |
--------------------------------------------------------------------------------
/docs/api/datamol.utils.fs.md:
--------------------------------------------------------------------------------
1 | # `datamol.utils.fs`
2 |
3 | ::: datamol.utils.fs
4 |
5 |
--------------------------------------------------------------------------------
/docs/api/datamol.utils.md:
--------------------------------------------------------------------------------
1 | # `datamol.utils`
2 |
3 | ::: datamol.utils.decorators
4 | ::: datamol.utils.jobs
5 | ::: datamol.utils.perf
6 |
--------------------------------------------------------------------------------
/docs/api/datamol.viz.md:
--------------------------------------------------------------------------------
1 | # `datamol.viz`
2 |
3 | ## Vizualize molecule in 2D or 3D
4 |
5 | ::: datamol.viz.to_image
6 | ::: datamol.viz.conformers
7 |
8 | ## Specific plotting functions
9 |
10 | ::: datamol.viz.MolsCircleGrid
11 | ::: datamol.viz.circle_grid
12 |
13 | ## Vizualize 2D molecule with highlighted substructures
14 |
15 | ::: datamol.viz.lasso_highlight_image
16 |
--------------------------------------------------------------------------------
/docs/assets/css/custom-datamol.css:
--------------------------------------------------------------------------------
1 | :root {
2 | --datamol-primary: #F89D4C;
3 | --datamol-secondary: #343a40;
4 |
5 | /* Primary color shades */
6 | --md-primary-fg-color: var(--datamol-primary);
7 | --md-primary-fg-color--light: var(--datamol-primary);
8 | --md-primary-fg-color--dark: var(--datamol-primary);
9 | --md-primary-bg-color: var(--datamol-secondary);
10 | --md-primary-bg-color--light: var(--datamol-secondary);
11 | --md-text-link-color: var(--datamol-secondary);
12 |
13 | /* Accent color shades */
14 | --md-accent-fg-color: var(--datamol-secondary);
15 | --md-accent-fg-color--transparent: var(--datamol-secondary);
16 | --md-accent-bg-color: var(--datamol-secondary);
17 | --md-accent-bg-color--light: var(--datamol-secondary);
18 | }
19 |
20 | :root>* {
21 | /* Code block color shades */
22 | --md-code-bg-color: hsla(0, 0%, 96%, 1);
23 | --md-code-fg-color: hsla(200, 18%, 26%, 1);
24 |
25 | /* Footer */
26 | --md-footer-bg-color: var(--datamol-primary);
27 | /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */
28 | --md-footer-fg-color: var(--datamol-secondary);
29 | --md-footer-fg-color--light: var(--datamol-secondary);
30 | --md-footer-fg-color--lighter: var(--datamol-secondary);
31 |
32 | }
33 |
34 | .md-header {
35 | background-image: linear-gradient(to right, #F89D4C, #E20000);
36 | }
37 |
38 | .md-footer {
39 | background-image: linear-gradient(to right, #F89D4C, #E20000);
40 | }
41 |
42 | .md-tabs {
43 | background-image: linear-gradient(to right, #F4F6F9, #E2CEC3);
44 | }
45 |
46 | .md-header__topic {
47 | color: rgb(255, 255, 255);
48 | }
49 |
50 | .md-source__repository,
51 | .md-source__icon,
52 | .md-search__input,
53 | .md-search__input::placeholder,
54 | .md-search__input~.md-search__icon,
55 | .md-footer__inner.md-grid,
56 | .md-copyright__highlight,
57 | .md-copyright,
58 | .md-footer-meta.md-typeset a,
59 | .md-version {
60 | color: rgb(255, 255, 255) !important;
61 | }
62 |
63 | .md-search__form {
64 | background-color: rgba(255, 255, 255, 0.2);
65 | }
66 |
67 | .md-search__input {
68 | color: #222222 !important;
69 | }
70 |
71 | .md-header__topic {
72 | color: rgb(255, 255, 255);
73 | font-size: 1.4em;
74 | }
75 |
76 | /* Increase the size of the logo */
77 | .md-header__button.md-logo img,
78 | .md-header__button.md-logo svg {
79 | height: 2rem !important;
80 | }
81 |
82 | /* Reduce the margin around the logo */
83 | .md-header__button.md-logo {
84 | margin: 0.4em;
85 | padding: 0.4em;
86 | }
87 |
88 | /* Remove the `In` and `Out` block in rendered Jupyter notebooks */
89 | .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt,
90 | .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt {
91 | display: none !important;
92 | }
93 |
--------------------------------------------------------------------------------
/docs/assets/css/custom.css:
--------------------------------------------------------------------------------
1 | /* Indentation. */
2 | div.doc-contents:not(.first) {
3 | padding-left: 25px;
4 | border-left: 4px solid rgba(230, 230, 230);
5 | margin-bottom: 80px;
6 | }
7 |
8 | /* Don't capitalize names. */
9 | h5.doc-heading {
10 | text-transform: none !important;
11 | }
12 |
13 | /* Don't use vertical space on hidden ToC entries. */
14 | .hidden-toc::before {
15 | margin-top: 0 !important;
16 | padding-top: 0 !important;
17 | }
18 |
19 | /* Don't show permalink of hidden ToC entries. */
20 | .hidden-toc a.headerlink {
21 | display: none;
22 | }
23 |
24 | /* Avoid breaking parameters name, etc. in table cells. */
25 | td code {
26 | word-break: normal !important;
27 | }
28 |
29 | /* For pieces of Markdown rendered in table cells. */
30 | td p {
31 | margin-top: 0 !important;
32 | margin-bottom: 0 !important;
33 | }
34 |
--------------------------------------------------------------------------------
/docs/assets/css/tweak-width.css:
--------------------------------------------------------------------------------
1 | @media only screen and (min-width: 76.25em) {
2 | .md-main__inner {
3 | max-width: none;
4 | padding-left: 2em;
5 | padding-left: 2em;
6 | }
7 | .md-sidebar--primary {
8 | left: 0;
9 | }
10 | .md-sidebar--secondary {
11 | right: 0;
12 | margin-left: 0;
13 | -webkit-transform: none;
14 | transform: none;
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/docs/assets/js/google-analytics.js:
--------------------------------------------------------------------------------
1 | var gtag_id = "G-0L9PP26N2H";
2 |
3 | var script = document.createElement("script");
4 | script.src = "https://www.googletagmanager.com/gtag/js?id=" + gtag_id;
5 | document.head.appendChild(script);
6 |
7 | window.dataLayer = window.dataLayer || [];
8 | function gtag(){dataLayer.push(arguments);}
9 | gtag('js', new Date());
10 | gtag('config', gtag_id);
11 |
--------------------------------------------------------------------------------
/docs/contribute.md:
--------------------------------------------------------------------------------
1 | # Contribute
2 |
3 | The below documents the development lifecycle of Datamol.
4 |
5 | ## Setup a dev environment
6 |
7 | ```bash
8 | mamba env create -n datamol -f env.yml
9 | mamba activate datamol
10 | pip install -e .
11 | ```
12 |
13 | ## Setup a dev environment with dev container
14 |
15 | This repository is setup to use [dev container](https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/introduction-to-dev-containers). You can use it locally with VSCode or any editor supporting dev containers as well as on GitHub Codespaces.
16 |
17 | The env is based on the Micromamba Docker image.
18 |
19 | ## Continuous Integration
20 |
21 | Datamol uses Github Actions to:
22 |
23 | - **Build and test** `datamol`.
24 | - Multiple combinations of OS, Python and RDKit versions are tested.
25 | - **Check** the code:
26 | - Formatting with `black`.
27 | - Static type check with `mypy`.
28 | - **Documentation**: build and deploy the documentation on `main` and for every new git tag.
29 |
30 | ## Run tests
31 |
32 | ```bash
33 | pytest
34 | ```
35 |
36 | ## Build the documentation
37 |
38 | You can build and serve the documentation locally with:
39 |
40 | ```bash
41 | # Build and serve the doc
42 | mike serve
43 | ```
44 |
45 | ### Multi-versionning
46 |
47 | The doc is built for eash push on `main` and every git tags using [mike](https://github.com/jimporter/mike). Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.
48 |
49 | ## Release a new version
50 |
51 | The process is fully automated by executing the [`release` GH Action](https://github.com/datamol-io/datamol/actions/workflows/release.yml).
52 |
--------------------------------------------------------------------------------
/docs/images/logo-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/images/logo-black.png
--------------------------------------------------------------------------------
/docs/images/logo-black.svg:
--------------------------------------------------------------------------------
1 |
6 |
--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/images/logo.png
--------------------------------------------------------------------------------
/docs/images/logo.svg:
--------------------------------------------------------------------------------
1 |
6 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | Datamol is a python library to work with molecules. It's a layer built on top of [RDKit](https://www.rdkit.org/) and aims to be as light as possible.
4 |
5 | - 🐍 Simple pythonic API
6 | - ⚗️ RDKit first: all you manipulate are `rdkit.Chem.Mol` objects.
7 | - ✅ Manipulating molecules often rely on many options; Datamol provides good defaults by design.
8 | - 🧠 Performance matters: built-in efficient parallelization when possible with optional progress bar.
9 | - 🕹️ Modern IO: out-of-the-box support for remote paths using `fsspec` to read and write multiple formats (sdf, xlsx, csv, etc).
10 |
11 | Visit our website at .
12 |
13 | ## Installation
14 |
15 | Use conda:
16 |
17 | ```bash
18 | mamba install -c conda-forge datamol
19 | ```
20 |
21 | _**Tips:** You can replace `mamba` by `conda`._
22 |
23 | _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install Datamol. The package is also pip installable if you need it: `pip install datamol`._
24 |
25 | ## Quick API Tour
26 |
27 | ```python
28 | import datamol as dm
29 |
30 | # Common functions
31 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
32 | fp = dm.to_fp(mol)
33 | selfies = dm.to_selfies(mol)
34 | inchi = dm.to_inchi(mol)
35 |
36 | # Standardize and sanitize
37 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O")
38 | mol = dm.fix_mol(mol)
39 | mol = dm.sanitize_mol(mol)
40 | mol = dm.standardize_mol(mol)
41 |
42 | # Dataframe manipulation
43 | df = dm.data.freesolv()
44 | mols = dm.from_df(df)
45 |
46 | # 2D viz
47 | legends = [dm.to_smiles(mol) for mol in mols[:10]]
48 | dm.viz.to_image(mols[:10], legends=legends)
49 |
50 | # Generate conformers
51 | smiles = "O=C(C)Oc1ccccc1C(=O)O"
52 | mol = dm.to_mol(smiles)
53 | mol_with_conformers = dm.conformers.generate(mol)
54 |
55 | # 3D viz (using nglview)
56 | dm.viz.conformers(mol, n_confs=10)
57 |
58 | # Compute SASA from conformers
59 | sasa = dm.conformers.sasa(mol_with_conformers)
60 |
61 | # Easy IO
62 | mols = dm.read_sdf("s3://my-awesome-data-lake/smiles.sdf", as_df=False)
63 | dm.to_sdf(mols, "gs://data-bucket/smiles.sdf")
64 | ```
65 |
66 | ## How to cite
67 |
68 | Please cite Datamol if you use it in your research: [](https://zenodo.org/badge/latestdoi/341603042).
69 |
70 | ## Compatibilities
71 |
72 | Version compatibilities are an essential topic for production-software stacks. We are cautious about documenting compatibility between `datamol`, `python` and `rdkit`.
73 |
74 | See below the associated versions of Python and RDKit, for which a minor version of Datamol **has been tested** during its whole lifecycle. _It does not mean other combinations does not work but that those are not tested._
75 |
76 | | `datamol` | `python` | `rdkit` |
77 | | --------- | ------------------- | ----------------------------- |
78 | | `0.12.x` | `[3.10, 3.11]` | `[2023.03, 2023.09]` |
79 | | `0.11.x` | `[3.9, 3.10, 3.11]` | `[2022.09, 2023.03]` |
80 | | `0.10.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` |
81 | | `0.9.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` |
82 | | `0.8.x` | `[3.8, 3.9, 3.10]` | `[2021.09, 2022.03, 2022.09]` |
83 | | `0.7.x` | `[3.8, 3.9]` | `[2021.09, 2022.03]` |
84 | | `0.6.x` | `[3.8, 3.9]` | `[2021.09]` |
85 | | `0.5.x` | `[3.8, 3.9]` | `[2021.03, 2021.09]` |
86 | | `0.4.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` |
87 | | `0.3.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` |
88 |
--------------------------------------------------------------------------------
/docs/license.md:
--------------------------------------------------------------------------------
1 | ```
2 | {!LICENSE!}
3 | ```
4 |
--------------------------------------------------------------------------------
/docs/tutorials/data/ReactionBlock.rxn:
--------------------------------------------------------------------------------
1 | $RXN
2 |
3 | ISIS 082120061354
4 |
5 | 2 1
6 | $MOL
7 |
8 | -ISIS- 08210613542D
9 |
10 | 3 2 0 0 0 0 0 0 0 0999 V2000
11 | -1.4340 -0.6042 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0
12 | -0.8639 -0.9333 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
13 | -1.4340 0.0542 0.0000 O 0 0 0 0 0 0 0 0 0 1 0 0
14 | 1 2 1 0 0 0 0
15 | 1 3 2 0 0 0 0
16 | M END
17 | $MOL
18 |
19 | -ISIS- 08210613542D
20 |
21 | 1 0 0 0 0 0 0 0 0 0999 V2000
22 | 2.2125 -0.7833 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0
23 | M END
24 | $MOL
25 |
26 | -ISIS- 08210613542D
27 |
28 | 3 2 0 0 0 0 0 0 0 0999 V2000
29 | 9.5282 -0.8083 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0
30 | 8.9579 -0.4792 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0
31 | 8.9579 0.1792 0.0000 O 0 0 0 0 0 0 0 0 0 1 0 0
32 | 1 2 1 0 0 0 0
33 | 2 3 2 0 0 0 0
34 | M END
35 |
--------------------------------------------------------------------------------
/docs/tutorials/images/Aligning_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Aligning_1.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Aligning_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Aligning_2.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Conformers_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Conformers_1.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Descriptors_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Descriptors_1.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Fragment_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_1.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Fragment_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_2.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Fragment_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_3.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Preprocess_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Preprocess_1.png
--------------------------------------------------------------------------------
/docs/tutorials/images/Scaffolds_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Scaffolds_1.png
--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | ## How to use
4 |
5 | Datamol has been designed to be used with a single import:
6 |
7 | ```python
8 | import datamol as dm
9 | ```
10 |
11 | All `datamol` functions are available under `dm`.
12 |
13 | ## Lazy loading
14 |
15 | datamol uses lazy loading to dynamically expose all its API without imposing a long import time during `import datamol as dm`. In case of trouble you can always disable lazy loading by setting the environment variable `DATAMOL_DISABLE_LAZY_LOADING` to `1`. Please report any issue [on the datamol repo](https://github.com/datamol-io/datamol/issues).
16 |
--------------------------------------------------------------------------------
/env.yml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 |
4 | dependencies:
5 | - python >=3.8
6 | - pip
7 | - tqdm
8 | - loguru
9 | - joblib
10 | - fsspec >=2021.9
11 | - s3fs >=2021.9
12 | - gcsfs >=2021.9
13 | - platformdirs
14 | - packaging
15 | - typing_extensions
16 | - importlib_resources
17 |
18 | # Scientific
19 | - pandas
20 | - numpy
21 | - scipy
22 | - pillow
23 | - matplotlib
24 | - scikit-learn
25 |
26 | # Chemistry
27 | - rdkit
28 | - selfies
29 |
30 | # Optional deps
31 | - openpyxl
32 | - networkx
33 | - nglview
34 | - xlsxwriter
35 | - pyarrow
36 |
37 | # Dev
38 | - pytest >=6.0
39 | - pytest-cov
40 | - pytest-xdist
41 | - black >=24
42 | - ruff
43 | - jupyterlab
44 | - mypy
45 | - codecov
46 | - nbconvert
47 |
48 | # Doc
49 | - mkdocs <1.6
50 | - mkdocs-material >=7.1.1
51 | - mkdocs-material-extensions
52 | - mkdocstrings
53 | - mkdocstrings-python
54 | - mkdocs-jupyter
55 | - markdown-include
56 | - mdx_truly_sane_lists
57 | - mike >=1.0.0
58 | - seaborn
59 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: "datamol"
2 | site_description: "A python library to work with molecules. Built on top of RDKit."
3 | repo_url: "https://github.com/datamol-io/datamol"
4 | repo_name: "datamol-io/datamol"
5 | copyright: Copyright 2020 - 2023 datamol.io
6 |
7 | site_url: ""
8 | remote_branch: "gh-pages"
9 | use_directory_urls: false
10 | docs_dir: "docs"
11 |
12 | # Fail on warnings to detect issues with types and docstring
13 | strict: true
14 |
15 | nav:
16 | - Overview: index.md
17 | - Usage: usage.md
18 | - Tutorials:
19 | - The Basics: tutorials/The_Basics.ipynb
20 | - Preprocessing: tutorials/Preprocessing.ipynb
21 | - Descriptors: tutorials/Descriptors.ipynb
22 | - Chemical Reactions: tutorials/Reactions.ipynb
23 | - Scaffolds: tutorials/Scaffolds.ipynb
24 | - Aligning: tutorials/Aligning.ipynb
25 | - Fuzzy_Scaffolds: tutorials/Fuzzy_Scaffolds.ipynb
26 | - Clustering: tutorials/Clustering.ipynb
27 | - Fragment: tutorials/Fragment.ipynb
28 | - Conformers: tutorials/Conformers.ipynb
29 | - Visualization: tutorials/Visualization.ipynb
30 | - Datamol Filesystem Module: tutorials/Filesystem.ipynb
31 | - API:
32 | - datamol.align: api/datamol.align.md
33 | - datamol.cluster: api/datamol.cluster.md
34 | - datamol.conformers: api/datamol.conformers.md
35 | - datamol.convert: api/datamol.convert.md
36 | - datamol.data: api/datamol.data.md
37 | - datamol.descriptors: api/datamol.descriptors.md
38 | - datamol.fp: api/datamol.fp.md
39 | - datamol.fragment: api/datamol.fragment.md
40 | - datamol.graph: api/datamol.graph.md
41 | - datamol.io: api/datamol.io.md
42 | - datamol.isomers: api/datamol.isomers.md
43 | - datamol.log: api/datamol.log.md
44 | - datamol.molar: api/datamol.molar.md
45 | - datamol.mol: api/datamol.mol.md
46 | - datamol.reactions: api/datamol.reactions.md
47 | - datamol.scaffold: api/datamol.scaffold.md
48 | - datamol.similarity: api/datamol.similarity.md
49 | - datamol.utils: api/datamol.utils.md
50 | - datamol.utils.fs: api/datamol.utils.fs.md
51 | - datamol.viz: api/datamol.viz.md
52 |
53 | - Contribute: contribute.md
54 | - License: license.md
55 |
56 | theme:
57 | name: material
58 | # NOTE(hadim): to customize the material primary and secondary
59 | # color check `docs/assets/css/datamol-custom.css`.
60 | features:
61 | - navigation.tabs
62 | - navigation.expand
63 | favicon: images/logo-black.png
64 | logo: images/logo.svg
65 |
66 | extra_css:
67 | - assets/css/custom.css
68 | - assets/css/custom-datamol.css
69 | - assets/css/tweak-width.css
70 |
71 | extra_javascript:
72 | - assets/js/google-analytics.js
73 |
74 | markdown_extensions:
75 | - admonition
76 | - markdown_include.include
77 | - pymdownx.emoji
78 | - pymdownx.magiclink
79 | - pymdownx.superfences
80 | - pymdownx.tabbed
81 | - pymdownx.tasklist
82 | # For `tab_length=2` in the markdown extension
83 | # See https://github.com/mkdocs/mkdocs/issues/545
84 | - mdx_truly_sane_lists
85 | - toc:
86 | permalink: true
87 | toc_depth: 4
88 |
89 | watch:
90 | - datamol/
91 |
92 | plugins:
93 | - search
94 |
95 | - mkdocstrings:
96 | handlers:
97 | python:
98 | setup_commands:
99 | - import sys
100 | - sys.path.append("docs")
101 | - sys.path.append("datamol")
102 | options:
103 | new_path_syntax: true
104 | show_root_heading: false
105 | heading_level: 3
106 | show_root_full_path: false
107 |
108 | - mkdocs-jupyter:
109 | execute: false
110 | # kernel_name: python3
111 |
112 | - mike:
113 | version_selector: true
114 |
115 | extra:
116 | version:
117 | # Multi versioning provider for mkdocs-material (used for the JS selector)
118 | provider: mike
119 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "setuptools-scm"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "datamol"
7 | description = "A python library to work with molecules. Built on top of RDKit."
8 | authors = [{ name = "Hadrien Mary", email = "hadrien@valencediscovery.com" }]
9 | readme = "README.md"
10 | dynamic = ["version"]
11 | requires-python = ">=3.8"
12 | license = { text = "Apache" }
13 | classifiers = [
14 | "Development Status :: 5 - Production/Stable",
15 | "Intended Audience :: Developers",
16 | "Intended Audience :: Healthcare Industry",
17 | "Intended Audience :: Science/Research",
18 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
19 | "Topic :: Scientific/Engineering :: Bio-Informatics",
20 | "Topic :: Scientific/Engineering :: Information Analysis",
21 | "Topic :: Scientific/Engineering :: Medical Science Apps.",
22 | "Natural Language :: English",
23 | "Operating System :: OS Independent",
24 | "Programming Language :: Python",
25 | "Programming Language :: Python :: 3",
26 | "Programming Language :: Python :: 3.8",
27 | "Programming Language :: Python :: 3.9",
28 | "Programming Language :: Python :: 3.10",
29 | "Programming Language :: Python :: 3.11",
30 | ]
31 | dependencies = [
32 | "tqdm",
33 | "loguru",
34 | "joblib",
35 | "fsspec>=2021.9",
36 | "pandas",
37 | "numpy",
38 | "scipy",
39 | "matplotlib",
40 | "pillow",
41 | "selfies",
42 | "platformdirs",
43 | "scikit-learn",
44 | "packaging",
45 | "typing-extensions",
46 | "importlib-resources",
47 | "rdkit",
48 | ]
49 |
50 | [project.urls]
51 | Website = "https://datamol.io"
52 | "Source Code" = "https://github.com/datamol-io/datamol"
53 | "Bug Tracker" = "https://github.com/datamol-io/datamol/issues"
54 | Documentation = "https://docs.datamol.io"
55 |
56 | [tool.setuptools]
57 | include-package-data = true
58 |
59 | [tool.setuptools_scm]
60 | fallback_version = "dev"
61 |
62 | [tool.setuptools.packages.find]
63 | where = ["."]
64 | include = ["datamol", "datamol.*"]
65 | exclude = []
66 | namespaces = true
67 |
68 | [tool.setuptools.package-data]
69 | "datamol.data" = ["*"]
70 |
71 | [tool.black]
72 | line-length = 100
73 | target-version = ['py39', 'py310']
74 | include = '\.pyi?$'
75 |
76 | [tool.pytest.ini_options]
77 | minversion = "6.0"
78 | addopts = "--verbose --cov=datamol --cov-fail-under=85 --cov-report xml --cov-report term --durations=10 -n auto"
79 | testpaths = ["tests"]
80 | filterwarnings = [
81 | "ignore::DeprecationWarning:rdkit.Chem.MolStandardize",
82 | "ignore::DeprecationWarning:jupyter_client",
83 | "ignore::DeprecationWarning:pkg_resources",
84 | "ignore::DeprecationWarning:joblib.externals.loky.backend",
85 | "ignore::DeprecationWarning:dateutil.tz.tz",
86 | "ignore::DeprecationWarning:joblib._utils",
87 | "ignore::DeprecationWarning:openpyxl.packaging.core",
88 | "ignore::DeprecationWarning:tqdm.std",
89 | ]
90 |
91 | [tool.coverage.run]
92 | source = ["datamol/"]
93 | disable_warnings = ["no-data-collected"]
94 | data_file = ".coverage/coverage"
95 |
96 | [tool.coverage.report]
97 | omit = ["datamol/__init__.py", "datamol/_version.py"]
98 |
99 | [tool.coverage.xml]
100 | output = "coverage.xml"
101 |
102 | [tool.mypy]
103 | exclude = []
104 | ignore_missing_imports = true
105 |
106 | [tool.pyright]
107 | reportShadowedImports = false
108 |
109 | [tool.ruff]
110 | ignore = [
111 | "E501", # Never enforce `E501` (line length violations).
112 | "E731", # Do not assign a lambda expression, use a def
113 | ]
114 | line-length = 110
115 | target-version = "py311"
116 |
117 | [tool.ruff.per-file-ignores]
118 | "__init__.py" = [
119 | "F401", # imported but unused
120 | "E402", # Module level import not at top of file
121 | ]
122 |
123 | [tool.ruff.pycodestyle]
124 | max-doc-length = 150
125 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import platform
2 | import pathlib
3 | from loguru import logger
4 |
5 | import pytest
6 |
7 |
8 | DATA_DIR_PATH = pathlib.Path(__file__).parent.resolve() / "data"
9 |
10 |
11 | @pytest.fixture
12 | def current_platform():
13 | if platform.system() == "Linux":
14 | return "linux"
15 | elif platform.system() == "Darwin":
16 | return "osx"
17 | elif platform.system() == "Windows":
18 | return "win"
19 | else:
20 | return platform.system()
21 |
22 |
23 | @pytest.fixture(autouse=True)
24 | def skip_by_platform(request, current_platform):
25 | if request.node.get_closest_marker("skip_platform"):
26 | if request.node.get_closest_marker("skip_platform").args[0] == current_platform:
27 | pytest.skip(f"skipped on this platform: {current_platform}")
28 |
29 |
30 | def pytest_configure(config):
31 | config.addinivalue_line(
32 | "markers",
33 | "skip_platform(current_platform): skip test for a given platform from `['linux', 'osx', 'win']`",
34 | )
35 |
36 |
37 | @pytest.fixture
38 | def datadir(request):
39 | return DATA_DIR_PATH
40 |
41 |
42 | # Mandatory for the below monkeypatch function.
43 | from _pytest.logging import caplog as _caplog # noqa: E402, F401
44 |
45 |
46 | @pytest.fixture
47 | def caplog(_caplog): # noqa: F811
48 | """Monkeypatching the pytest caplog to work with loguru.
49 |
50 | See https://loguru.readthedocs.io/en/latest/resources/migration.html#making-things-work-with-pytest-and-caplog
51 | """
52 | import logging
53 |
54 | class PropogateHandler(logging.Handler):
55 | def emit(self, record):
56 | logging.getLogger(record.name).handle(record)
57 |
58 | handler_id = logger.add(PropogateHandler(), format="{message}")
59 | yield _caplog
60 | logger.remove(handler_id)
61 |
--------------------------------------------------------------------------------
/tests/data/TUBB3-observations.sdf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/tests/data/TUBB3-observations.sdf.gz
--------------------------------------------------------------------------------
/tests/data/freesolv.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/tests/data/freesolv.xlsx
--------------------------------------------------------------------------------
/tests/data/test.mol2:
--------------------------------------------------------------------------------
1 | @MOLECULE
2 | mol_first
3 | 11 11 1 0 0
4 | SMALL
5 | AMBER ff14SB
6 |
7 | @ATOM
8 | 1 C1 -0.0167 1.3778 0.0096 C.ar 1 UNK 0.0267
9 | 2 C2 0.0021 -0.0041 0.0020 C.ar 1 UNK -0.0438
10 | 3 C3 1.2218 -0.6631 -0.0131 C.ar 1 UNK -0.0592
11 | 4 C4 2.3820 0.0960 -0.0201 C.ar 1 UNK -0.0438
12 | 5 C5 2.2849 1.4746 -0.0118 C.ar 1 UNK 0.0267
13 | 6 N6 1.1072 2.0677 0.0026 N.ar 1 UNK -0.2647
14 | 7 H7 -0.9627 1.8988 0.0169 H 1 UNK 0.0840
15 | 8 H8 -0.9217 -0.5635 0.0075 H 1 UNK 0.0639
16 | 9 H9 1.2671 -1.7422 -0.0190 H 1 UNK 0.0624
17 | 10 H10 3.3495 -0.3839 -0.0316 H 1 UNK 0.0639
18 | 11 H11 3.1838 2.0731 -0.0171 H 1 UNK 0.0840
19 | @BOND
20 | 1 1 6 ar
21 | 2 1 2 ar
22 | 3 1 7 1
23 | 4 2 3 ar
24 | 5 2 8 1
25 | 6 3 4 ar
26 | 7 3 9 1
27 | 8 4 5 ar
28 | 9 4 10 1
29 | 10 5 6 ar
30 | 11 5 11 1
31 | @SUBSTRUCTURE
32 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
33 |
34 | @MOLECULE
35 | mol_sec
36 | 9 9 1 0 0
37 | SMALL
38 | AMBER ff14SB
39 |
40 |
41 | @ATOM
42 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
43 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
44 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
45 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
46 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
47 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
48 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
49 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
50 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
51 | @BOND
52 | 1 1 6 2
53 | 2 1 2 1
54 | 3 1 7 1
55 | 4 2 3 1
56 | 5 2 4 1
57 | 6 4 5 2
58 | 7 4 8 1
59 | 8 5 6 1
60 | 9 5 9 1
61 | @SUBSTRUCTURE
62 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
63 |
64 | @MOLECULE
65 | mol_third
66 | 9 9 1 0 0
67 | SMALL
68 | AMBER ff14SB
69 |
70 |
71 | @ATOM
72 | 1 C1 1.2973 -0.3859 -0.0124 C 1 UNK 0.0838
73 | 2 N2 0.0021 -0.0041 0.0020 N 1 UNK -0.3106
74 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
75 | 4 C4 -0.0165 1.3646 0.0095 C 1 UNK 0.0120
76 | 5 C5 1.2671 1.7717 -0.0005 C 1 UNK 0.0422
77 | 6 N6 2.0482 0.6814 -0.0138 N 1 UNK -0.2480
78 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
79 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
80 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
81 | @BOND
82 | 1 1 6 2
83 | 2 1 2 1
84 | 3 1 7 1
85 | 4 2 3 1
86 | 5 2 4 1
87 | 6 4 5 2
88 | 7 4 8 1
89 | 8 5 6 1
90 | 9 5 9 1
91 | @SUBSTRUCTURE
92 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
93 |
94 | @MOLECULE
95 | mol_sec_f
96 | 9 9 1 0 0
97 | SMALL
98 | AMBER ff14SB
99 |
100 |
101 | @ATOM
102 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
103 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
104 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
105 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
106 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
107 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
108 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
109 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
110 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
111 |
112 | 1 1 6 2
113 | 2 1 2 1
114 | 3 1 7 1
115 | 4 2 3 1
116 | 5 2 4 1
117 | 6 4 5 2
118 | 7 4 8 1
119 | 8 5 6 1
120 | 9 5 9 1
121 | @SUBSTRUCTURE
122 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
123 |
124 | @MOLECULE
125 | mol_sec_f1
126 | 9 9 1 0 0
127 | SMALL
128 | AMBER ff14SB
129 |
130 |
131 |
132 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
133 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
134 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
135 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
136 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
137 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
138 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
139 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
140 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
141 | @BOND
142 | 1 1 6 2
143 | 2 1 2 1
144 | 3 1 7 1
145 | 4 2 3 1
146 | 5 2 4 1
147 | 6 4 5 2
148 | 7 4 8 1
149 | 8 5 6 1
150 | 9 5 9 1
151 | @SUBSTRUCTURE
152 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
153 |
154 | @MOLECULE
155 | mol_sec_f3
156 | 9 9 1 0 0
157 | SMALL
158 | AMBER ff14SB
159 |
160 | @ATOM
161 | @BOND
162 | 1 1 6 2
163 | 2 1 2 1
164 | 3 1 7 1
165 | 4 2 3 1
166 | 5 2 4 1
167 | 6 4 5 2
168 | 7 4 8 1
169 | 8 5 6 1
170 | 9 5 9 1
171 | @SUBSTRUCTURE
172 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
173 |
174 | @MOLECULE
175 | mol_sec_f4
176 | 9 9 1 0 0
177 | SMALL
178 | AMBER ff14SB
179 |
180 | @ATOM
181 | @BOND
182 | @SUBSTRUCTURE
183 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
184 |
185 |
186 |
187 | @MOLECULE
188 |
189 |
190 |
191 | @ATOM
192 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838
193 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106
194 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
195 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120
196 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422
197 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480
198 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
199 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
200 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
201 |
202 | 1 1 6 2
203 | 2 1 2 1
204 | 3 1 7 1
205 | 4 2 3 1
206 | 5 2 4 1
207 | 6 4 5 2
208 | 7 4 8 1
209 | 8 5 6 1
210 | 9 5 9 1
211 | @SUBSTRUCTURE
212 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
213 |
214 | @MOLECULE
215 | mol_sec
216 | 9 9 1 0 0
217 | SMALL
218 | AMBER ff14SB
219 |
220 |
221 | @ATOM
222 | 1 C1 1.2973 -0.3859 -0.0124 C 1 UNK 0.0838
223 | 2 N2 0.0021 -0.0041 0.0020 N 1 UNK -0.3106
224 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532
225 | 4 C4 -0.0165 1.3646 0.0095 C 1 UNK 0.0120
226 | 5 C5 1.2671 1.7717 -0.0005 C 1 UNK 0.0422
227 | 6 N6 2.0482 0.6814 -0.0138 N 1 UNK -0.2480
228 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014
229 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806
230 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854
231 | @BOND
232 | 1 1 6 2
233 | 2 1 2 1
234 | 3 1 7 1
235 | 4 2 3 1
236 | 5 2 4 1
237 | 6 4 5 2
238 | 7 4 8 1
239 | 8 5 6 1
240 | 9 5 9 1
241 | @SUBSTRUCTURE
242 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT
243 |
244 |
--------------------------------------------------------------------------------
/tests/test_align.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import pandas as pd
4 | import datamol as dm
5 |
6 |
7 | def test_template_align():
8 | data: pd.DataFrame = dm.cdk2(as_df=True) # type: ignore
9 | data = data.iloc[:6].copy() # type: ignore
10 |
11 | template = data.iloc[0]["mol"]
12 | data["aligned_mol"] = data["mol"].apply(lambda x: dm.align.template_align(x, template=template))
13 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
14 |
15 | template = data.iloc[0]["smiles"]
16 | data["aligned_mol"] = data["smiles"].apply(
17 | lambda x: dm.align.template_align(x, template=template)
18 | )
19 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
20 |
21 | template = data.iloc[0]["mol"]
22 | data["aligned_mol"] = data["mol"].apply(
23 | lambda x: dm.align.template_align(x, template=template, auto_select_coord_gen=True)
24 | )
25 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
26 |
27 | template = data.iloc[0]["mol"]
28 | data["aligned_mol"] = data["mol"].apply(
29 | lambda x: dm.align.template_align(x, template=template, use_depiction=False)
30 | )
31 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
32 |
33 | template = None
34 | data["aligned_mol"] = data["mol"].apply(lambda x: dm.align.template_align(x, template=template))
35 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
36 |
37 | template = None
38 | data["aligned_mol"] = data["mol"].apply(
39 | lambda x: dm.align.template_align(x, template=template, copy=False)
40 | )
41 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
42 |
43 | assert dm.align.template_align(None) is None
44 |
45 |
46 | def test_auto_align_many():
47 | data: pd.DataFrame = dm.solubility(as_df=True) # type: ignore
48 | data = data.iloc[:16].copy() # type: ignore
49 |
50 | excepted_cluster_size = [8, 6, 5, 6, 6]
51 |
52 | for i, partition_method in enumerate(
53 | [
54 | "cluster",
55 | "scaffold",
56 | "anongraph-scaffold",
57 | "anon-scaffold",
58 | "strip-scaffold",
59 | ]
60 | ):
61 | print(partition_method)
62 |
63 | data["aligned_mol"] = dm.align.auto_align_many(
64 | data["mol"],
65 | partition_method=partition_method,
66 | )
67 |
68 | props = data["aligned_mol"].apply(lambda x: pd.Series(x.GetPropsAsDict()))
69 |
70 | assert "dm.auto_align_many.cluster_id" in props.columns
71 | assert "dm.auto_align_many.core" in props.columns
72 | assert props["dm.auto_align_many.cluster_id"].dtype.name == "int64"
73 | assert props["dm.auto_align_many.core"].dtype.name == "object"
74 |
75 | assert props["dm.auto_align_many.cluster_id"].unique().shape[0] == excepted_cluster_size[i]
76 |
77 | with pytest.raises(ValueError):
78 | dm.align.auto_align_many(data["mol"], partition_method="invalid")
79 |
--------------------------------------------------------------------------------
/tests/test_cluster.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import datamol as dm
4 |
5 |
6 | def test_cluster_mols():
7 | # Get some mols
8 | data = dm.data.freesolv()
9 | smiles = data["smiles"].iloc[:100].tolist()
10 | mols = [dm.to_mol(s) for s in smiles]
11 |
12 | _, mol_clusters = dm.cluster_mols(mols, cutoff=0.7)
13 | cluster_sizes = [11, 7, 5, 3, 3, 3, 2, 3, 2, 1, 2, 2, 1]
14 | assert [len(c) for c in mol_clusters[:13]] == cluster_sizes
15 |
16 |
17 | def test_pick_diverse():
18 | # Get some mols
19 | data = dm.data.freesolv()
20 | smiles = data["smiles"].iloc[:100].tolist()
21 | mols = [dm.to_mol(s) for s in smiles]
22 |
23 | indices, _ = dm.pick_diverse(mols, npick=18, seed=19)
24 |
25 | excepted_indices = np.array(
26 | [9, 14, 47, 50, 56, 61, 67, 89, 83, 90, 94, 10, 0, 96, 15, 58, 71, 21]
27 | )
28 |
29 | assert np.all(indices == excepted_indices)
30 |
31 |
32 | def test_pick_centroids():
33 | data = dm.data.freesolv()
34 | smiles = data["smiles"].iloc[:100].tolist()
35 | mols = [dm.to_mol(s) for s in smiles]
36 | indices, centroids = dm.pick_centroids(
37 | mols, npick=18, threshold=0.7, method="sphere", n_jobs=-1
38 | )
39 | excepted_indices = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18, 20])
40 |
41 | assert np.all(indices == excepted_indices)
42 |
43 |
44 | def test_assign_to_centroids():
45 | data = dm.data.freesolv()
46 | smiles = data["smiles"].iloc[:100].tolist()
47 | mols = [dm.to_mol(s) for s in smiles]
48 | indices, centroids = dm.pick_centroids(
49 | mols, npick=18, threshold=0.7, method="sphere", n_jobs=-1
50 | )
51 |
52 | cluster_map, cluster_list = dm.assign_to_centroids(mols, centroids, n_jobs=-1)
53 | # expect centroid to be in centroid list
54 | assert indices[0] in cluster_map[0]
55 | # expect no intersection after assignment
56 | map_intersection = set.intersection(*map(set, cluster_map.values()))
57 | assert len(map_intersection) == 0
58 | # expect some similar molecule in a given cluster
59 | # assert 33 in cluster_map[0]
60 |
--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
1 | import datamol as dm
2 |
3 |
4 | def test_freesolv():
5 | data = dm.data.freesolv()
6 | assert data.shape == (642, 4)
7 | assert list(data.columns) == ["iupac", "smiles", "expt", "calc"]
8 |
9 |
10 | def test_cdk2():
11 | data = dm.data.cdk2()
12 | assert data.shape == (47, 12)
13 | assert list(data.columns) == [
14 | "smiles",
15 | "mol",
16 | "id",
17 | "Cluster",
18 | "MODEL.SOURCE",
19 | "MODEL.CCRATIO",
20 | "r_mmffld_Potential_Energy-OPLS_2005",
21 | "r_mmffld_RMS_Derivative-OPLS_2005",
22 | "b_mmffld_Minimization_Converged-OPLS_2005",
23 | "s_st_Chirality_1",
24 | "s_st_Chirality_2",
25 | "s_st_Chirality_3",
26 | ]
27 |
28 |
29 | def test_solubility():
30 | data = dm.data.solubility()
31 | assert data.shape == (1282, 7)
32 | assert list(data.columns) == [
33 | "mol",
34 | "ID",
35 | "NAME",
36 | "SOL",
37 | "SOL_classification",
38 | "smiles",
39 | "split",
40 | ]
41 |
42 |
43 | def test_chembl_drugs():
44 | data = dm.data.chembl_drugs()
45 | assert data.shape == (2628, 5)
46 | assert list(data.columns) == [
47 | "first_approval",
48 | "molecule_chembl_id",
49 | "molecule_type",
50 | "pref_name",
51 | "smiles",
52 | ]
53 |
54 |
55 | def test_chembl_samples():
56 | data = dm.data.chembl_samples()
57 | assert data.shape == (2000, 1)
58 | assert list(data.columns) == ["smiles"]
59 |
--------------------------------------------------------------------------------
/tests/test_descriptors.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import pandas as pd
4 | import datamol as dm
5 |
6 |
7 | def test_descriptors():
8 | smiles_list = ["CC(=O)OC1=CC=CC=C1C(=O)O", "CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl"]
9 |
10 | for smiles in smiles_list:
11 | mol = dm.to_mol(smiles)
12 |
13 | dm.descriptors.mw(mol)
14 | dm.descriptors.fsp3(mol)
15 | dm.descriptors.n_hba(mol)
16 | dm.descriptors.n_hbd(mol)
17 | dm.descriptors.n_lipinski_hba(mol)
18 | dm.descriptors.n_lipinski_hbd(mol)
19 | dm.descriptors.n_rings(mol)
20 | dm.descriptors.n_hetero_atoms(mol)
21 | dm.descriptors.n_heavy_atoms(mol)
22 | dm.descriptors.n_rotatable_bonds(mol)
23 | dm.descriptors.n_aliphatic_rings(mol)
24 | dm.descriptors.n_aromatic_rings(mol)
25 | dm.descriptors.n_saturated_rings(mol)
26 | dm.descriptors.n_radical_electrons(mol)
27 | dm.descriptors.tpsa(mol)
28 | dm.descriptors.qed(mol)
29 | dm.descriptors.clogp(mol)
30 | dm.descriptors.sas(mol)
31 | dm.descriptors.sas(mol)
32 | dm.descriptors.n_stereo_centers_unspecified(mol)
33 | dm.descriptors.n_spiro_atoms(mol)
34 |
35 | dm.descriptors.n_aliphatic_carbocycles(mol)
36 | dm.descriptors.n_aliphatic_heterocyles(mol)
37 | dm.descriptors.n_aliphatic_rings(mol)
38 | dm.descriptors.n_aromatic_carbocycles(mol)
39 | dm.descriptors.n_aromatic_heterocyles(mol)
40 | dm.descriptors.n_aromatic_rings(mol)
41 | dm.descriptors.n_saturated_carbocycles(mol)
42 | dm.descriptors.n_saturated_heterocyles(mol)
43 | dm.descriptors.n_saturated_rings(mol)
44 |
45 |
46 | def test_compute_many_descriptors():
47 | mol = dm.to_mol("CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl")
48 |
49 | true_values = pd.Series(
50 | {
51 | "mw": 319.181525512,
52 | "fsp3": 0.5,
53 | "n_lipinski_hba": 3.0,
54 | "n_lipinski_hbd": 1.0,
55 | "n_rings": 2.0,
56 | "n_hetero_atoms": 4.0,
57 | "n_heavy_atoms": 22.0,
58 | "n_rotatable_bonds": 8.0,
59 | "n_radical_electrons": 0.0,
60 | "tpsa": 28.16,
61 | "qed": 0.7564117572128701,
62 | "clogp": 4.810600000000004,
63 | "sas": 2.670786229594949,
64 | "n_aliphatic_carbocycles": 0.0,
65 | "n_aliphatic_heterocyles": 0.0,
66 | "n_aliphatic_rings": 0.0,
67 | "n_aromatic_carbocycles": 1.0,
68 | "n_aromatic_heterocyles": 1.0,
69 | "n_aromatic_rings": 2.0,
70 | "n_saturated_carbocycles": 0.0,
71 | "n_saturated_heterocyles": 0.0,
72 | "n_saturated_rings": 0.0,
73 | }
74 | )
75 |
76 | # Scenario #1
77 | props = dm.descriptors.compute_many_descriptors(mol)
78 | props = pd.Series(props)
79 |
80 | assert props.equals(true_values)
81 |
82 | # Scenario #2
83 | props = dm.descriptors.compute_many_descriptors(
84 | mol,
85 | properties_fn={"hello": lambda x: 88},
86 | add_properties=False,
87 | )
88 | assert props == {"hello": 88}
89 |
90 | # Scenario #3
91 | props = dm.descriptors.compute_many_descriptors(
92 | mol,
93 | properties_fn={"hello": lambda x: 88},
94 | add_properties=True,
95 | )
96 | props = pd.Series(props)
97 |
98 | true_values_2 = true_values.copy()
99 | true_values_2["hello"] = 88
100 | true_values_2 = true_values_2[props.index]
101 |
102 | assert true_values_2.equals(props)
103 |
104 |
105 | def test_compute_many_descriptors_with_function_as_string():
106 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
107 |
108 | results = dm.descriptors.compute_many_descriptors(
109 | mol,
110 | properties_fn={"max_partial_charge": "MaxPartialCharge"},
111 | add_properties=False,
112 | )
113 |
114 | assert "max_partial_charge" in results.keys()
115 | assert pytest.approx(0.33900378687731025) == results["max_partial_charge"]
116 |
117 |
118 | def test_batch_compute_many_descriptors():
119 | data = dm.data.freesolv()
120 | data = data.iloc[:30]
121 | mols = data["smiles"].apply(dm.to_mol).tolist()
122 |
123 | props = dm.descriptors.batch_compute_many_descriptors(
124 | mols,
125 | batch_size=64,
126 | n_jobs=-1,
127 | progress=False,
128 | )
129 |
130 | assert set(props.columns.tolist()) == {
131 | "mw",
132 | "fsp3",
133 | "n_lipinski_hba",
134 | "n_lipinski_hbd",
135 | "n_rings",
136 | "n_hetero_atoms",
137 | "n_heavy_atoms",
138 | "n_rotatable_bonds",
139 | "n_radical_electrons",
140 | "tpsa",
141 | "qed",
142 | "clogp",
143 | "sas",
144 | "n_aliphatic_carbocycles",
145 | "n_aliphatic_heterocyles",
146 | "n_aliphatic_rings",
147 | "n_aromatic_carbocycles",
148 | "n_aromatic_heterocyles",
149 | "n_aromatic_rings",
150 | "n_saturated_carbocycles",
151 | "n_saturated_heterocyles",
152 | "n_saturated_rings",
153 | }
154 | assert props.shape == (30, 22)
155 |
156 |
157 | def test_any_rdkit_descriptor():
158 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
159 |
160 | value = dm.descriptors.any_rdkit_descriptor("MaxPartialCharge")(mol)
161 | assert pytest.approx(value) == 0.33900378687731025
162 |
163 | value = dm.descriptors.any_rdkit_descriptor("CalcFractionCSP3")(mol)
164 | assert pytest.approx(value) == 0.1111111111111111
165 |
166 | with pytest.raises(ValueError):
167 | dm.descriptors.any_rdkit_descriptor("DOES NOT EXIST")
168 |
169 |
170 | def test_n_aromatic_atoms():
171 | smiles = "Nc1cnn(-c2ccccc2)c(=O)c1Cl"
172 | mol = dm.to_mol(smiles)
173 |
174 | assert dm.descriptors.n_aromatic_atoms(mol) == 12
175 | assert dm.descriptors.n_aromatic_atoms_proportion(mol) == 0.8
176 |
177 |
178 | def test_formal_charge():
179 | mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC")
180 | assert dm.descriptors.formal_charge(mol) == 0
181 |
182 | mol = dm.to_mol("C(CC(=O)[O-])C(C(=O)[O-])[NH3+]")
183 | assert dm.descriptors.formal_charge(mol) == -1
184 |
185 |
186 | def test_refractivity():
187 | mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3")
188 |
189 | value = dm.descriptors.refractivity(mol)
190 | assert pytest.approx(value, rel=2) == 81.10
191 |
192 |
193 | def test_n_rigid_bonds():
194 | mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC")
195 | assert dm.descriptors.n_rigid_bonds(mol) == 20
196 |
197 | mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3")
198 | assert dm.descriptors.n_rigid_bonds(mol) == 19
199 |
200 |
201 | def test_n_stereocenters():
202 | mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC")
203 |
204 | assert dm.descriptors.n_stereo_centers(mol) == 1
205 |
206 | mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3")
207 | assert dm.descriptors.n_stereo_centers(mol) == 0
208 |
209 |
210 | def test_n_charged_atoms():
211 | mol = dm.to_mol("C(CC(=O)[O-])C(C(=O)[O-])[NH3+]")
212 | assert dm.descriptors.n_charged_atoms(mol) == 3
213 |
--------------------------------------------------------------------------------
/tests/test_fp.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import datamol as dm
4 |
5 |
6 | def test_to_fp():
7 | smiles = "CC(=O)Oc1ccccc1C(=O)O"
8 | mol = dm.to_mol(smiles)
9 |
10 | assert dm.to_fp(mol).shape[0] == 2048
11 | assert dm.to_fp(mol).sum() == 31
12 |
13 |
14 | def test_list_fp():
15 | assert set(dm.list_supported_fingerprints().keys()) == {
16 | "atompair",
17 | "atompair-count",
18 | "avalon-count",
19 | "ecfp",
20 | "fcfp",
21 | "ecfp-count",
22 | "erg",
23 | "estate",
24 | "fcfp-count",
25 | "layered",
26 | "maccs",
27 | "pattern",
28 | "rdkit",
29 | "topological",
30 | "topological-count",
31 | "rdkit-count",
32 | }
33 |
34 |
35 | def test_all_fps():
36 | smiles = "CC(=O)Oc1ccccc1C(=O)O"
37 | mol = dm.to_mol(smiles)
38 |
39 | fp_infos = {}
40 | for fp_type in dm.list_supported_fingerprints():
41 | fold_size = None
42 | if fp_type == "rdkit-count":
43 | fold_size = 2048
44 |
45 | print(fp_type)
46 | args = {}
47 | args["mol"] = mol
48 | args["as_array"] = True
49 | args["fp_type"] = fp_type
50 | args["fold_size"] = fold_size
51 | fp = dm.to_fp(**args)
52 |
53 | fp_infos[fp_type] = dict(size=len(fp), bits_sum=fp.sum())
54 |
55 | print(fp_infos)
56 |
57 | assert fp_infos == {
58 | "maccs": {"size": 167, "bits_sum": 21},
59 | "ecfp": {"size": 2048, "bits_sum": 31},
60 | "fcfp": {"size": 2048, "bits_sum": 22},
61 | "topological": {"size": 2048, "bits_sum": 18},
62 | "atompair": {"size": 2048, "bits_sum": 68},
63 | "rdkit": {"size": 2048, "bits_sum": 354},
64 | "pattern": {"size": 2048, "bits_sum": 173},
65 | "layered": {"size": 2048, "bits_sum": 335},
66 | "erg": {"size": 315, "bits_sum": 23.4},
67 | "estate": {"size": 79, "bits_sum": 13},
68 | "avalon-count": {"size": 512, "bits_sum": 168},
69 | "ecfp-count": {"size": 2048, "bits_sum": 42},
70 | "fcfp-count": {"size": 2048, "bits_sum": 35},
71 | "topological-count": {"size": 2048, "bits_sum": 19},
72 | "atompair-count": {"size": 2048, "bits_sum": 78},
73 | "rdkit-count": {"size": 2048, "bits_sum": 301},
74 | }
75 |
76 |
77 | def test_fp_invalid_input():
78 | args = {}
79 | args["mol"] = None
80 | args["radius"] = 3
81 |
82 | with pytest.raises(ValueError):
83 | dm.to_fp(**args)
84 |
85 | args["mol"] = "dsdsdsd"
86 | with pytest.raises(ValueError):
87 | dm.to_fp(**args)
88 |
--------------------------------------------------------------------------------
/tests/test_fragment.py:
--------------------------------------------------------------------------------
1 | import datamol as dm
2 |
3 |
4 | def test_brics():
5 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
6 | mol = dm.to_mol(smiles)
7 | frags = dm.fragment.brics(mol)
8 | assert len(frags) == 9
9 |
10 |
11 | def test_frag():
12 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
13 | mol = dm.to_mol(smiles)
14 | frags = dm.fragment.frag(mol)
15 | assert len(frags) == 9
16 |
17 |
18 | def test_recap():
19 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
20 | mol = dm.to_mol(smiles)
21 | frags = dm.fragment.recap(mol)
22 | assert len(frags) == 3
23 |
24 |
25 | def test_anybreak():
26 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
27 | mol = dm.to_mol(smiles)
28 | frags = dm.fragment.anybreak(mol)
29 | assert len(frags) == 9
30 |
31 |
32 | def test_mmpa():
33 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
34 | mol = dm.to_mol(smiles)
35 |
36 | frags = dm.fragment.mmpa_cut(mol)
37 | assert len(frags) == 39
38 | assert "CCCOCc1cccc(-c2ccccn2)c1,C(C[*:2])[*:1],C[*:1].c1ccc(-c2cccc(CO[*:2])c2)nc1\n" in frags
39 |
40 |
41 | def test_assemble():
42 | # Fragment a molecule
43 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
44 | mol = dm.to_mol(smiles)
45 | frags = dm.fragment.brics(mol)
46 |
47 | # Limit the number of fragments to work with because
48 | # assembling is computationally intensive.
49 | frags = frags[:2]
50 |
51 | # Assemble molecules from the list of fragments
52 | mols = list(dm.fragment.assemble_fragment_order(frags, max_n_mols=4))
53 |
54 | assert len(mols) == 4
55 |
56 |
57 | def test_break_mol():
58 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
59 | mol = dm.to_mol(smiles)
60 | fragments, *_, tree = dm.fragment.break_mol(mol, randomize=False, mode="brics", returnTree=True)
61 |
62 | assert fragments == ["CCC", "O", "C", "c1ccncc1", "c1ccccc1"]
63 | assert list(tree.nodes) == [0, 1, 2, 3, 4, 5, 6, 7, 8]
64 | assert list(tree.edges) == [(0, 1), (0, 2), (2, 3), (2, 4), (4, 5), (4, 6), (6, 7), (6, 8)]
65 |
66 |
67 | def test_assemble_build():
68 | mols = [[dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")], [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")]]
69 |
70 | results = list(dm.fragment.build(mols))
71 | assert len(results) == 71
72 |
73 | results = list(dm.fragment.build(mols, mode="rxn"))
74 | assert len(results) == 0
75 |
76 | results = list(dm.fragment.build(mols, mode=None))
77 | assert len(results) == 0
78 |
--------------------------------------------------------------------------------
/tests/test_import.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import datamol as dm
4 |
5 |
6 | def test_datamol_import_fails():
7 | with pytest.raises(AttributeError):
8 | dm.that_import_does_not_exist
9 |
--------------------------------------------------------------------------------
/tests/test_isomers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import datamol as dm
4 |
5 |
6 | def test_enumerate_tautomers():
7 | mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
8 |
9 | mols = dm.enumerate_tautomers(mol, n_variants=10)
10 |
11 | assert {dm.to_smiles(m) for m in mols} == {"O=C1C=[N:1]C2CCCCC2C1", "OC1=CC2CCCCC2[N:1]=C1"}
12 |
13 |
14 | def test_enumerate_stereo():
15 | mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
16 |
17 | mols = dm.enumerate_stereoisomers(mol, n_variants=10)
18 |
19 | assert {dm.to_smiles(m) for m in mols} == {
20 | "OC1=C[C@@H]2CCCC[C@@H]2[N:1]=C1",
21 | "OC1=C[C@@H]2CCCC[C@H]2[N:1]=C1",
22 | "OC1=C[C@H]2CCCC[C@@H]2[N:1]=C1",
23 | "OC1=C[C@H]2CCCC[C@H]2[N:1]=C1",
24 | }
25 |
26 |
27 | def test_enumerate_stereo_undefined_failure():
28 | mol = dm.to_mol(
29 | "N=1C(NC2CC2)=C3C(=NC1)N(/C=C/C=4C=C(C=CC4C)C(=O)NC=5C=C(C=C(C5)N6CCN(CC6)C)C(F)(F)F)C=N3"
30 | )
31 | with pytest.raises(RuntimeError):
32 | dm.enumerate_stereoisomers(mol, clean_it=True)
33 |
34 | mols = dm.enumerate_stereoisomers(mol, clean_it=False)
35 | assert len(mols) == 2 # only one double bond
36 |
37 |
38 | def test_enumerate_stereo_timeout():
39 | mol = dm.to_mol("CCCCC")
40 |
41 | # NOTE(hadim): it's impossible to predict anything given a timeout for different
42 | # machines so we here we just check the code can run without errors
43 | dm.enumerate_stereoisomers(mol, n_variants=2, timeout_seconds=1)
44 |
45 |
46 | def test_count_stereoisomers():
47 | num_isomers_1 = dm.count_stereoisomers(dm.to_mol("CC=CC"), undefined_only=True)
48 | num_isomers_2 = dm.count_stereoisomers(dm.to_mol("CC=CC"), undefined_only=False)
49 | assert num_isomers_1 == num_isomers_2
50 |
51 | assert dm.count_stereoisomers(dm.to_mol("Br/C=C\\Br"), undefined_only=True) == 1
52 |
53 |
54 | def test_enumerate_structural():
55 | mol = dm.to_mol("CCCCC") # pentane has only three structural isomers
56 |
57 | mols_iso = dm.enumerate_structisomers(
58 | mol,
59 | n_variants=2,
60 | allow_cycle=False,
61 | depth=1,
62 | allow_double_bond=False,
63 | allow_triple_bond=False,
64 | )
65 |
66 | assert {dm.to_smiles(m) for m in mols_iso} == {"CCC(C)C"}
67 |
68 | # NOTE(hadim): disable to reduce testing time
69 | # mols_cyclo_iso = dm.enumerate_structisomers(mol, n_variants=5, depth=2, allow_cycle=True)
70 |
71 | # # expect 3 molecules with cycles
72 | # assert sum([Chem.rdMolDescriptors.CalcNumRings(x) == 1 for x in mols_cyclo_iso]) == 3 # type: ignore
73 |
74 | # mols_cyclo_iso_double = dm.enumerate_structisomers(
75 | # mol, n_variants=10, allow_cycle=True, allow_double_bond=True
76 | # )
77 | # should have mol with double link
78 | # assert sum(["=" in dm.to_smiles(x) for x in mols_cyclo_iso_double]) > 0
79 |
80 |
81 | @pytest.mark.skip_platform("win")
82 | def test_enumerate_structural_timeout():
83 | mol = dm.to_mol("CCCCC")
84 |
85 | # NOTE(hadim): it's impossible to predict anything given a timeout for different
86 | # machines so we here we just check the code can run without errors
87 | dm.enumerate_structisomers(mol, n_variants=10, timeout_seconds=1)
88 |
89 |
90 | def test_canonical_tautomer():
91 | smiles = "Oc1c(cccc3)c3nc2ccncc12"
92 | mol = dm.to_mol(smiles)
93 |
94 | canonical_mol = dm.canonical_tautomer(mol)
95 |
96 | assert dm.to_smiles(canonical_mol) == "O=c1c2ccccc2[nH]c2ccncc12"
97 | assert dm.to_inchikey(canonical_mol) == dm.to_inchikey(mol)
98 |
99 |
100 | def test_remove_stereochemistry():
101 | mol = dm.to_mol("C[C@H]1CCC[C@@H](C)[C@@H]1Cl")
102 | mol_no_stereo = dm.remove_stereochemistry(mol)
103 | assert dm.to_smiles(mol_no_stereo) == "CC1CCCC(C)C1Cl"
104 |
--------------------------------------------------------------------------------
/tests/test_log.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import datamol as dm
4 |
5 |
6 | @dm.no_rdkit_log
7 | def no_log_to_mol(smiles):
8 | return dm.to_mol(smiles)
9 |
10 |
11 | def check_logs_are_shown(capfd):
12 | smiles = "fake_smiles"
13 | dm.to_mol(smiles)
14 | _, err = capfd.readouterr()
15 | assert "SMILES Parse Error" in err
16 |
17 |
18 | def check_logs_are_not_shown(capfd):
19 | smiles = "fake_smiles"
20 | dm.to_mol(smiles)
21 | _, err = capfd.readouterr()
22 | assert err == ""
23 |
24 |
25 | def check_logs_are_not_shown_deco(capfd):
26 | smiles = "fake_smiles"
27 | no_log_to_mol(smiles)
28 | _, err = capfd.readouterr()
29 | assert err == ""
30 |
31 |
32 | @pytest.mark.skip_platform("win")
33 | def test_rdkit_log(capfd):
34 | """Test multiple rdkit log scenarios."""
35 |
36 | check_logs_are_shown(capfd)
37 | check_logs_are_not_shown_deco(capfd)
38 |
39 | check_logs_are_shown(capfd)
40 | with dm.without_rdkit_log():
41 | check_logs_are_not_shown(capfd)
42 | check_logs_are_shown(capfd)
43 |
44 | dm.disable_rdkit_log()
45 | check_logs_are_not_shown(capfd)
46 |
47 | dm.enable_rdkit_log()
48 | check_logs_are_shown(capfd)
49 |
50 | dm.disable_rdkit_log()
51 | with dm.without_rdkit_log():
52 | check_logs_are_not_shown(capfd)
53 | check_logs_are_not_shown(capfd)
54 |
55 |
56 | @pytest.mark.skip_platform("win")
57 | def test_rdkit_log_enable(capfd):
58 | dm.enable_rdkit_log()
59 |
60 | with dm.without_rdkit_log():
61 | check_logs_are_not_shown(capfd)
62 |
63 | with dm.without_rdkit_log(enable=False):
64 | check_logs_are_shown(capfd)
65 |
66 | check_logs_are_shown(capfd)
67 |
--------------------------------------------------------------------------------
/tests/test_mcs.py:
--------------------------------------------------------------------------------
1 | import datamol as dm
2 |
3 |
4 | def test_find_mcs():
5 | smiles_list = [
6 | "C=CC(=O)NCCOc1cc2ncnc(Nc3ccc(Br)cc3F)c2cc1NC(=O)C=C",
7 | "C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Br)c3)ncnc2cc1OCCCN1CCOCC1",
8 | "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCCNC(=O)CN(C)C",
9 | "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)NCC",
10 | "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)CN(C)C",
11 | ]
12 | mols = [dm.to_mol(s) for s in smiles_list]
13 | smarts = dm.find_mcs(mols=mols, timeout=2)
14 |
15 | # NOTE(hadim): hash are different given different RDKit version
16 | expected_hashes = [
17 | # RDKit >= 2023.09
18 | "762f483ac10cc0f45c5aa2c790f9ef52f8dfb337",
19 | # RDKit <= 2023.03
20 | "49eff32e405d17980fad428cf4063ec52e2c5fda",
21 | ]
22 |
23 | assert dm.hash_mol(dm.from_smarts(smarts)) in expected_hashes
24 |
--------------------------------------------------------------------------------
/tests/test_molar.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import datamol as dm
4 | import numpy as np
5 | import pandas as pd
6 |
7 |
8 | MOLAR_TEST_VALUES = pd.DataFrame(
9 | [
10 | (1, 6, "uM"),
11 | (0.059, 7.229147988357856, "uM"),
12 | (0.024, 7.61978876, "uM"),
13 | (0.187, 6.72815839, "uM"),
14 | (0.00154, 8.8124793, "uM"),
15 | (128, 6.892790, "nM"),
16 | (0.000128, 6.892790, "mM"),
17 | ],
18 | columns=["xc50", "pxc50", "unit"],
19 | )
20 |
21 |
22 | def test_molar_to_log():
23 | # test scalar
24 | value, log_value, unit = MOLAR_TEST_VALUES.iloc[0].values
25 | assert dm.molar.molar_to_log(value, unit=unit) == log_value
26 |
27 | # test arrays
28 | for unit in ["uM", "mM", "nM"]:
29 | mask = MOLAR_TEST_VALUES["unit"] == unit
30 | values = MOLAR_TEST_VALUES[mask]["xc50"].tolist()
31 | log_values = MOLAR_TEST_VALUES[mask]["pxc50"].tolist()
32 | np.testing.assert_almost_equal(dm.molar.molar_to_log(values, unit=unit), log_values)
33 |
34 | # test wrong unit
35 | with pytest.raises(ValueError):
36 | dm.molar.molar_to_log(0.000128, unit="kcal/mol")
37 |
38 |
39 | def test_log_to_molar():
40 | # test scalar
41 | value, log_value, unit = MOLAR_TEST_VALUES.iloc[0].values
42 | np.testing.assert_almost_equal(dm.molar.log_to_molar(log_value, unit=unit), value)
43 |
44 | # test arrays
45 | for unit in ["uM", "mM", "nM"]:
46 | mask = MOLAR_TEST_VALUES["unit"] == unit
47 | values = MOLAR_TEST_VALUES[mask]["xc50"].tolist()
48 | log_values = MOLAR_TEST_VALUES[mask]["pxc50"].tolist()
49 | np.testing.assert_almost_equal(
50 | dm.molar.log_to_molar(log_values, unit=unit), values, decimal=5
51 | )
52 |
53 | # test wrong unit
54 | with pytest.raises(ValueError):
55 | dm.molar.log_to_molar(7.214, unit="kcal/mol")
56 |
57 |
58 | def test_log_to_molar_with_integer():
59 | dm.molar.log_to_molar(6, unit="uM")
60 |
--------------------------------------------------------------------------------
/tests/test_notebooks.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pathlib
3 |
4 | import nbformat
5 | import datamol as dm
6 | from nbconvert.preprocessors.execute import ExecutePreprocessor
7 |
8 | ROOT_DIR = pathlib.Path(__file__).parent.resolve()
9 |
10 | NOTEBOOK_DIR = ROOT_DIR.parent / "docs" / "tutorials"
11 |
12 | NOTEBOOK_PATHS = sorted(list(NOTEBOOK_DIR.glob("*.ipynb")))
13 |
14 | # Discard `Filesystem.ipynb` because it takes too long to run.
15 | NOTEBOOK_PATHS = list(filter(lambda x: "Filesystem.ipynb" != x.name, NOTEBOOK_PATHS))
16 |
17 |
18 | @pytest.mark.skip_platform("win")
19 | @pytest.mark.parametrize("nb_path", NOTEBOOK_PATHS, ids=[str(n.name) for n in NOTEBOOK_PATHS])
20 | def test_notebook(nb_path):
21 | # Setup and configure the processor to execute the notebook
22 | if "Visualization.ipynb" in nb_path.name and not dm.is_greater_than_current_rdkit_version(
23 | "2023.03"
24 | ):
25 | pytest.skip("Circle Grid requires rdkit>2022.09")
26 | ep = ExecutePreprocessor(timeout=600, kernel_name="python")
27 |
28 | # Open the notebook
29 | with open(nb_path) as f:
30 | nb = nbformat.read(f, as_version=nbformat.NO_CONVERT)
31 |
32 | # Execute the notebook
33 | ep.preprocess(nb, {"metadata": {"path": NOTEBOOK_DIR}})
34 |
--------------------------------------------------------------------------------
/tests/test_predictors.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import datamol as dm
4 | import numpy as np
5 |
6 |
7 | def test_esol():
8 | smiles = "Nc1cnn(-c2ccccc2)c(=O)c1Cl"
9 | mol = dm.to_mol(smiles)
10 |
11 | assert np.allclose(dm.predictors.esol(mol), -2.627091966265316)
12 |
13 |
14 | def test_esol_from_data():
15 | data = dm.freesolv()
16 | data = data.iloc[:20]
17 |
18 | with pytest.raises(KeyError):
19 | dm.predictors.esol_from_data(data)
20 |
21 | data["mol"] = data["smiles"].apply(dm.to_mol)
22 | data["clogp"] = data["mol"].apply(dm.descriptors.clogp)
23 | data["mw"] = data["mol"].apply(dm.descriptors.mw)
24 | data["n_rotatable_bonds"] = data["mol"].apply(dm.descriptors.n_rotatable_bonds)
25 | data["n_aromatic_atoms_proportion"] = data["mol"].apply(
26 | dm.descriptors.n_aromatic_atoms_proportion
27 | )
28 |
29 | # dataframe
30 | esol_values = dm.predictors.esol_from_data(data)
31 | assert esol_values.dtype == float
32 | assert esol_values.shape == (20,)
33 |
34 | # series
35 | v = dm.predictors.esol_from_data(data.iloc[0])
36 | v = float(v)
37 | assert isinstance(v, float)
38 |
39 | # dict
40 | v = dm.predictors.esol_from_data(data.iloc[0].to_dict())
41 | v = float(v)
42 | assert isinstance(v, float)
43 |
--------------------------------------------------------------------------------
/tests/test_scaffold.py:
--------------------------------------------------------------------------------
1 | import datamol as dm
2 |
3 |
4 | def test_fuzzy_scaffolding():
5 | smiles = [
6 | "Cc1ccc(NC(=O)Cn2cccn2)c(Br)c1",
7 | "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1",
8 | "CC(NC(=O)CSCc1cccs1)C1CCCO1",
9 | "CC1CCCCN1C(=O)CN1CCC[C@@H](N)C1",
10 | "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1", # no way this one (Remdesivir) is in the db
11 | "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1",
12 | ]
13 |
14 | mols = [dm.to_mol(s) for s in smiles]
15 |
16 | # NOTE(hadim): different version of rdkit (2020.09 vs 2021.03) returns
17 | # different SMILES here.
18 | # assert "O=C(CN1CCC[C@@H]([*:1])C1)N1CCCCC1[*:2]" in all_scaffolds
19 | # assert "O=C(CSCc1cccs1)NC(C1CCCO1)[*:1]" in all_scaffolds
20 | # assert "O=C(N=c1sccn1[*:1])C(Oc1ccc([*:3])cc1)[*:2]" in all_scaffolds
21 |
22 | all_scaffolds, df_scf2infos, df_scf2groups = dm.scaffold.fuzzy_scaffolding(mols)
23 |
24 | assert len(all_scaffolds) == 5
25 | assert len(df_scf2infos.columns) == 3
26 |
27 | # because we are returning the output for each scf
28 | # these should be the same
29 | assert len(df_scf2infos.index) == len(df_scf2groups.index)
30 | assert list(df_scf2infos["scf"]) == list(df_scf2groups["scf"])
31 |
32 | # mere coincidence that scf2infos and scf2groups for the columns have the
33 | # the same length. the reason there are 3 not two is because it could have
34 | # extra columns where a cell may have none values.
35 | assert len(df_scf2groups.columns) == 3
36 |
--------------------------------------------------------------------------------
/tests/test_similarity.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import numpy as np
4 | import datamol as dm
5 | import datamol.utils.testing
6 |
7 |
8 | def test_pdist():
9 | smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
10 | mols = [dm.to_mol(smiles) for smiles in smiles_list]
11 |
12 | dist_mat = dm.pdist(mols)
13 |
14 | assert dist_mat.shape == (3, 3)
15 | assert dist_mat.sum() == 5.6757105943152455
16 |
17 | dist_mat = dm.pdist(mols, n_jobs=None)
18 |
19 | assert dist_mat.shape == (3, 3)
20 | assert dist_mat.sum() == 5.6757105943152455
21 |
22 |
23 | def test_pdist_condensed():
24 | smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
25 | mols = [dm.to_mol(smiles) for smiles in smiles_list]
26 |
27 | dist_mat = dm.pdist(mols, squareform=False)
28 |
29 | assert dist_mat.shape == (3,)
30 | assert dist_mat.sum() == 2.8378552971576227
31 |
32 |
33 | def test_cdist():
34 | smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
35 | mols1 = [dm.to_mol(smiles) for smiles in smiles_list1]
36 |
37 | smiles_list2 = [
38 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
39 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
40 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
41 | ]
42 | mols2 = [dm.to_mol(smiles) for smiles in smiles_list2]
43 |
44 | dist_mat = dm.cdist(mols1, mols2)
45 |
46 | assert dist_mat.shape == (3, 3)
47 | assert np.isclose(dist_mat.mean(), 0.9416270180919872)
48 |
49 |
50 | def test_cdist_chunked():
51 | smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
52 | mols1 = [dm.to_mol(smiles) for smiles in smiles_list1]
53 |
54 | smiles_list2 = [
55 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
56 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
57 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
58 | ]
59 | mols2 = [dm.to_mol(smiles) for smiles in smiles_list2]
60 |
61 | d1 = dm.cdist(mols1, mols2, distances_chunk=True)
62 | d2 = dm.cdist(mols1, mols2, distances_chunk=False)
63 |
64 | assert d1.shape == d2.shape
65 | assert np.allclose(d1, d2)
66 |
67 |
68 | def test_cdist_pdist_consistent():
69 | smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
70 | mols1 = [dm.to_mol(smiles) for smiles in smiles_list1]
71 |
72 | dist_mat = dm.cdist(mols1, mols1)
73 | dist_mat2 = dm.pdist(mols1)
74 |
75 | assert np.isclose(dist_mat.mean(), dist_mat2.mean())
76 | assert np.allclose(dist_mat, dist_mat2)
77 |
78 |
79 | def test_cdist_pdist_invalid_input():
80 | smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1", "dsdsdsd"]
81 |
82 | with pytest.raises(ValueError):
83 | dm.similarity.cdist(smiles_list, smiles_list)
84 |
85 | with pytest.raises(ValueError):
86 | dm.similarity.pdist(smiles_list)
87 |
88 |
89 | def test_datamol_pdist_same_as_rdkit():
90 | smiles_list = [
91 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
92 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
93 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
94 | ]
95 |
96 | dist_mat = dm.similarity.pdist(smiles_list)
97 | dist_mat_rdkit = datamol.utils.testing.pdist_rdkit(smiles_list)
98 |
99 | assert np.allclose(dist_mat, dist_mat_rdkit)
100 |
101 |
102 | def test_datamol_cdist_same_as_rdkit():
103 | smiles_list = [
104 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
105 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
106 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
107 | ]
108 |
109 | smiles_list2 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
110 |
111 | dist_mat = dm.similarity.cdist(smiles_list, smiles_list2)
112 | dist_mat_rdkit = datamol.utils.testing.cdist_rdkit(smiles_list, smiles_list2)
113 |
114 | assert np.allclose(dist_mat, dist_mat_rdkit)
115 |
--------------------------------------------------------------------------------
/tests/test_utils_fs.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import pathlib
4 |
5 | import fsspec
6 | import datamol as dm
7 |
8 |
9 | def test_copy_files(tmp_path):
10 | source_path = tmp_path / "source.txt"
11 | destination_path = tmp_path / "destination.txt"
12 |
13 | content = "hello this is a content"
14 | with open(source_path, "w") as f:
15 | f.write(content)
16 |
17 | dm.utils.fs.copy_file(source_path, destination_path)
18 |
19 | with open(destination_path) as f:
20 | assert f.read() == content
21 |
22 |
23 | def test_copy_dir(tmp_path):
24 | source_path = tmp_path / "source_dir"
25 | source_path_subdir = source_path / "a_subdir"
26 | destination_path = tmp_path / "destination_dir"
27 | destination_path_subdir = destination_path / "a_subdir"
28 |
29 | dm.utils.fs.mkdir(source_path)
30 | dm.utils.fs.mkdir(source_path_subdir)
31 |
32 | content = "hello this is a content"
33 | file1_path = source_path / "hello.txt"
34 | with open(file1_path, "w") as f:
35 | f.write(content)
36 |
37 | file2_path = source_path_subdir / "hello.txt"
38 | with open(file2_path, "w") as f:
39 | f.write(content)
40 |
41 | assert not dm.utils.fs.is_dir(destination_path_subdir)
42 | assert not dm.utils.fs.is_dir(destination_path)
43 |
44 | dm.utils.fs.copy_dir(source_path, destination_path)
45 |
46 | assert dm.utils.fs.is_dir(destination_path_subdir)
47 | assert dm.utils.fs.is_dir(destination_path)
48 | assert dm.utils.fs.is_file(file1_path)
49 | assert dm.utils.fs.is_file(file2_path)
50 |
51 | with open(file1_path) as f:
52 | assert f.read() == content
53 |
54 | with open(file2_path) as f:
55 | assert f.read() == content
56 |
57 |
58 | def test_mkdir(tmp_path):
59 | source_path = tmp_path / "source_dir"
60 | source_path_subdir = source_path / "a_subdir"
61 |
62 | dm.utils.fs.mkdir(source_path)
63 |
64 | assert dm.utils.fs.is_dir(source_path)
65 | assert not dm.utils.fs.is_dir(source_path_subdir)
66 |
67 | dm.utils.fs.mkdir(source_path_subdir)
68 |
69 | assert dm.utils.fs.is_dir(source_path)
70 | assert dm.utils.fs.is_dir(source_path_subdir)
71 |
72 |
73 | @pytest.mark.skip_platform("win")
74 | def test_cache_dir():
75 | cache_dir = dm.utils.fs.get_cache_dir("my_app")
76 | assert str(cache_dir).endswith("my_app")
77 | assert cache_dir.exists()
78 | assert cache_dir.is_dir()
79 |
80 | cache_dir = dm.utils.fs.get_cache_dir("my_app", suffix="likelydonotalreadyexist", create=False)
81 | assert str(cache_dir).endswith("likelydonotalreadyexist")
82 | assert not cache_dir.exists()
83 | assert not cache_dir.is_dir()
84 |
85 | cache_dir = dm.utils.fs.get_cache_dir("my_app", suffix="iamasuffix")
86 | assert str(cache_dir).endswith("iamasuffix")
87 | assert "my_app" in str(cache_dir)
88 | assert cache_dir.exists()
89 | assert cache_dir.is_dir()
90 |
91 |
92 | def test_get_mapper(tmp_path):
93 | fsmapper = dm.utils.fs.get_mapper(str(tmp_path / "test.txt"))
94 |
95 | # NOTE(hadim): depends the fsspec version
96 | assert fsmapper.fs.protocol in ["file", ("file", "local")]
97 |
98 |
99 | @pytest.mark.skip_platform("win")
100 | def test_get_basename(tmp_path):
101 | assert dm.utils.fs.get_basename(str(tmp_path / "test.txt")) == "test.txt"
102 | assert dm.utils.fs.get_basename("s3://a-bucket-that-likely-do-not-exist/test.txt") == "test.txt"
103 |
104 |
105 | def test_get_extension(tmp_path):
106 | assert dm.utils.fs.get_extension(str(tmp_path / "test.txt")) == "txt"
107 | assert dm.utils.fs.get_extension("s3://a-bucket-that-likely-do-not-exist/test.txt") == "txt"
108 |
109 |
110 | def test_exists(tmp_path):
111 | tmp_file = tmp_path / "test.txt"
112 |
113 | assert not dm.utils.fs.exists(tmp_file)
114 | assert not dm.utils.fs.is_file(tmp_file)
115 |
116 | assert dm.utils.fs.is_dir(tmp_path)
117 | assert not dm.utils.fs.is_dir(tmp_path / "likely-does-not-exist")
118 |
119 | with open(tmp_file, "w") as f:
120 | f.write("hello")
121 |
122 | assert dm.utils.fs.exists(tmp_file)
123 | assert dm.utils.fs.is_file(tmp_file)
124 |
125 | assert not dm.utils.fs.is_file(open(tmp_file))
126 | assert not dm.utils.fs.is_dir(open(tmp_file))
127 |
128 |
129 | def test_get_protocol(tmp_path):
130 | assert dm.utils.fs.get_protocol(tmp_path / "ahahah.txt") == "file"
131 | assert dm.utils.fs.get_protocol("s3://a-bucket-that-likely-do-not-exist/test.txt") == "s3"
132 |
133 |
134 | def test_is_local_path(tmp_path):
135 | assert dm.utils.fs.is_local_path(tmp_path / "ahahah.txt")
136 | assert not dm.utils.fs.is_local_path("s3://a-bucket-that-likely-do-not-exist/test.txt")
137 |
138 |
139 | @pytest.mark.skip_platform("win")
140 | def test_join(tmp_path):
141 | assert (
142 | dm.utils.fs.join("s3://a-bucket-that-likely-do-not-exist", "test.txt")
143 | == "s3://a-bucket-that-likely-do-not-exist/test.txt"
144 | )
145 | assert dm.utils.fs.join(tmp_path, "test.txt") == str(tmp_path / "test.txt")
146 |
147 |
148 | def test_get_size(tmp_path):
149 | tmp_file = tmp_path / "test.txt"
150 |
151 | with open(tmp_file, "w") as f:
152 | f.write("hello")
153 |
154 | assert dm.utils.fs.get_size(tmp_file) > 0
155 | assert dm.utils.fs.get_size(open(tmp_file)) > 0
156 | assert dm.utils.fs.get_size(fsspec.open(tmp_file)) > 0
157 |
158 |
159 | def test_md5(tmp_path):
160 | tmp_file = tmp_path / "test.txt"
161 |
162 | with open(tmp_file, "w") as f:
163 | f.write("hello")
164 |
165 | assert dm.utils.fs.md5(tmp_file) == "5d41402abc4b2a76b9719d911017c592"
166 |
167 |
168 | @pytest.mark.skip_platform("win")
169 | def test_glob(tmp_path):
170 | for i in range(5):
171 | tmp_file = tmp_path / f"test_{i}.txt"
172 |
173 | with open(tmp_file, "w") as f:
174 | f.write("hello")
175 |
176 | tmp_path_regex = tmp_path / "*.txt"
177 | assert len(dm.utils.fs.glob(tmp_path_regex)) == 5
178 |
179 |
180 | def test_copy_file(tmp_path):
181 | tmp_file = tmp_path / "test.txt"
182 |
183 | assert dm.utils.fs.is_dir(tmp_path)
184 | assert dm.utils.fs.is_dir(str(tmp_path))
185 | assert dm.utils.fs.is_dir(pathlib.Path(str(tmp_path)))
186 |
187 | assert not dm.utils.fs.is_dir(tmp_path / "not_exist_dir")
188 | assert not dm.utils.fs.is_dir(str(tmp_path / "not_exist_dir"))
189 | assert not dm.utils.fs.is_dir(pathlib.Path(str(tmp_path / "not_exist_dir")))
190 |
191 | with open(tmp_file, "w") as f:
192 | f.write("hello")
193 |
194 | tmp_file2 = tmp_path / "test2.txt"
195 | assert not dm.utils.fs.is_file(tmp_file2)
196 | assert not dm.utils.fs.is_file(str(tmp_file2))
197 | assert not dm.utils.fs.is_file(pathlib.Path(str(tmp_file2)))
198 |
199 | dm.utils.fs.copy_file(tmp_file, tmp_file2)
200 |
201 | assert dm.utils.fs.is_file(tmp_file2)
202 | assert dm.utils.fs.is_file(str(tmp_file2))
203 | assert dm.utils.fs.is_file(pathlib.Path(str(tmp_file2)))
204 | assert open(tmp_file2).read() == "hello"
205 |
206 | with pytest.raises(ValueError):
207 | dm.utils.fs.copy_file(tmp_file, tmp_file2)
208 |
209 | tmp_file3 = tmp_path / "test3.txt"
210 | dm.utils.fs.copy_file(tmp_file, tmp_file3, progress=True)
211 | assert dm.utils.fs.is_file(tmp_file3)
212 | assert dm.utils.fs.is_file(str(tmp_file3))
213 | assert dm.utils.fs.is_file(pathlib.Path(str(tmp_file3)))
214 | assert open(tmp_file3).read() == "hello"
215 |
--------------------------------------------------------------------------------
/tests/test_utils_jobs.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numbers
3 | import operator
4 | import unittest
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 | from functools import reduce
10 |
11 | import datamol as dm
12 |
13 |
14 | def random_fn(*args, op="mul", **kwargs):
15 | """Perform random functions on a list"""
16 | all_values = [x for x in args if isinstance(x, numbers.Number)]
17 | all_values += [x for x in kwargs.values() if isinstance(x, numbers.Number)]
18 | op_fn = getattr(operator, op, None)
19 | if op_fn is None:
20 | op_fn = getattr(math, op)
21 | return op_fn(all_values[0])
22 | return reduce(op_fn, all_values)
23 |
24 |
25 | class TestJobs(unittest.TestCase):
26 | def test_sequential(self):
27 | jobrunner = dm.JobRunner(n_jobs=None, progress=False)
28 | # practically do nothing (add a single value with nothing)
29 | o1 = jobrunner(random_fn, [9, 25, 1024], op="add")
30 | self.assertEqual(o1, [9, 25, 1024])
31 |
32 | # take the sqrt
33 | o2 = jobrunner(random_fn, [9, 25, 1024], op="sqrt")
34 | self.assertEqual(o2, [3, 5, 32])
35 |
36 | # multiply all inputs
37 | o3 = jobrunner(random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul")
38 | self.assertEqual(o3, [6, 4 * 5 * 6, 0])
39 |
40 | # do the same thing but with kwargs
41 | o4 = jobrunner(
42 | random_fn,
43 | iter([dict(a=1, b=2, c=3), dict(a=4, b=5, c=6), dict(a=3, b=4, c=0)]),
44 | arg_type="kwargs",
45 | op="mul",
46 | )
47 | self.assertEqual(o4, [6, 4 * 5 * 6, 0])
48 |
49 | o5 = jobrunner(random_fn, np.asarray([9, 25, 1024]), op="add")
50 | self.assertEqual(o5, [9, 25, 1024])
51 |
52 | def test_parallel(self):
53 | jobrunner1 = dm.JobRunner(n_jobs=4, progress=True) # use loky backend
54 | o1 = jobrunner1(random_fn, [9, 25, 1024], op="add")
55 | self.assertEqual(o1, [9, 25, 1024])
56 |
57 | o5 = jobrunner1(random_fn, np.asarray([9, 25, 1024]), op="add")
58 | self.assertEqual(o5, [9, 25, 1024])
59 |
60 | o3 = jobrunner1(random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul")
61 | self.assertEqual(o3, [6, 4 * 5 * 6, 0])
62 |
63 | # use threads instead, no progress
64 | jobrunner2 = dm.JobRunner(n_jobs=2, progress=False, prefer="threads")
65 | o2 = jobrunner2(random_fn, [9, 25, 1024], op="sqrt")
66 | self.assertEqual(o2, [3, 5, 32])
67 |
68 | o4 = jobrunner2(
69 | random_fn,
70 | iter([dict(a=1, b=2, c=3), dict(a=4, b=5, c=6), dict(a=3, b=4, c=0)]),
71 | arg_type="kwargs",
72 | op="mul",
73 | )
74 | self.assertEqual(o4, [6, 4 * 5 * 6, 0])
75 |
76 | def test_seq_vs_parallel(self):
77 | # test parallel vs sequential
78 | jobrunner = dm.JobRunner(n_jobs=4, progress=False) # use loky backend
79 | o_seq = jobrunner.sequential(
80 | random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul"
81 | )
82 | o_par = jobrunner.parallel(
83 | random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul"
84 | )
85 | self.assertEqual(o_seq, o_par)
86 |
87 | def test_parallelized(self):
88 | def fn(x):
89 | return x**2
90 |
91 | results = dm.parallelized(
92 | fn,
93 | [{"x": i} for i in range(10)],
94 | scheduler="processes",
95 | n_jobs=None,
96 | arg_type="kwargs",
97 | progress=True,
98 | )
99 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
100 |
101 | results = dm.parallelized(
102 | fn,
103 | [[i] for i in range(10)],
104 | scheduler="processes",
105 | n_jobs=None,
106 | arg_type="args",
107 | progress=True,
108 | )
109 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
110 |
111 | results = dm.parallelized(
112 | fn,
113 | range(10),
114 | scheduler="processes",
115 | n_jobs=None,
116 | progress=False,
117 | )
118 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
119 |
120 | def test_job_kwargs(self):
121 | def fn(x):
122 | return x**2
123 |
124 | results = dm.parallelized(
125 | fn,
126 | [{"x": i} for i in range(10)],
127 | scheduler="processes",
128 | n_jobs=None,
129 | arg_type="kwargs",
130 | progress=True,
131 | verbose=100,
132 | )
133 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
134 |
135 | def test_tqdm_kwargs(self):
136 | def fn(x):
137 | return x**2
138 |
139 | results = dm.parallelized(
140 | fn,
141 | [{"x": i} for i in range(10)],
142 | scheduler="processes",
143 | n_jobs=None,
144 | arg_type="kwargs",
145 | progress=True,
146 | tqdm_kwargs=dict(desc="My progress bar"),
147 | )
148 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
149 |
150 | def test_with_batch_size(self):
151 | def _fn(n):
152 | return n * 3
153 |
154 | def _fn_return_none(n):
155 | return None
156 |
157 | results = dm.utils.parallelized(
158 | _fn,
159 | range(997),
160 | n_jobs=-1,
161 | progress=True,
162 | batch_size=10,
163 | )
164 | assert len(results) == 997
165 |
166 | results = dm.utils.parallelized(
167 | _fn_return_none,
168 | range(997),
169 | n_jobs=-1,
170 | progress=True,
171 | batch_size=10,
172 | )
173 | assert len(results) == 997
174 |
175 | def test_with_total(self):
176 | def _fn_process_fn(_, row):
177 | datum = {}
178 | datum["smiles"] = row["smiles"]
179 | return pd.Series(datum)
180 |
181 | data = dm.freesolv()
182 | data = data.iloc[:50]
183 |
184 | # parallel mode
185 |
186 | ## check the `total` arg is ok
187 | dm.parallelized(
188 | _fn_process_fn,
189 | data.iterrows(),
190 | n_jobs=-1,
191 | progress=True,
192 | arg_type="args",
193 | total=50,
194 | )
195 |
196 | ## check collision between guessed total and provided one
197 | dm.parallelized(
198 | _fn_process_fn,
199 | list(data.iterrows()),
200 | n_jobs=-1,
201 | progress=True,
202 | arg_type="args",
203 | total=50,
204 | )
205 |
206 | # sequential mode
207 |
208 | ## check the `total` arg is ok
209 | dm.parallelized(
210 | _fn_process_fn,
211 | data.iterrows(),
212 | n_jobs=1,
213 | progress=True,
214 | arg_type="args",
215 | total=50,
216 | )
217 |
218 | ## check collision between guessed total and provided one
219 | dm.parallelized(
220 | _fn_process_fn,
221 | list(data.iterrows()),
222 | n_jobs=1,
223 | progress=True,
224 | arg_type="args",
225 | total=50,
226 | )
227 |
228 |
229 | def test_parallelized_with_batches():
230 | data = dm.freesolv()
231 | data = data.iloc[:10]
232 |
233 | def _fn1(smiles):
234 | return len(smiles)
235 |
236 | results1 = dm.parallelized(
237 | _fn1,
238 | data["smiles"],
239 | progress=False,
240 | n_jobs=-1,
241 | )
242 |
243 | def _fn2(smiles_list):
244 | return [len(s) for s in smiles_list]
245 |
246 | results2 = dm.parallelized_with_batches(
247 | _fn2,
248 | data["smiles"],
249 | batch_size=2,
250 | progress=False,
251 | n_jobs=-1,
252 | )
253 |
254 | assert results1 == results2
255 |
--------------------------------------------------------------------------------
/tests/test_utils_perf.py:
--------------------------------------------------------------------------------
1 | import datamol as dm
2 |
3 |
4 | def test_watch_duration():
5 | def fn(n):
6 | for i in range(n):
7 | print(i)
8 |
9 | with dm.utils.perf.watch_duration(log=True) as w:
10 | fn(5)
11 |
12 | assert isinstance(w.duration, float)
13 |
--------------------------------------------------------------------------------
/tests/test_viz.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import base64
4 | import io
5 |
6 | import numpy as np
7 | import ipywidgets as widgets
8 |
9 | import PIL
10 | from PIL import Image
11 |
12 | import datamol as dm
13 |
14 |
15 | # NOTE(hadim): rdkit returns different image objects
16 | # according to the Python process context (Jupyter notebook vs terminal).
17 | # In consequence, those tests will fail if they are executed within a
18 | # Jupyter notebook.
19 |
20 |
21 | def _convert_ipython_to_array(image):
22 | """convert ipython image to numpy array"""
23 | image_obj = base64.b64decode(str(image._repr_png_()))
24 | try:
25 | image_obj = Image.open(io.BytesIO(image_obj))
26 | return np.array(image_obj)
27 | except Exception:
28 | return np.array(image)
29 |
30 |
31 | def test_to_image():
32 | # Get a list of molecules
33 | data = dm.data.freesolv()
34 | mols = dm.from_df(data) # type: ignore
35 | mols = mols[:8]
36 |
37 | # With multiple molecules
38 | legends = [dm.to_smiles(mol) for mol in mols]
39 | image = dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200), use_svg=False)
40 | image = _convert_ipython_to_array(image)
41 |
42 | print(type(image))
43 |
44 | image = np.array(image)
45 |
46 | assert image.dtype == np.uint8
47 | assert image.shape == (400, 800, 3)
48 | assert image.shape[1] == 200 * 4
49 |
50 | # With a single molecule
51 | mol = mols[0]
52 | legends = dm.to_smiles(mol)
53 | image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200), use_svg=False)
54 | image = _convert_ipython_to_array(image)
55 | image = np.array(image)
56 |
57 | assert image.dtype == np.uint8
58 | assert image.shape == (200, 200, 3)
59 |
60 | dm.viz.to_image(mol, indices=True, mol_size=400)
61 |
62 | # With input smiles
63 | mol = "CCCOCc1cc(c2ncccc2)ccc1"
64 | legends = mol
65 | image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200), use_svg=False)
66 | image = _convert_ipython_to_array(image)
67 | image = np.array(image)
68 |
69 | assert image.dtype == np.uint8
70 | assert image.shape == (200, 200, 3)
71 |
72 |
73 | def test_to_image_incorrect_aromaticity():
74 | query = "C-c1cn(-C-2-[N,O:3]-[#6@H](-C-[#6,#8:1]-[*:2])-C(-[#8])-C-2-[#1,#8,#9:4])c2ncnc(-C)c12"
75 | mol = dm.from_smarts(query)
76 | dm.to_image(
77 | mol,
78 | mol_size=300,
79 | use_svg=False,
80 | legends="a legend",
81 | legend_fontsize=40,
82 | stereo_annotations=False,
83 | )
84 |
85 |
86 | def test_to_image_save_file(tmpdir):
87 | smiles = "CCCOCc1cc(c2ncccc2)ccc1"
88 | mol = dm.to_mol(smiles)
89 |
90 | image_path = str(tmpdir.join("mol.png"))
91 | dm.viz.to_image(mol, outfile=image_path, use_svg=False)
92 |
93 | # check whether the png is valid
94 | try:
95 | img = Image.open(image_path)
96 | img.verify()
97 | except PIL.UnidentifiedImageError:
98 | pytest.fail(f"The image {image_path} is invalid.")
99 |
100 | image_path = str(tmpdir.join("mol.svg"))
101 | dm.viz.to_image(mol, outfile=image_path, use_svg=True)
102 |
103 | # check whether the svg looks valid
104 | with open(image_path) as f:
105 | content = f.read().strip()
106 | assert content.startswith("")
108 |
109 |
110 | def test_conformers():
111 | import nglview as nv
112 |
113 | smiles = "CCCC=O"
114 | mol = dm.to_mol(smiles)
115 | mol = dm.conformers.generate(mol)
116 |
117 | # one conformer
118 | view = dm.viz.conformers(mol)
119 | assert type(view) == nv.widget.NGLWidget
120 |
121 | # multiple conformers
122 | view = dm.viz.conformers(mol, n_confs=12)
123 | assert type(view) == widgets.GridspecLayout
124 |
125 |
126 | @pytest.mark.skipif(
127 | not dm.is_greater_than_current_rdkit_version("2023.03"),
128 | reason="Circle Grid requires rdkit>2022.09",
129 | )
130 | def test_circle_grid(tmp_path):
131 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
132 | dm.viz.circle_grid(
133 | mol,
134 | [
135 | [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")],
136 | [dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")],
137 | ],
138 | outfile=str(tmp_path / "image.png"),
139 | )
140 |
141 |
142 | @pytest.mark.skipif(
143 | not dm.is_greater_than_current_rdkit_version("2023.03"),
144 | reason="Circle Grid requires rdkit>2022.09",
145 | )
146 | def test_circle_grid_with_hex_color(tmp_path):
147 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
148 | dm.viz.circle_grid(
149 | mol,
150 | [
151 | [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")],
152 | [dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")],
153 | ],
154 | ring_color="#ff1472",
155 | layout_random_seed=None,
156 | )
157 |
158 |
159 | @pytest.mark.skipif(
160 | not dm.is_greater_than_current_rdkit_version("2023.03"),
161 | reason="Circle Grid requires rdkit>2022.09",
162 | )
163 | def test_circle_grid_with_angle_start(tmp_path):
164 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
165 | dm.viz.circle_grid(
166 | mol,
167 | [
168 | [dm.to_mol("CCC"), dm.to_mol("CCCCCCC"), dm.to_mol("CCCCCO")],
169 | [
170 | dm.to_mol("CCCO"),
171 | ],
172 | ],
173 | # ring_color=(0, 0, 0, 0.5),
174 | ring_color="#ff1472aa",
175 | layout_random_seed=19,
176 | ring_mol_start_angles_degrees=[90, 90],
177 | )
178 |
179 |
180 | def test_to_image_align():
181 | # Get a list of molecules
182 | data = dm.data.freesolv()
183 | mols = dm.from_df(data) # type: ignore
184 | mols = mols[:8]
185 |
186 | # With multiple molecules
187 | dm.viz.to_image(mols, align=True)
188 |
189 |
190 | def test_to_image_align_template():
191 | # Get a list of molecules
192 | data = dm.data.freesolv()
193 | mols = dm.from_df(data) # type: ignore
194 | mols = mols[:8]
195 |
196 | dm.viz.to_image(mols, align=mols[0])
197 |
--------------------------------------------------------------------------------
/tests/test_viz_lasso_highlight.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import datamol as dm
3 |
4 |
5 | # The following tests are supposed to work and should not raise any errors
6 | def test_original_working_solution_str():
7 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
8 | smarts_list = "CONN"
9 | assert dm.lasso_highlight_image(smi, smarts_list)
10 |
11 |
12 | # The following tests are supposed to work and should not raise any errors
13 | def test_from_mol():
14 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
15 | mol = dm.to_mol(smi)
16 | smarts_list = "CONN"
17 | assert dm.lasso_highlight_image(mol, smarts_list)
18 |
19 |
20 | def test_with_highlight():
21 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
22 | mol = dm.to_mol(smi)
23 | smarts_list = "CONN"
24 | highlight_atoms = [4, 5, 6]
25 | highlight_bonds = [1, 2, 3, 4]
26 | highlight_atom_colors = {4: (230, 230, 250), 5: (230, 230, 250), 6: (230, 230, 250)}
27 | highlight_bond_colors = {
28 | 1: (230, 230, 250),
29 | 2: (230, 230, 250),
30 | 3: (230, 230, 250),
31 | 4: (230, 230, 250),
32 | }
33 | assert dm.lasso_highlight_image(
34 | mol,
35 | smarts_list,
36 | highlight_atoms=highlight_atoms,
37 | highlight_bonds=highlight_bonds,
38 | highlight_atom_colors=highlight_atom_colors,
39 | highlight_bond_colors=highlight_bond_colors,
40 | continuousHighlight=False,
41 | )
42 |
43 |
44 | def test_original_working_solution_list_single_str():
45 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
46 | smarts_list = ["CONN"]
47 | assert dm.lasso_highlight_image(smi, smarts_list)
48 |
49 |
50 | def test_original_working_solution_list_str():
51 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
52 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"]
53 | assert dm.lasso_highlight_image(smi, smarts_list)
54 |
55 |
56 | def test_original_working_solution_mol():
57 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
58 | smarts_list = dm.to_mol("CONN")
59 | assert dm.lasso_highlight_image(smi, smarts_list)
60 |
61 |
62 | def test_original_working_solution_list_single_mol():
63 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
64 | smarts_list = [dm.to_mol("CONN")]
65 | assert dm.lasso_highlight_image(smi, smarts_list)
66 |
67 |
68 | def test_original_working_solution_List_mol():
69 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
70 | smarts_list = [dm.to_mol("CONN"), dm.to_mol("N#CC~CO"), dm.to_mol("C=CON"), dm.to_mol("CONNCN")]
71 | assert dm.lasso_highlight_image(smi, smarts_list)
72 |
73 |
74 | def test_wokring_solution_with_more_structures_than_colors():
75 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
76 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
77 | assert dm.lasso_highlight_image(smi, smarts_list)
78 |
79 |
80 | def test_drawing_options():
81 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
82 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
83 | assert dm.lasso_highlight_image(smi, smarts_list, bondLineWidth=15)
84 |
85 |
86 | def test_wrong_drawing_options():
87 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
88 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
89 |
90 | with pytest.raises(ValueError):
91 | dm.lasso_highlight_image(smi, smarts_list, bondLineWidthXXXXXXX=15)
92 |
93 |
94 | def test_input_mol_is_none():
95 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
96 |
97 | with pytest.raises(ValueError):
98 | dm.lasso_highlight_image(None, smarts_list)
99 |
100 |
101 | def test_search_input_error_empty_list():
102 | # should still go through but just print out the structure without any highlights
103 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
104 | smarts_list = []
105 | assert dm.lasso_highlight_image(smi, smarts_list)
106 |
107 |
108 | def test_target_input_error_empty_str():
109 | with pytest.raises(ValueError):
110 | smi = ""
111 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"]
112 | dm.lasso_highlight_image(smi, smarts_list)
113 |
114 |
115 | def test_target_input_error_None():
116 | with pytest.raises(ValueError):
117 | smi = None
118 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"]
119 | dm.lasso_highlight_image(smi, smarts_list)
120 |
121 |
122 | def test_search_input_error_smarts_no_substructure():
123 | # This test should still continue but will just print out a structure without any highlights and a warning
124 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
125 | smarts_list = ["CCCCCC"]
126 | assert dm.lasso_highlight_image(smi, smarts_list)
127 |
128 |
129 | # testing using " == str(type(img)) so to not bring in IPython
130 | # as a dependency for the tests
131 | def test_SVG_is_returned_explicit():
132 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
133 | smarts_list = ["CC"]
134 | img = dm.lasso_highlight_image(smi, smarts_list, use_svg=True)
135 | assert isinstance(img, str)
136 |
137 |
138 | def test_SVG_is_returned_implict():
139 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
140 | smarts_list = ["CC"]
141 | img = dm.lasso_highlight_image(smi, smarts_list)
142 | assert isinstance(img, str)
143 |
144 |
145 | def test_PNG_is_returned():
146 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
147 | smarts_list = ["CC"]
148 | img = dm.lasso_highlight_image(smi, smarts_list, use_svg=False)
149 |
150 | from PIL import Image
151 |
152 | assert isinstance(img, Image.Image)
153 |
154 |
155 | def test_aromatic_query_work():
156 | smi = "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3"
157 | smarts_list = ["c1ccccc1"]
158 | assert dm.lasso_highlight_image(smi, smarts_list)
159 |
160 |
161 | def test_smarts_query():
162 | smi = "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3"
163 | smarts_list = "[#6]"
164 | assert dm.lasso_highlight_image(smi, smarts_list)
165 |
166 |
167 | def test_query_and_atom_indices_list():
168 | dm.viz.lasso_highlight_image(
169 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
170 | search_molecules="c1ccccc1",
171 | atom_indices=[[4, 5, 6], [1, 2, 3, 4]],
172 | )
173 |
174 |
175 | def test_multiple_mol_lasso():
176 | img = dm.viz.lasso_highlight_image(
177 | ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"],
178 | search_molecules="c1ccccc1",
179 | )
180 | assert isinstance(img, str)
181 |
182 | img = dm.viz.lasso_highlight_image(
183 | ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"],
184 | search_molecules="c1ccccc1",
185 | mol_size=(200, 200),
186 | n_cols=1,
187 | use_svg=False,
188 | )
189 | from PIL import Image
190 |
191 | assert isinstance(img, Image.Image)
192 | img.size == (400, 200)
193 |
194 |
195 | def test_multiple_mol_lasso_different_scale_legends():
196 | dm.viz.lasso_highlight_image(
197 | ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"],
198 | legends=["Mol1", "Mol2"],
199 | search_molecules="c1ccccc1",
200 | n_cols=1,
201 | draw_mols_same_scale=False,
202 | )
203 |
204 |
205 | def test_atom_indices_list_of_list():
206 | dm.viz.lasso_highlight_image(
207 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
208 | search_molecules=None,
209 | atom_indices=[[4, 5, 6], [1, 2, 3, 4]],
210 | )
211 |
212 |
213 | def test_atom_indices_list():
214 | dm.viz.lasso_highlight_image(
215 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
216 | search_molecules=None,
217 | atom_indices=[4, 5, 6],
218 | )
219 |
220 |
221 | def test_with_hex_color():
222 | dm.viz.lasso_highlight_image(
223 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
224 | search_molecules=None,
225 | atom_indices=[4, 5, 6],
226 | color_list=["#ff1472"],
227 | )
228 |
--------------------------------------------------------------------------------
/tests/test_viz_substrcture.py:
--------------------------------------------------------------------------------
1 | import datamol as dm
2 |
3 |
4 | def test_match_substructure():
5 | mol1 = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
6 | mol2 = dm.to_mol("CCN(CC)CC(=O)CC(C)NC1=C2C=CC(=CC2=NC=C1)Cl")
7 |
8 | query1 = dm.from_smarts("[C;H0](=O)")
9 | query2 = dm.to_mol("CN(C)")
10 |
11 | # Test multiple scenarios
12 |
13 | dm.viz.match_substructure(
14 | mols=[mol1, mol2],
15 | queries=[query1, query2],
16 | highlight_bonds=True,
17 | use_svg=True,
18 | )
19 | dm.viz.match_substructure(
20 | mols=mol1,
21 | queries=[query1, query2],
22 | highlight_bonds=True,
23 | use_svg=True,
24 | )
25 | dm.viz.match_substructure(
26 | mols=[mol1, mol2],
27 | queries=query1,
28 | highlight_bonds=False,
29 | use_svg=False,
30 | )
31 | dm.viz.match_substructure(
32 | mols=mol1,
33 | queries=query2,
34 | highlight_bonds=True,
35 | use_svg=False,
36 | )
37 |
--------------------------------------------------------------------------------