├── .github
    ├── CODEOWNERS
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── SECURITY.md
    └── workflows
    │   ├── code-check.yml
    │   ├── doc.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── binder
    ├── environment.yml
    └── postBuild
├── codecov.yml
├── datamol
    ├── __init__.py
    ├── _sanifix4.py
    ├── _version.py
    ├── align.py
    ├── cluster.py
    ├── conformers
    │   ├── __init__.py
    │   ├── _conformers.py
    │   └── _features.py
    ├── convert.py
    ├── data
    │   ├── __init__.py
    │   ├── cdk2.sdf
    │   ├── chembl_approved_drugs.parquet
    │   ├── chembl_drugs.csv
    │   ├── chembl_samples.csv
    │   ├── freesolv.csv
    │   ├── reactions.json
    │   ├── salts_solvents.smi
    │   ├── solubility.test.sdf
    │   └── solubility.train.sdf
    ├── descriptors
    │   ├── __init__.py
    │   ├── compute.py
    │   └── descriptors.py
    ├── fp.py
    ├── fragment
    │   ├── __init__.py
    │   ├── _assemble.py
    │   └── _fragment.py
    ├── graph.py
    ├── io.py
    ├── isomers
    │   ├── __init__.py
    │   ├── _enumerate.py
    │   └── _structural.py
    ├── log.py
    ├── mcs.py
    ├── mol.py
    ├── molar.py
    ├── predictors
    │   ├── __init__.py
    │   └── esol.py
    ├── reactions
    │   ├── __init__.py
    │   ├── _attachments.py
    │   └── _reactions.py
    ├── scaffold
    │   ├── __init__.py
    │   └── _fuzzy.py
    ├── similarity.py
    ├── types.py
    ├── utils
    │   ├── __init__.py
    │   ├── decorators.py
    │   ├── fs.py
    │   ├── jobs.py
    │   ├── perf.py
    │   └── testing.py
    └── viz
    │   ├── __init__.py
    │   ├── _circle_grid.py
    │   ├── _conformers.py
    │   ├── _lasso_highlight.py
    │   ├── _substructure.py
    │   ├── _viz.py
    │   └── utils.py
├── docs
    ├── CNAME
    ├── api
    │   ├── datamol.align.md
    │   ├── datamol.cluster.md
    │   ├── datamol.conformers.md
    │   ├── datamol.convert.md
    │   ├── datamol.data.md
    │   ├── datamol.descriptors.md
    │   ├── datamol.fp.md
    │   ├── datamol.fragment.md
    │   ├── datamol.graph.md
    │   ├── datamol.io.md
    │   ├── datamol.isomers.md
    │   ├── datamol.log.md
    │   ├── datamol.mol.md
    │   ├── datamol.molar.md
    │   ├── datamol.reactions.md
    │   ├── datamol.scaffold.md
    │   ├── datamol.similarity.md
    │   ├── datamol.utils.fs.md
    │   ├── datamol.utils.md
    │   └── datamol.viz.md
    ├── assets
    │   ├── css
    │   │   ├── custom-datamol.css
    │   │   ├── custom.css
    │   │   └── tweak-width.css
    │   └── js
    │   │   └── google-analytics.js
    ├── contribute.md
    ├── images
    │   ├── logo-black.png
    │   ├── logo-black.svg
    │   ├── logo-title.svg
    │   ├── logo.png
    │   └── logo.svg
    ├── index.md
    ├── license.md
    ├── tutorials
    │   ├── Aligning.ipynb
    │   ├── Clustering.ipynb
    │   ├── Conformers.ipynb
    │   ├── Descriptors.ipynb
    │   ├── Filesystem.ipynb
    │   ├── Fragment.ipynb
    │   ├── Fuzzy_Scaffolds.ipynb
    │   ├── Preprocessing.ipynb
    │   ├── Reactions.ipynb
    │   ├── Scaffolds.ipynb
    │   ├── The_Basics.ipynb
    │   ├── Visualization.ipynb
    │   ├── data
    │   │   ├── Enamine_DNA_Libary_5530cmpds_20200831_SMALL.sdf
    │   │   └── ReactionBlock.rxn
    │   └── images
    │   │   ├── Aligning_1.png
    │   │   ├── Aligning_2.png
    │   │   ├── Conformers_1.png
    │   │   ├── Descriptors_1.png
    │   │   ├── Fragment_1.png
    │   │   ├── Fragment_2.png
    │   │   ├── Fragment_3.png
    │   │   ├── Preprocess_1.png
    │   │   └── Scaffolds_1.png
    └── usage.md
├── env.yml
├── mkdocs.yml
├── notebooks
    └── Get_ChEMBL_Approved_Drugs.ipynb
├── pyproject.toml
└── tests
    ├── conftest.py
    ├── data
        ├── TUBB3-observations-last-broken.sdf
        ├── TUBB3-observations.sdf
        ├── TUBB3-observations.sdf.gz
        ├── freesolv.csv
        ├── freesolv.xlsx
        └── test.mol2
    ├── test_align.py
    ├── test_cluster.py
    ├── test_conformers.py
    ├── test_convert.py
    ├── test_data.py
    ├── test_descriptors.py
    ├── test_fp.py
    ├── test_fragment.py
    ├── test_graph.py
    ├── test_import.py
    ├── test_io.py
    ├── test_isomers.py
    ├── test_log.py
    ├── test_mcs.py
    ├── test_mol.py
    ├── test_molar.py
    ├── test_notebooks.py
    ├── test_predictors.py
    ├── test_reactions.py
    ├── test_scaffold.py
    ├── test_similarity.py
    ├── test_utils_fs.py
    ├── test_utils_jobs.py
    ├── test_utils_perf.py
    ├── test_viz.py
    ├── test_viz_lasso_highlight.py
    └── test_viz_substrcture.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @hadim
2 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | .
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | see documentation directly at https://docs.datamol.io/stable/contribute.html
2 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Changelogs
 2 | 
 3 | - _enumerate the changes of that PR._
 4 | 
 5 | ---
 6 | 
 7 | _Checklist:_
 8 | 
 9 | - [ ] _Was this PR discussed in an issue? It is recommended to first discuss a new feature into a GitHub issue before opening a PR._
10 | - [ ] _Add tests to cover the fixed bug(s) or the new introduced feature(s) (if appropriate)._
11 | - [ ] _Update the API documentation is a new function is added, or an existing one is deleted._
12 | - [ ] _Write concise and explanatory changelogs below._
13 | - [ ] _If possible, assign one of the following labels to the PR: `feature`, `fix` or `test` (or ask a maintainer to do it for you)._
14 | 
15 | ---
16 | 
17 | _discussion related to that PR_
18 | 


--------------------------------------------------------------------------------
/.github/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | Please report any security-related issues directly to hadrien@valencediscovery.com. 
4 | 


--------------------------------------------------------------------------------
/.github/workflows/code-check.yml:
--------------------------------------------------------------------------------
 1 | name: code-check
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |     tags: ["*"]
 7 |   pull_request:
 8 |     branches:
 9 |       - "*"
10 |       - "!gh-pages"
11 | 
12 | jobs:
13 |   python-format-black:
14 |     name: Python lint [black]
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout the code
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: "3.10"
24 | 
25 |       - name: Install black
26 |         run: |
27 |           pip install black>=24
28 | 
29 |       - name: Lint
30 |         run: black --check .
31 | 
32 |   python-lint-ruff:
33 |     name: Python lint [ruff]
34 |     runs-on: ubuntu-latest
35 |     steps:
36 |       - name: Checkout the code
37 |         uses: actions/checkout@v4
38 | 
39 |       - name: Set up Python
40 |         uses: actions/setup-python@v4
41 |         with:
42 |           python-version: "3.10"
43 | 
44 |       - name: Install ruff
45 |         run: |
46 |           pip install ruff
47 | 
48 |       - name: Lint
49 |         run: ruff .
50 | 


--------------------------------------------------------------------------------
/.github/workflows/doc.yml:
--------------------------------------------------------------------------------
 1 | name: doc
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 | 
 7 | # Prevent doc action on `main` to conflict with each others.
 8 | concurrency:
 9 |   group: doc-${{ github.ref }}
10 |   cancel-in-progress: true
11 | 
12 | jobs:
13 |   doc:
14 |     runs-on: "ubuntu-latest"
15 |     timeout-minutes: 30
16 | 
17 |     defaults:
18 |       run:
19 |         shell: bash -l {0}
20 | 
21 |     steps:
22 |       - name: Checkout the code
23 |         uses: actions/checkout@v4
24 | 
25 |       - name: Setup mamba
26 |         uses: mamba-org/setup-micromamba@v1
27 |         with:
28 |           environment-file: env.yml
29 |           environment-name: my_env
30 |           cache-environment: true
31 |           cache-downloads: true
32 | 
33 |       - name: Install library
34 |         run: python -m pip install --no-deps .
35 | 
36 |       - name: Configure git
37 |         run: |
38 |           git config --global user.name "${GITHUB_ACTOR}"
39 |           git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
40 | 
41 |       - name: Deploy the doc
42 |         run: |
43 |           echo "Get the gh-pages branch"
44 |           git fetch origin gh-pages
45 | 
46 |           echo "Build and deploy the doc on main"
47 |           mike deploy --push main
48 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: release
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       release-version:
  7 |         description: "A valid Semver version string"
  8 |         required: true
  9 | 
 10 | permissions:
 11 |   contents: write
 12 |   pull-requests: write
 13 | 
 14 | jobs:
 15 |   release:
 16 |     # Do not release if not triggered from the default branch
 17 |     if: github.ref == format('refs/heads/{0}', github.event.repository.default_branch)
 18 | 
 19 |     runs-on: ubuntu-latest
 20 |     timeout-minutes: 30
 21 | 
 22 |     defaults:
 23 |       run:
 24 |         shell: bash -l {0}
 25 | 
 26 |     steps:
 27 |       - name: Checkout the code
 28 |         uses: actions/checkout@v4
 29 | 
 30 |       - name: Setup mamba
 31 |         uses: mamba-org/setup-micromamba@v1
 32 |         with:
 33 |           environment-file: env.yml
 34 |           environment-name: my_env
 35 |           cache-environment: true
 36 |           cache-downloads: true
 37 |           create-args: >-
 38 |             pip
 39 |             semver
 40 |             python-build
 41 |             setuptools_scm
 42 | 
 43 |       - name: Check the version is valid semver
 44 |         run: |
 45 |           RELEASE_VERSION="${{ inputs.release-version }}"
 46 | 
 47 |           {
 48 |             pysemver check $RELEASE_VERSION
 49 |           } || {
 50 |             echo "The version '$RELEASE_VERSION' is not a valid Semver version string."
 51 |             echo "Please use a valid semver version string. More details at https://semver.org/"
 52 |             echo "The release process is aborted."
 53 |             exit 1
 54 |           }
 55 | 
 56 |       - name: Check the version is higher than the latest one
 57 |         run: |
 58 |           # Retrieve the git tags first
 59 |           git fetch --prune --unshallow --tags &> /dev/null
 60 | 
 61 |           RELEASE_VERSION="${{ inputs.release-version }}"
 62 |           LATEST_VERSION=$(git describe --abbrev=0 --tags)
 63 | 
 64 |           IS_HIGHER_VERSION=$(pysemver compare $RELEASE_VERSION $LATEST_VERSION)
 65 | 
 66 |           if [ "$IS_HIGHER_VERSION" != "1" ]; then
 67 |             echo "The version '$RELEASE_VERSION' is not higher than the latest version '$LATEST_VERSION'."
 68 |             echo "The release process is aborted."
 69 |             exit 1
 70 |           fi
 71 | 
 72 |       - name: Build Changelog
 73 |         id: github_release
 74 |         uses: mikepenz/release-changelog-builder-action@v4
 75 |         env:
 76 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 77 |         with:
 78 |           toTag: "main"
 79 | 
 80 |       - name: Configure git
 81 |         run: |
 82 |           git config --global user.name "${GITHUB_ACTOR}"
 83 |           git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
 84 | 
 85 |       - name: Create and push git tag
 86 |         env:
 87 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 88 |         run: |
 89 |           # Tag the release
 90 |           git tag -a "${{ inputs.release-version }}" -m "Release version ${{ inputs.release-version }}"
 91 | 
 92 |           # Checkout the git tag
 93 |           git checkout "${{ inputs.release-version }}"
 94 | 
 95 |           # Push the modified changelogs
 96 |           git push origin main
 97 | 
 98 |           # Push the tags
 99 |           git push origin "${{ inputs.release-version }}"
100 | 
101 |       - name: Install library
102 |         run: python -m pip install --no-deps .
103 | 
104 |       - name: Build the wheel and sdist
105 |         run: python -m build --no-isolation
106 | 
107 |       - name: Publish package to PyPI
108 |         uses: pypa/gh-action-pypi-publish@release/v1
109 |         with:
110 |           password: ${{ secrets.PYPI_API_TOKEN }}
111 |           packages-dir: dist/
112 | 
113 |       - name: Create GitHub Release
114 |         uses: softprops/action-gh-release@de2c0eb89ae2a093876385947365aca7b0e5f844
115 |         with:
116 |           tag_name: ${{ inputs.release-version }}
117 |           body: ${{steps.github_release.outputs.changelog}}
118 | 
119 |       - name: Deploy the doc
120 |         run: |
121 |           echo "Get the gh-pages branch"
122 |           git fetch origin gh-pages
123 | 
124 |           echo "Build and deploy the doc on ${{ inputs.release-version }}"
125 |           mike deploy --push stable
126 |           mike deploy --push ${{ inputs.release-version }}
127 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |     tags: ["*"]
 7 |   pull_request:
 8 |     branches:
 9 |       - "*"
10 |       - "!gh-pages"
11 |   schedule:
12 |     - cron: "0 4 * * MON"
13 | 
14 | jobs:
15 |   test:
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.10", "3.11"]
20 |         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
21 |         rdkit-version: ["2023.09", "2024.03"]
22 | 
23 |     runs-on: ${{ matrix.os }}
24 |     timeout-minutes: 30
25 | 
26 |     defaults:
27 |       run:
28 |         shell: bash -l {0}
29 | 
30 |     name: |
31 |       os=${{ matrix.os }}
32 |       - python=${{ matrix.python-version }}
33 |       - rdkit=${{ matrix.rdkit-version }}
34 | 
35 |     steps:
36 |       - name: Checkout the code
37 |         uses: actions/checkout@v4
38 | 
39 |       - name: Setup mamba
40 |         uses: mamba-org/setup-micromamba@v1
41 |         with:
42 |           environment-file: env.yml
43 |           environment-name: my_env
44 |           cache-environment: true
45 |           cache-downloads: true
46 |           create-args: >-
47 |             python=${{ matrix.python-version }}
48 |             rdkit=${{ matrix.rdkit-version }}
49 | 
50 |       - name: Install library
51 |         run: python -m pip install --no-deps -e . # `-e` required for correct `coverage` run.
52 | 
53 |       - name: Run tests
54 |         run: pytest
55 | 
56 |       - name: Codecov Upload
57 |         uses: codecov/codecov-action@v4
58 |         with:
59 |           files: ./coverage.xml
60 |           flags: unittests
61 |           name: codecov-umbrella
62 |           fail_ci_if_error: false
63 |           verbose: false
64 |           env_vars: ${{ matrix.os }},${{ matrix.python-version }},${{ matrix.rdkit-version }}
65 | 
66 |       - name: Test building the doc
67 |         run: mkdocs build
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.env
 2 | cov.xml
 3 | coverage.xml
 4 | 
 5 | .vscode/
 6 | 
 7 | .ipynb_checkpoints/
 8 | 
 9 | *.py[cod]
10 | 
11 | # C extensions
12 | *.so
13 | 
14 | # Packages
15 | *.egg
16 | *.egg-info
17 | dist
18 | build
19 | eggs
20 | parts
21 | bin
22 | var
23 | sdist
24 | develop-eggs
25 | .installed.cfg
26 | lib
27 | lib64
28 | 
29 | # Installer logs
30 | pip-log.txt
31 | 
32 | # Unit test / coverage reports
33 | .coverage*
34 | .tox
35 | nosetests.xml
36 | htmlcov
37 | 
38 | # Translations
39 | *.mo
40 | 
41 | # Mr Developer
42 | .mr.developer.cfg
43 | .project
44 | .pydevproject
45 | 
46 | # Complexity
47 | output/*.html
48 | output/*/index.html
49 | 
50 | # Sphinx
51 | docs/_build
52 | 
53 | MANIFEST
54 | 
55 | *.tif
56 | 
57 | # Rever
58 | rever/
59 | 
60 | # Dev notebook
61 | dev.ipynb
62 | 
63 | # MkDocs
64 | site/
65 | 
66 | .idea/
67 | __pycache__
68 | .DS_Store
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <img src="docs/images/logo-title.svg" width="100%">
  3 | </div>
  4 | 
  5 | <p align="center">
  6 |     <b>datamol - molecular processing made easy</b> <br />
  7 | </p>
  8 | <p align="center">
  9 |   <a href="https://docs.datamol.io/stable/" target="_blank">
 10 |       Docs
 11 |   </a> |
 12 |   <a href="https://datamol.io/" target="_blank">
 13 |       Homepage
 14 |   </a>
 15 | </p>
 16 | 
 17 | ---
 18 | 
 19 | [![DOI](https://zenodo.org/badge/341603042.svg)](https://zenodo.org/badge/latestdoi/341603042)
 20 | [![Binder](http://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/datamol-io/datamol/main?urlpath=lab/tree/docs/tutorials/The_Basics.ipynb)
 21 | [![PyPI](https://img.shields.io/pypi/v/datamol)](https://pypi.org/project/datamol/)
 22 | [![Conda](https://img.shields.io/conda/v/conda-forge/datamol?label=conda&color=success)](https://anaconda.org/conda-forge/datamol)
 23 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/datamol)](https://pypi.org/project/datamol/)
 24 | [![Conda](https://img.shields.io/conda/dn/conda-forge/datamol)](https://anaconda.org/conda-forge/datamol)
 25 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/datamol)](https://pypi.org/project/datamol/)
 26 | [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/datamol-io/datamol/blob/main/LICENSE)
 27 | [![GitHub Repo stars](https://img.shields.io/github/stars/datamol-io/datamol)](https://github.com/datamol-io/datamol/stargazers)
 28 | [![GitHub Repo stars](https://img.shields.io/github/forks/datamol-io/datamol)](https://github.com/datamol-io/datamol/network/members)
 29 | [![Codecov](https://codecov.io/gh/datamol-io/datamol/branch/main/graph/badge.svg?token=2ETG8SA7IG)](https://codecov.io/gh/datamol-io/datamol)
 30 | 
 31 | Datamol is a python library to work with molecules. It's a layer built on top of [RDKit](https://www.rdkit.org/) and aims to be as light as possible.
 32 | 
 33 | - 🐍 Simple pythonic API
 34 | - ⚗️ RDKit first: all you manipulate are `rdkit.Chem.Mol` objects.
 35 | - ✅ Manipulating molecules often relies on many options; Datamol provides good defaults by design.
 36 | - 🧠 Performance matters: built-in efficient parallelization when possible with an optional progress bar.
 37 | - 🕹️ Modern IO: out-of-the-box support for remote paths using `fsspec` to read and write multiple formats (sdf, xlsx, csv, etc).
 38 | 
 39 | ## Try Online
 40 | 
 41 | Visit [![Binder](http://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/datamol-io/datamol/main?urlpath=lab/tree/docs/tutorials/The_Basics.ipynb) and try Datamol online.
 42 | 
 43 | ## Documentation
 44 | 
 45 | Visit <https://docs.datamol.io>.
 46 | 
 47 | ## Installation
 48 | 
 49 | Use conda:
 50 | 
 51 | ```bash
 52 | mamba install -c conda-forge datamol
 53 | ```
 54 | 
 55 | ## Quick API Tour
 56 | 
 57 | ```python
 58 | import datamol as dm
 59 | 
 60 | # Common functions
 61 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
 62 | fp = dm.to_fp(mol)
 63 | selfies = dm.to_selfies(mol)
 64 | inchi = dm.to_inchi(mol)
 65 | 
 66 | # Standardize and sanitize
 67 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O")
 68 | mol = dm.fix_mol(mol)
 69 | mol = dm.sanitize_mol(mol)
 70 | mol = dm.standardize_mol(mol)
 71 | 
 72 | # Dataframe manipulation
 73 | df = dm.data.freesolv()
 74 | mols = dm.from_df(df)
 75 | 
 76 | # 2D viz
 77 | legends = [dm.to_smiles(mol) for mol in mols[:10]]
 78 | dm.viz.to_image(mols[:10], legends=legends)
 79 | 
 80 | # Generate conformers
 81 | smiles = "O=C(C)Oc1ccccc1C(=O)O"
 82 | mol = dm.to_mol(smiles)
 83 | mol_with_conformers = dm.conformers.generate(mol)
 84 | 
 85 | # 3D viz (using nglview)
 86 | dm.viz.conformers(mol, n_confs=10)
 87 | 
 88 | # Compute SASA from conformers
 89 | sasa = dm.conformers.sasa(mol_with_conformers)
 90 | 
 91 | # Easy IO
 92 | mols = dm.read_sdf("s3://my-awesome-data-lake/smiles.sdf", as_df=False)
 93 | dm.to_sdf(mols, "gs://data-bucket/smiles.sdf")
 94 | ```
 95 | 
 96 | ## How to cite
 97 | 
 98 | Please cite Datamol if you use it in your research: [![DOI](https://zenodo.org/badge/341603042.svg)](https://zenodo.org/badge/latestdoi/341603042).
 99 | 
100 | ## Compatibilities
101 | 
102 | Version compatibilities are an essential topic for production-software stacks. We are cautious about documenting compatibility between `datamol`, `python` and `rdkit`.
103 | 
104 | See below the associated versions of Python and RDKit, for which a minor version of Datamol **has been tested** during its whole lifecycle. _It does not mean other combinations does not work but that those are not tested._
105 | 
106 | | `datamol` | `python`            | `rdkit`                       |
107 | | --------- | ------------------- | ----------------------------- |
108 | | `0.12.x`  | `[3.10, 3.11]`      | `[2023.03, 2023.09]`          |
109 | | `0.11.x`  | `[3.9, 3.10, 3.11]` | `[2022.09, 2023.03]`          |
110 | | `0.10.x`  | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]`          |
111 | | `0.9.x`   | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]`          |
112 | | `0.8.x`   | `[3.8, 3.9, 3.10]`  | `[2021.09, 2022.03, 2022.09]` |
113 | | `0.7.x`   | `[3.8, 3.9]`        | `[2021.09, 2022.03]`          |
114 | | `0.6.x`   | `[3.8, 3.9]`        | `[2021.09]`                   |
115 | | `0.5.x`   | `[3.8, 3.9]`        | `[2021.03, 2021.09]`          |
116 | | `0.4.x`   | `[3.8, 3.9]`        | `[2020.09, 2021.03]`          |
117 | | `0.3.x`   | `[3.8, 3.9]`        | `[2020.09, 2021.03]`          |
118 | 
119 | ## CI Status
120 | 
121 | The CI runs tests and performs code quality checks for the following combinations:
122 | 
123 | - The three major platforms: Windows, OSX and Linux.
124 | - The two latest Python versions.
125 | - The two latest RDKit versions.
126 | 
127 | |                                         | `main`                                                                                                                                                                    |
128 | | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
129 | | Lib build & Testing                     | [![test](https://github.com/datamol-io/datamol/actions/workflows/test.yml/badge.svg)](https://github.com/datamol-io/datamol/actions/workflows/test.yml)                   |
130 | | Code Sanity (linting and type analysis) | [![code-check](https://github.com/datamol-io/datamol/actions/workflows/code-check.yml/badge.svg)](https://github.com/datamol-io/datamol/actions/workflows/code-check.yml) |
131 | | Documentation Build                     | [![doc](https://github.com/datamol-io/datamol/actions/workflows/doc.yml/badge.svg)](https://github.com/datamol-io/datamol/actions/workflows/doc.yml)                      |
132 | 
133 | ## License
134 | 
135 | Under the Apache-2.0 license. See [LICENSE](LICENSE).
136 | 


--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | 
 4 | dependencies:
 5 |   - python >=3.8
 6 |   - pip
 7 |   - tqdm
 8 |   - loguru
 9 |   - joblib
10 |   - fsspec >=2021.9
11 |   - s3fs >=2021.9
12 |   - gcsfs >=2021.9
13 |   - platformdirs
14 |   - packaging
15 |   - typing_extensions
16 |   - importlib_resources
17 | 
18 |   # Scientific
19 |   - pandas
20 |   - numpy
21 |   - scipy
22 |   - pillow
23 |   - matplotlib
24 |   - scikit-learn
25 | 
26 |   # Chemistry
27 |   - rdkit >=2021.03
28 |   - selfies
29 | 
30 |   # Optional deps
31 |   - openpyxl
32 |   - networkx
33 |   - nglview
34 |   - xlsxwriter
35 |   - pyarrow
36 | 
37 |   # Dev
38 |   - pytest >=6.0
39 |   - pytest-cov
40 |   - pytest-xdist
41 |   - black >=24
42 |   - jupyterlab
43 |   - mypy
44 |   - codecov
45 |   - nbconvert
46 | 
47 |   # Doc
48 |   - mkdocs
49 |   - mkdocs-material >=7.1.1
50 |   - mkdocs-material-extensions
51 |   - mkdocstrings
52 |   - mkdocstrings-python
53 |   - mkdocs-jupyter
54 |   - markdown-include
55 |   - mdx_truly_sane_lists
56 |   - mike >=1.0.0
57 |   - seaborn
58 | 


--------------------------------------------------------------------------------
/binder/postBuild:
--------------------------------------------------------------------------------
1 | pip install -e .
2 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   range: "50...80"
3 |   status:
4 |     project:
5 |       default:
6 |         threshold: 1%
7 |     patch: false
8 | 


--------------------------------------------------------------------------------
/datamol/_sanifix4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | sanifix4.py
  3 | Original code from rdkit [James Davidson]
  4 | """
  5 | 
  6 | from rdkit import Chem, RDLogger
  7 | 
  8 | 
  9 | logger = RDLogger.logger()
 10 | 
 11 | 
 12 | def _FragIndicesToMol(oMol, indices):
 13 |     em = Chem.EditableMol(Chem.Mol())
 14 | 
 15 |     newIndices = {}
 16 |     for i, idx in enumerate(indices):
 17 |         em.AddAtom(oMol.GetAtomWithIdx(idx))
 18 |         newIndices[idx] = i
 19 | 
 20 |     for i, idx in enumerate(indices):
 21 |         at = oMol.GetAtomWithIdx(idx)
 22 |         for bond in at.GetBonds():
 23 |             if bond.GetBeginAtomIdx() == idx:
 24 |                 oidx = bond.GetEndAtomIdx()
 25 |             else:
 26 |                 oidx = bond.GetBeginAtomIdx()
 27 |             # make sure every bond only gets added once:
 28 |             if oidx < idx:
 29 |                 continue
 30 |             em.AddBond(newIndices[idx], newIndices[oidx], bond.GetBondType())
 31 |     res = em.GetMol()
 32 |     res.ClearComputedProps()
 33 |     Chem.GetSymmSSSR(res)
 34 |     res.UpdatePropertyCache(False)
 35 |     res._idxMap = newIndices
 36 |     return res
 37 | 
 38 | 
 39 | def _recursivelyModifyNs(mol, matches, indices=None):
 40 |     if indices is None:
 41 |         indices = []
 42 |     res = None
 43 |     while len(matches) and res is None:
 44 |         tIndices = indices[:]
 45 |         nextIdx = matches.pop(0)
 46 |         tIndices.append(nextIdx)
 47 |         nm = Chem.Mol(mol.ToBinary())
 48 |         nm.GetAtomWithIdx(nextIdx).SetNoImplicit(True)
 49 |         nm.GetAtomWithIdx(nextIdx).SetNumExplicitHs(1)
 50 |         cp = Chem.Mol(nm.ToBinary())
 51 |         try:
 52 |             Chem.SanitizeMol(cp)
 53 |         except ValueError:
 54 |             res, indices = _recursivelyModifyNs(nm, matches, indices=tIndices)
 55 |         else:
 56 |             indices = tIndices
 57 |             res = cp
 58 |     return res, indices
 59 | 
 60 | 
 61 | def AdjustAromaticNs(m, nitrogenPattern="[n&D2&H0;r5,r6]"):
 62 |     """
 63 |     default nitrogen pattern matches Ns in 5 rings and 6 rings in order to be able
 64 |     to fix: O=c1ccncc1
 65 |     """
 66 |     Chem.GetSymmSSSR(m)
 67 |     m.UpdatePropertyCache(False)
 68 | 
 69 |     # break non-ring bonds linking rings:
 70 |     em = Chem.EditableMol(m)
 71 |     linkers = m.GetSubstructMatches(Chem.MolFromSmarts("[r]!@[r]"))
 72 |     plsFix = set()
 73 |     for a, b in linkers:
 74 |         em.RemoveBond(a, b)
 75 |         plsFix.add(a)
 76 |         plsFix.add(b)
 77 |     nm = em.GetMol()
 78 |     for at in plsFix:
 79 |         at = nm.GetAtomWithIdx(at)
 80 |         if at.GetIsAromatic() and at.GetAtomicNum() == 7:
 81 |             at.SetNumExplicitHs(1)
 82 |             at.SetNoImplicit(True)
 83 | 
 84 |     # build molecules from the fragments:
 85 |     fragLists = Chem.GetMolFrags(nm)
 86 |     frags = [_FragIndicesToMol(nm, x) for x in fragLists]
 87 | 
 88 |     # loop through the fragments in turn and try to aromatize them:
 89 |     ok = True
 90 |     for i, frag in enumerate(frags):
 91 |         cp = Chem.Mol(frag)
 92 |         try:
 93 |             Chem.SanitizeMol(cp)
 94 |         except ValueError:
 95 |             matches = [x[0] for x in frag.GetSubstructMatches(Chem.MolFromSmarts(nitrogenPattern))]
 96 |             lres, indices = _recursivelyModifyNs(frag, matches)
 97 |             if not lres:
 98 |                 # print 'frag %d failed (%s)'%(i,str(fragLists[i]))
 99 |                 ok = False
100 |                 break
101 |             else:
102 |                 revMap = {}
103 |                 for k, v in frag._idxMap.items():
104 |                     revMap[v] = k
105 |                 for idx in indices:
106 |                     oatom = m.GetAtomWithIdx(revMap[idx])
107 |                     oatom.SetNoImplicit(True)
108 |                     oatom.SetNumExplicitHs(1)
109 |     if not ok:
110 |         return None
111 |     return m
112 | 
113 | 
114 | def sanifix(m):
115 |     if m is None:
116 |         return None
117 |     try:
118 |         m.UpdatePropertyCache(False)
119 |         cp = Chem.Mol(m.ToBinary())
120 |         Chem.SanitizeMol(cp)
121 |         return cp
122 |     except ValueError as e:
123 |         logger.debug(f"{Chem.MolToSmiles(m)} failed due to {e}")
124 |         try:
125 |             m = AdjustAromaticNs(m)
126 |             if m is not None:
127 |                 Chem.SanitizeMol(m)
128 |             return m
129 |         except Exception as ee:
130 |             logger.debug(f"{Chem.MolToSmiles(m)} failed due to {ee}")
131 |             return None
132 |     except RuntimeError as e:
133 |         logger.debug(f"{Chem.MolToSmiles(m)} failed due to {e}")
134 |         logger.info(f"The faulty smiles is: {Chem.MolToSmiles(m)}")
135 |         raise e
136 | 


--------------------------------------------------------------------------------
/datamol/_version.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from importlib.metadata import version
 3 |     from importlib.metadata import PackageNotFoundError
 4 | except ModuleNotFoundError:
 5 |     # Try backported to PY<38 `importlib_metadata`.
 6 |     from importlib_metadata import version
 7 |     from importlib_metadata import PackageNotFoundError
 8 | 
 9 | 
10 | import rdkit
11 | import packaging.version
12 | 
13 | 
14 | try:
15 |     __version__ = version("datamol")
16 | except PackageNotFoundError:
17 |     # package is not installed
18 |     __version__ = "dev"
19 | 
20 | CURRENT_RDKIT_VERSION = rdkit.__version__
21 | CURRENT_RDKIT_VERSION_OBJ = packaging.version.parse(CURRENT_RDKIT_VERSION)
22 | 
23 | 
24 | def is_lower_than_current_rdkit_version(rdkit_version: str):
25 |     return CURRENT_RDKIT_VERSION_OBJ < packaging.version.parse(rdkit_version)
26 | 
27 | 
28 | def is_greater_than_current_rdkit_version(rdkit_version: str):
29 |     return CURRENT_RDKIT_VERSION_OBJ > packaging.version.parse(rdkit_version)
30 | 
31 | 
32 | def is_lower_eq_than_current_rdkit_version(rdkit_version: str):
33 |     return CURRENT_RDKIT_VERSION_OBJ <= packaging.version.parse(rdkit_version)
34 | 
35 | 
36 | def is_greater_eq_than_current_rdkit_version(rdkit_version: str):
37 |     return CURRENT_RDKIT_VERSION_OBJ >= packaging.version.parse(rdkit_version)
38 | 


--------------------------------------------------------------------------------
/datamol/conformers/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._conformers import generate
 2 | from ._conformers import cluster
 3 | from ._conformers import rmsd
 4 | from ._conformers import return_centroids
 5 | from ._conformers import translate
 6 | from ._conformers import align_conformers
 7 | 
 8 | from ._features import sasa
 9 | from ._features import get_coords
10 | from ._features import center_of_mass
11 | from ._features import keep_conformers
12 | 


--------------------------------------------------------------------------------
/datamol/conformers/_features.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | from typing import List
  3 | from typing import Optional
  4 | 
  5 | import numpy as np
  6 | 
  7 | from ..types import Mol
  8 | from ..utils.jobs import JobRunner
  9 | from ..utils import decorators
 10 | from ..mol import PERIODIC_TABLE
 11 | from ..mol import copy_mol
 12 | 
 13 | 
 14 | @decorators.disable_on_os("win")
 15 | def sasa(
 16 |     mol: Mol,
 17 |     conf_id: Optional[Union[int, List[int]]] = None,
 18 |     n_jobs: int = 1,
 19 | ) -> np.ndarray:
 20 |     """Compute Solvent Accessible Surface Area of all the conformers
 21 |     using FreeSASA (https://freesasa.github.io/). Values are returned
 22 |     as an array and also stored within each conformer as a property
 23 |     called `rdkit_free_sasa`.
 24 | 
 25 |     Example:
 26 | 
 27 |     ```python
 28 |     smiles = "O=C(C)Oc1ccccc1C(=O)O"
 29 |     mol = dm.to_mol(smiles)
 30 |     mol = dm.conformers.generate(mol)
 31 | 
 32 |     # Compute SASA for all the conformers without parallelization
 33 |     sasa_values = dm.conformers.sasa(mol, conf_id=None, n_jobs=1)
 34 | 
 35 |     # If minimization has been enabled (default to True)
 36 |     # you can access the computed energy.
 37 |     conf = mol.GetConformer(0)
 38 |     props = conf.GetPropsAsDict()
 39 |     print(props)
 40 |     # {'rdkit_uff_energy': 1.7649408317784008}
 41 |     ```
 42 | 
 43 |     Args:
 44 |         mol: a molecule
 45 |         conf_id: Id of the conformers to compute. If None, compute all.
 46 |         n_jobs: Number of jobs for parallelization. Set to 1 to disable
 47 |             and -1 to use all cores.
 48 | 
 49 |     Returns:
 50 |         mol: the molecule with the conformers.
 51 |     """
 52 |     from rdkit.Chem import rdFreeSASA
 53 | 
 54 |     if mol.GetNumConformers() == 0:
 55 |         raise ValueError(
 56 |             "The molecule has 0 conformers. You can generate conformers with `dm.conformers.generate(mol)`."
 57 |         )
 58 | 
 59 |     # Get Van der Waals radii (angstrom)
 60 |     radii = [PERIODIC_TABLE.GetRvdw(atom.GetAtomicNum()) for atom in mol.GetAtoms()]
 61 | 
 62 |     # Which conformers to compute
 63 |     conf_ids = []
 64 |     if conf_id is None:
 65 |         # If None compute for all the conformers
 66 |         conf_ids = list(range(mol.GetNumConformers()))  # type: ignore
 67 |     elif isinstance(conf_id, int):
 68 |         conf_ids = [conf_id]
 69 |     else:
 70 |         conf_ids = conf_id
 71 | 
 72 |     # Compute solvent accessible surface area
 73 |     def _get_sasa(i):
 74 |         conf = mol.GetConformer(i)
 75 |         sasa = rdFreeSASA.CalcSASA(mol, radii, confIdx=conf.GetId())
 76 |         conf.SetDoubleProp("rdkit_free_sasa", sasa)
 77 |         return sasa
 78 | 
 79 |     runner = JobRunner(n_jobs=n_jobs)
 80 |     sasa_values = runner(_get_sasa, conf_ids)
 81 |     return np.array(sasa_values)
 82 | 
 83 | 
 84 | def get_coords(mol: Mol, conf_id: int = -1):
 85 |     """Get the coordinate of a conformer of a molecule.
 86 | 
 87 |     Args:
 88 |         mol: a molecule.
 89 |         conf_id: a conformer id.
 90 |     """
 91 | 
 92 |     if mol.GetNumConformers() == 0:
 93 |         raise ValueError("Molecule does not have any conformers.")
 94 | 
 95 |     conf = mol.GetConformer(id=conf_id)
 96 |     return conf.GetPositions()
 97 | 
 98 | 
 99 | def center_of_mass(
100 |     mol: Mol,
101 |     use_atoms: bool = True,
102 |     digits: Optional[int] = None,
103 |     conf_id: int = -1,
104 | ) -> np.ndarray:
105 |     """Compute the center of mass of a conformer of a molecule.
106 | 
107 |     Args:
108 |         mol: a molecule
109 |         use_atoms: Whether to compute the true center of mass or the geometrical center.
110 |         digits: Number of digits to round to.
111 |         conf_id: the conformer id.
112 | 
113 |     Returns
114 |         cm: Center of mass or geometrical center
115 |     """
116 |     coords = get_coords(mol, conf_id=conf_id)
117 |     atom_weight = np.ones((coords.shape[0]))
118 | 
119 |     if use_atoms:
120 |         atom_weight = np.array([atom.GetMass() for atom in mol.GetAtoms()])
121 | 
122 |     atom_weight = atom_weight[:, None]
123 |     atom_weight /= atom_weight.sum()
124 |     center = (coords * atom_weight).sum(axis=0)
125 | 
126 |     if digits is not None:
127 |         center = center.round(digits)
128 | 
129 |     return center
130 | 
131 | 
132 | def keep_conformers(
133 |     mol: Mol,
134 |     indices_to_keep: Union[int, List[int]] = -1,
135 |     assign_id: bool = True,
136 |     copy: bool = True,
137 | ):
138 |     """Keep on the specified conformer(s) in `indices_to_keep`.
139 | 
140 |     Args:
141 |         mol: A molecule.
142 |         indices_to_keep: A indice or a least of indices of conformers to keep.
143 |         assign_id: Whether to assign the kept conformers an id or keep the original one.
144 |         copy: Whether to copy the molecule or not.
145 |     """
146 | 
147 |     if copy:
148 |         mol = copy_mol(mol)
149 | 
150 |     if not isinstance(indices_to_keep, list):
151 |         indices_to_keep = [indices_to_keep]
152 | 
153 |     # Extract conformers to keep
154 |     confs_to_keep = [mol.GetConformer(conf_id) for conf_id in indices_to_keep]
155 | 
156 |     # Copy current mol and remove all conformers
157 |     mol2 = copy_mol(mol)
158 |     mol2.RemoveAllConformers()
159 | 
160 |     # Add conformers
161 |     _ = [mol2.AddConformer(conf, assignId=assign_id) for conf in confs_to_keep]
162 | 
163 |     # Cleanup
164 |     mol = mol2
165 | 
166 |     return mol
167 | 


--------------------------------------------------------------------------------
/datamol/data/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The data module aims to provide a fast and convenient access to various molecular datasets.
  3 | 
  4 | ---
  5 | """
  6 | 
  7 | from typing import Optional
  8 | from typing import cast
  9 | from typing import Union
 10 | from typing import List
 11 | from typing import overload
 12 | from typing import Literal
 13 | 
 14 | import sys
 15 | import io
 16 | import functools
 17 | 
 18 | try:
 19 |     import importlib.resources as importlib_resources
 20 | except ImportError:
 21 |     import importlib_resources
 22 | 
 23 | import pandas as pd
 24 | 
 25 | from ..types import Mol
 26 | from ..io import read_sdf
 27 | from ..convert import from_df
 28 | from ..convert import render_mol_df
 29 | 
 30 | 
 31 | @functools.lru_cache()
 32 | def datamol_data_file_path(filename: str, dm_module: str = "datamol.data") -> str:
 33 |     if sys.version_info < (3, 9, 0):
 34 |         with importlib_resources.path(dm_module, filename) as p:
 35 |             data_path = p
 36 |     else:
 37 |         data_path = importlib_resources.files(dm_module).joinpath(filename)
 38 | 
 39 |     return str(data_path)
 40 | 
 41 | 
 42 | def open_datamol_data_file(
 43 |     filename: str,
 44 |     open_binary: bool = False,
 45 |     dm_module: str = "datamol.data",
 46 | ):
 47 |     if sys.version_info < (3, 9, 0):
 48 |         if open_binary:
 49 |             file_context_manager = importlib_resources.open_binary(dm_module, filename)
 50 |         else:
 51 |             file_context_manager = importlib_resources.open_text(dm_module, filename)
 52 |     else:
 53 |         if open_binary:
 54 |             mode = "rb"
 55 |         else:
 56 |             mode = "r"
 57 | 
 58 |         file_context_manager = (
 59 |             importlib_resources.files(dm_module).joinpath(filename).open(mode=mode)
 60 |         )
 61 | 
 62 |     # NOTE(hadim): we assume the file always exists
 63 |     file_context_manager = cast(io.TextIOWrapper, file_context_manager)
 64 | 
 65 |     return file_context_manager
 66 | 
 67 | 
 68 | @overload
 69 | def freesolv(as_df: Literal[True] = True) -> pd.DataFrame: ...
 70 | 
 71 | 
 72 | @overload
 73 | def freesolv(as_df: Literal[False] = False) -> List[Mol]: ...
 74 | 
 75 | 
 76 | @overload
 77 | def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: ...
 78 | 
 79 | 
 80 | def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
 81 |     """Return the FreeSolv dataset as a dataframe.
 82 | 
 83 |     The dataset contains 642 molecules and the following columns:
 84 |     `['iupac', 'smiles', 'expt', 'calc']`.
 85 | 
 86 |     Warning:
 87 |         This dataset is only meant to be used as a toy dataset for pedagogic and
 88 |         testing purposes. **It is not** a dataset for benchmarking, analysis or
 89 |         model training.
 90 |     """
 91 | 
 92 |     with open_datamol_data_file("freesolv.csv") as f:
 93 |         data = pd.read_csv(f)
 94 | 
 95 |     if not as_df:
 96 |         data = from_df(data)
 97 | 
 98 |     return data
 99 | 
100 | 
101 | @overload
102 | def cdk2(as_df: Literal[True] = True, mol_column: Optional[str] = "mol") -> pd.DataFrame: ...
103 | 
104 | 
105 | @overload
106 | def cdk2(as_df: Literal[False] = False, mol_column: Optional[str] = "mol") -> List[Mol]: ...
107 | 
108 | 
109 | @overload
110 | def cdk2(
111 |     as_df: bool = True, mol_column: Optional[str] = "mol"
112 | ) -> Union[List[Mol], pd.DataFrame]: ...
113 | 
114 | 
115 | def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"):
116 |     """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`.
117 | 
118 |     Args:
119 |         as_df: Whether to return a list mol or a pandas DataFrame.
120 |         mol_column: Name of the mol column. Only relevant if `as_df` is True.
121 |     """
122 | 
123 |     with open_datamol_data_file("cdk2.sdf", open_binary=True) as f:
124 |         data = read_sdf(f, as_df=as_df, mol_column=mol_column)
125 |     return data
126 | 
127 | 
128 | @overload
129 | def solubility(as_df: Literal[True] = True, mol_column: Optional[str] = "mol") -> pd.DataFrame: ...
130 | 
131 | 
132 | @overload
133 | def solubility(as_df: Literal[False] = False, mol_column: Optional[str] = "mol") -> List[Mol]: ...
134 | 
135 | 
136 | @overload
137 | def solubility(
138 |     as_df: bool = True, mol_column: Optional[str] = "mol"
139 | ) -> Union[List[Mol], pd.DataFrame]: ...
140 | 
141 | 
142 | def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
143 |     """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`.
144 | 
145 |     The dataframe or the list of molecules with contain a `split` column, either `train` or `test`.
146 | 
147 |     Args:
148 |         as_df: Whether to return a list mol or a pandas DataFrame.
149 |         mol_column: Name of the mol column. Only relevant if `as_df` is True.
150 |     """
151 | 
152 |     with open_datamol_data_file("solubility.train.sdf", open_binary=True) as f:
153 |         train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)
154 | 
155 |     with open_datamol_data_file("solubility.test.sdf", open_binary=True) as f:
156 |         test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)
157 | 
158 |     train = cast(pd.DataFrame, train)
159 |     test = cast(pd.DataFrame, test)
160 | 
161 |     train["split"] = "train"
162 |     test["split"] = "test"
163 | 
164 |     # NOTE(hadim): LMAO RDkit consistency xD
165 |     test = test.rename(columns={"SMILES": "smiles"})
166 | 
167 |     data = pd.concat([train, test], ignore_index=True)
168 | 
169 |     if as_df:
170 |         if mol_column is None:
171 |             data = data.drop(columns=["mol"])
172 | 
173 |         render_mol_df(data)
174 |         return data
175 | 
176 |     return from_df(data, mol_column=mol_column)
177 | 
178 | 
179 | @overload
180 | def chembl_drugs(as_df: Literal[True] = True) -> pd.DataFrame: ...
181 | 
182 | 
183 | @overload
184 | def chembl_drugs(as_df: Literal[False] = False) -> List[Mol]: ...
185 | 
186 | 
187 | def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
188 |     """A list of ~2.5k molecules from ChEMBL (all approved drugs) in SMILES format.
189 |     Includes metadata indicating year of first approval, molecule chembl id, molecule type and pref_name.
190 | 
191 |     List was generated with ['Get_ChEMBL_Approved_Drugs.ipynb'](https://github.com/datamol-io/datamol/notebooks/Get_ChEMBL_Approved_Drugs.ipynb) on 2023-10-18.
192 |     The notebook works with the chembl_webresource_client api to collect chembl IDs and metadata, then focuses on small molecules with valid SMILES and first approval date.
193 |     """
194 |     with open_datamol_data_file("chembl_approved_drugs.parquet", open_binary=True) as f:
195 |         data = pd.read_parquet(f)
196 | 
197 |     if not as_df:
198 |         data = from_df(data)
199 | 
200 |     return data
201 | 
202 | 
203 | @overload
204 | def chembl_samples(as_df: Literal[True] = True) -> pd.DataFrame: ...
205 | 
206 | 
207 | @overload
208 | def chembl_samples(as_df: Literal[False] = False) -> List[Mol]: ...
209 | 
210 | 
211 | def chembl_samples(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
212 |     """A list of ~2k molecules from ChEMBL.
213 | 
214 |     Originally, proposed by Patrick Walters at <https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill>.
215 |     """
216 | 
217 |     with open_datamol_data_file("chembl_samples.csv") as f:
218 |         data = pd.read_csv(f)
219 | 
220 |     if not as_df:
221 |         data = from_df(data)
222 | 
223 |     return data
224 | 


--------------------------------------------------------------------------------
/datamol/data/chembl_approved_drugs.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/datamol/data/chembl_approved_drugs.parquet


--------------------------------------------------------------------------------
/datamol/descriptors/__init__.py:
--------------------------------------------------------------------------------
 1 | from .descriptors import mw
 2 | from .descriptors import fsp3
 3 | from .descriptors import n_hba
 4 | from .descriptors import n_hbd
 5 | from .descriptors import n_lipinski_hba
 6 | from .descriptors import n_lipinski_hbd
 7 | from .descriptors import n_rings
 8 | from .descriptors import n_hetero_atoms
 9 | from .descriptors import n_heavy_atoms
10 | from .descriptors import n_rotatable_bonds
11 | from .descriptors import n_radical_electrons
12 | from .descriptors import tpsa
13 | from .descriptors import qed
14 | from .descriptors import clogp
15 | from .descriptors import sas
16 | from .descriptors import n_NHOH
17 | from .descriptors import n_NO
18 | from .descriptors import formal_charge
19 | from .descriptors import n_aliphatic_carbocycles
20 | from .descriptors import n_aliphatic_heterocyles
21 | from .descriptors import n_aliphatic_rings
22 | from .descriptors import n_aromatic_carbocycles
23 | from .descriptors import n_aromatic_heterocyles
24 | from .descriptors import n_aromatic_rings
25 | from .descriptors import n_saturated_carbocycles
26 | from .descriptors import n_saturated_heterocyles
27 | from .descriptors import n_saturated_rings
28 | from .descriptors import n_aromatic_atoms
29 | from .descriptors import n_aromatic_atoms_proportion
30 | from .descriptors import refractivity
31 | from .descriptors import n_rigid_bonds
32 | from .descriptors import n_stereo_centers
33 | from .descriptors import n_charged_atoms
34 | from .descriptors import n_stereo_centers_unspecified
35 | from .descriptors import n_spiro_atoms
36 | 
37 | from .compute import any_rdkit_descriptor
38 | from .compute import compute_many_descriptors
39 | from .compute import batch_compute_many_descriptors
40 | 


--------------------------------------------------------------------------------
/datamol/descriptors/compute.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | from typing import Dict
  3 | from typing import List
  4 | from typing import Union
  5 | from typing import Optional
  6 | 
  7 | import functools
  8 | 
  9 | import pandas as pd
 10 | 
 11 | from rdkit.Chem import Descriptors
 12 | from rdkit.Chem import rdMolDescriptors
 13 | 
 14 | from .. import Mol
 15 | from ..utils.jobs import parallelized
 16 | 
 17 | from .descriptors import mw
 18 | from .descriptors import fsp3
 19 | from .descriptors import n_lipinski_hba
 20 | from .descriptors import n_lipinski_hbd
 21 | from .descriptors import n_rings
 22 | from .descriptors import n_hetero_atoms
 23 | from .descriptors import n_heavy_atoms
 24 | from .descriptors import n_rotatable_bonds
 25 | from .descriptors import n_radical_electrons
 26 | from .descriptors import tpsa
 27 | from .descriptors import qed
 28 | from .descriptors import clogp
 29 | from .descriptors import sas
 30 | from .descriptors import n_aliphatic_carbocycles
 31 | from .descriptors import n_aliphatic_heterocyles
 32 | from .descriptors import n_aliphatic_rings
 33 | from .descriptors import n_aromatic_carbocycles
 34 | from .descriptors import n_aromatic_heterocyles
 35 | from .descriptors import n_aromatic_rings
 36 | from .descriptors import n_saturated_carbocycles
 37 | from .descriptors import n_saturated_heterocyles
 38 | from .descriptors import n_saturated_rings
 39 | 
 40 | 
 41 | def any_rdkit_descriptor(name: str) -> Callable:
 42 |     """Return a descriptor function by name either from
 43 |     `rdkit.Chem import Descriptors` or `rdkit.Chem.rdMolDescriptors`.
 44 | 
 45 |     Args:
 46 |         name: Descriptor name.
 47 |     """
 48 |     fn = getattr(Descriptors, name, None)
 49 | 
 50 |     if fn is None:
 51 |         fn = getattr(rdMolDescriptors, name, None)
 52 | 
 53 |     if fn is None:
 54 |         raise ValueError(f"Descriptor {name} not found.")
 55 | 
 56 |     return fn
 57 | 
 58 | 
 59 | _DEFAULT_PROPERTIES_FN = {
 60 |     "mw": mw,
 61 |     "fsp3": fsp3,
 62 |     "n_lipinski_hba": n_lipinski_hba,
 63 |     "n_lipinski_hbd": n_lipinski_hbd,
 64 |     "n_rings": n_rings,
 65 |     "n_hetero_atoms": n_hetero_atoms,
 66 |     "n_heavy_atoms": n_heavy_atoms,
 67 |     "n_rotatable_bonds": n_rotatable_bonds,
 68 |     "n_radical_electrons": n_radical_electrons,
 69 |     "tpsa": tpsa,
 70 |     "qed": qed,
 71 |     "clogp": clogp,
 72 |     "sas": sas,
 73 |     "n_aliphatic_carbocycles": n_aliphatic_carbocycles,
 74 |     "n_aliphatic_heterocyles": n_aliphatic_heterocyles,
 75 |     "n_aliphatic_rings": n_aliphatic_rings,
 76 |     "n_aromatic_carbocycles": n_aromatic_carbocycles,
 77 |     "n_aromatic_heterocyles": n_aromatic_heterocyles,
 78 |     "n_aromatic_rings": n_aromatic_rings,
 79 |     "n_saturated_carbocycles": n_saturated_carbocycles,
 80 |     "n_saturated_heterocyles": n_saturated_heterocyles,
 81 |     "n_saturated_rings": n_saturated_rings,
 82 | }
 83 | 
 84 | 
 85 | def compute_many_descriptors(
 86 |     mol: Mol,
 87 |     properties_fn: Optional[Dict[str, Union[Callable, str]]] = None,
 88 |     add_properties: bool = True,
 89 | ) -> dict:
 90 |     """Compute a list of opiniated molecular properties.
 91 | 
 92 |     Args:
 93 |         mol: A molecule.
 94 |         properties_fn: A list of functions that compute properties. If None,
 95 |             a default list of properties is used. If the function is a string,
 96 |             `dm.descriptors.any_descriptor()` is used to retrieve the descriptor
 97 |             function.
 98 |         add_properties: Whether to add the computed properties to the default list.
 99 | 
100 |     Returns:
101 |         Computed properties as a dict.
102 |     """
103 | 
104 |     if properties_fn is None:
105 |         properties_fn = _DEFAULT_PROPERTIES_FN
106 |     elif add_properties:
107 |         [properties_fn.setdefault(k, v) for k, v in _DEFAULT_PROPERTIES_FN.items()]
108 | 
109 |     props = {}
110 |     for k, v in properties_fn.items():
111 |         if isinstance(v, str):
112 |             v = any_rdkit_descriptor(v)
113 | 
114 |         props[k] = v(mol)
115 | 
116 |     return props
117 | 
118 | 
119 | def batch_compute_many_descriptors(
120 |     mols: List[Mol],
121 |     properties_fn: Optional[Dict[str, Union[Callable, str]]] = None,
122 |     add_properties: bool = True,
123 |     n_jobs: int = 1,
124 |     batch_size: Optional[int] = None,
125 |     progress: bool = False,
126 |     progress_leave: bool = True,
127 | ) -> pd.DataFrame:
128 |     """Compute a list of opiniated molecular properties on a list of molecules.
129 | 
130 |     Args:
131 |         mols: A list of molecules.
132 |         properties_fn: A list of functions that compute properties. If None,
133 |             a default list of properties is used. If the function is a string,
134 |             `dm.descriptors.any_descriptor()` is used to retrieve the descriptor
135 |             function.
136 |         add_properties: Whether to add the computed properties to the default list.
137 | 
138 |     Returns:
139 |         A dataframe of computed properties with one row per input molecules.
140 |     """
141 | 
142 |     compute_fn = functools.partial(
143 |         compute_many_descriptors,
144 |         properties_fn=properties_fn,
145 |         add_properties=add_properties,
146 |     )
147 | 
148 |     props = parallelized(
149 |         compute_fn,
150 |         mols,
151 |         batch_size=batch_size,
152 |         progress=progress,
153 |         n_jobs=n_jobs,
154 |         tqdm_kwargs=dict(leave=progress_leave),
155 |     )
156 |     return pd.DataFrame(props)
157 | 


--------------------------------------------------------------------------------
/datamol/descriptors/descriptors.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | 
  4 | from rdkit.Chem import Descriptors
  5 | from rdkit.Chem import rdMolDescriptors
  6 | from rdkit.Chem import RDConfig
  7 | from rdkit.Chem import Lipinski
  8 | from rdkit.Chem import rdmolops
  9 | from rdkit.Chem import Crippen
 10 | 
 11 | 
 12 | from .. import Mol
 13 | from ..convert import from_smarts
 14 | from ..log import no_rdkit_log
 15 | from .._version import is_lower_than_current_rdkit_version
 16 | 
 17 | 
 18 | @no_rdkit_log
 19 | def _sasscorer(mol: Mol):
 20 |     sys.path.append(os.path.join(RDConfig.RDContribDir, "SA_Score"))
 21 |     try:
 22 |         import sascorer  # type:ignore
 23 |     except ImportError:
 24 |         raise ImportError(
 25 |             "Could not import sascorer. If you installed rdkit-pypi with `pip`, please uninstall it and reinstall rdkit with `conda` or `mamba`."
 26 |         )
 27 | 
 28 |     return sascorer.calculateScore(mol)
 29 | 
 30 | 
 31 | _AROMATIC_QUERY = from_smarts("a")
 32 | 
 33 | mw = rdMolDescriptors.CalcExactMolWt
 34 | fsp3 = rdMolDescriptors.CalcFractionCSP3
 35 | tpsa = rdMolDescriptors.CalcTPSA
 36 | qed = Descriptors.qed
 37 | clogp = Descriptors.MolLogP  # type: ignore
 38 | sas = _sasscorer
 39 | formal_charge = rdmolops.GetFormalCharge
 40 | refractivity = Crippen.MolMR
 41 | 
 42 | n_hba = rdMolDescriptors.CalcNumHBA
 43 | n_hbd = rdMolDescriptors.CalcNumHBD
 44 | n_lipinski_hba = rdMolDescriptors.CalcNumLipinskiHBA
 45 | n_lipinski_hbd = rdMolDescriptors.CalcNumLipinskiHBD
 46 | n_rings = rdMolDescriptors.CalcNumRings
 47 | n_hetero_atoms = rdMolDescriptors.CalcNumHeteroatoms
 48 | 
 49 | 
 50 | if is_lower_than_current_rdkit_version("2021.09"):
 51 |     n_heavy_atoms = Descriptors.HeavyAtomCount  # type: ignore
 52 | else:
 53 |     n_heavy_atoms = rdMolDescriptors.CalcNumHeavyAtoms
 54 | 
 55 | n_rotatable_bonds = rdMolDescriptors.CalcNumRotatableBonds
 56 | n_radical_electrons = Descriptors.NumRadicalElectrons
 57 | n_NHOH = Lipinski.NHOHCount
 58 | n_NO = Lipinski.NOCount
 59 | n_spiro_atoms = rdMolDescriptors.CalcNumSpiroAtoms
 60 | 
 61 | n_aliphatic_carbocycles = rdMolDescriptors.CalcNumAliphaticCarbocycles
 62 | n_aliphatic_heterocyles = rdMolDescriptors.CalcNumAliphaticHeterocycles
 63 | n_aliphatic_rings = rdMolDescriptors.CalcNumAliphaticRings
 64 | 
 65 | n_aromatic_carbocycles = rdMolDescriptors.CalcNumAromaticCarbocycles
 66 | n_aromatic_heterocyles = rdMolDescriptors.CalcNumAromaticHeterocycles
 67 | n_aromatic_rings = rdMolDescriptors.CalcNumAromaticRings
 68 | 
 69 | n_saturated_carbocycles = rdMolDescriptors.CalcNumSaturatedCarbocycles
 70 | n_saturated_heterocyles = rdMolDescriptors.CalcNumSaturatedHeterocycles
 71 | n_saturated_rings = rdMolDescriptors.CalcNumSaturatedRings
 72 | 
 73 | 
 74 | def n_rigid_bonds(mol: Mol) -> int:
 75 |     """Compute the number of rigid bonds in a molecule.
 76 | 
 77 |     Rigid bonds are bonds that are not single and not in rings.
 78 | 
 79 |     Args:
 80 |         mol: A molecule.
 81 | 
 82 |     Returns:
 83 |         n_rigid_bonds: number of rigid bonds in the molecule
 84 |     """
 85 |     non_rigid_bonds_count = from_smarts("*-&!@*")
 86 |     n_rigid_bonds = mol.GetNumBonds() - len(mol.GetSubstructMatches(non_rigid_bonds_count))
 87 |     return n_rigid_bonds
 88 | 
 89 | 
 90 | def n_aromatic_atoms(mol: Mol) -> int:
 91 |     """Calculate the number of aromatic atoms."""
 92 |     matches = mol.GetSubstructMatches(_AROMATIC_QUERY)
 93 |     return len(matches)
 94 | 
 95 | 
 96 | def n_aromatic_atoms_proportion(mol: Mol) -> int:
 97 |     """Calculate the aromatic proportion: # aromatic atoms/#atoms total.
 98 | 
 99 |     Args:
100 |         mol: A molecule.
101 | 
102 |     Only heavy atoms are considered.
103 |     """
104 |     return n_aromatic_atoms(mol) / mol.GetNumHeavyAtoms()
105 | 
106 | 
107 | def n_stereo_centers(mol: Mol) -> int:
108 |     """Compute the number of stereocenters in a molecule.
109 | 
110 |     Args:
111 |         mol: A molecule.
112 | 
113 |     Returns:
114 |         n_stero_center: number of stereocenters in the molecule
115 |     """
116 |     n = 0
117 |     try:
118 |         rdmolops.FindPotentialStereo(mol, cleanIt=False)
119 |         n = rdMolDescriptors.CalcNumAtomStereoCenters(mol)
120 |     except Exception:
121 |         pass
122 |     return n
123 | 
124 | 
125 | def n_stereo_centers_unspecified(mol: Mol) -> int:
126 |     """Compute the number of unspecified stereocenters in a molecule.
127 | 
128 |     Args:
129 |         mol: A molecule.
130 | 
131 |     Returns:
132 |         n_stereo_centers_unspecified: number of unspecified stereocenters in the molecule
133 |     """
134 |     n = 0
135 |     try:
136 |         rdmolops.FindPotentialStereo(mol, cleanIt=False)
137 |         n = rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(mol)
138 |     except Exception:
139 |         pass
140 |     return n
141 | 
142 | 
143 | def n_charged_atoms(mol: Mol) -> int:
144 |     """Compute the number of charged atoms in a molecule.
145 | 
146 |     Args:
147 |         mol: A molecule.
148 | 
149 |     Returns:
150 |         n_charged_atoms: number of charged atoms in the molecule
151 |     """
152 |     return sum([at.GetFormalCharge() != 0 for at in mol.GetAtoms()])
153 | 


--------------------------------------------------------------------------------
/datamol/fragment/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._fragment import brics
 2 | from ._fragment import frag
 3 | from ._fragment import recap
 4 | from ._fragment import anybreak
 5 | from ._fragment import mmpa_frag
 6 | from ._fragment import mmpa_cut
 7 | 
 8 | from ._assemble import assemble_fragment_order
 9 | from ._assemble import break_mol
10 | from ._assemble import build
11 | 


--------------------------------------------------------------------------------
/datamol/fragment/_fragment.py:
--------------------------------------------------------------------------------
  1 | from typing import Set
  2 | from typing import Optional
  3 | from typing import Any
  4 | 
  5 | from rdkit import Chem
  6 | from rdkit.Chem import BRICS
  7 | from rdkit.Chem import Recap
  8 | from rdkit.Chem import rdMMPA
  9 | 
 10 | from rdkit.Chem.Fraggle import FraggleSim
 11 | 
 12 | import datamol as dm
 13 | 
 14 | 
 15 | def brics(
 16 |     mol: Chem.rdchem.Mol,
 17 |     singlepass: bool = True,
 18 |     remove_parent: bool = False,
 19 |     sanitize: bool = True,
 20 |     fix: bool = True,
 21 | ):
 22 |     """Run BRICS on the molecules and potentially fix dummy atoms.
 23 | 
 24 |     Args:
 25 |         mol: a molecule.
 26 |         singlepass: Single pass for `BRICSDecompose`.
 27 |         remove_parent: Remove parent from the fragments.
 28 |         sanitize: Wether to sanitize the fragments.
 29 |         fix: Wether to fix the fragments.
 30 |     """
 31 |     frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
 32 |     frags = list(frags)
 33 | 
 34 |     if fix:
 35 |         frags = [dm.fix_mol(x) for x in frags]
 36 |     if sanitize:
 37 |         frags = [dm.sanitize_mol(x) for x in frags]
 38 |     if remove_parent:
 39 |         frags.pop(0)
 40 | 
 41 |     frags = [x for x in frags if x is not None]
 42 | 
 43 |     return frags
 44 | 
 45 | 
 46 | def frag(
 47 |     mol: Chem.rdchem.Mol,
 48 |     remove_parent: bool = False,
 49 |     sanitize: bool = True,
 50 |     fix: bool = True,
 51 | ):
 52 |     """Generate all possible fragmentation of a molecule.
 53 | 
 54 |     Args:
 55 |         mol: a molecule.
 56 |         remove_parent: Remove parent from the fragments.
 57 |         sanitize: Wether to sanitize the fragments.
 58 |         fix: Wether to fix the fragments.
 59 |     """
 60 |     frags = FraggleSim.generate_fraggle_fragmentation(mol)
 61 | 
 62 |     smiles = set([])
 63 |     for seq in frags:
 64 |         smiles |= {s.strip() for s in seq.split(".")}
 65 | 
 66 |     smiles = list(sorted(smiles, reverse=True))
 67 |     frags = [dm.to_mol(s) for s in smiles]
 68 | 
 69 |     if fix:
 70 |         frags = [dm.fix_mol(x) for x in frags]
 71 |     if sanitize:
 72 |         frags = [dm.sanitize_mol(x) for x in frags]
 73 | 
 74 |     frags = [x for x in frags if x is not None]
 75 | 
 76 |     if remove_parent:
 77 |         return frags
 78 |     return [mol] + frags
 79 | 
 80 | 
 81 | def recap(
 82 |     mol: Chem.rdchem.Mol,
 83 |     remove_parent: bool = False,
 84 |     sanitize: bool = True,
 85 |     fix: bool = True,
 86 | ):
 87 |     """Fragment the molecule using the recap algorithm.
 88 | 
 89 |     Args:
 90 |         mol: a molecule.
 91 |         remove_parent: Remove parent from the fragments.
 92 |         sanitize: Wether to sanitize the fragments.
 93 |         fix: Wether to fix the fragments.
 94 |     """
 95 |     res = Recap.RecapDecompose(mol)
 96 |     frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]
 97 | 
 98 |     if fix:
 99 |         frags = [dm.fix_mol(x) for x in frags]
100 |     if sanitize:
101 |         frags = [dm.sanitize_mol(x) for x in frags]
102 | 
103 |     frags = [x for x in frags if x is not None]
104 | 
105 |     if remove_parent:
106 |         return frags
107 |     return [mol] + frags
108 | 
109 | 
110 | def anybreak(
111 |     mol: Chem.rdchem.Mol,
112 |     remove_parent: bool = False,
113 |     sanitize: bool = True,
114 |     fix: bool = True,
115 | ):
116 |     """Fragment molecule by applying brics first, then fall back to frag.
117 | 
118 |     Args:
119 |         mol: a molecule.
120 |         remove_parent: Remove parent from the fragments.
121 |         sanitize: Wether to sanitize the fragments.
122 |         fix: Wether to fix the fragments.
123 |     """
124 |     frags = []
125 |     try:
126 |         frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
127 |     except Exception:
128 |         pass
129 | 
130 |     if len(frags) == 0:
131 |         frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)
132 | 
133 |     return frags
134 | 
135 | 
136 | def mmpa_frag(
137 |     mol: dm.Mol,
138 |     pattern: Optional[str] = None,
139 |     max_cut: int = 1,
140 |     max_bond_cut: int = 20,
141 |     h_split: bool = False,
142 | ) -> Optional[Set[dm.Mol]]:
143 |     """Fragment molecule on specific bonds suitable for a MMPA analysis.
144 | 
145 |     Args:
146 |         mol: Molecule to fragment.
147 |         pattern: Bond pattern to split on. Will use default rdkit pattern
148 |             '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
149 |         max_cut: Number of cuts.
150 |         max_bond_cut: Maximum number of bond to cut. Default to 20.
151 |         h_split:  Whether to split at hydrogen position too.
152 |             This is equivalent to enabling the addition of new fragments.
153 | 
154 |     Returns:
155 |         List of fragments.
156 |     """
157 | 
158 |     frags = []
159 |     if pattern is None:
160 |         frags = rdMMPA.FragmentMol(
161 |             mol,
162 |             maxCuts=max_cut,
163 |             resultsAsMols=False,
164 |             maxCutBonds=max_bond_cut,
165 |         )
166 |     elif pattern:
167 |         frags = rdMMPA.FragmentMol(
168 |             mol,
169 |             pattern=pattern,
170 |             maxCuts=max_cut,
171 |             resultsAsMols=False,
172 |             maxCutBonds=max_bond_cut,
173 |         )
174 | 
175 |     if h_split:
176 |         mol = dm.add_hs(mol)
177 |         frags += rdMMPA.FragmentMol(
178 |             mol,
179 |             pattern="[#1]!@!=!#[!#1]",
180 |             maxCuts=1,
181 |             resultsAsMols=False,
182 |             maxCutBonds=max_bond_cut,
183 |         )
184 |     return set(frags)
185 | 
186 | 
187 | def mmpa_cut(mol: dm.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
188 |     """Cut molecules to perform mmpa analysis later
189 | 
190 |     Args:
191 |         mol: Molecule to fragment.
192 |         rdkit_pattern: Whether to perform the fragmentation
193 |             using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"
194 | 
195 |     Returns:
196 |         List of 'smiles,core,chains'
197 |     """
198 | 
199 |     if mol is None:
200 |         return mol
201 | 
202 |     outlines = set()
203 | 
204 |     smiles = dm.to_smiles(mol)
205 | 
206 |     if rdkit_pattern:
207 |         frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
208 |     else:
209 |         # heavy atoms
210 |         frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
211 |         frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))
212 | 
213 |     frags = set(frags)
214 |     for core, chains in frags:
215 |         output = f"{smiles},{core},{chains}\n"
216 |         outlines.add(output)
217 | 
218 |     # hydrogen splitting
219 |     mol = dm.add_hs(mol)
220 |     smiles = dm.to_smiles(mol)
221 | 
222 |     n = mol.GetNumHeavyAtoms()
223 |     if n < 60:
224 |         frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
225 |         for core, chains in frags:
226 |             output = f"{smiles},{core},{chains}\n"
227 |             outlines.add(output)
228 | 
229 |     return outlines
230 | 


--------------------------------------------------------------------------------
/datamol/isomers/__init__.py:
--------------------------------------------------------------------------------
1 | from ._structural import IsomerEnumerator
2 | 
3 | from ._enumerate import enumerate_stereoisomers
4 | from ._enumerate import enumerate_tautomers
5 | from ._enumerate import enumerate_structisomers
6 | from ._enumerate import count_stereoisomers
7 | from ._enumerate import remove_stereochemistry
8 | from ._enumerate import canonical_tautomer
9 | 


--------------------------------------------------------------------------------
/datamol/log.py:
--------------------------------------------------------------------------------
  1 | from rdkit import RDLogger
  2 | from rdkit import rdBase
  3 | from functools import wraps
  4 | 
  5 | 
  6 | class without_rdkit_log:
  7 |     """Context manager to disable RDKit logs. By default all logs are disabled.
  8 | 
  9 |     Example:
 10 | 
 11 |     ```python
 12 |     import datamol as dm
 13 | 
 14 |     with dm.without_rdkit_log():
 15 |         mol = dm.to_mol("CCCCO")  # potential RDKit logs won't show
 16 |     ```
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         mute_errors: bool = True,
 22 |         mute_warning: bool = True,
 23 |         mute_info: bool = True,
 24 |         mute_debug: bool = True,
 25 |         enable: bool = True,
 26 |     ):
 27 |         if enable is False:
 28 |             mute_errors = False
 29 |             mute_warning = False
 30 |             mute_info = False
 31 |             mute_debug = False
 32 | 
 33 |         # Get current log state
 34 |         self.previous_status = self._get_log_status()
 35 | 
 36 |         # Init the desired log state to apply during in the context
 37 |         self.desired_status = {}
 38 |         self.desired_status["rdApp.error"] = not mute_errors
 39 |         self.desired_status["rdApp.warning"] = not mute_warning
 40 |         self.desired_status["rdApp.debug"] = not mute_debug
 41 |         self.desired_status["rdApp.info"] = not mute_info
 42 | 
 43 |     def _get_log_status(self):
 44 |         """Get the current log status of RDKit logs."""
 45 |         log_status = rdBase.LogStatus()
 46 |         log_status = {st.split(":")[0]: st.split(":")[1] for st in log_status.split("\n")}
 47 |         log_status = {k: True if v == "enabled" else False for k, v in log_status.items()}
 48 |         return log_status
 49 | 
 50 |     def _apply_log_status(self, log_status):
 51 |         """Apply an RDKit log status."""
 52 |         for k, v in log_status.items():
 53 |             if v is True:
 54 |                 rdBase.EnableLog(k)
 55 |             else:
 56 |                 rdBase.DisableLog(k)
 57 | 
 58 |     def __enter__(self):
 59 |         self._apply_log_status(self.desired_status)
 60 | 
 61 |     def __exit__(self, *args, **kwargs):
 62 |         self._apply_log_status(self.previous_status)
 63 | 
 64 | 
 65 | def disable_rdkit_log():
 66 |     """Disable all rdkit logs."""
 67 |     for log_level in RDLogger._levels:
 68 |         rdBase.DisableLog(log_level)
 69 | 
 70 | 
 71 | def enable_rdkit_log():
 72 |     """Enable all rdkit logs."""
 73 |     for log_level in RDLogger._levels:
 74 |         rdBase.EnableLog(log_level)
 75 | 
 76 | 
 77 | def no_rdkit_log(
 78 |     func=None,
 79 |     *,
 80 |     mute_errors: bool = True,
 81 |     mute_warning: bool = True,
 82 |     mute_info: bool = True,
 83 |     mute_debug: bool = True,
 84 |     enable: bool = True,
 85 | ):
 86 |     """Decorator to disable RDKit logs.
 87 | 
 88 |     This decorator can be used to suppress RDKit logs when executing a specific function.
 89 |     By default, all log levels (error, warning, info, and debug) are muted.
 90 | 
 91 |     Args:
 92 |         mute_errors : Whether to mute error logs (default is True).
 93 |         mute_warning : Whether to mute warning logs (default is True).
 94 |         mute_info : Whether to mute info logs (default is True).
 95 |         mute_debug : Whether to mute debug logs (default is True).
 96 |         enable: Whether to enable the log muting (default is True). If set to False, no logs will be muted.
 97 | 
 98 |     Example:
 99 |     ```python
100 |     @no_rdkit_log()
101 |     def example_function():
102 |         # Your function code here
103 |         pass
104 | 
105 |     example_function()  # RDKit logs won't show during this function's execution
106 |     ```
107 |     """
108 | 
109 |     if func is None:
110 |         return lambda f: no_rdkit_log(
111 |             f,
112 |             mute_errors=mute_errors,
113 |             mute_warning=mute_warning,
114 |             mute_info=mute_info,
115 |             mute_debug=mute_debug,
116 |             enable=enable,
117 |         )
118 | 
119 |     @wraps(func)
120 |     def wrapper(*args, **kwargs):
121 |         with without_rdkit_log(mute_errors, mute_warning, mute_info, mute_debug, enable):
122 |             return func(*args, **kwargs)
123 | 
124 |     return wrapper
125 | 


--------------------------------------------------------------------------------
/datamol/mcs.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import Any
 3 | 
 4 | from rdkit.Chem import rdFMCS
 5 | 
 6 | import datamol as dm
 7 | 
 8 | ALLOWED_ATOM_COMPARE = ["CompareAny", "CompareAnyHeavyAtom", "CompareElements", "CompareIsotopes"]
 9 | ALLOWED_BOND_COMPARE = ["CompareAny", "CompareOrder", "CompareOrderExact"]
10 | ALLOWED_RING_COMPARE = ["IgnoreRingFusion", "PermissiveRingFusion", "StrictRingFusion"]
11 | 
12 | 
13 | def find_mcs(
14 |     mols: List[dm.Mol],
15 |     maximize_bonds: bool = True,
16 |     threshold: float = 0.0,
17 |     timeout: int = 5,
18 |     verbose: bool = False,
19 |     match_valences: bool = False,
20 |     ring_matches_ring_only: bool = True,
21 |     complete_rings_only: bool = False,
22 |     match_chiral_tag: bool = False,
23 |     seed_smarts: str = "",
24 |     atom_compare: str = "CompareElements",
25 |     bond_compare: str = "CompareOrder",
26 |     ring_compare: str = "IgnoreRingFusion",
27 |     with_details: bool = False,
28 |     **kwargs: Any,
29 | ):
30 |     """Find the maximum common substructure from a list of molecules.
31 | 
32 |     Args:
33 |         mols: List of molecules.
34 |         maximize_bonds: Maximize the number of bonds in the substructure.
35 |         threshold: The threshold for the MCS (between 0 and 1).
36 |         timeout: The timeout for the MCS.
37 |         verbose: Whether to enable verbose mode.
38 |         match_valences: Whether to match valences.
39 |         ring_matches_ring_only: Whether to match rings only.
40 |         complete_rings_only: Whether to match complete rings only.
41 |         match_chiral_tag: Whether to match chiral tags.
42 |         seed_smarts: The seed SMARTS.
43 |         atom_compare: One of "CompareAny", "CompareAnyHeavyAtom", "CompareElements",
44 |             "CompareIsotopes".
45 |         bond_compare: One of "CompareAny", "CompareOrder", "CompareOrderExact".
46 |         ring_compare: One of "IgnoreRingFusion", "PermissiveRingFusion", "StrictRingFusion".
47 |         with_details: Whether to return the RDKit MCS object or just the SMARTS string.
48 |         **kwargs: Additional arguments for the MCS.
49 |     """
50 | 
51 |     if atom_compare not in ALLOWED_ATOM_COMPARE:
52 |         raise ValueError(f"atom_compare must be one of {ALLOWED_ATOM_COMPARE}")
53 | 
54 |     if bond_compare not in ALLOWED_BOND_COMPARE:
55 |         raise ValueError(f"bond_compare must be one of {ALLOWED_BOND_COMPARE}")
56 | 
57 |     if ring_compare not in ALLOWED_RING_COMPARE:
58 |         raise ValueError(f"ring_compare must be one of {ALLOWED_RING_COMPARE}")
59 | 
60 |     args = {}
61 |     args["maximizeBonds"] = maximize_bonds
62 |     args["threshold"] = threshold
63 |     args["timeout"] = timeout
64 |     args["verbose"] = verbose
65 |     args["matchValences"] = match_valences
66 |     args["ringMatchesRingOnly"] = ring_matches_ring_only
67 |     args["completeRingsOnly"] = complete_rings_only
68 |     args["matchChiralTag"] = match_chiral_tag
69 |     args["seedSmarts"] = seed_smarts
70 |     args["atomCompare"] = rdFMCS.AtomCompare.names[atom_compare]
71 |     args["bondCompare"] = rdFMCS.BondCompare.names[bond_compare]
72 |     args["ringCompare"] = rdFMCS.RingCompare.names[ring_compare]
73 | 
74 |     args.update(kwargs)
75 | 
76 |     mcs = rdFMCS.FindMCS(mols, **args)
77 | 
78 |     if with_details:
79 |         return mcs
80 | 
81 |     smarts = mcs.smartsString
82 |     if smarts == "":
83 |         smarts = None
84 |     return smarts
85 | 


--------------------------------------------------------------------------------
/datamol/molar.py:
--------------------------------------------------------------------------------
 1 | """A set of utility functions to convert between various units and formats used in drug discovery.
 2 | """
 3 | 
 4 | from typing import Union
 5 | from typing import Iterable
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | _MOLAR_SCALES = {"M": 1, "mM": 1e-3, "uM": 1e-6, "nM": 1e-9, "pM": 1e-12, "fM": 1e-15}
11 | 
12 | 
13 | def molar_to_log(
14 |     values: Union[float, Iterable[float], np.ndarray],
15 |     unit: str,
16 | ) -> Union[float, Iterable[float], np.ndarray]:
17 |     """Convert a molar concentration (XC50 for example) to its log scaled value (pXC50).
18 | 
19 |     Args:
20 |         values: A molar concentration (can be a scalar, a list or an array).
21 |         unit: The unit of the input concentration. Choose from:
22 |             `{'M', 'fM', 'mM', 'nM', 'pM', 'uM'}`.
23 |     """
24 | 
25 |     if unit not in _MOLAR_SCALES:
26 |         raise ValueError(
27 |             f"The unit '{unit}' is not supported. Choose from {set(_MOLAR_SCALES.keys())}."
28 |         )
29 | 
30 |     return -1 * np.log10(np.array(values) * _MOLAR_SCALES[unit])
31 | 
32 | 
33 | def log_to_molar(
34 |     values: Union[float, Iterable[float], np.ndarray],
35 |     unit: str,
36 | ) -> Union[float, Iterable[float], np.ndarray]:
37 |     """Convert a log-scaled molar concentration (pXC50 for example) to its unscaled value (XC50).
38 | 
39 |     Args:
40 |         values: A log-scaled molar concentration (can be a scalar, a list or an array).
41 |         unit: The unit of the input concentration. Choose from:
42 |             `{'M', 'fM', 'mM', 'nM', 'pM', 'uM'}`.
43 |     """
44 | 
45 |     if unit not in _MOLAR_SCALES:
46 |         raise ValueError(
47 |             f"The unit '{unit}' is not supported. Choose from {set(_MOLAR_SCALES.keys())}."
48 |         )
49 | 
50 |     return 10 ** (-1 * np.array(values, dtype="float")) / _MOLAR_SCALES[unit]
51 | 


--------------------------------------------------------------------------------
/datamol/predictors/__init__.py:
--------------------------------------------------------------------------------
1 | from .esol import esol
2 | from .esol import esol_from_data
3 | 


--------------------------------------------------------------------------------
/datamol/predictors/esol.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | from .. import Mol
 7 | 
 8 | from ..descriptors.descriptors import clogp
 9 | from ..descriptors.descriptors import mw
10 | from ..descriptors.descriptors import n_rotatable_bonds
11 | from ..descriptors.descriptors import n_aromatic_atoms_proportion
12 | 
13 | 
14 | _ESOL_INTERCEPT = 0.26121066137801696
15 | _ESOL_COEF = {
16 |     "mw": -0.0066138847738667125,
17 |     "clogp": -0.7416739523408995,
18 |     "n_rotatable_bonds": 0.003451545565957996,
19 |     "n_aromatic_atoms_proportion": -0.42624840441316975,
20 | }
21 | 
22 | 
23 | def esol(mol: Mol):
24 |     """Compute the solubility descriptor ESOL.
25 | 
26 |     Note that the intermediate descriptors will be computed on-the-fly. If you prefer
27 |     precomputing those then you can use `esol_from_data`.
28 | 
29 |     Source: https://github.com/PatWalters/solubility/blob/d1536c58afe5e0e7ac4c96e2ffef496d5b98664b/esol.py
30 |     """
31 | 
32 |     esol = (
33 |         _ESOL_INTERCEPT
34 |         + _ESOL_COEF["clogp"] * clogp(mol)
35 |         + _ESOL_COEF["mw"] * mw(mol)
36 |         + _ESOL_COEF["n_rotatable_bonds"] * n_rotatable_bonds(mol)
37 |         + _ESOL_COEF["n_aromatic_atoms_proportion"] * n_aromatic_atoms_proportion(mol)
38 |     )
39 | 
40 |     return esol
41 | 
42 | 
43 | def esol_from_data(data: Union[pd.Series, pd.DataFrame, dict]):
44 |     """Compute the solubility descriptor ESOL.
45 | 
46 |     `data` must contains the following intermediate descriptors:
47 | 
48 |     - `clogp`: `dm.descriptors.clogp`
49 |     - `mw`: `dm.descriptors.mw`
50 |     - `n_rotatable_bonds`: `dm.descriptors.n_rotatable_bonds`
51 |     - `n_aromatic_atoms_proportion`: `dm.descriptors.n_aromatic_atoms_proportion`
52 | 
53 |     Source: https://github.com/PatWalters/solubility/blob/d1536c58afe5e0e7ac4c96e2ffef496d5b98664b/esol.py
54 | 
55 |     Args:
56 |         data: A dataframe or series containing the intermediate descriptors.
57 |     """
58 | 
59 |     esol = (
60 |         _ESOL_INTERCEPT
61 |         + _ESOL_COEF["clogp"] * data["clogp"]
62 |         + _ESOL_COEF["mw"] * data["mw"]
63 |         + _ESOL_COEF["n_rotatable_bonds"] * data["n_rotatable_bonds"]
64 |         + _ESOL_COEF["n_aromatic_atoms_proportion"] * data["n_aromatic_atoms_proportion"]
65 |     )
66 | 
67 |     return esol
68 | 


--------------------------------------------------------------------------------
/datamol/reactions/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._reactions import is_reaction_ok
 2 | from ._reactions import select_reaction_output
 3 | from ._reactions import apply_reaction
 4 | from ._reactions import can_react
 5 | from ._reactions import inverse_reaction
 6 | from ._reactions import find_reactant_position
 7 | from ._reactions import ATTACHING_RXN
 8 | from ._reactions import rxn_from_smarts
 9 | from ._reactions import rxn_to_smarts
10 | from ._reactions import rxn_from_block
11 | from ._reactions import rxn_from_block_file
12 | from ._reactions import rxn_to_block
13 | from ._reactions import rxn_to_block_file
14 | 
15 | from ._attachments import add_brackets_to_attachment_points
16 | from ._attachments import convert_attach_to_isotope
17 | from ._attachments import num_attachment_points
18 | from ._attachments import open_attach_points
19 | 


--------------------------------------------------------------------------------
/datamol/reactions/_attachments.py:
--------------------------------------------------------------------------------
  1 | from typing import cast
  2 | from typing import Union
  3 | 
  4 | import re
  5 | import operator
  6 | 
  7 | import datamol as dm
  8 | from rdkit import Chem
  9 | 
 10 | ATTACHMENT_POINT_TOKEN = "*"
 11 | ATTACHMENT_POINT_NUM_REGEXP = r"\[{}:?(\d*)\]".format(re.escape(ATTACHMENT_POINT_TOKEN))
 12 | ATTACHMENT_POINT_REGEXP = r"(?:{0}|\[{0}[^\]]*\])".format(re.escape(ATTACHMENT_POINT_TOKEN))
 13 | ATTACHMENT_POINT_NO_BRACKETS_REGEXP = r"(?<![:\[]){0}(?![:\]])".format(
 14 |     re.escape(ATTACHMENT_POINT_TOKEN)
 15 | )
 16 | ALL_POSSIBLE_ATTACHMENTS = r"\[(\d*){}:?(\d*)\]".format(re.escape(ATTACHMENT_POINT_TOKEN))
 17 | 
 18 | 
 19 | def add_brackets_to_attachment_points(smiles: str) -> str:
 20 |     """
 21 |     Adds brackets to the attachment points (if they don't have them).
 22 |     Example: "CC(C)CO*" to "CC(C)CO[*]"
 23 | 
 24 |     Args:
 25 |         smiles: A smiles string.
 26 | 
 27 |     Returns:
 28 |         A smiles string with brackets.
 29 |     """
 30 |     return re.sub(
 31 |         ATTACHMENT_POINT_NO_BRACKETS_REGEXP,
 32 |         "[{}]".format(ATTACHMENT_POINT_TOKEN),
 33 |         smiles,
 34 |     )
 35 | 
 36 | 
 37 | def convert_attach_to_isotope(
 38 |     mol_or_smiles: Union[dm.Mol, str],
 39 |     same_isotope: bool = False,
 40 |     as_smiles: bool = False,
 41 | ) -> Union[dm.Mol, str]:
 42 |     """Convert attachment to isotope mapping.
 43 | 
 44 |     Examples: "O=C(NCc1cnc([*])c1)[*]" to  "O=C(NCc1cnc([1*])c1)[2*]"
 45 | 
 46 |     Args:
 47 |         mol_or_smiles: A Mol object or a smiles to be converted
 48 |         same_isotope: Whether convert to the same isotope.
 49 |             Example: "O=C(NCc1cnc([*])c1)[*]" to  "O=C(NCc1cnc([1*])c1)[1*]"
 50 | 
 51 |     Returns:
 52 |         Converted Mol object or SMILES.
 53 |     """
 54 |     mol = dm.to_mol(mol_or_smiles)
 55 |     smiles = dm.to_smiles(mol)
 56 |     smiles = cast(str, smiles)
 57 | 
 58 |     smiles = add_brackets_to_attachment_points(smiles)
 59 | 
 60 |     # reg matching seems to be the most effective
 61 |     subs_reg = r"[\g<1>{}]"
 62 |     if same_isotope:
 63 |         subs_reg = "[1{}]"
 64 | 
 65 |     smiles = re.sub(ATTACHMENT_POINT_NUM_REGEXP, subs_reg.format(ATTACHMENT_POINT_TOKEN), smiles)
 66 | 
 67 |     if as_smiles:
 68 |         return smiles
 69 |     return dm.to_mol(smiles)
 70 | 
 71 | 
 72 | def num_attachment_points(mol_or_smiles: Union[dm.Mol, str]) -> int:
 73 |     """
 74 |     Get the number of attachment point in the
 75 | 
 76 |     Args:
 77 |         mol_or_smiles: A Mol object or a smiles to be converted
 78 | 
 79 |     Returns:
 80 |         Number of attachment points of the given molecule.
 81 |     """
 82 |     if isinstance(mol_or_smiles, dm.Mol):
 83 |         mol = cast(dm.Mol, mol_or_smiles)
 84 |         n_points = len(
 85 |             [atom for atom in mol.GetAtoms() if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN]
 86 |         )
 87 |     else:
 88 |         n_points = len(re.findall(ATTACHMENT_POINT_REGEXP, mol_or_smiles))
 89 | 
 90 |     return n_points
 91 | 
 92 | 
 93 | def open_attach_points(
 94 |     mol: dm.Mol,
 95 |     fix_atom_map: bool = False,
 96 |     bond_type: dm.BondType = dm.SINGLE_BOND,
 97 | ) -> dm.Mol:
 98 |     """Compute attachment points on a molecule.
 99 |     This will highlight all valid attachment point on the current molecule instead.
100 | 
101 |     Args:
102 |         mol: A Mol object to be processed.
103 |         fix_atom_map: Whether fix the atom mapping of the molecule.
104 |         bond_type: The bond type to be opened.
105 | 
106 |     Returns:
107 |         Molecule with open attachment points
108 |     """
109 | 
110 |     emol = Chem.rdchem.RWMol(dm.to_mol(mol))
111 |     with dm.log.without_rdkit_log():
112 |         atoms = [
113 |             (a.GetIdx(), a)
114 |             for a in emol.GetAtoms()
115 |             if a.GetSymbol() != ATTACHMENT_POINT_TOKEN
116 |             and a.GetImplicitValence() > 0
117 |             and (not a.HasProp("_protected") or a.GetProp("_protected") != "1")
118 |         ]
119 |         atoms.sort(reverse=True, key=operator.itemgetter(0))
120 | 
121 |         for atom in atoms:
122 |             new_atom = Chem.rdchem.Atom(ATTACHMENT_POINT_TOKEN)
123 |             new_atom.SetAtomMapNum(1 if fix_atom_map else atom[0])
124 |             new_index = emol.AddAtom(new_atom)
125 |             emol.UpdatePropertyCache(strict=False)
126 |             if bond_type is not None:
127 |                 emol.AddBond(atom[0], new_index, bond_type)
128 |             else:
129 |                 emol.AddBond(atom[0], new_index)
130 | 
131 |     mol = dm.sanitize_mol(emol)
132 |     return mol
133 | 


--------------------------------------------------------------------------------
/datamol/scaffold/__init__.py:
--------------------------------------------------------------------------------
1 | from ._fuzzy import trim_side_chain
2 | from ._fuzzy import fuzzy_scaffolding
3 | 


--------------------------------------------------------------------------------
/datamol/similarity.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from typing import Optional
  3 | from typing import Union
  4 | from typing import Any
  5 | 
  6 | import functools
  7 | 
  8 | import numpy as np
  9 | from sklearn.metrics import pairwise_distances_chunked
 10 | from scipy.spatial import distance
 11 | 
 12 | import datamol as dm
 13 | 
 14 | 
 15 | def pdist(
 16 |     mols: List[Union[str, dm.Mol]],
 17 |     n_jobs: Optional[int] = 1,
 18 |     squareform: bool = True,
 19 |     **fp_args: Any,
 20 | ) -> np.ndarray:
 21 |     """Compute the pairwise tanimoto distance between the fingerprints of all the
 22 |     molecules in the input set.
 23 | 
 24 |     Args:
 25 |         mols: list of molecules
 26 |         n_jobs: Number of jobs for parallelization. Let to 1 for no
 27 |             parallelization. Set to -1 to use all available cores.
 28 |         squareform: Whether to return in square form (matrix) or in a condensed
 29 |             form (1D vector).
 30 |         **fp_args: list of args to pass to `to_fp()`.
 31 | 
 32 |     Returns:
 33 |         dist_mat
 34 |     """
 35 | 
 36 |     fps = dm.parallelized(
 37 |         functools.partial(dm.to_fp, as_array=True, **fp_args),
 38 |         mols,
 39 |         n_jobs=n_jobs,
 40 |     )
 41 | 
 42 |     fps_array = np.array(fps)
 43 | 
 44 |     dist_mat = distance.pdist(fps_array, metric="jaccard")
 45 | 
 46 |     if squareform:
 47 |         dist_mat = distance.squareform(dist_mat, force="tomatrix")
 48 | 
 49 |     return dist_mat
 50 | 
 51 | 
 52 | def cdist(
 53 |     mols1: List[Union[str, dm.Mol]],
 54 |     mols2: List[Union[str, dm.Mol]],
 55 |     n_jobs: Optional[int] = 1,
 56 |     distances_chunk: bool = False,
 57 |     distances_chunk_memory: int = 1024,
 58 |     distances_n_jobs: int = -1,
 59 |     **fp_args: Any,
 60 | ) -> np.ndarray:
 61 |     """Compute the tanimoto distance between the fingerprints of each pair of
 62 |     molecules of the two collections of inputs.
 63 | 
 64 |     Args:
 65 |         mols1: list of molecules.
 66 |         mols2: list of molecules.
 67 |         n_jobs: Number of jobs for fingerprint computation. Let to 1 for no
 68 |             parallelization. Set to -1 to use all available cores.
 69 |         distances_chunk: Whether to use chunked computation.
 70 |         distances_chunk_memory: Memory size in MB to use for chunked computation.
 71 |         distances_n_jobs: Number of jobs for parallelization.
 72 |         **fp_args: list of args to pass to `to_fp()`.
 73 | 
 74 |     Returns:
 75 |         distmat
 76 |     """
 77 | 
 78 |     fps1 = dm.parallelized(
 79 |         functools.partial(dm.to_fp, as_array=True, **fp_args),
 80 |         mols1,
 81 |         n_jobs=n_jobs,
 82 |     )
 83 | 
 84 |     fps2 = dm.parallelized(
 85 |         functools.partial(dm.to_fp, as_array=True, **fp_args),
 86 |         mols2,
 87 |         n_jobs=n_jobs,
 88 |     )
 89 | 
 90 |     fps1_array = np.array(fps1).astype(bool)
 91 |     fps2_array = np.array(fps2).astype(bool)
 92 | 
 93 |     if distances_chunk:
 94 |         distances = pairwise_distances_chunked(
 95 |             fps1_array,
 96 |             fps2_array,
 97 |             metric="jaccard",
 98 |             n_jobs=distances_n_jobs,
 99 |             working_memory=distances_chunk_memory,
100 |         )
101 |         distances_array = np.vstack(list(distances))
102 |     else:
103 |         distances_array = distance.cdist(fps1_array, fps2_array, metric="jaccard")
104 | 
105 |     return distances_array
106 | 


--------------------------------------------------------------------------------
/datamol/types.py:
--------------------------------------------------------------------------------
 1 | # NOTE(hadim): typing_extensions can be replaced by typing once we drop support for Python 3.9.
 2 | from typing_extensions import TypeAlias
 3 | from typing import Union
 4 | from typing import Tuple
 5 | 
 6 | from rdkit import Chem
 7 | from rdkit.Chem import rdChemReactions
 8 | 
 9 | Mol: TypeAlias = Chem.rdchem.Mol
10 | BondType: TypeAlias = Chem.rdchem.BondType
11 | ChemicalReaction: TypeAlias = rdChemReactions.ChemicalReaction
12 | Atom: TypeAlias = Chem.rdchem.Atom
13 | Bond: TypeAlias = Chem.rdchem.Bond
14 | 
15 | RDKitColor = Union[Tuple[float, float, float, float], Tuple[float, float, float]]
16 | DatamolColor = Union[RDKitColor, str]
17 | 


--------------------------------------------------------------------------------
/datamol/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .jobs import JobRunner
2 | from .jobs import parallelized
3 | from .jobs import parallelized_with_batches
4 | 
5 | from . import fs
6 | from . import perf
7 | 
8 | from . import decorators
9 | 


--------------------------------------------------------------------------------
/datamol/utils/decorators.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | from typing import List
 3 | from typing import Union
 4 | 
 5 | import platform
 6 | from functools import wraps
 7 | 
 8 | 
 9 | def disable_on_os(os_names: Union[str, List[str]]):
10 |     """A decorator to disable a function raising an error if the OS detected is not supported.
11 | 
12 |     Args:
13 |         os_names: OS names to disable this function. Valid OS names are: `["linux", "osx", "win"]`.
14 |     """
15 | 
16 |     if isinstance(os_names, str):
17 |         os_names = [os_names]
18 | 
19 |     valid_os_names = []
20 |     for os_name in os_names:
21 |         if os_name == "linux":
22 |             valid_os_names.append("Linux")
23 |         elif os_name == "win":
24 |             valid_os_names.append("Windows")
25 |         elif os_name == "osx":
26 |             valid_os_names.append("Darwin")
27 |         else:
28 |             valid_os_names.append(os_name)
29 | 
30 |     def real_decorator(function: Callable):
31 |         @wraps(function)
32 |         def wrapper(*args, **kwargs):
33 |             if platform.system() not in valid_os_names:
34 |                 retval = function(*args, **kwargs)
35 |                 return retval
36 |             else:
37 |                 raise NotImplementedError(
38 |                     f"The function {function.__name__} is not supported"
39 |                     f" for the platform '{platform.system()}'."
40 |                 )
41 | 
42 |         return wrapper
43 | 
44 |     return real_decorator
45 | 


--------------------------------------------------------------------------------
/datamol/utils/perf.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from loguru import logger
 4 | 
 5 | 
 6 | duration_intervals = (
 7 |     ("weeks", 604800),  # 60 * 60 * 24 * 7
 8 |     ("days", 86400),  # 60 * 60 * 24
 9 |     ("h", 3600),  # 60 * 60
10 |     ("min", 60),
11 |     ("s", 1),
12 |     ("ms", 1e-3),
13 |     ("us", 1e-6),
14 | )
15 | 
16 | 
17 | def human_duration(seconds: float, granularity: int = 1):
18 |     # NOTE(hadim): far from being perfect.
19 | 
20 |     result = []
21 |     duration: float = seconds
22 |     for name, count in duration_intervals:
23 |         value = duration // count
24 |         if value:
25 |             duration -= value * count
26 |             result.append(f"{value:.0f}{name}")
27 |     return ", ".join(result[:granularity])
28 | 
29 | 
30 | class watch_duration:
31 |     """A Python decorator to measure execution time with logging capability.
32 | 
33 |     Args:
34 |         log: Whether to log the measured duration.
35 |         log_human_duration: Whether to log duration in a human way
36 |             depending on the amount.
37 | 
38 |     Example:
39 | 
40 |     ```python
41 |     def fn(n):
42 |         for i in range(n):
43 |             print(i)
44 |             time.sleep(0.2)
45 | 
46 |     with dm.utils.perf.watch_duration(log=True) as w:
47 |         fn(5)
48 | 
49 |     print(w.duration)
50 |     ```
51 |     """
52 | 
53 |     def __init__(self, log: bool = True, log_human_duration: bool = True):
54 |         self.log = log
55 |         self.log_human_duration = log_human_duration
56 | 
57 |         self.start = None
58 |         self.end = None
59 |         self.duration = None
60 |         self.duration_minutes = None
61 | 
62 |     def __enter__(self):
63 |         self.start = time.time()
64 |         return self
65 | 
66 |     def __exit__(self, *_):
67 |         assert self.start is not None
68 | 
69 |         self.end = time.time()
70 |         self.duration = self.end - self.start
71 |         self.duration_minutes = self.duration / 60
72 | 
73 |         if self.log:
74 |             if self.log_human_duration:
75 |                 logger.info(f"Duration {human_duration(self.duration)}.")
76 |             else:
77 |                 logger.info(f"Duration {self.duration_minutes:.2f} minutes")
78 | 


--------------------------------------------------------------------------------
/datamol/utils/testing.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import Optional
 3 | from typing import Union
 4 | 
 5 | import functools
 6 | 
 7 | import numpy as np
 8 | from scipy.spatial import distance
 9 | 
10 | from rdkit import Chem
11 | from rdkit.DataManip.Metric import GetTanimotoDistMat  # type: ignore
12 | from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
13 | 
14 | import datamol as dm
15 | 
16 | 
17 | def pdist_rdkit(
18 |     mols: List[Union[str, Chem.rdchem.Mol]],
19 |     n_jobs: Optional[int] = 1,
20 |     squareform: bool = True,
21 |     **fp_args,
22 | ) -> np.ndarray:
23 |     """Equivalent to `dm.similarity.pdist` but uses the RDKit API.
24 | 
25 |     Important:
26 |         This function is only used for testing and shoult not be used in production.
27 |     """
28 | 
29 |     fps = dm.parallelized(
30 |         functools.partial(dm.to_fp, as_array=False, **fp_args),
31 |         mols,
32 |         n_jobs=n_jobs,
33 |     )
34 | 
35 |     fps = list(fps)  # type: ignore
36 | 
37 |     dist = GetTanimotoDistMat(fps)
38 | 
39 |     # Put in squareform: `scipy.spatial.distance.squareform` is incompatible with RDKit returned vector.
40 |     dist_mat = np.zeros((len(fps), len(fps)))
41 |     dist_mat[np.tril_indices_from(dist_mat, -1)] = dist
42 |     dist_mat += dist_mat.T
43 | 
44 |     if not squareform:
45 |         dist_mat = distance.squareform(dist_mat, force="tovector")
46 | 
47 |     return dist_mat
48 | 
49 | 
50 | def cdist_rdkit(
51 |     mols1: List[Union[str, Chem.rdchem.Mol]],
52 |     mols2: List[Union[str, Chem.rdchem.Mol]],
53 |     n_jobs: Optional[int] = 1,
54 |     **fp_args,
55 | ) -> np.ndarray:
56 |     """Equivalent to `dm.similarity.cdist` but uses the RDKit API.
57 | 
58 |     Important:
59 |         This function is only used for testing and shoult not be used in production.
60 |     """
61 | 
62 |     fps1 = dm.parallelized(
63 |         functools.partial(dm.to_fp, as_array=False, **fp_args),
64 |         mols1,
65 |         n_jobs=n_jobs,
66 |     )
67 | 
68 |     fps2 = dm.parallelized(
69 |         functools.partial(dm.to_fp, as_array=False, **fp_args),
70 |         mols2,
71 |         n_jobs=n_jobs,
72 |     )
73 | 
74 |     fps1 = list(fps1)  # type: ignore
75 |     fps2 = list(fps2)  # type: ignore
76 | 
77 |     dist_mat = np.zeros((len(fps1), len(fps2)))
78 |     for i in range(len(fps1)):
79 |         for j in range(len(fps2)):
80 |             d = 1 - TanimotoSimilarity(fps1[i], fps2[j])
81 |             dist_mat[i, j] = d
82 | 
83 |     return dist_mat
84 | 


--------------------------------------------------------------------------------
/datamol/viz/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import utils
 2 | 
 3 | from ._viz import to_image
 4 | 
 5 | from ._substructure import match_substructure
 6 | 
 7 | from ._conformers import conformers
 8 | 
 9 | from ._circle_grid import circle_grid
10 | from ._circle_grid import MolsCircleGrid
11 | 
12 | from ._lasso_highlight import lasso_highlight_image
13 | 


--------------------------------------------------------------------------------
/datamol/viz/_conformers.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | from typing import List
  3 | from typing import Optional
  4 | 
  5 | import copy
  6 | import itertools
  7 | 
  8 | from rdkit import Chem
  9 | from rdkit.Chem import rdMolAlign
 10 | 
 11 | 
 12 | def _get_nglview():
 13 |     try:
 14 |         import nglview as nv
 15 | 
 16 |         return nv
 17 |     except ImportError:
 18 |         raise ImportError("You must install nglview from https://github.com/nglviewer/nglview.")
 19 | 
 20 | 
 21 | def _get_ipywidgets():
 22 |     try:
 23 |         import ipywidgets as widgets
 24 | 
 25 |         return widgets
 26 |     except ImportError:
 27 |         raise ImportError(
 28 |             "You must install ipywidgets from https://github.com/jupyter-widgets/ipywidgets/."
 29 |         )
 30 | 
 31 | 
 32 | def conformers(
 33 |     mol: Chem.rdchem.Mol,
 34 |     conf_id: int = -1,
 35 |     n_confs: Optional[Union[int, List[int]]] = None,
 36 |     align_conf: bool = True,
 37 |     n_cols: int = 3,
 38 |     sync_views: bool = True,
 39 |     remove_hs: bool = True,
 40 |     width: str = "auto",
 41 | ):
 42 |     """Visualize the conformer(s) of a molecule.
 43 | 
 44 |     Args:
 45 |         mol: a molecule.
 46 |         conf_id: The ID of the conformer to show. -1 shows
 47 |             the first conformer. Only works if `n_confs` is None.
 48 |         n_confs: Can be a number of conformers
 49 |             to shows or a list of conformer indices. When None, only the first
 50 |             conformer is displayed. When -1, show all conformers.
 51 |         align_conf: Whether to align conformers together.
 52 |         n_cols: Number of columns. Defaults to 3.
 53 |         sync_views: Wether to sync the multiple views.
 54 |         remove_hs: Wether to remove the hydrogens of the conformers.
 55 |         width: The width of the returned view. Defaults to "auto".
 56 |     """
 57 | 
 58 |     widgets = _get_ipywidgets()
 59 |     nv = _get_nglview()
 60 | 
 61 |     if mol.GetNumConformers() == 0:
 62 |         raise ValueError(
 63 |             "The molecule has 0 conformers. You can generate conformers with `dm.conformers.generate(mol)`."
 64 |         )
 65 | 
 66 |     # Clone the molecule
 67 |     mol = copy.deepcopy(mol)
 68 | 
 69 |     if remove_hs:
 70 |         mol = Chem.RemoveHs(mol)  # type: ignore
 71 |     else:
 72 |         mol = Chem.AddHs(mol)  # type: ignore
 73 | 
 74 |     if n_confs is None:
 75 |         return nv.show_rdkit(mol, conf_id=conf_id)
 76 | 
 77 |     # If n_confs is int, convert to list of conformer IDs
 78 |     if n_confs == -1:
 79 |         n_confs = [conf.GetId() for conf in mol.GetConformers()]
 80 |     elif isinstance(n_confs, int):
 81 |         if n_confs > mol.GetNumConformers():
 82 |             n_confs = mol.GetNumConformers()
 83 |         n_confs = list(range(n_confs))  # type: ignore
 84 | 
 85 |     if align_conf:
 86 |         rdMolAlign.AlignMolConformers(mol, confIds=n_confs)
 87 | 
 88 |     # Get number of rows
 89 |     n_rows = len(n_confs) // n_cols
 90 |     n_rows += 1 if (len(n_confs) % n_cols) > 0 else 0
 91 | 
 92 |     # Create a grid
 93 |     grid = widgets.GridspecLayout(n_rows, n_cols)  # type: ignore
 94 | 
 95 |     # Create and add views to the grid.
 96 |     widget_coords = itertools.product(range(n_rows), range(n_cols))
 97 |     views = []
 98 |     for i, (conf_id, (x, y)) in enumerate(zip(n_confs, widget_coords)):
 99 |         view = nv.show_rdkit(mol, conf_id=conf_id)
100 |         view.layout.width = width
101 |         view.layout.align_self = "stretch"
102 |         grid[x, y] = view
103 |         views.append(view)
104 | 
105 |     # Sync views
106 |     if sync_views:
107 |         for view in views:
108 |             view._set_sync_camera(views)
109 | 
110 |     return grid
111 | 


--------------------------------------------------------------------------------
/datamol/viz/_substructure.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from typing import Union
 3 | from typing import List
 4 | 
 5 | import datamol as dm
 6 | 
 7 | from ._viz import to_image
 8 | 
 9 | 
10 | def match_substructure(
11 |     mols: Union[List[dm.Mol], dm.Mol],
12 |     queries: Union[List[dm.Mol], dm.Mol],
13 |     highlight_bonds: bool = True,
14 |     copy: bool = True,
15 |     **kwargs: Any,
16 | ):
17 |     """Generate an image of molecule(s) with substructure matches for a given
18 |     pattern or substructure.
19 | 
20 |     Args:
21 |         mols: One or more molecules.
22 |         queries: One or more queries.
23 |         highlight_bonds: Whether to also highlight the bonds matching the patterns.
24 |         copy: Whether to copy the molecules and the queries.
25 |         kwargs: Other kwargs passed to `dm.viz.to_image`.
26 |     """
27 | 
28 |     # NOTE(hadim): `MolsToGridImage` used in `to_image` can't use a list of list of indices
29 |     # for every molecules so it's not really possible to have different colors for different
30 |     # matches in the same molecules.
31 |     # In the future, we will implement our custom `MolsToGridImage` in order to have more controls
32 |     # on the colors used.
33 |     # For the same reason, we don't bother about colors here.
34 | 
35 |     if isinstance(mols, dm.Mol):
36 |         mols = [mols]
37 | 
38 |     if isinstance(queries, dm.Mol):
39 |         queries = [queries]
40 | 
41 |     # Copy mols and patterns
42 |     if copy:
43 |         mols = [dm.copy_mol(mol) for mol in mols]
44 |         queries = [dm.copy_mol(mol) for mol in queries]
45 | 
46 |     all_atom_indices = []
47 |     all_bond_indices = []
48 | 
49 |     for mol in mols:
50 |         atom_indices = []
51 |         bond_indices = []
52 | 
53 |         for query in queries:
54 |             if highlight_bonds:
55 |                 atom_matches, bond_matches = dm.substructure_matching_bonds(mol, query)
56 |                 atom_indices += atom_matches
57 |                 bond_indices += bond_matches
58 |             else:
59 |                 atom_indices += list(mol.GetSubstructMatches(query, uniquify=True))  # type: ignore
60 |                 bond_indices += []
61 | 
62 |         # NOTE(hadim): we must flatten the atom/bond indices, since `MolsToGridImage`
63 |         # don't accept multiple list of indices for every single molecule.
64 |         bond_indices = [item for sublist in bond_indices for item in sublist]
65 |         atom_indices = [item for sublist in atom_indices for item in sublist]
66 | 
67 |         all_atom_indices.append(atom_indices)
68 |         all_bond_indices.append(bond_indices)
69 | 
70 |     image = to_image(
71 |         mols,
72 |         highlight_atom=all_atom_indices,
73 |         highlight_bond=all_bond_indices,
74 |         **kwargs,
75 |     )
76 | 
77 |     return image
78 | 


--------------------------------------------------------------------------------
/datamol/viz/_viz.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | from typing import List
  3 | from typing import Tuple
  4 | from typing import Optional
  5 | from typing import Any
  6 | from loguru import logger
  7 | 
  8 | from rdkit.Chem import Draw
  9 | 
 10 | import datamol as dm
 11 | 
 12 | from .utils import prepare_mol_for_drawing
 13 | from .utils import image_to_file
 14 | 
 15 | 
 16 | def to_image(
 17 |     mols: Union[List[Union[dm.Mol, str]], dm.Mol, str],
 18 |     legends: Union[List[Union[str, None]], str, None] = None,
 19 |     n_cols: int = 4,
 20 |     use_svg: bool = True,
 21 |     mol_size: Union[Tuple[int, int], int] = (300, 300),
 22 |     highlight_atom: Optional[List[List[int]]] = None,
 23 |     highlight_bond: Optional[List[List[int]]] = None,
 24 |     outfile: Optional[str] = None,
 25 |     max_mols: int = 32,
 26 |     max_mols_ipython: int = 50,
 27 |     copy: bool = True,
 28 |     indices: bool = False,
 29 |     bond_indices: bool = False,
 30 |     bond_line_width: int = 2,
 31 |     stereo_annotations: bool = True,
 32 |     legend_fontsize: int = 16,
 33 |     kekulize: bool = True,
 34 |     align: Union[dm.Mol, str, bool] = False,
 35 |     **kwargs: Any,
 36 | ):
 37 |     """Generate an image out of a molecule or a list of molecules.
 38 | 
 39 |     Args:
 40 |         mols: One or a list of molecules.
 41 |         legends: A string or a list of string as legend for every molecules.
 42 |         n_cols: Number of molecules per column.
 43 |         use_svg: Whether to ouput an SVG (or a PNG).
 44 |         mol_size: A int or a tuple of int defining the size per molecule.
 45 |         highlight_atom: the atoms to highlight.
 46 |         highlight_bond: The bonds to highlight.
 47 |         outfile: Path where to save the image (local or remote path).
 48 |         max_mols: The maximum number of molecules to display.
 49 |         max_mols_ipython: The maximum number of molecules to display when running within an IPython environment.
 50 |         copy: Whether to copy the molecules or not.
 51 |         indices: Whether to draw the atom indices.
 52 |         bond_indices: Whether to draw the bond indices.
 53 |         bond_line_width: The width of the bond lines.
 54 |         legend_fontsize: Font size for the legend.
 55 |         kekulize: Run kekulization routine on molecules. Skipped if fails.
 56 |         align: Whether to align the 2D coordinates of the molecules.
 57 |             - If set to True, align all molecules with `dm.align.auto_align_many()`.
 58 |             - If set to a molecule, it is used as a template for alignment with `dm.align.template_align()`.
 59 |             - If set to False, no alignment is performed.
 60 |             For a more custom alignment, we suggest using directly the module `dm.align` instead.
 61 |         **kwargs: Additional arguments to pass to the drawing function. See RDKit
 62 |             documentation related to `MolDrawOptions` for more details at
 63 |             https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.
 64 |     """
 65 | 
 66 |     if isinstance(mol_size, int):
 67 |         mol_size = (mol_size, mol_size)
 68 | 
 69 |     if isinstance(mols, (dm.Mol, str)):
 70 |         mols = [mols]
 71 | 
 72 |     # Convert smiles to molecules if strings are provided as input for API consistency
 73 |     mols = mols[:]  # avoid in place modification
 74 |     for i in range(len(mols)):
 75 |         if isinstance(mols[i], str):
 76 |             mols[i] = dm.to_mol(mols[i])
 77 | 
 78 |     if isinstance(legends, str):
 79 |         legends = [legends]
 80 | 
 81 |     if copy:
 82 |         mols = [dm.copy_mol(mol) for mol in mols]
 83 | 
 84 |     if max_mols is not None:
 85 |         mols = mols[:max_mols]
 86 | 
 87 |         if legends is not None:
 88 |             legends = legends[:max_mols]
 89 | 
 90 |     # Whether to align the molecules
 91 |     if isinstance(align, (dm.Mol, str)):
 92 |         mols = [dm.align.template_align(mol, template=align) for mol in mols]
 93 |     elif align is True:
 94 |         mols = dm.align.auto_align_many(mols)
 95 | 
 96 |     # Prepare molecules before drawing
 97 |     mols = [prepare_mol_for_drawing(mol, kekulize=kekulize) for mol in mols]
 98 | 
 99 |     _highlight_atom = highlight_atom
100 |     if highlight_atom is not None and isinstance(highlight_atom[0], int):
101 |         _highlight_atom = [highlight_atom]
102 | 
103 |     _highlight_bond = highlight_bond
104 |     if highlight_bond is not None and isinstance(highlight_bond[0], int):
105 |         _highlight_bond = [highlight_bond]
106 | 
107 |     # Don't make the image bigger than it
108 |     if len(mols) < n_cols:
109 |         n_cols = len(mols)
110 | 
111 |     draw_options = Draw.rdMolDraw2D.MolDrawOptions()
112 |     draw_options.legendFontSize = legend_fontsize
113 |     draw_options.addAtomIndices = indices
114 |     draw_options.addBondIndices = bond_indices
115 |     draw_options.addStereoAnnotation = stereo_annotations
116 |     draw_options.bondLineWidth = bond_line_width
117 | 
118 |     # Add the custom drawing options.
119 |     _kwargs = {}
120 |     for k, v in kwargs.items():
121 |         if hasattr(draw_options, k):
122 |             setattr(draw_options, k, v)
123 |         else:
124 |             _kwargs[k] = v
125 | 
126 |     # Check if we are in a Jupyter notebook or IPython display context
127 |     # If so, conditionally add the maxMols argument
128 |     in_notebook = dm.viz.utils.is_ipython_session()
129 | 
130 |     if in_notebook:
131 |         _kwargs["maxMols"] = max_mols_ipython
132 |         if max_mols > max_mols_ipython:
133 |             logger.warning(
134 |                 f"You have set max_mols to {max_mols}, which is higher than max_mols_ipython ({max_mols_ipython}). "
135 |                 "Consider increasing max_mols_ipython if you want to display all molecules in an IPython environment."
136 |             )
137 | 
138 |     image = Draw.MolsToGridImage(
139 |         mols,
140 |         legends=legends,
141 |         molsPerRow=n_cols,
142 |         useSVG=use_svg,
143 |         subImgSize=mol_size,
144 |         highlightAtomLists=_highlight_atom,
145 |         highlightBondLists=_highlight_bond,
146 |         drawOptions=draw_options,
147 |         **_kwargs,
148 |     )
149 | 
150 |     if outfile is not None:
151 |         image_to_file(image, outfile, as_svg=use_svg)
152 |     return image
153 | 


--------------------------------------------------------------------------------
/datamol/viz/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from typing import Union
  3 | 
  4 | import io
  5 | import fsspec
  6 | 
  7 | from rdkit.Chem import Draw
  8 | from matplotlib import colors as mcolors
  9 | 
 10 | import PIL.Image
 11 | import PIL.PngImagePlugin
 12 | 
 13 | import datamol as dm
 14 | 
 15 | from datamol.types import RDKitColor
 16 | from datamol.types import DatamolColor
 17 | 
 18 | 
 19 | def prepare_mol_for_drawing(mol: Optional[dm.Mol], kekulize: bool = True) -> Optional[dm.Mol]:
 20 |     """Prepare the molecule before drawing to avoid any error due to unsanitized molecule
 21 |     or incorrect valence or aromaticity.
 22 | 
 23 |     Code is inspired from `rdkit.Chem.Draw._moltoimg`.
 24 | 
 25 |     Args:
 26 |         mol: A molecule to prepare. If set to None, the function will return None.
 27 |         kekulize: Whether to kekulize the molecule.
 28 |     """
 29 | 
 30 |     if mol is None:
 31 |         return None
 32 | 
 33 |     try:
 34 |         with dm.without_rdkit_log():
 35 |             # Check for implicit and explicit valence
 36 |             if mol.NeedsUpdatePropertyCache():  # type: ignore
 37 |                 mol.UpdatePropertyCache(False)  # type: ignore
 38 | 
 39 |             # Check for aromaticity
 40 |             if dm.is_lower_than_current_rdkit_version("2022.09"):
 41 |                 _kekulize = Draw._okToKekulizeMol(mol, kekulize)  # type: ignore
 42 |             else:
 43 |                 _kekulize = Draw.shouldKekulize(mol, kekulize)
 44 | 
 45 |             # Run the rdkit preparation procedure
 46 |             _mol = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=_kekulize)
 47 | 
 48 |     except ValueError:  # <- can happen on a kekulization failure
 49 |         # Run the rdkit preparation procedure with kekulize set to `False`
 50 |         _mol = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False)
 51 | 
 52 |     return _mol
 53 | 
 54 | 
 55 | def is_ipython_session() -> bool:
 56 |     try:
 57 |         kernel_name = get_ipython().__class__.__name__  # noqa: F821 # type: ignore
 58 |         module_name = get_ipython().__class__.__module__  # noqa: F821 # type: ignore
 59 | 
 60 |         if kernel_name == "ZMQInteractiveShell" or module_name == "google.colab._shell":
 61 |             return True
 62 |     except Exception:
 63 |         pass
 64 | 
 65 |     return False
 66 | 
 67 | 
 68 | def drawer_to_image(drawer: Draw.rdMolDraw2D.MolDraw2D):
 69 |     """Convert an RDkit drawer to an image. The image can be either a PNG or SVG depending on the
 70 |     drawer class. The returned image type will depends whether the Python session is an IPython one or not.
 71 | 
 72 |     This function matches the behavior of `datamol.to_image` and `rdkit.Chem.Draw.MolDraw2DToImage`.
 73 | 
 74 |     Args:
 75 |         drawer: An RDkit drawer.
 76 | 
 77 |     Returns:
 78 |         An image: either PNG or SVG depending on the drawer class. If within an IPython sessions,
 79 |                   IPython display objects are returned.
 80 |     """
 81 | 
 82 |     is_svg = isinstance(drawer, Draw.rdMolDraw2D.MolDraw2DSVG)
 83 | 
 84 |     if is_ipython_session():
 85 |         if is_svg:
 86 |             from IPython.core.display import SVG
 87 | 
 88 |             return SVG(drawer.GetDrawingText())
 89 |         else:
 90 |             from IPython.core.display import Image
 91 | 
 92 |             return Image(drawer.GetDrawingText())
 93 |     else:
 94 |         if is_svg:
 95 |             return drawer.GetDrawingText()
 96 |         else:
 97 |             from PIL import Image
 98 | 
 99 |             return Image.open(io.BytesIO(drawer.GetDrawingText()))
100 | 
101 | 
102 | def image_to_file(
103 |     image: Union[
104 |         str,
105 |         PIL.PngImagePlugin.PngImageFile,
106 |         bytes,
107 |         PIL.Image.Image,
108 |     ],
109 |     outfile,
110 |     as_svg: bool = False,
111 | ):
112 |     """Save image to file. The image can be either a PNG or SVG depending
113 | 
114 |     Args:
115 |         image: Image to save to a file
116 |         outfile: Path to the output file where to save the image
117 |         as_svg: Whether the image is an SVG or not
118 |     """
119 | 
120 |     with fsspec.open(outfile, "wb") as f:
121 |         if as_svg:
122 |             if isinstance(image, str):
123 |                 # in a terminal process
124 |                 f.write(image.encode())  # type: ignore
125 |             else:
126 |                 # in a jupyter kernel process
127 |                 f.write(image.data.encode())  # type: ignore
128 |         else:
129 |             if isinstance(image, PIL.PngImagePlugin.PngImageFile):  # type: ignore
130 |                 # in a terminal process
131 |                 image.save(f)  # type: ignore
132 |             else:
133 |                 # in a jupyter kernel process
134 |                 f.write(image.data)  # type: ignore
135 | 
136 | 
137 | def to_rdkit_color(color: Optional[DatamolColor]) -> Optional[RDKitColor]:
138 |     """If required convert a datamol color (rgb, rgba or hex string) to an RDKit
139 |     color (rgb or rgba).
140 | 
141 |     Args:
142 |         color: A datamol color: hex, rgb, rgba or None.
143 |     """
144 |     if color is None:
145 |         return None
146 | 
147 |     if isinstance(color, str):
148 |         return mcolors.to_rgba(color)  # type: ignore
149 |     if isinstance(color, (tuple, list)) and len(color) in [3, 4] and any(x > 1 for x in color):
150 |         return tuple(x / 255 if i < 3 else x for i, x in enumerate(color))
151 | 
152 |     return color
153 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | docs.datamol.io
2 | 


--------------------------------------------------------------------------------
/docs/api/datamol.align.md:
--------------------------------------------------------------------------------
1 | # `datamol.align`
2 | 
3 | ::: datamol.align
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.cluster.md:
--------------------------------------------------------------------------------
1 | # `datamol.cluster`
2 | 
3 | ::: datamol.cluster
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.conformers.md:
--------------------------------------------------------------------------------
1 | # `datamol.conformers`
2 | 
3 | ::: datamol.conformers._conformers
4 | ::: datamol.conformers._features
5 | 


--------------------------------------------------------------------------------
/docs/api/datamol.convert.md:
--------------------------------------------------------------------------------
1 | # `datamol.convert`
2 | 
3 | ::: datamol.convert
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.data.md:
--------------------------------------------------------------------------------
1 | # `datamol.data`
2 | 
3 | ::: datamol.data
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.descriptors.md:
--------------------------------------------------------------------------------
1 | # `datamol.descriptors`
2 | 
3 | ::: datamol.descriptors.descriptors
4 | ::: datamol.descriptors.compute
5 | 


--------------------------------------------------------------------------------
/docs/api/datamol.fp.md:
--------------------------------------------------------------------------------
1 | # `datamol.fp`
2 | 
3 | ::: datamol.fp
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.fragment.md:
--------------------------------------------------------------------------------
1 | # `datamol.fragment`
2 | 
3 | ::: datamol.fragment._fragment
4 | ::: datamol.fragment._assemble
5 | 


--------------------------------------------------------------------------------
/docs/api/datamol.graph.md:
--------------------------------------------------------------------------------
1 | # `datamol.graph`
2 | 
3 | ::: datamol.graph
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.io.md:
--------------------------------------------------------------------------------
1 | # `datamol.io`
2 | 
3 | ::: datamol.io
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.isomers.md:
--------------------------------------------------------------------------------
1 | # `datamol.isomers`
2 | 
3 | ::: datamol.isomers._enumerate
4 | ::: datamol.isomers._structural
5 | 


--------------------------------------------------------------------------------
/docs/api/datamol.log.md:
--------------------------------------------------------------------------------
1 | # `datamol.log`
2 | 
3 | ::: datamol.log
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.mol.md:
--------------------------------------------------------------------------------
1 | # `datamol.mol`
2 | 
3 | ::: datamol.mol
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.molar.md:
--------------------------------------------------------------------------------
1 | # `datamol.molar`
2 | 
3 | ::: datamol.molar
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.reactions.md:
--------------------------------------------------------------------------------
1 | # `datamol.reactions`
2 | 
3 | ::: datamol.reactions._reactions
4 | ::: datamol.reactions._attachments
5 | 


--------------------------------------------------------------------------------
/docs/api/datamol.scaffold.md:
--------------------------------------------------------------------------------
1 | # `datamol.scaffold`
2 | 
3 | ::: datamol.scaffold._fuzzy
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.similarity.md:
--------------------------------------------------------------------------------
1 | # `datamol.similarity`
2 | 
3 | ::: datamol.similarity
4 | 


--------------------------------------------------------------------------------
/docs/api/datamol.utils.fs.md:
--------------------------------------------------------------------------------
1 | # `datamol.utils.fs`
2 | 
3 | ::: datamol.utils.fs
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/api/datamol.utils.md:
--------------------------------------------------------------------------------
1 | # `datamol.utils`
2 | 
3 | ::: datamol.utils.decorators
4 | ::: datamol.utils.jobs
5 | ::: datamol.utils.perf
6 | 


--------------------------------------------------------------------------------
/docs/api/datamol.viz.md:
--------------------------------------------------------------------------------
 1 | # `datamol.viz`
 2 | 
 3 | ## Vizualize molecule in 2D or 3D
 4 | 
 5 | ::: datamol.viz.to_image
 6 | ::: datamol.viz.conformers
 7 | 
 8 | ## Specific plotting functions
 9 | 
10 | ::: datamol.viz.MolsCircleGrid
11 | ::: datamol.viz.circle_grid
12 | 
13 | ## Vizualize 2D molecule with highlighted substructures
14 | 
15 | ::: datamol.viz.lasso_highlight_image
16 | 


--------------------------------------------------------------------------------
/docs/assets/css/custom-datamol.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |   --datamol-primary: #F89D4C;
 3 |   --datamol-secondary: #343a40;
 4 | 
 5 |   /* Primary color shades */
 6 |   --md-primary-fg-color: var(--datamol-primary);
 7 |   --md-primary-fg-color--light: var(--datamol-primary);
 8 |   --md-primary-fg-color--dark: var(--datamol-primary);
 9 |   --md-primary-bg-color: var(--datamol-secondary);
10 |   --md-primary-bg-color--light: var(--datamol-secondary);
11 |   --md-text-link-color: var(--datamol-secondary);
12 | 
13 |   /* Accent color shades */
14 |   --md-accent-fg-color: var(--datamol-secondary);
15 |   --md-accent-fg-color--transparent: var(--datamol-secondary);
16 |   --md-accent-bg-color: var(--datamol-secondary);
17 |   --md-accent-bg-color--light: var(--datamol-secondary);
18 | }
19 | 
20 | :root>* {
21 |   /* Code block color shades */
22 |   --md-code-bg-color: hsla(0, 0%, 96%, 1);
23 |   --md-code-fg-color: hsla(200, 18%, 26%, 1);
24 | 
25 |   /* Footer */
26 |   --md-footer-bg-color: var(--datamol-primary);
27 |   /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */
28 |   --md-footer-fg-color: var(--datamol-secondary);
29 |   --md-footer-fg-color--light: var(--datamol-secondary);
30 |   --md-footer-fg-color--lighter: var(--datamol-secondary);
31 | 
32 | }
33 | 
34 | .md-header {
35 |   background-image: linear-gradient(to right, #F89D4C, #E20000);
36 | }
37 | 
38 | .md-footer {
39 |   background-image: linear-gradient(to right, #F89D4C, #E20000);
40 | }
41 | 
42 | .md-tabs {
43 |   background-image: linear-gradient(to right, #F4F6F9, #E2CEC3);
44 | }
45 | 
46 | .md-header__topic {
47 |   color: rgb(255, 255, 255);
48 | }
49 | 
50 | .md-source__repository,
51 | .md-source__icon,
52 | .md-search__input,
53 | .md-search__input::placeholder,
54 | .md-search__input~.md-search__icon,
55 | .md-footer__inner.md-grid,
56 | .md-copyright__highlight,
57 | .md-copyright,
58 | .md-footer-meta.md-typeset a,
59 | .md-version {
60 |   color: rgb(255, 255, 255) !important;
61 | }
62 | 
63 | .md-search__form {
64 |   background-color: rgba(255, 255, 255, 0.2);
65 | }
66 | 
67 | .md-search__input {
68 |   color: #222222 !important;
69 | }
70 | 
71 | .md-header__topic {
72 |   color: rgb(255, 255, 255);
73 |   font-size: 1.4em;
74 | }
75 | 
76 | /* Increase the size of the logo */
77 | .md-header__button.md-logo img,
78 | .md-header__button.md-logo svg {
79 |   height: 2rem !important;
80 | }
81 | 
82 | /* Reduce the margin around the logo */
83 | .md-header__button.md-logo {
84 |   margin: 0.4em;
85 |   padding: 0.4em;
86 | }
87 | 
88 | /* Remove the `In` and `Out` block in rendered Jupyter notebooks */
89 | .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt,
90 | .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt {
91 |   display: none !important;
92 | }
93 | 


--------------------------------------------------------------------------------
/docs/assets/css/custom.css:
--------------------------------------------------------------------------------
 1 | /* Indentation. */
 2 | div.doc-contents:not(.first) {
 3 |   padding-left: 25px;
 4 |   border-left: 4px solid rgba(230, 230, 230);
 5 |   margin-bottom: 80px;
 6 | }
 7 | 
 8 | /* Don't capitalize names. */
 9 | h5.doc-heading {
10 |   text-transform: none !important;
11 | }
12 | 
13 | /* Don't use vertical space on hidden ToC entries. */
14 | .hidden-toc::before {
15 |   margin-top: 0 !important;
16 |   padding-top: 0 !important;
17 | }
18 | 
19 | /* Don't show permalink of hidden ToC entries. */
20 | .hidden-toc a.headerlink {
21 |   display: none;
22 | }
23 | 
24 | /* Avoid breaking parameters name, etc. in table cells. */
25 | td code {
26 |   word-break: normal !important;
27 | }
28 | 
29 | /* For pieces of Markdown rendered in table cells. */
30 | td p {
31 |   margin-top: 0 !important;
32 |   margin-bottom: 0 !important;
33 | }
34 | 


--------------------------------------------------------------------------------
/docs/assets/css/tweak-width.css:
--------------------------------------------------------------------------------
 1 | @media only screen and (min-width: 76.25em) {
 2 |   .md-main__inner {
 3 |     max-width: none;
 4 |     padding-left: 2em;
 5 |     padding-left: 2em;
 6 |   }
 7 |   .md-sidebar--primary {
 8 |     left: 0;
 9 |   }
10 |   .md-sidebar--secondary {
11 |     right: 0;
12 |     margin-left: 0;
13 |     -webkit-transform: none;
14 |     transform: none;
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/docs/assets/js/google-analytics.js:
--------------------------------------------------------------------------------
 1 | var gtag_id = "G-0L9PP26N2H";
 2 | 
 3 | var script = document.createElement("script");
 4 | script.src = "https://www.googletagmanager.com/gtag/js?id=" + gtag_id;
 5 | document.head.appendChild(script);
 6 | 
 7 | window.dataLayer = window.dataLayer || [];
 8 | function gtag(){dataLayer.push(arguments);}
 9 | gtag('js', new Date());
10 | gtag('config', gtag_id);
11 | 


--------------------------------------------------------------------------------
/docs/contribute.md:
--------------------------------------------------------------------------------
 1 | # Contribute
 2 | 
 3 | The below documents the development lifecycle of Datamol.
 4 | 
 5 | ## Setup a dev environment
 6 | 
 7 | ```bash
 8 | mamba env create -n datamol -f env.yml
 9 | mamba activate datamol
10 | pip install -e .
11 | ```
12 | 
13 | ## Setup a dev environment with dev container
14 | 
15 | This repository is setup to use [dev container](https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/introduction-to-dev-containers). You can use it locally with VSCode or any editor supporting dev containers as well as on GitHub Codespaces.
16 | 
17 | The env is based on the Micromamba Docker image.
18 | 
19 | ## Continuous Integration
20 | 
21 | Datamol uses Github Actions to:
22 | 
23 | - **Build and test** `datamol`.
24 |   - Multiple combinations of OS, Python and RDKit versions are tested.
25 | - **Check** the code:
26 |   - Formatting with `black`.
27 |   - Static type check with `mypy`.
28 | - **Documentation**: build and deploy the documentation on `main` and for every new git tag.
29 | 
30 | ## Run tests
31 | 
32 | ```bash
33 | pytest
34 | ```
35 | 
36 | ## Build the documentation
37 | 
38 | You can build and serve the documentation locally with:
39 | 
40 | ```bash
41 | # Build and serve the doc
42 | mike serve
43 | ```
44 | 
45 | ### Multi-versionning
46 | 
47 | The doc is built for eash push on `main` and every git tags using [mike](https://github.com/jimporter/mike). Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.
48 | 
49 | ## Release a new version
50 | 
51 | The process is fully automated by executing the [`release` GH Action](https://github.com/datamol-io/datamol/actions/workflows/release.yml).
52 | 


--------------------------------------------------------------------------------
/docs/images/logo-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/images/logo-black.png


--------------------------------------------------------------------------------
/docs/images/logo-black.svg:
--------------------------------------------------------------------------------
1 | <svg width="128" height="128" viewBox="0 0 128 128" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M116.23 93.6215L64.4602 124.044L12.2284 94.4219L11.7663 34.3766L63.5359 3.95375L115.768 33.5762L116.23 93.6215Z" stroke="black" stroke-width="6"/>
3 | <path d="M97.7695 83.3598L64.7708 102.752L31.4776 83.87L31.183 45.5963L64.1817 26.2044L97.475 45.0861L97.7695 83.3598Z" stroke="black" stroke-width="6"/>
4 | <path d="M81.5972 74.1761L64.6018 84.1636L47.4547 74.4389L47.303 54.7267L64.2984 44.7392L81.4455 54.4639L81.5972 74.1761Z" stroke="black" stroke-width="6"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/images/logo.png


--------------------------------------------------------------------------------
/docs/images/logo.svg:
--------------------------------------------------------------------------------
1 | <svg width="128" height="128" viewBox="0 0 128 128" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M116.23 93.6215L64.4602 124.044L12.2284 94.4219L11.7663 34.3766L63.5359 3.95375L115.768 33.5762L116.23 93.6215Z" stroke="white" stroke-width="6"/>
3 | <path d="M97.7695 83.3598L64.7708 102.752L31.4776 83.87L31.183 45.5963L64.1817 26.2044L97.475 45.0861L97.7695 83.3598Z" stroke="white" stroke-width="6"/>
4 | <path d="M81.5972 74.1761L64.6018 84.1636L47.4547 74.4389L47.303 54.7267L64.2984 44.7392L81.4455 54.4639L81.5972 74.1761Z" stroke="white" stroke-width="6"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | Datamol is a python library to work with molecules. It's a layer built on top of [RDKit](https://www.rdkit.org/) and aims to be as light as possible.
 4 | 
 5 | - 🐍 Simple pythonic API
 6 | - ⚗️ RDKit first: all you manipulate are `rdkit.Chem.Mol` objects.
 7 | - ✅ Manipulating molecules often rely on many options; Datamol provides good defaults by design.
 8 | - 🧠 Performance matters: built-in efficient parallelization when possible with optional progress bar.
 9 | - 🕹️ Modern IO: out-of-the-box support for remote paths using `fsspec` to read and write multiple formats (sdf, xlsx, csv, etc).
10 | 
11 | Visit our website at <https://datamol.io>.
12 | 
13 | ## Installation
14 | 
15 | Use conda:
16 | 
17 | ```bash
18 | mamba install -c conda-forge datamol
19 | ```
20 | 
21 | _**Tips:** You can replace `mamba` by `conda`._
22 | 
23 | _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install Datamol. The package is also pip installable if you need it: `pip install datamol`._
24 | 
25 | ## Quick API Tour
26 | 
27 | ```python
28 | import datamol as dm
29 | 
30 | # Common functions
31 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
32 | fp = dm.to_fp(mol)
33 | selfies = dm.to_selfies(mol)
34 | inchi = dm.to_inchi(mol)
35 | 
36 | # Standardize and sanitize
37 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O")
38 | mol = dm.fix_mol(mol)
39 | mol = dm.sanitize_mol(mol)
40 | mol = dm.standardize_mol(mol)
41 | 
42 | # Dataframe manipulation
43 | df = dm.data.freesolv()
44 | mols = dm.from_df(df)
45 | 
46 | # 2D viz
47 | legends = [dm.to_smiles(mol) for mol in mols[:10]]
48 | dm.viz.to_image(mols[:10], legends=legends)
49 | 
50 | # Generate conformers
51 | smiles = "O=C(C)Oc1ccccc1C(=O)O"
52 | mol = dm.to_mol(smiles)
53 | mol_with_conformers = dm.conformers.generate(mol)
54 | 
55 | # 3D viz (using nglview)
56 | dm.viz.conformers(mol, n_confs=10)
57 | 
58 | # Compute SASA from conformers
59 | sasa = dm.conformers.sasa(mol_with_conformers)
60 | 
61 | # Easy IO
62 | mols = dm.read_sdf("s3://my-awesome-data-lake/smiles.sdf", as_df=False)
63 | dm.to_sdf(mols, "gs://data-bucket/smiles.sdf")
64 | ```
65 | 
66 | ## How to cite
67 | 
68 | Please cite Datamol if you use it in your research: [![DOI](https://zenodo.org/badge/341603042.svg)](https://zenodo.org/badge/latestdoi/341603042).
69 | 
70 | ## Compatibilities
71 | 
72 | Version compatibilities are an essential topic for production-software stacks. We are cautious about documenting compatibility between `datamol`, `python` and `rdkit`.
73 | 
74 | See below the associated versions of Python and RDKit, for which a minor version of Datamol **has been tested** during its whole lifecycle. _It does not mean other combinations does not work but that those are not tested._
75 | 
76 | | `datamol` | `python`            | `rdkit`                       |
77 | | --------- | ------------------- | ----------------------------- |
78 | | `0.12.x`  | `[3.10, 3.11]`      | `[2023.03, 2023.09]`          |
79 | | `0.11.x`  | `[3.9, 3.10, 3.11]` | `[2022.09, 2023.03]`          |
80 | | `0.10.x`  | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]`          |
81 | | `0.9.x`   | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]`          |
82 | | `0.8.x`   | `[3.8, 3.9, 3.10]`  | `[2021.09, 2022.03, 2022.09]` |
83 | | `0.7.x`   | `[3.8, 3.9]`        | `[2021.09, 2022.03]`          |
84 | | `0.6.x`   | `[3.8, 3.9]`        | `[2021.09]`                   |
85 | | `0.5.x`   | `[3.8, 3.9]`        | `[2021.03, 2021.09]`          |
86 | | `0.4.x`   | `[3.8, 3.9]`        | `[2020.09, 2021.03]`          |
87 | | `0.3.x`   | `[3.8, 3.9]`        | `[2020.09, 2021.03]`          |
88 | 


--------------------------------------------------------------------------------
/docs/license.md:
--------------------------------------------------------------------------------
1 | ```
2 | {!LICENSE!}
3 | ```
4 | 


--------------------------------------------------------------------------------
/docs/tutorials/data/ReactionBlock.rxn:
--------------------------------------------------------------------------------
 1 | $RXN
 2 | 
 3 |       ISIS     082120061354
 4 | 
 5 |   2  1
 6 | $MOL
 7 | 
 8 |   -ISIS-  08210613542D
 9 | 
10 |   3  2  0  0  0  0  0  0  0  0999 V2000
11 |    -1.4340   -0.6042    0.0000 C   0  0  0  0  0  0  0  0  0  2  0  0
12 |    -0.8639   -0.9333    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
13 |    -1.4340    0.0542    0.0000 O   0  0  0  0  0  0  0  0  0  1  0  0
14 |   1  2  1  0  0  0  0
15 |   1  3  2  0  0  0  0
16 | M  END
17 | $MOL
18 | 
19 |   -ISIS-  08210613542D
20 | 
21 |   1  0  0  0  0  0  0  0  0  0999 V2000
22 |     2.2125   -0.7833    0.0000 N   0  0  0  0  0  0  0  0  0  3  0  0
23 | M  END
24 | $MOL
25 | 
26 |   -ISIS-  08210613542D
27 | 
28 |   3  2  0  0  0  0  0  0  0  0999 V2000
29 |     9.5282   -0.8083    0.0000 N   0  0  0  0  0  0  0  0  0  3  0  0
30 |     8.9579   -0.4792    0.0000 C   0  0  0  0  0  0  0  0  0  2  0  0
31 |     8.9579    0.1792    0.0000 O   0  0  0  0  0  0  0  0  0  1  0  0
32 |   1  2  1  0  0  0  0
33 |   2  3  2  0  0  0  0
34 | M  END
35 | 


--------------------------------------------------------------------------------
/docs/tutorials/images/Aligning_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Aligning_1.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Aligning_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Aligning_2.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Conformers_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Conformers_1.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Descriptors_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Descriptors_1.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Fragment_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_1.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Fragment_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_2.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Fragment_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_3.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Preprocess_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Preprocess_1.png


--------------------------------------------------------------------------------
/docs/tutorials/images/Scaffolds_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Scaffolds_1.png


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
 1 | # Usage
 2 | 
 3 | ## How to use
 4 | 
 5 | Datamol has been designed to be used with a single import:
 6 | 
 7 | ```python
 8 | import datamol as dm
 9 | ```
10 | 
11 | All `datamol` functions are available under `dm`.
12 | 
13 | ## Lazy loading
14 | 
15 | datamol uses lazy loading to dynamically expose all its API without imposing a long import time during `import datamol as dm`. In case of trouble you can always disable lazy loading by setting the environment variable `DATAMOL_DISABLE_LAZY_LOADING` to `1`. Please report any issue [on the datamol repo](https://github.com/datamol-io/datamol/issues).
16 | 


--------------------------------------------------------------------------------
/env.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | 
 4 | dependencies:
 5 |   - python >=3.8
 6 |   - pip
 7 |   - tqdm
 8 |   - loguru
 9 |   - joblib
10 |   - fsspec >=2021.9
11 |   - s3fs >=2021.9
12 |   - gcsfs >=2021.9
13 |   - platformdirs
14 |   - packaging
15 |   - typing_extensions
16 |   - importlib_resources
17 | 
18 |   # Scientific
19 |   - pandas
20 |   - numpy
21 |   - scipy
22 |   - pillow
23 |   - matplotlib
24 |   - scikit-learn
25 | 
26 |   # Chemistry
27 |   - rdkit
28 |   - selfies
29 | 
30 |   # Optional deps
31 |   - openpyxl
32 |   - networkx
33 |   - nglview
34 |   - xlsxwriter
35 |   - pyarrow
36 | 
37 |   # Dev
38 |   - pytest >=6.0
39 |   - pytest-cov
40 |   - pytest-xdist
41 |   - black >=24
42 |   - ruff
43 |   - jupyterlab
44 |   - mypy
45 |   - codecov
46 |   - nbconvert
47 | 
48 |   # Doc
49 |   - mkdocs <1.6
50 |   - mkdocs-material >=7.1.1
51 |   - mkdocs-material-extensions
52 |   - mkdocstrings
53 |   - mkdocstrings-python
54 |   - mkdocs-jupyter
55 |   - markdown-include
56 |   - mdx_truly_sane_lists
57 |   - mike >=1.0.0
58 |   - seaborn
59 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: "datamol"
  2 | site_description: "A python library to work with molecules. Built on top of RDKit."
  3 | repo_url: "https://github.com/datamol-io/datamol"
  4 | repo_name: "datamol-io/datamol"
  5 | copyright: Copyright 2020 - 2023 datamol.io
  6 | 
  7 | site_url: ""
  8 | remote_branch: "gh-pages"
  9 | use_directory_urls: false
 10 | docs_dir: "docs"
 11 | 
 12 | # Fail on warnings to detect issues with types and docstring
 13 | strict: true
 14 | 
 15 | nav:
 16 |   - Overview: index.md
 17 |   - Usage: usage.md
 18 |   - Tutorials:
 19 |       - The Basics: tutorials/The_Basics.ipynb
 20 |       - Preprocessing: tutorials/Preprocessing.ipynb
 21 |       - Descriptors: tutorials/Descriptors.ipynb
 22 |       - Chemical Reactions: tutorials/Reactions.ipynb
 23 |       - Scaffolds: tutorials/Scaffolds.ipynb
 24 |       - Aligning: tutorials/Aligning.ipynb
 25 |       - Fuzzy_Scaffolds: tutorials/Fuzzy_Scaffolds.ipynb
 26 |       - Clustering: tutorials/Clustering.ipynb
 27 |       - Fragment: tutorials/Fragment.ipynb
 28 |       - Conformers: tutorials/Conformers.ipynb
 29 |       - Visualization: tutorials/Visualization.ipynb
 30 |       - Datamol Filesystem Module: tutorials/Filesystem.ipynb
 31 |   - API:
 32 |     - datamol.align: api/datamol.align.md
 33 |     - datamol.cluster: api/datamol.cluster.md
 34 |     - datamol.conformers: api/datamol.conformers.md
 35 |     - datamol.convert: api/datamol.convert.md
 36 |     - datamol.data: api/datamol.data.md
 37 |     - datamol.descriptors: api/datamol.descriptors.md
 38 |     - datamol.fp: api/datamol.fp.md
 39 |     - datamol.fragment: api/datamol.fragment.md
 40 |     - datamol.graph: api/datamol.graph.md
 41 |     - datamol.io: api/datamol.io.md
 42 |     - datamol.isomers: api/datamol.isomers.md
 43 |     - datamol.log: api/datamol.log.md
 44 |     - datamol.molar: api/datamol.molar.md
 45 |     - datamol.mol: api/datamol.mol.md
 46 |     - datamol.reactions: api/datamol.reactions.md
 47 |     - datamol.scaffold: api/datamol.scaffold.md
 48 |     - datamol.similarity: api/datamol.similarity.md
 49 |     - datamol.utils: api/datamol.utils.md
 50 |     - datamol.utils.fs: api/datamol.utils.fs.md
 51 |     - datamol.viz: api/datamol.viz.md
 52 | 
 53 |   - Contribute: contribute.md
 54 |   - License: license.md
 55 | 
 56 | theme:
 57 |   name: material
 58 |   # NOTE(hadim): to customize the material primary and secondary
 59 |   # color check `docs/assets/css/datamol-custom.css`.
 60 |   features:
 61 |     - navigation.tabs
 62 |     - navigation.expand
 63 |   favicon: images/logo-black.png
 64 |   logo: images/logo.svg
 65 | 
 66 | extra_css:
 67 |   - assets/css/custom.css
 68 |   - assets/css/custom-datamol.css
 69 |   - assets/css/tweak-width.css
 70 | 
 71 | extra_javascript:
 72 |   - assets/js/google-analytics.js
 73 | 
 74 | markdown_extensions:
 75 |   - admonition
 76 |   - markdown_include.include
 77 |   - pymdownx.emoji
 78 |   - pymdownx.magiclink
 79 |   - pymdownx.superfences
 80 |   - pymdownx.tabbed
 81 |   - pymdownx.tasklist
 82 |   # For `tab_length=2` in the markdown extension
 83 |   # See https://github.com/mkdocs/mkdocs/issues/545
 84 |   - mdx_truly_sane_lists
 85 |   - toc:
 86 |       permalink: true
 87 |       toc_depth: 4
 88 | 
 89 | watch:
 90 |   - datamol/
 91 | 
 92 | plugins:
 93 |   - search
 94 | 
 95 |   - mkdocstrings:
 96 |       handlers:
 97 |         python:
 98 |           setup_commands:
 99 |             - import sys
100 |             - sys.path.append("docs")
101 |             - sys.path.append("datamol")
102 |           options:
103 |             new_path_syntax: true
104 |             show_root_heading: false
105 |             heading_level: 3
106 |             show_root_full_path: false
107 | 
108 |   - mkdocs-jupyter:
109 |       execute: false
110 |       # kernel_name: python3
111 | 
112 |   - mike:
113 |       version_selector: true
114 | 
115 | extra:
116 |   version:
117 |     # Multi versioning provider for mkdocs-material (used for the JS selector)
118 |     provider: mike
119 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools", "setuptools-scm"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "datamol"
  7 | description = "A python library to work with molecules. Built on top of RDKit."
  8 | authors = [{ name = "Hadrien Mary", email = "hadrien@valencediscovery.com" }]
  9 | readme = "README.md"
 10 | dynamic = ["version"]
 11 | requires-python = ">=3.8"
 12 | license = { text = "Apache" }
 13 | classifiers = [
 14 |     "Development Status :: 5 - Production/Stable",
 15 |     "Intended Audience :: Developers",
 16 |     "Intended Audience :: Healthcare Industry",
 17 |     "Intended Audience :: Science/Research",
 18 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 19 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
 20 |     "Topic :: Scientific/Engineering :: Information Analysis",
 21 |     "Topic :: Scientific/Engineering :: Medical Science Apps.",
 22 |     "Natural Language :: English",
 23 |     "Operating System :: OS Independent",
 24 |     "Programming Language :: Python",
 25 |     "Programming Language :: Python :: 3",
 26 |     "Programming Language :: Python :: 3.8",
 27 |     "Programming Language :: Python :: 3.9",
 28 |     "Programming Language :: Python :: 3.10",
 29 |     "Programming Language :: Python :: 3.11",
 30 | ]
 31 | dependencies = [
 32 |     "tqdm",
 33 |     "loguru",
 34 |     "joblib",
 35 |     "fsspec>=2021.9",
 36 |     "pandas",
 37 |     "numpy",
 38 |     "scipy",
 39 |     "matplotlib",
 40 |     "pillow",
 41 |     "selfies",
 42 |     "platformdirs",
 43 |     "scikit-learn",
 44 |     "packaging",
 45 |     "typing-extensions",
 46 |     "importlib-resources",
 47 |     "rdkit",
 48 | ]
 49 | 
 50 | [project.urls]
 51 | Website = "https://datamol.io"
 52 | "Source Code" = "https://github.com/datamol-io/datamol"
 53 | "Bug Tracker" = "https://github.com/datamol-io/datamol/issues"
 54 | Documentation = "https://docs.datamol.io"
 55 | 
 56 | [tool.setuptools]
 57 | include-package-data = true
 58 | 
 59 | [tool.setuptools_scm]
 60 | fallback_version = "dev"
 61 | 
 62 | [tool.setuptools.packages.find]
 63 | where = ["."]
 64 | include = ["datamol", "datamol.*"]
 65 | exclude = []
 66 | namespaces = true
 67 | 
 68 | [tool.setuptools.package-data]
 69 | "datamol.data" = ["*"]
 70 | 
 71 | [tool.black]
 72 | line-length = 100
 73 | target-version = ['py39', 'py310']
 74 | include = '\.pyi?$'
 75 | 
 76 | [tool.pytest.ini_options]
 77 | minversion = "6.0"
 78 | addopts = "--verbose --cov=datamol --cov-fail-under=85 --cov-report xml --cov-report term --durations=10 -n auto"
 79 | testpaths = ["tests"]
 80 | filterwarnings = [
 81 |     "ignore::DeprecationWarning:rdkit.Chem.MolStandardize",
 82 |     "ignore::DeprecationWarning:jupyter_client",
 83 |     "ignore::DeprecationWarning:pkg_resources",
 84 |     "ignore::DeprecationWarning:joblib.externals.loky.backend",
 85 |     "ignore::DeprecationWarning:dateutil.tz.tz",
 86 |     "ignore::DeprecationWarning:joblib._utils",
 87 |     "ignore::DeprecationWarning:openpyxl.packaging.core",
 88 |     "ignore::DeprecationWarning:tqdm.std",
 89 | ]
 90 | 
 91 | [tool.coverage.run]
 92 | source = ["datamol/"]
 93 | disable_warnings = ["no-data-collected"]
 94 | data_file = ".coverage/coverage"
 95 | 
 96 | [tool.coverage.report]
 97 | omit = ["datamol/__init__.py", "datamol/_version.py"]
 98 | 
 99 | [tool.coverage.xml]
100 | output = "coverage.xml"
101 | 
102 | [tool.mypy]
103 | exclude = []
104 | ignore_missing_imports = true
105 | 
106 | [tool.pyright]
107 | reportShadowedImports = false
108 | 
109 | [tool.ruff]
110 | ignore = [
111 |     "E501", # Never enforce `E501` (line length violations).
112 |     "E731", # Do not assign a lambda expression, use a def
113 | ]
114 | line-length = 110
115 | target-version = "py311"
116 | 
117 | [tool.ruff.per-file-ignores]
118 | "__init__.py" = [
119 |     "F401", # imported but unused
120 |     "E402", # Module level import not at top of file
121 | ]
122 | 
123 | [tool.ruff.pycodestyle]
124 | max-doc-length = 150
125 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import pathlib
 3 | from loguru import logger
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | DATA_DIR_PATH = pathlib.Path(__file__).parent.resolve() / "data"
 9 | 
10 | 
11 | @pytest.fixture
12 | def current_platform():
13 |     if platform.system() == "Linux":
14 |         return "linux"
15 |     elif platform.system() == "Darwin":
16 |         return "osx"
17 |     elif platform.system() == "Windows":
18 |         return "win"
19 |     else:
20 |         return platform.system()
21 | 
22 | 
23 | @pytest.fixture(autouse=True)
24 | def skip_by_platform(request, current_platform):
25 |     if request.node.get_closest_marker("skip_platform"):
26 |         if request.node.get_closest_marker("skip_platform").args[0] == current_platform:
27 |             pytest.skip(f"skipped on this platform: {current_platform}")
28 | 
29 | 
30 | def pytest_configure(config):
31 |     config.addinivalue_line(
32 |         "markers",
33 |         "skip_platform(current_platform): skip test for a given platform from `['linux', 'osx', 'win']`",
34 |     )
35 | 
36 | 
37 | @pytest.fixture
38 | def datadir(request):
39 |     return DATA_DIR_PATH
40 | 
41 | 
42 | # Mandatory for the below monkeypatch function.
43 | from _pytest.logging import caplog as _caplog  # noqa: E402, F401
44 | 
45 | 
46 | @pytest.fixture
47 | def caplog(_caplog):  # noqa: F811
48 |     """Monkeypatching the pytest caplog to work with loguru.
49 | 
50 |     See https://loguru.readthedocs.io/en/latest/resources/migration.html#making-things-work-with-pytest-and-caplog
51 |     """
52 |     import logging
53 | 
54 |     class PropogateHandler(logging.Handler):
55 |         def emit(self, record):
56 |             logging.getLogger(record.name).handle(record)
57 | 
58 |     handler_id = logger.add(PropogateHandler(), format="{message}")
59 |     yield _caplog
60 |     logger.remove(handler_id)
61 | 


--------------------------------------------------------------------------------
/tests/data/TUBB3-observations.sdf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/tests/data/TUBB3-observations.sdf.gz


--------------------------------------------------------------------------------
/tests/data/freesolv.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/tests/data/freesolv.xlsx


--------------------------------------------------------------------------------
/tests/data/test.mol2:
--------------------------------------------------------------------------------
  1 | @<TRIPOS>MOLECULE
  2 | mol_first
  3 | 11 11 1 0 0 
  4 | SMALL
  5 | AMBER ff14SB
  6 | 
  7 | @<TRIPOS>ATOM
  8 |     1 C1         -0.0167    1.3778    0.0096 C.ar      1 UNK    0.0267
  9 |     2 C2          0.0021   -0.0041    0.0020 C.ar      1 UNK   -0.0438
 10 |     3 C3          1.2218   -0.6631   -0.0131 C.ar      1 UNK   -0.0592
 11 |     4 C4          2.3820    0.0960   -0.0201 C.ar      1 UNK   -0.0438
 12 |     5 C5          2.2849    1.4746   -0.0118 C.ar      1 UNK    0.0267
 13 |     6 N6          1.1072    2.0677    0.0026 N.ar      1 UNK   -0.2647
 14 |     7 H7         -0.9627    1.8988    0.0169 H         1 UNK    0.0840
 15 |     8 H8         -0.9217   -0.5635    0.0075 H         1 UNK    0.0639
 16 |     9 H9          1.2671   -1.7422   -0.0190 H         1 UNK    0.0624
 17 |     10 H10         3.3495   -0.3839   -0.0316 H         1 UNK    0.0639
 18 |     11 H11         3.1838    2.0731   -0.0171 H         1 UNK    0.0840
 19 | @<TRIPOS>BOND
 20 |     1    1    6 ar
 21 |     2    1    2 ar
 22 |     3    1    7 1 
 23 |     4    2    3 ar
 24 |     5    2    8 1 
 25 |     6    3    4 ar
 26 |     7    3    9 1 
 27 |     8    4    5 ar
 28 |     9    4   10 1
 29 |     10    5    6 ar
 30 |     11    5   11 1
 31 | @<TRIPOS>SUBSTRUCTURE
 32 |     1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
 33 | 
 34 | @<TRIPOS>MOLECULE
 35 | mol_sec
 36 | 9 9 1 0 0 
 37 | SMALL
 38 | AMBER ff14SB
 39 | 
 40 | 
 41 | @<TRIPOS>ATOM
 42 |       1 C1          1.2973   -0.3859   -0.0124 C.2       1 UNK    0.0838
 43 |       2 N2          0.0021   -0.0041    0.0020 N.pl3     1 UNK   -0.3106
 44 |       3 H3         -0.7708   -0.5902    0.0062 H         1 UNK    0.1532
 45 |       4 C4         -0.0165    1.3646    0.0095 C.2       1 UNK    0.0120
 46 |       5 C5          1.2671    1.7717   -0.0005 C.2       1 UNK    0.0422
 47 |       6 N6          2.0482    0.6814   -0.0138 N.2       1 UNK   -0.2480
 48 |       7 H7          1.6529   -1.4057   -0.0216 H         1 UNK    0.1014
 49 |       8 H8         -0.8923    1.9965    0.0173 H         1 UNK    0.0806
 50 |       9 H9          1.6079    2.7966    0.0017 H         1 UNK    0.0854
 51 | @<TRIPOS>BOND
 52 |      1    1    6 2 
 53 |      2    1    2 1 
 54 |      3    1    7 1
 55 |      4    2    3 1
 56 |      5    2    4 1
 57 |      6    4    5 2
 58 |      7    4    8 1
 59 |      8    5    6 1
 60 |      9    5    9 1
 61 | @<TRIPOS>SUBSTRUCTURE
 62 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
 63 | 
 64 | @<TRIPOS>MOLECULE
 65 | mol_third
 66 | 9 9 1 0 0 
 67 | SMALL
 68 | AMBER ff14SB
 69 | 
 70 | 
 71 | @<TRIPOS>ATOM
 72 |       1 C1          1.2973   -0.3859   -0.0124 C       1 UNK    0.0838
 73 |       2 N2          0.0021   -0.0041    0.0020 N       1 UNK   -0.3106
 74 |       3 H3         -0.7708   -0.5902    0.0062 H       1 UNK    0.1532
 75 |       4 C4         -0.0165    1.3646    0.0095 C       1 UNK    0.0120
 76 |       5 C5          1.2671    1.7717   -0.0005 C       1 UNK    0.0422
 77 |       6 N6          2.0482    0.6814   -0.0138 N       1 UNK   -0.2480
 78 |       7 H7          1.6529   -1.4057   -0.0216 H       1 UNK    0.1014
 79 |       8 H8         -0.8923    1.9965    0.0173 H       1 UNK    0.0806
 80 |       9 H9          1.6079    2.7966    0.0017 H       1 UNK    0.0854
 81 | @<TRIPOS>BOND
 82 |      1    1    6 2 
 83 |      2    1    2 1 
 84 |      3    1    7 1
 85 |      4    2    3 1
 86 |      5    2    4 1
 87 |      6    4    5 2
 88 |      7    4    8 1
 89 |      8    5    6 1
 90 |      9    5    9 1
 91 | @<TRIPOS>SUBSTRUCTURE
 92 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
 93 | 
 94 | @<TRIPOS>MOLECULE
 95 | mol_sec_f
 96 | 9 9 1 0 0 
 97 | SMALL
 98 | AMBER ff14SB
 99 | 
100 | 
101 | @<TRIPOS>ATOM
102 |       1 C1          1.2973   -0.3859   -0.0124 C.2       1 UNK    0.0838
103 |       2 N2          0.0021   -0.0041    0.0020 N.pl3     1 UNK   -0.3106
104 |       3 H3         -0.7708   -0.5902    0.0062 H         1 UNK    0.1532
105 |       4 C4         -0.0165    1.3646    0.0095 C.2       1 UNK    0.0120
106 |       5 C5          1.2671    1.7717   -0.0005 C.2       1 UNK    0.0422
107 |       6 N6          2.0482    0.6814   -0.0138 N.2       1 UNK   -0.2480
108 |       7 H7          1.6529   -1.4057   -0.0216 H         1 UNK    0.1014
109 |       8 H8         -0.8923    1.9965    0.0173 H         1 UNK    0.0806
110 |       9 H9          1.6079    2.7966    0.0017 H         1 UNK    0.0854
111 | 
112 |      1    1    6 2 
113 |      2    1    2 1 
114 |      3    1    7 1
115 |      4    2    3 1
116 |      5    2    4 1
117 |      6    4    5 2
118 |      7    4    8 1
119 |      8    5    6 1
120 |      9    5    9 1
121 | @<TRIPOS>SUBSTRUCTURE
122 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
123 | 
124 | @<TRIPOS>MOLECULE
125 | mol_sec_f1
126 | 9 9 1 0 0 
127 | SMALL
128 | AMBER ff14SB
129 | 
130 | 
131 | 
132 |       1 C1          1.2973   -0.3859   -0.0124 C.2       1 UNK    0.0838
133 |       2 N2          0.0021   -0.0041    0.0020 N.pl3     1 UNK   -0.3106
134 |       3 H3         -0.7708   -0.5902    0.0062 H         1 UNK    0.1532
135 |       4 C4         -0.0165    1.3646    0.0095 C.2       1 UNK    0.0120
136 |       5 C5          1.2671    1.7717   -0.0005 C.2       1 UNK    0.0422
137 |       6 N6          2.0482    0.6814   -0.0138 N.2       1 UNK   -0.2480
138 |       7 H7          1.6529   -1.4057   -0.0216 H         1 UNK    0.1014
139 |       8 H8         -0.8923    1.9965    0.0173 H         1 UNK    0.0806
140 |       9 H9          1.6079    2.7966    0.0017 H         1 UNK    0.0854
141 | @<TRIPOS>BOND
142 |      1    1    6 2 
143 |      2    1    2 1 
144 |      3    1    7 1
145 |      4    2    3 1
146 |      5    2    4 1
147 |      6    4    5 2
148 |      7    4    8 1
149 |      8    5    6 1
150 |      9    5    9 1
151 | @<TRIPOS>SUBSTRUCTURE
152 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
153 | 
154 | @<TRIPOS>MOLECULE
155 | mol_sec_f3
156 | 9 9 1 0 0 
157 | SMALL
158 | AMBER ff14SB
159 | 
160 | @<TRIPOS>ATOM
161 | @<TRIPOS>BOND
162 |      1    1    6 2 
163 |      2    1    2 1 
164 |      3    1    7 1
165 |      4    2    3 1
166 |      5    2    4 1
167 |      6    4    5 2
168 |      7    4    8 1
169 |      8    5    6 1
170 |      9    5    9 1
171 | @<TRIPOS>SUBSTRUCTURE
172 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
173 | 
174 | @<TRIPOS>MOLECULE
175 | mol_sec_f4
176 | 9 9 1 0 0 
177 | SMALL
178 | AMBER ff14SB
179 | 
180 | @<TRIPOS>ATOM
181 | @<TRIPOS>BOND
182 | @<TRIPOS>SUBSTRUCTURE
183 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
184 | 
185 | 
186 | 
187 | @<TRIPOS>MOLECULE
188 | 
189 | 
190 | 
191 | @<TRIPOS>ATOM
192 |       1 C1          1.2973   -0.3859   -0.0124 C.2       1 UNK    0.0838
193 |       2 N2          0.0021   -0.0041    0.0020 N.pl3     1 UNK   -0.3106
194 |       3 H3         -0.7708   -0.5902    0.0062 H         1 UNK    0.1532
195 |       4 C4         -0.0165    1.3646    0.0095 C.2       1 UNK    0.0120
196 |       5 C5          1.2671    1.7717   -0.0005 C.2       1 UNK    0.0422
197 |       6 N6          2.0482    0.6814   -0.0138 N.2       1 UNK   -0.2480
198 |       7 H7          1.6529   -1.4057   -0.0216 H         1 UNK    0.1014
199 |       8 H8         -0.8923    1.9965    0.0173 H         1 UNK    0.0806
200 |       9 H9          1.6079    2.7966    0.0017 H         1 UNK    0.0854
201 | 
202 |      1    1    6 2 
203 |      2    1    2 1 
204 |      3    1    7 1
205 |      4    2    3 1
206 |      5    2    4 1
207 |      6    4    5 2
208 |      7    4    8 1
209 |      8    5    6 1
210 |      9    5    9 1
211 | @<TRIPOS>SUBSTRUCTURE
212 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
213 | 
214 | @<TRIPOS>MOLECULE
215 | mol_sec
216 | 9 9 1 0 0 
217 | SMALL
218 | AMBER ff14SB
219 | 
220 | 
221 | @<TRIPOS>ATOM
222 |       1 C1          1.2973   -0.3859   -0.0124 C       1 UNK    0.0838
223 |       2 N2          0.0021   -0.0041    0.0020 N       1 UNK   -0.3106
224 |       3 H3         -0.7708   -0.5902    0.0062 H       1 UNK    0.1532
225 |       4 C4         -0.0165    1.3646    0.0095 C       1 UNK    0.0120
226 |       5 C5          1.2671    1.7717   -0.0005 C       1 UNK    0.0422
227 |       6 N6          2.0482    0.6814   -0.0138 N       1 UNK   -0.2480
228 |       7 H7          1.6529   -1.4057   -0.0216 H       1 UNK    0.1014
229 |       8 H8         -0.8923    1.9965    0.0173 H       1 UNK    0.0806
230 |       9 H9          1.6079    2.7966    0.0017 H       1 UNK    0.0854
231 | @<TRIPOS>BOND
232 |      1    1    6 2 
233 |      2    1    2 1 
234 |      3    1    7 1
235 |      4    2    3 1
236 |      5    2    4 1
237 |      6    4    5 2
238 |      7    4    8 1
239 |      8    5    6 1
240 |      9    5    9 1
241 | @<TRIPOS>SUBSTRUCTURE
242 |      1 UNK     1 RESIDUE           4 A     UNK     0 ROOT
243 | 
244 | 


--------------------------------------------------------------------------------
/tests/test_align.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import pandas as pd
 4 | import datamol as dm
 5 | 
 6 | 
 7 | def test_template_align():
 8 |     data: pd.DataFrame = dm.cdk2(as_df=True)  # type: ignore
 9 |     data = data.iloc[:6].copy()  # type: ignore
10 | 
11 |     template = data.iloc[0]["mol"]
12 |     data["aligned_mol"] = data["mol"].apply(lambda x: dm.align.template_align(x, template=template))
13 |     assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
14 | 
15 |     template = data.iloc[0]["smiles"]
16 |     data["aligned_mol"] = data["smiles"].apply(
17 |         lambda x: dm.align.template_align(x, template=template)
18 |     )
19 |     assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
20 | 
21 |     template = data.iloc[0]["mol"]
22 |     data["aligned_mol"] = data["mol"].apply(
23 |         lambda x: dm.align.template_align(x, template=template, auto_select_coord_gen=True)
24 |     )
25 |     assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
26 | 
27 |     template = data.iloc[0]["mol"]
28 |     data["aligned_mol"] = data["mol"].apply(
29 |         lambda x: dm.align.template_align(x, template=template, use_depiction=False)
30 |     )
31 |     assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
32 | 
33 |     template = None
34 |     data["aligned_mol"] = data["mol"].apply(lambda x: dm.align.template_align(x, template=template))
35 |     assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
36 | 
37 |     template = None
38 |     data["aligned_mol"] = data["mol"].apply(
39 |         lambda x: dm.align.template_align(x, template=template, copy=False)
40 |     )
41 |     assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True
42 | 
43 |     assert dm.align.template_align(None) is None
44 | 
45 | 
46 | def test_auto_align_many():
47 |     data: pd.DataFrame = dm.solubility(as_df=True)  # type: ignore
48 |     data = data.iloc[:16].copy()  # type: ignore
49 | 
50 |     excepted_cluster_size = [8, 6, 5, 6, 6]
51 | 
52 |     for i, partition_method in enumerate(
53 |         [
54 |             "cluster",
55 |             "scaffold",
56 |             "anongraph-scaffold",
57 |             "anon-scaffold",
58 |             "strip-scaffold",
59 |         ]
60 |     ):
61 |         print(partition_method)
62 | 
63 |         data["aligned_mol"] = dm.align.auto_align_many(
64 |             data["mol"],
65 |             partition_method=partition_method,
66 |         )
67 | 
68 |         props = data["aligned_mol"].apply(lambda x: pd.Series(x.GetPropsAsDict()))
69 | 
70 |         assert "dm.auto_align_many.cluster_id" in props.columns
71 |         assert "dm.auto_align_many.core" in props.columns
72 |         assert props["dm.auto_align_many.cluster_id"].dtype.name == "int64"
73 |         assert props["dm.auto_align_many.core"].dtype.name == "object"
74 | 
75 |         assert props["dm.auto_align_many.cluster_id"].unique().shape[0] == excepted_cluster_size[i]
76 | 
77 |     with pytest.raises(ValueError):
78 |         dm.align.auto_align_many(data["mol"], partition_method="invalid")
79 | 


--------------------------------------------------------------------------------
/tests/test_cluster.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import datamol as dm
 4 | 
 5 | 
 6 | def test_cluster_mols():
 7 |     # Get some mols
 8 |     data = dm.data.freesolv()
 9 |     smiles = data["smiles"].iloc[:100].tolist()
10 |     mols = [dm.to_mol(s) for s in smiles]
11 | 
12 |     _, mol_clusters = dm.cluster_mols(mols, cutoff=0.7)
13 |     cluster_sizes = [11, 7, 5, 3, 3, 3, 2, 3, 2, 1, 2, 2, 1]
14 |     assert [len(c) for c in mol_clusters[:13]] == cluster_sizes
15 | 
16 | 
17 | def test_pick_diverse():
18 |     # Get some mols
19 |     data = dm.data.freesolv()
20 |     smiles = data["smiles"].iloc[:100].tolist()
21 |     mols = [dm.to_mol(s) for s in smiles]
22 | 
23 |     indices, _ = dm.pick_diverse(mols, npick=18, seed=19)
24 | 
25 |     excepted_indices = np.array(
26 |         [9, 14, 47, 50, 56, 61, 67, 89, 83, 90, 94, 10, 0, 96, 15, 58, 71, 21]
27 |     )
28 | 
29 |     assert np.all(indices == excepted_indices)
30 | 
31 | 
32 | def test_pick_centroids():
33 |     data = dm.data.freesolv()
34 |     smiles = data["smiles"].iloc[:100].tolist()
35 |     mols = [dm.to_mol(s) for s in smiles]
36 |     indices, centroids = dm.pick_centroids(
37 |         mols, npick=18, threshold=0.7, method="sphere", n_jobs=-1
38 |     )
39 |     excepted_indices = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18, 20])
40 | 
41 |     assert np.all(indices == excepted_indices)
42 | 
43 | 
44 | def test_assign_to_centroids():
45 |     data = dm.data.freesolv()
46 |     smiles = data["smiles"].iloc[:100].tolist()
47 |     mols = [dm.to_mol(s) for s in smiles]
48 |     indices, centroids = dm.pick_centroids(
49 |         mols, npick=18, threshold=0.7, method="sphere", n_jobs=-1
50 |     )
51 | 
52 |     cluster_map, cluster_list = dm.assign_to_centroids(mols, centroids, n_jobs=-1)
53 |     # expect centroid to be in centroid list
54 |     assert indices[0] in cluster_map[0]
55 |     # expect no intersection after assignment
56 |     map_intersection = set.intersection(*map(set, cluster_map.values()))
57 |     assert len(map_intersection) == 0
58 |     # expect some similar molecule in a given cluster
59 |     # assert 33 in cluster_map[0]
60 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | import datamol as dm
 2 | 
 3 | 
 4 | def test_freesolv():
 5 |     data = dm.data.freesolv()
 6 |     assert data.shape == (642, 4)
 7 |     assert list(data.columns) == ["iupac", "smiles", "expt", "calc"]
 8 | 
 9 | 
10 | def test_cdk2():
11 |     data = dm.data.cdk2()
12 |     assert data.shape == (47, 12)
13 |     assert list(data.columns) == [
14 |         "smiles",
15 |         "mol",
16 |         "id",
17 |         "Cluster",
18 |         "MODEL.SOURCE",
19 |         "MODEL.CCRATIO",
20 |         "r_mmffld_Potential_Energy-OPLS_2005",
21 |         "r_mmffld_RMS_Derivative-OPLS_2005",
22 |         "b_mmffld_Minimization_Converged-OPLS_2005",
23 |         "s_st_Chirality_1",
24 |         "s_st_Chirality_2",
25 |         "s_st_Chirality_3",
26 |     ]
27 | 
28 | 
29 | def test_solubility():
30 |     data = dm.data.solubility()
31 |     assert data.shape == (1282, 7)
32 |     assert list(data.columns) == [
33 |         "mol",
34 |         "ID",
35 |         "NAME",
36 |         "SOL",
37 |         "SOL_classification",
38 |         "smiles",
39 |         "split",
40 |     ]
41 | 
42 | 
43 | def test_chembl_drugs():
44 |     data = dm.data.chembl_drugs()
45 |     assert data.shape == (2628, 5)
46 |     assert list(data.columns) == [
47 |         "first_approval",
48 |         "molecule_chembl_id",
49 |         "molecule_type",
50 |         "pref_name",
51 |         "smiles",
52 |     ]
53 | 
54 | 
55 | def test_chembl_samples():
56 |     data = dm.data.chembl_samples()
57 |     assert data.shape == (2000, 1)
58 |     assert list(data.columns) == ["smiles"]
59 | 


--------------------------------------------------------------------------------
/tests/test_descriptors.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import pandas as pd
  4 | import datamol as dm
  5 | 
  6 | 
  7 | def test_descriptors():
  8 |     smiles_list = ["CC(=O)OC1=CC=CC=C1C(=O)O", "CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl"]
  9 | 
 10 |     for smiles in smiles_list:
 11 |         mol = dm.to_mol(smiles)
 12 | 
 13 |         dm.descriptors.mw(mol)
 14 |         dm.descriptors.fsp3(mol)
 15 |         dm.descriptors.n_hba(mol)
 16 |         dm.descriptors.n_hbd(mol)
 17 |         dm.descriptors.n_lipinski_hba(mol)
 18 |         dm.descriptors.n_lipinski_hbd(mol)
 19 |         dm.descriptors.n_rings(mol)
 20 |         dm.descriptors.n_hetero_atoms(mol)
 21 |         dm.descriptors.n_heavy_atoms(mol)
 22 |         dm.descriptors.n_rotatable_bonds(mol)
 23 |         dm.descriptors.n_aliphatic_rings(mol)
 24 |         dm.descriptors.n_aromatic_rings(mol)
 25 |         dm.descriptors.n_saturated_rings(mol)
 26 |         dm.descriptors.n_radical_electrons(mol)
 27 |         dm.descriptors.tpsa(mol)
 28 |         dm.descriptors.qed(mol)
 29 |         dm.descriptors.clogp(mol)
 30 |         dm.descriptors.sas(mol)
 31 |         dm.descriptors.sas(mol)
 32 |         dm.descriptors.n_stereo_centers_unspecified(mol)
 33 |         dm.descriptors.n_spiro_atoms(mol)
 34 | 
 35 |         dm.descriptors.n_aliphatic_carbocycles(mol)
 36 |         dm.descriptors.n_aliphatic_heterocyles(mol)
 37 |         dm.descriptors.n_aliphatic_rings(mol)
 38 |         dm.descriptors.n_aromatic_carbocycles(mol)
 39 |         dm.descriptors.n_aromatic_heterocyles(mol)
 40 |         dm.descriptors.n_aromatic_rings(mol)
 41 |         dm.descriptors.n_saturated_carbocycles(mol)
 42 |         dm.descriptors.n_saturated_heterocyles(mol)
 43 |         dm.descriptors.n_saturated_rings(mol)
 44 | 
 45 | 
 46 | def test_compute_many_descriptors():
 47 |     mol = dm.to_mol("CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl")
 48 | 
 49 |     true_values = pd.Series(
 50 |         {
 51 |             "mw": 319.181525512,
 52 |             "fsp3": 0.5,
 53 |             "n_lipinski_hba": 3.0,
 54 |             "n_lipinski_hbd": 1.0,
 55 |             "n_rings": 2.0,
 56 |             "n_hetero_atoms": 4.0,
 57 |             "n_heavy_atoms": 22.0,
 58 |             "n_rotatable_bonds": 8.0,
 59 |             "n_radical_electrons": 0.0,
 60 |             "tpsa": 28.16,
 61 |             "qed": 0.7564117572128701,
 62 |             "clogp": 4.810600000000004,
 63 |             "sas": 2.670786229594949,
 64 |             "n_aliphatic_carbocycles": 0.0,
 65 |             "n_aliphatic_heterocyles": 0.0,
 66 |             "n_aliphatic_rings": 0.0,
 67 |             "n_aromatic_carbocycles": 1.0,
 68 |             "n_aromatic_heterocyles": 1.0,
 69 |             "n_aromatic_rings": 2.0,
 70 |             "n_saturated_carbocycles": 0.0,
 71 |             "n_saturated_heterocyles": 0.0,
 72 |             "n_saturated_rings": 0.0,
 73 |         }
 74 |     )
 75 | 
 76 |     # Scenario #1
 77 |     props = dm.descriptors.compute_many_descriptors(mol)
 78 |     props = pd.Series(props)
 79 | 
 80 |     assert props.equals(true_values)
 81 | 
 82 |     # Scenario #2
 83 |     props = dm.descriptors.compute_many_descriptors(
 84 |         mol,
 85 |         properties_fn={"hello": lambda x: 88},
 86 |         add_properties=False,
 87 |     )
 88 |     assert props == {"hello": 88}
 89 | 
 90 |     # Scenario #3
 91 |     props = dm.descriptors.compute_many_descriptors(
 92 |         mol,
 93 |         properties_fn={"hello": lambda x: 88},
 94 |         add_properties=True,
 95 |     )
 96 |     props = pd.Series(props)
 97 | 
 98 |     true_values_2 = true_values.copy()
 99 |     true_values_2["hello"] = 88
100 |     true_values_2 = true_values_2[props.index]
101 | 
102 |     assert true_values_2.equals(props)
103 | 
104 | 
105 | def test_compute_many_descriptors_with_function_as_string():
106 |     mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
107 | 
108 |     results = dm.descriptors.compute_many_descriptors(
109 |         mol,
110 |         properties_fn={"max_partial_charge": "MaxPartialCharge"},
111 |         add_properties=False,
112 |     )
113 | 
114 |     assert "max_partial_charge" in results.keys()
115 |     assert pytest.approx(0.33900378687731025) == results["max_partial_charge"]
116 | 
117 | 
118 | def test_batch_compute_many_descriptors():
119 |     data = dm.data.freesolv()
120 |     data = data.iloc[:30]
121 |     mols = data["smiles"].apply(dm.to_mol).tolist()
122 | 
123 |     props = dm.descriptors.batch_compute_many_descriptors(
124 |         mols,
125 |         batch_size=64,
126 |         n_jobs=-1,
127 |         progress=False,
128 |     )
129 | 
130 |     assert set(props.columns.tolist()) == {
131 |         "mw",
132 |         "fsp3",
133 |         "n_lipinski_hba",
134 |         "n_lipinski_hbd",
135 |         "n_rings",
136 |         "n_hetero_atoms",
137 |         "n_heavy_atoms",
138 |         "n_rotatable_bonds",
139 |         "n_radical_electrons",
140 |         "tpsa",
141 |         "qed",
142 |         "clogp",
143 |         "sas",
144 |         "n_aliphatic_carbocycles",
145 |         "n_aliphatic_heterocyles",
146 |         "n_aliphatic_rings",
147 |         "n_aromatic_carbocycles",
148 |         "n_aromatic_heterocyles",
149 |         "n_aromatic_rings",
150 |         "n_saturated_carbocycles",
151 |         "n_saturated_heterocyles",
152 |         "n_saturated_rings",
153 |     }
154 |     assert props.shape == (30, 22)
155 | 
156 | 
157 | def test_any_rdkit_descriptor():
158 |     mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
159 | 
160 |     value = dm.descriptors.any_rdkit_descriptor("MaxPartialCharge")(mol)
161 |     assert pytest.approx(value) == 0.33900378687731025
162 | 
163 |     value = dm.descriptors.any_rdkit_descriptor("CalcFractionCSP3")(mol)
164 |     assert pytest.approx(value) == 0.1111111111111111
165 | 
166 |     with pytest.raises(ValueError):
167 |         dm.descriptors.any_rdkit_descriptor("DOES NOT EXIST")
168 | 
169 | 
170 | def test_n_aromatic_atoms():
171 |     smiles = "Nc1cnn(-c2ccccc2)c(=O)c1Cl"
172 |     mol = dm.to_mol(smiles)
173 | 
174 |     assert dm.descriptors.n_aromatic_atoms(mol) == 12
175 |     assert dm.descriptors.n_aromatic_atoms_proportion(mol) == 0.8
176 | 
177 | 
178 | def test_formal_charge():
179 |     mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC")
180 |     assert dm.descriptors.formal_charge(mol) == 0
181 | 
182 |     mol = dm.to_mol("C(CC(=O)[O-])C(C(=O)[O-])[NH3+]")
183 |     assert dm.descriptors.formal_charge(mol) == -1
184 | 
185 | 
186 | def test_refractivity():
187 |     mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3")
188 | 
189 |     value = dm.descriptors.refractivity(mol)
190 |     assert pytest.approx(value, rel=2) == 81.10
191 | 
192 | 
193 | def test_n_rigid_bonds():
194 |     mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC")
195 |     assert dm.descriptors.n_rigid_bonds(mol) == 20
196 | 
197 |     mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3")
198 |     assert dm.descriptors.n_rigid_bonds(mol) == 19
199 | 
200 | 
201 | def test_n_stereocenters():
202 |     mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC")
203 | 
204 |     assert dm.descriptors.n_stereo_centers(mol) == 1
205 | 
206 |     mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3")
207 |     assert dm.descriptors.n_stereo_centers(mol) == 0
208 | 
209 | 
210 | def test_n_charged_atoms():
211 |     mol = dm.to_mol("C(CC(=O)[O-])C(C(=O)[O-])[NH3+]")
212 |     assert dm.descriptors.n_charged_atoms(mol) == 3
213 | 


--------------------------------------------------------------------------------
/tests/test_fp.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import datamol as dm
 4 | 
 5 | 
 6 | def test_to_fp():
 7 |     smiles = "CC(=O)Oc1ccccc1C(=O)O"
 8 |     mol = dm.to_mol(smiles)
 9 | 
10 |     assert dm.to_fp(mol).shape[0] == 2048
11 |     assert dm.to_fp(mol).sum() == 31
12 | 
13 | 
14 | def test_list_fp():
15 |     assert set(dm.list_supported_fingerprints().keys()) == {
16 |         "atompair",
17 |         "atompair-count",
18 |         "avalon-count",
19 |         "ecfp",
20 |         "fcfp",
21 |         "ecfp-count",
22 |         "erg",
23 |         "estate",
24 |         "fcfp-count",
25 |         "layered",
26 |         "maccs",
27 |         "pattern",
28 |         "rdkit",
29 |         "topological",
30 |         "topological-count",
31 |         "rdkit-count",
32 |     }
33 | 
34 | 
35 | def test_all_fps():
36 |     smiles = "CC(=O)Oc1ccccc1C(=O)O"
37 |     mol = dm.to_mol(smiles)
38 | 
39 |     fp_infos = {}
40 |     for fp_type in dm.list_supported_fingerprints():
41 |         fold_size = None
42 |         if fp_type == "rdkit-count":
43 |             fold_size = 2048
44 | 
45 |         print(fp_type)
46 |         args = {}
47 |         args["mol"] = mol
48 |         args["as_array"] = True
49 |         args["fp_type"] = fp_type
50 |         args["fold_size"] = fold_size
51 |         fp = dm.to_fp(**args)
52 | 
53 |         fp_infos[fp_type] = dict(size=len(fp), bits_sum=fp.sum())
54 | 
55 |     print(fp_infos)
56 | 
57 |     assert fp_infos == {
58 |         "maccs": {"size": 167, "bits_sum": 21},
59 |         "ecfp": {"size": 2048, "bits_sum": 31},
60 |         "fcfp": {"size": 2048, "bits_sum": 22},
61 |         "topological": {"size": 2048, "bits_sum": 18},
62 |         "atompair": {"size": 2048, "bits_sum": 68},
63 |         "rdkit": {"size": 2048, "bits_sum": 354},
64 |         "pattern": {"size": 2048, "bits_sum": 173},
65 |         "layered": {"size": 2048, "bits_sum": 335},
66 |         "erg": {"size": 315, "bits_sum": 23.4},
67 |         "estate": {"size": 79, "bits_sum": 13},
68 |         "avalon-count": {"size": 512, "bits_sum": 168},
69 |         "ecfp-count": {"size": 2048, "bits_sum": 42},
70 |         "fcfp-count": {"size": 2048, "bits_sum": 35},
71 |         "topological-count": {"size": 2048, "bits_sum": 19},
72 |         "atompair-count": {"size": 2048, "bits_sum": 78},
73 |         "rdkit-count": {"size": 2048, "bits_sum": 301},
74 |     }
75 | 
76 | 
77 | def test_fp_invalid_input():
78 |     args = {}
79 |     args["mol"] = None
80 |     args["radius"] = 3
81 | 
82 |     with pytest.raises(ValueError):
83 |         dm.to_fp(**args)
84 | 
85 |     args["mol"] = "dsdsdsd"
86 |     with pytest.raises(ValueError):
87 |         dm.to_fp(**args)
88 | 


--------------------------------------------------------------------------------
/tests/test_fragment.py:
--------------------------------------------------------------------------------
 1 | import datamol as dm
 2 | 
 3 | 
 4 | def test_brics():
 5 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
 6 |     mol = dm.to_mol(smiles)
 7 |     frags = dm.fragment.brics(mol)
 8 |     assert len(frags) == 9
 9 | 
10 | 
11 | def test_frag():
12 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
13 |     mol = dm.to_mol(smiles)
14 |     frags = dm.fragment.frag(mol)
15 |     assert len(frags) == 9
16 | 
17 | 
18 | def test_recap():
19 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
20 |     mol = dm.to_mol(smiles)
21 |     frags = dm.fragment.recap(mol)
22 |     assert len(frags) == 3
23 | 
24 | 
25 | def test_anybreak():
26 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
27 |     mol = dm.to_mol(smiles)
28 |     frags = dm.fragment.anybreak(mol)
29 |     assert len(frags) == 9
30 | 
31 | 
32 | def test_mmpa():
33 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
34 |     mol = dm.to_mol(smiles)
35 | 
36 |     frags = dm.fragment.mmpa_cut(mol)
37 |     assert len(frags) == 39
38 |     assert "CCCOCc1cccc(-c2ccccn2)c1,C(C[*:2])[*:1],C[*:1].c1ccc(-c2cccc(CO[*:2])c2)nc1\n" in frags
39 | 
40 | 
41 | def test_assemble():
42 |     # Fragment a molecule
43 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
44 |     mol = dm.to_mol(smiles)
45 |     frags = dm.fragment.brics(mol)
46 | 
47 |     # Limit the number of fragments to work with because
48 |     # assembling is computationally intensive.
49 |     frags = frags[:2]
50 | 
51 |     # Assemble molecules from the list of fragments
52 |     mols = list(dm.fragment.assemble_fragment_order(frags, max_n_mols=4))
53 | 
54 |     assert len(mols) == 4
55 | 
56 | 
57 | def test_break_mol():
58 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
59 |     mol = dm.to_mol(smiles)
60 |     fragments, *_, tree = dm.fragment.break_mol(mol, randomize=False, mode="brics", returnTree=True)
61 | 
62 |     assert fragments == ["CCC", "O", "C", "c1ccncc1", "c1ccccc1"]
63 |     assert list(tree.nodes) == [0, 1, 2, 3, 4, 5, 6, 7, 8]
64 |     assert list(tree.edges) == [(0, 1), (0, 2), (2, 3), (2, 4), (4, 5), (4, 6), (6, 7), (6, 8)]
65 | 
66 | 
67 | def test_assemble_build():
68 |     mols = [[dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")], [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")]]
69 | 
70 |     results = list(dm.fragment.build(mols))
71 |     assert len(results) == 71
72 | 
73 |     results = list(dm.fragment.build(mols, mode="rxn"))
74 |     assert len(results) == 0
75 | 
76 |     results = list(dm.fragment.build(mols, mode=None))
77 |     assert len(results) == 0
78 | 


--------------------------------------------------------------------------------
/tests/test_import.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | import datamol as dm
4 | 
5 | 
6 | def test_datamol_import_fails():
7 |     with pytest.raises(AttributeError):
8 |         dm.that_import_does_not_exist
9 | 


--------------------------------------------------------------------------------
/tests/test_isomers.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import datamol as dm
  4 | 
  5 | 
  6 | def test_enumerate_tautomers():
  7 |     mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
  8 | 
  9 |     mols = dm.enumerate_tautomers(mol, n_variants=10)
 10 | 
 11 |     assert {dm.to_smiles(m) for m in mols} == {"O=C1C=[N:1]C2CCCCC2C1", "OC1=CC2CCCCC2[N:1]=C1"}
 12 | 
 13 | 
 14 | def test_enumerate_stereo():
 15 |     mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
 16 | 
 17 |     mols = dm.enumerate_stereoisomers(mol, n_variants=10)
 18 | 
 19 |     assert {dm.to_smiles(m) for m in mols} == {
 20 |         "OC1=C[C@@H]2CCCC[C@@H]2[N:1]=C1",
 21 |         "OC1=C[C@@H]2CCCC[C@H]2[N:1]=C1",
 22 |         "OC1=C[C@H]2CCCC[C@@H]2[N:1]=C1",
 23 |         "OC1=C[C@H]2CCCC[C@H]2[N:1]=C1",
 24 |     }
 25 | 
 26 | 
 27 | def test_enumerate_stereo_undefined_failure():
 28 |     mol = dm.to_mol(
 29 |         "N=1C(NC2CC2)=C3C(=NC1)N(/C=C/C=4C=C(C=CC4C)C(=O)NC=5C=C(C=C(C5)N6CCN(CC6)C)C(F)(F)F)C=N3"
 30 |     )
 31 |     with pytest.raises(RuntimeError):
 32 |         dm.enumerate_stereoisomers(mol, clean_it=True)
 33 | 
 34 |     mols = dm.enumerate_stereoisomers(mol, clean_it=False)
 35 |     assert len(mols) == 2  # only one double bond
 36 | 
 37 | 
 38 | def test_enumerate_stereo_timeout():
 39 |     mol = dm.to_mol("CCCCC")
 40 | 
 41 |     # NOTE(hadim): it's impossible to predict anything given a timeout for different
 42 |     # machines so we here we just check the code can run without errors
 43 |     dm.enumerate_stereoisomers(mol, n_variants=2, timeout_seconds=1)
 44 | 
 45 | 
 46 | def test_count_stereoisomers():
 47 |     num_isomers_1 = dm.count_stereoisomers(dm.to_mol("CC=CC"), undefined_only=True)
 48 |     num_isomers_2 = dm.count_stereoisomers(dm.to_mol("CC=CC"), undefined_only=False)
 49 |     assert num_isomers_1 == num_isomers_2
 50 | 
 51 |     assert dm.count_stereoisomers(dm.to_mol("Br/C=C\\Br"), undefined_only=True) == 1
 52 | 
 53 | 
 54 | def test_enumerate_structural():
 55 |     mol = dm.to_mol("CCCCC")  # pentane has only three structural isomers
 56 | 
 57 |     mols_iso = dm.enumerate_structisomers(
 58 |         mol,
 59 |         n_variants=2,
 60 |         allow_cycle=False,
 61 |         depth=1,
 62 |         allow_double_bond=False,
 63 |         allow_triple_bond=False,
 64 |     )
 65 | 
 66 |     assert {dm.to_smiles(m) for m in mols_iso} == {"CCC(C)C"}
 67 | 
 68 |     # NOTE(hadim): disable to reduce testing time
 69 |     # mols_cyclo_iso = dm.enumerate_structisomers(mol, n_variants=5, depth=2, allow_cycle=True)
 70 | 
 71 |     # # expect 3 molecules with cycles
 72 |     # assert sum([Chem.rdMolDescriptors.CalcNumRings(x) == 1 for x in mols_cyclo_iso]) == 3  # type: ignore
 73 | 
 74 |     # mols_cyclo_iso_double = dm.enumerate_structisomers(
 75 |     #     mol, n_variants=10, allow_cycle=True, allow_double_bond=True
 76 |     # )
 77 |     # should have mol with double link
 78 |     # assert sum(["=" in dm.to_smiles(x) for x in mols_cyclo_iso_double]) > 0
 79 | 
 80 | 
 81 | @pytest.mark.skip_platform("win")
 82 | def test_enumerate_structural_timeout():
 83 |     mol = dm.to_mol("CCCCC")
 84 | 
 85 |     # NOTE(hadim): it's impossible to predict anything given a timeout for different
 86 |     # machines so we here we just check the code can run without errors
 87 |     dm.enumerate_structisomers(mol, n_variants=10, timeout_seconds=1)
 88 | 
 89 | 
 90 | def test_canonical_tautomer():
 91 |     smiles = "Oc1c(cccc3)c3nc2ccncc12"
 92 |     mol = dm.to_mol(smiles)
 93 | 
 94 |     canonical_mol = dm.canonical_tautomer(mol)
 95 | 
 96 |     assert dm.to_smiles(canonical_mol) == "O=c1c2ccccc2[nH]c2ccncc12"
 97 |     assert dm.to_inchikey(canonical_mol) == dm.to_inchikey(mol)
 98 | 
 99 | 
100 | def test_remove_stereochemistry():
101 |     mol = dm.to_mol("C[C@H]1CCC[C@@H](C)[C@@H]1Cl")
102 |     mol_no_stereo = dm.remove_stereochemistry(mol)
103 |     assert dm.to_smiles(mol_no_stereo) == "CC1CCCC(C)C1Cl"
104 | 


--------------------------------------------------------------------------------
/tests/test_log.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import datamol as dm
 4 | 
 5 | 
 6 | @dm.no_rdkit_log
 7 | def no_log_to_mol(smiles):
 8 |     return dm.to_mol(smiles)
 9 | 
10 | 
11 | def check_logs_are_shown(capfd):
12 |     smiles = "fake_smiles"
13 |     dm.to_mol(smiles)
14 |     _, err = capfd.readouterr()
15 |     assert "SMILES Parse Error" in err
16 | 
17 | 
18 | def check_logs_are_not_shown(capfd):
19 |     smiles = "fake_smiles"
20 |     dm.to_mol(smiles)
21 |     _, err = capfd.readouterr()
22 |     assert err == ""
23 | 
24 | 
25 | def check_logs_are_not_shown_deco(capfd):
26 |     smiles = "fake_smiles"
27 |     no_log_to_mol(smiles)
28 |     _, err = capfd.readouterr()
29 |     assert err == ""
30 | 
31 | 
32 | @pytest.mark.skip_platform("win")
33 | def test_rdkit_log(capfd):
34 |     """Test multiple rdkit log scenarios."""
35 | 
36 |     check_logs_are_shown(capfd)
37 |     check_logs_are_not_shown_deco(capfd)
38 | 
39 |     check_logs_are_shown(capfd)
40 |     with dm.without_rdkit_log():
41 |         check_logs_are_not_shown(capfd)
42 |     check_logs_are_shown(capfd)
43 | 
44 |     dm.disable_rdkit_log()
45 |     check_logs_are_not_shown(capfd)
46 | 
47 |     dm.enable_rdkit_log()
48 |     check_logs_are_shown(capfd)
49 | 
50 |     dm.disable_rdkit_log()
51 |     with dm.without_rdkit_log():
52 |         check_logs_are_not_shown(capfd)
53 |     check_logs_are_not_shown(capfd)
54 | 
55 | 
56 | @pytest.mark.skip_platform("win")
57 | def test_rdkit_log_enable(capfd):
58 |     dm.enable_rdkit_log()
59 | 
60 |     with dm.without_rdkit_log():
61 |         check_logs_are_not_shown(capfd)
62 | 
63 |     with dm.without_rdkit_log(enable=False):
64 |         check_logs_are_shown(capfd)
65 | 
66 |     check_logs_are_shown(capfd)
67 | 


--------------------------------------------------------------------------------
/tests/test_mcs.py:
--------------------------------------------------------------------------------
 1 | import datamol as dm
 2 | 
 3 | 
 4 | def test_find_mcs():
 5 |     smiles_list = [
 6 |         "C=CC(=O)NCCOc1cc2ncnc(Nc3ccc(Br)cc3F)c2cc1NC(=O)C=C",
 7 |         "C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Br)c3)ncnc2cc1OCCCN1CCOCC1",
 8 |         "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCCNC(=O)CN(C)C",
 9 |         "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)NCC",
10 |         "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)CN(C)C",
11 |     ]
12 |     mols = [dm.to_mol(s) for s in smiles_list]
13 |     smarts = dm.find_mcs(mols=mols, timeout=2)
14 | 
15 |     # NOTE(hadim): hash are different given different RDKit version
16 |     expected_hashes = [
17 |         # RDKit >= 2023.09
18 |         "762f483ac10cc0f45c5aa2c790f9ef52f8dfb337",
19 |         # RDKit <= 2023.03
20 |         "49eff32e405d17980fad428cf4063ec52e2c5fda",
21 |     ]
22 | 
23 |     assert dm.hash_mol(dm.from_smarts(smarts)) in expected_hashes
24 | 


--------------------------------------------------------------------------------
/tests/test_molar.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import datamol as dm
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | MOLAR_TEST_VALUES = pd.DataFrame(
 9 |     [
10 |         (1, 6, "uM"),
11 |         (0.059, 7.229147988357856, "uM"),
12 |         (0.024, 7.61978876, "uM"),
13 |         (0.187, 6.72815839, "uM"),
14 |         (0.00154, 8.8124793, "uM"),
15 |         (128, 6.892790, "nM"),
16 |         (0.000128, 6.892790, "mM"),
17 |     ],
18 |     columns=["xc50", "pxc50", "unit"],
19 | )
20 | 
21 | 
22 | def test_molar_to_log():
23 |     # test scalar
24 |     value, log_value, unit = MOLAR_TEST_VALUES.iloc[0].values
25 |     assert dm.molar.molar_to_log(value, unit=unit) == log_value
26 | 
27 |     # test arrays
28 |     for unit in ["uM", "mM", "nM"]:
29 |         mask = MOLAR_TEST_VALUES["unit"] == unit
30 |         values = MOLAR_TEST_VALUES[mask]["xc50"].tolist()
31 |         log_values = MOLAR_TEST_VALUES[mask]["pxc50"].tolist()
32 |         np.testing.assert_almost_equal(dm.molar.molar_to_log(values, unit=unit), log_values)
33 | 
34 |     # test wrong unit
35 |     with pytest.raises(ValueError):
36 |         dm.molar.molar_to_log(0.000128, unit="kcal/mol")
37 | 
38 | 
39 | def test_log_to_molar():
40 |     # test scalar
41 |     value, log_value, unit = MOLAR_TEST_VALUES.iloc[0].values
42 |     np.testing.assert_almost_equal(dm.molar.log_to_molar(log_value, unit=unit), value)
43 | 
44 |     # test arrays
45 |     for unit in ["uM", "mM", "nM"]:
46 |         mask = MOLAR_TEST_VALUES["unit"] == unit
47 |         values = MOLAR_TEST_VALUES[mask]["xc50"].tolist()
48 |         log_values = MOLAR_TEST_VALUES[mask]["pxc50"].tolist()
49 |         np.testing.assert_almost_equal(
50 |             dm.molar.log_to_molar(log_values, unit=unit), values, decimal=5
51 |         )
52 | 
53 |     # test wrong unit
54 |     with pytest.raises(ValueError):
55 |         dm.molar.log_to_molar(7.214, unit="kcal/mol")
56 | 
57 | 
58 | def test_log_to_molar_with_integer():
59 |     dm.molar.log_to_molar(6, unit="uM")
60 | 


--------------------------------------------------------------------------------
/tests/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pathlib
 3 | 
 4 | import nbformat
 5 | import datamol as dm
 6 | from nbconvert.preprocessors.execute import ExecutePreprocessor
 7 | 
 8 | ROOT_DIR = pathlib.Path(__file__).parent.resolve()
 9 | 
10 | NOTEBOOK_DIR = ROOT_DIR.parent / "docs" / "tutorials"
11 | 
12 | NOTEBOOK_PATHS = sorted(list(NOTEBOOK_DIR.glob("*.ipynb")))
13 | 
14 | # Discard `Filesystem.ipynb` because it takes too long to run.
15 | NOTEBOOK_PATHS = list(filter(lambda x: "Filesystem.ipynb" != x.name, NOTEBOOK_PATHS))
16 | 
17 | 
18 | @pytest.mark.skip_platform("win")
19 | @pytest.mark.parametrize("nb_path", NOTEBOOK_PATHS, ids=[str(n.name) for n in NOTEBOOK_PATHS])
20 | def test_notebook(nb_path):
21 |     # Setup and configure the processor to execute the notebook
22 |     if "Visualization.ipynb" in nb_path.name and not dm.is_greater_than_current_rdkit_version(
23 |         "2023.03"
24 |     ):
25 |         pytest.skip("Circle Grid requires rdkit>2022.09")
26 |     ep = ExecutePreprocessor(timeout=600, kernel_name="python")
27 | 
28 |     # Open the notebook
29 |     with open(nb_path) as f:
30 |         nb = nbformat.read(f, as_version=nbformat.NO_CONVERT)
31 | 
32 |     # Execute the notebook
33 |     ep.preprocess(nb, {"metadata": {"path": NOTEBOOK_DIR}})
34 | 


--------------------------------------------------------------------------------
/tests/test_predictors.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import datamol as dm
 4 | import numpy as np
 5 | 
 6 | 
 7 | def test_esol():
 8 |     smiles = "Nc1cnn(-c2ccccc2)c(=O)c1Cl"
 9 |     mol = dm.to_mol(smiles)
10 | 
11 |     assert np.allclose(dm.predictors.esol(mol), -2.627091966265316)
12 | 
13 | 
14 | def test_esol_from_data():
15 |     data = dm.freesolv()
16 |     data = data.iloc[:20]
17 | 
18 |     with pytest.raises(KeyError):
19 |         dm.predictors.esol_from_data(data)
20 | 
21 |     data["mol"] = data["smiles"].apply(dm.to_mol)
22 |     data["clogp"] = data["mol"].apply(dm.descriptors.clogp)
23 |     data["mw"] = data["mol"].apply(dm.descriptors.mw)
24 |     data["n_rotatable_bonds"] = data["mol"].apply(dm.descriptors.n_rotatable_bonds)
25 |     data["n_aromatic_atoms_proportion"] = data["mol"].apply(
26 |         dm.descriptors.n_aromatic_atoms_proportion
27 |     )
28 | 
29 |     # dataframe
30 |     esol_values = dm.predictors.esol_from_data(data)
31 |     assert esol_values.dtype == float
32 |     assert esol_values.shape == (20,)
33 | 
34 |     # series
35 |     v = dm.predictors.esol_from_data(data.iloc[0])
36 |     v = float(v)
37 |     assert isinstance(v, float)
38 | 
39 |     # dict
40 |     v = dm.predictors.esol_from_data(data.iloc[0].to_dict())
41 |     v = float(v)
42 |     assert isinstance(v, float)
43 | 


--------------------------------------------------------------------------------
/tests/test_scaffold.py:
--------------------------------------------------------------------------------
 1 | import datamol as dm
 2 | 
 3 | 
 4 | def test_fuzzy_scaffolding():
 5 |     smiles = [
 6 |         "Cc1ccc(NC(=O)Cn2cccn2)c(Br)c1",
 7 |         "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1",
 8 |         "CC(NC(=O)CSCc1cccs1)C1CCCO1",
 9 |         "CC1CCCCN1C(=O)CN1CCC[C@@H](N)C1",
10 |         "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1",  # no way this one (Remdesivir) is in the db
11 |         "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1",
12 |     ]
13 | 
14 |     mols = [dm.to_mol(s) for s in smiles]
15 | 
16 |     # NOTE(hadim): different version of rdkit (2020.09 vs 2021.03) returns
17 |     # different SMILES here.
18 |     # assert "O=C(CN1CCC[C@@H]([*:1])C1)N1CCCCC1[*:2]" in all_scaffolds
19 |     # assert "O=C(CSCc1cccs1)NC(C1CCCO1)[*:1]" in all_scaffolds
20 |     # assert "O=C(N=c1sccn1[*:1])C(Oc1ccc([*:3])cc1)[*:2]" in all_scaffolds
21 | 
22 |     all_scaffolds, df_scf2infos, df_scf2groups = dm.scaffold.fuzzy_scaffolding(mols)
23 | 
24 |     assert len(all_scaffolds) == 5
25 |     assert len(df_scf2infos.columns) == 3
26 | 
27 |     # because we are returning the output for each scf
28 |     # these should be the same
29 |     assert len(df_scf2infos.index) == len(df_scf2groups.index)
30 |     assert list(df_scf2infos["scf"]) == list(df_scf2groups["scf"])
31 | 
32 |     # mere coincidence that scf2infos and scf2groups for the columns have the
33 |     # the same length. the reason there are 3 not two is because it could have
34 |     # extra columns where a cell may have none values.
35 |     assert len(df_scf2groups.columns) == 3
36 | 


--------------------------------------------------------------------------------
/tests/test_similarity.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import numpy as np
  4 | import datamol as dm
  5 | import datamol.utils.testing
  6 | 
  7 | 
  8 | def test_pdist():
  9 |     smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
 10 |     mols = [dm.to_mol(smiles) for smiles in smiles_list]
 11 | 
 12 |     dist_mat = dm.pdist(mols)
 13 | 
 14 |     assert dist_mat.shape == (3, 3)
 15 |     assert dist_mat.sum() == 5.6757105943152455
 16 | 
 17 |     dist_mat = dm.pdist(mols, n_jobs=None)
 18 | 
 19 |     assert dist_mat.shape == (3, 3)
 20 |     assert dist_mat.sum() == 5.6757105943152455
 21 | 
 22 | 
 23 | def test_pdist_condensed():
 24 |     smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
 25 |     mols = [dm.to_mol(smiles) for smiles in smiles_list]
 26 | 
 27 |     dist_mat = dm.pdist(mols, squareform=False)
 28 | 
 29 |     assert dist_mat.shape == (3,)
 30 |     assert dist_mat.sum() == 2.8378552971576227
 31 | 
 32 | 
 33 | def test_cdist():
 34 |     smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
 35 |     mols1 = [dm.to_mol(smiles) for smiles in smiles_list1]
 36 | 
 37 |     smiles_list2 = [
 38 |         "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
 39 |         "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
 40 |         "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
 41 |     ]
 42 |     mols2 = [dm.to_mol(smiles) for smiles in smiles_list2]
 43 | 
 44 |     dist_mat = dm.cdist(mols1, mols2)
 45 | 
 46 |     assert dist_mat.shape == (3, 3)
 47 |     assert np.isclose(dist_mat.mean(), 0.9416270180919872)
 48 | 
 49 | 
 50 | def test_cdist_chunked():
 51 |     smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
 52 |     mols1 = [dm.to_mol(smiles) for smiles in smiles_list1]
 53 | 
 54 |     smiles_list2 = [
 55 |         "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
 56 |         "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
 57 |         "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
 58 |     ]
 59 |     mols2 = [dm.to_mol(smiles) for smiles in smiles_list2]
 60 | 
 61 |     d1 = dm.cdist(mols1, mols2, distances_chunk=True)
 62 |     d2 = dm.cdist(mols1, mols2, distances_chunk=False)
 63 | 
 64 |     assert d1.shape == d2.shape
 65 |     assert np.allclose(d1, d2)
 66 | 
 67 | 
 68 | def test_cdist_pdist_consistent():
 69 |     smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
 70 |     mols1 = [dm.to_mol(smiles) for smiles in smiles_list1]
 71 | 
 72 |     dist_mat = dm.cdist(mols1, mols1)
 73 |     dist_mat2 = dm.pdist(mols1)
 74 | 
 75 |     assert np.isclose(dist_mat.mean(), dist_mat2.mean())
 76 |     assert np.allclose(dist_mat, dist_mat2)
 77 | 
 78 | 
 79 | def test_cdist_pdist_invalid_input():
 80 |     smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1", "dsdsdsd"]
 81 | 
 82 |     with pytest.raises(ValueError):
 83 |         dm.similarity.cdist(smiles_list, smiles_list)
 84 | 
 85 |     with pytest.raises(ValueError):
 86 |         dm.similarity.pdist(smiles_list)
 87 | 
 88 | 
 89 | def test_datamol_pdist_same_as_rdkit():
 90 |     smiles_list = [
 91 |         "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
 92 |         "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
 93 |         "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
 94 |     ]
 95 | 
 96 |     dist_mat = dm.similarity.pdist(smiles_list)
 97 |     dist_mat_rdkit = datamol.utils.testing.pdist_rdkit(smiles_list)
 98 | 
 99 |     assert np.allclose(dist_mat, dist_mat_rdkit)
100 | 
101 | 
102 | def test_datamol_cdist_same_as_rdkit():
103 |     smiles_list = [
104 |         "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1",
105 |         "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21",
106 |         "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl",
107 |     ]
108 | 
109 |     smiles_list2 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"]
110 | 
111 |     dist_mat = dm.similarity.cdist(smiles_list, smiles_list2)
112 |     dist_mat_rdkit = datamol.utils.testing.cdist_rdkit(smiles_list, smiles_list2)
113 | 
114 |     assert np.allclose(dist_mat, dist_mat_rdkit)
115 | 


--------------------------------------------------------------------------------
/tests/test_utils_fs.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import pathlib
  4 | 
  5 | import fsspec
  6 | import datamol as dm
  7 | 
  8 | 
  9 | def test_copy_files(tmp_path):
 10 |     source_path = tmp_path / "source.txt"
 11 |     destination_path = tmp_path / "destination.txt"
 12 | 
 13 |     content = "hello this is a content"
 14 |     with open(source_path, "w") as f:
 15 |         f.write(content)
 16 | 
 17 |     dm.utils.fs.copy_file(source_path, destination_path)
 18 | 
 19 |     with open(destination_path) as f:
 20 |         assert f.read() == content
 21 | 
 22 | 
 23 | def test_copy_dir(tmp_path):
 24 |     source_path = tmp_path / "source_dir"
 25 |     source_path_subdir = source_path / "a_subdir"
 26 |     destination_path = tmp_path / "destination_dir"
 27 |     destination_path_subdir = destination_path / "a_subdir"
 28 | 
 29 |     dm.utils.fs.mkdir(source_path)
 30 |     dm.utils.fs.mkdir(source_path_subdir)
 31 | 
 32 |     content = "hello this is a content"
 33 |     file1_path = source_path / "hello.txt"
 34 |     with open(file1_path, "w") as f:
 35 |         f.write(content)
 36 | 
 37 |     file2_path = source_path_subdir / "hello.txt"
 38 |     with open(file2_path, "w") as f:
 39 |         f.write(content)
 40 | 
 41 |     assert not dm.utils.fs.is_dir(destination_path_subdir)
 42 |     assert not dm.utils.fs.is_dir(destination_path)
 43 | 
 44 |     dm.utils.fs.copy_dir(source_path, destination_path)
 45 | 
 46 |     assert dm.utils.fs.is_dir(destination_path_subdir)
 47 |     assert dm.utils.fs.is_dir(destination_path)
 48 |     assert dm.utils.fs.is_file(file1_path)
 49 |     assert dm.utils.fs.is_file(file2_path)
 50 | 
 51 |     with open(file1_path) as f:
 52 |         assert f.read() == content
 53 | 
 54 |     with open(file2_path) as f:
 55 |         assert f.read() == content
 56 | 
 57 | 
 58 | def test_mkdir(tmp_path):
 59 |     source_path = tmp_path / "source_dir"
 60 |     source_path_subdir = source_path / "a_subdir"
 61 | 
 62 |     dm.utils.fs.mkdir(source_path)
 63 | 
 64 |     assert dm.utils.fs.is_dir(source_path)
 65 |     assert not dm.utils.fs.is_dir(source_path_subdir)
 66 | 
 67 |     dm.utils.fs.mkdir(source_path_subdir)
 68 | 
 69 |     assert dm.utils.fs.is_dir(source_path)
 70 |     assert dm.utils.fs.is_dir(source_path_subdir)
 71 | 
 72 | 
 73 | @pytest.mark.skip_platform("win")
 74 | def test_cache_dir():
 75 |     cache_dir = dm.utils.fs.get_cache_dir("my_app")
 76 |     assert str(cache_dir).endswith("my_app")
 77 |     assert cache_dir.exists()
 78 |     assert cache_dir.is_dir()
 79 | 
 80 |     cache_dir = dm.utils.fs.get_cache_dir("my_app", suffix="likelydonotalreadyexist", create=False)
 81 |     assert str(cache_dir).endswith("likelydonotalreadyexist")
 82 |     assert not cache_dir.exists()
 83 |     assert not cache_dir.is_dir()
 84 | 
 85 |     cache_dir = dm.utils.fs.get_cache_dir("my_app", suffix="iamasuffix")
 86 |     assert str(cache_dir).endswith("iamasuffix")
 87 |     assert "my_app" in str(cache_dir)
 88 |     assert cache_dir.exists()
 89 |     assert cache_dir.is_dir()
 90 | 
 91 | 
 92 | def test_get_mapper(tmp_path):
 93 |     fsmapper = dm.utils.fs.get_mapper(str(tmp_path / "test.txt"))
 94 | 
 95 |     # NOTE(hadim): depends the fsspec version
 96 |     assert fsmapper.fs.protocol in ["file", ("file", "local")]
 97 | 
 98 | 
 99 | @pytest.mark.skip_platform("win")
100 | def test_get_basename(tmp_path):
101 |     assert dm.utils.fs.get_basename(str(tmp_path / "test.txt")) == "test.txt"
102 |     assert dm.utils.fs.get_basename("s3://a-bucket-that-likely-do-not-exist/test.txt") == "test.txt"
103 | 
104 | 
105 | def test_get_extension(tmp_path):
106 |     assert dm.utils.fs.get_extension(str(tmp_path / "test.txt")) == "txt"
107 |     assert dm.utils.fs.get_extension("s3://a-bucket-that-likely-do-not-exist/test.txt") == "txt"
108 | 
109 | 
110 | def test_exists(tmp_path):
111 |     tmp_file = tmp_path / "test.txt"
112 | 
113 |     assert not dm.utils.fs.exists(tmp_file)
114 |     assert not dm.utils.fs.is_file(tmp_file)
115 | 
116 |     assert dm.utils.fs.is_dir(tmp_path)
117 |     assert not dm.utils.fs.is_dir(tmp_path / "likely-does-not-exist")
118 | 
119 |     with open(tmp_file, "w") as f:
120 |         f.write("hello")
121 | 
122 |     assert dm.utils.fs.exists(tmp_file)
123 |     assert dm.utils.fs.is_file(tmp_file)
124 | 
125 |     assert not dm.utils.fs.is_file(open(tmp_file))
126 |     assert not dm.utils.fs.is_dir(open(tmp_file))
127 | 
128 | 
129 | def test_get_protocol(tmp_path):
130 |     assert dm.utils.fs.get_protocol(tmp_path / "ahahah.txt") == "file"
131 |     assert dm.utils.fs.get_protocol("s3://a-bucket-that-likely-do-not-exist/test.txt") == "s3"
132 | 
133 | 
134 | def test_is_local_path(tmp_path):
135 |     assert dm.utils.fs.is_local_path(tmp_path / "ahahah.txt")
136 |     assert not dm.utils.fs.is_local_path("s3://a-bucket-that-likely-do-not-exist/test.txt")
137 | 
138 | 
139 | @pytest.mark.skip_platform("win")
140 | def test_join(tmp_path):
141 |     assert (
142 |         dm.utils.fs.join("s3://a-bucket-that-likely-do-not-exist", "test.txt")
143 |         == "s3://a-bucket-that-likely-do-not-exist/test.txt"
144 |     )
145 |     assert dm.utils.fs.join(tmp_path, "test.txt") == str(tmp_path / "test.txt")
146 | 
147 | 
148 | def test_get_size(tmp_path):
149 |     tmp_file = tmp_path / "test.txt"
150 | 
151 |     with open(tmp_file, "w") as f:
152 |         f.write("hello")
153 | 
154 |     assert dm.utils.fs.get_size(tmp_file) > 0
155 |     assert dm.utils.fs.get_size(open(tmp_file)) > 0
156 |     assert dm.utils.fs.get_size(fsspec.open(tmp_file)) > 0
157 | 
158 | 
159 | def test_md5(tmp_path):
160 |     tmp_file = tmp_path / "test.txt"
161 | 
162 |     with open(tmp_file, "w") as f:
163 |         f.write("hello")
164 | 
165 |     assert dm.utils.fs.md5(tmp_file) == "5d41402abc4b2a76b9719d911017c592"
166 | 
167 | 
168 | @pytest.mark.skip_platform("win")
169 | def test_glob(tmp_path):
170 |     for i in range(5):
171 |         tmp_file = tmp_path / f"test_{i}.txt"
172 | 
173 |         with open(tmp_file, "w") as f:
174 |             f.write("hello")
175 | 
176 |     tmp_path_regex = tmp_path / "*.txt"
177 |     assert len(dm.utils.fs.glob(tmp_path_regex)) == 5
178 | 
179 | 
180 | def test_copy_file(tmp_path):
181 |     tmp_file = tmp_path / "test.txt"
182 | 
183 |     assert dm.utils.fs.is_dir(tmp_path)
184 |     assert dm.utils.fs.is_dir(str(tmp_path))
185 |     assert dm.utils.fs.is_dir(pathlib.Path(str(tmp_path)))
186 | 
187 |     assert not dm.utils.fs.is_dir(tmp_path / "not_exist_dir")
188 |     assert not dm.utils.fs.is_dir(str(tmp_path / "not_exist_dir"))
189 |     assert not dm.utils.fs.is_dir(pathlib.Path(str(tmp_path / "not_exist_dir")))
190 | 
191 |     with open(tmp_file, "w") as f:
192 |         f.write("hello")
193 | 
194 |     tmp_file2 = tmp_path / "test2.txt"
195 |     assert not dm.utils.fs.is_file(tmp_file2)
196 |     assert not dm.utils.fs.is_file(str(tmp_file2))
197 |     assert not dm.utils.fs.is_file(pathlib.Path(str(tmp_file2)))
198 | 
199 |     dm.utils.fs.copy_file(tmp_file, tmp_file2)
200 | 
201 |     assert dm.utils.fs.is_file(tmp_file2)
202 |     assert dm.utils.fs.is_file(str(tmp_file2))
203 |     assert dm.utils.fs.is_file(pathlib.Path(str(tmp_file2)))
204 |     assert open(tmp_file2).read() == "hello"
205 | 
206 |     with pytest.raises(ValueError):
207 |         dm.utils.fs.copy_file(tmp_file, tmp_file2)
208 | 
209 |     tmp_file3 = tmp_path / "test3.txt"
210 |     dm.utils.fs.copy_file(tmp_file, tmp_file3, progress=True)
211 |     assert dm.utils.fs.is_file(tmp_file3)
212 |     assert dm.utils.fs.is_file(str(tmp_file3))
213 |     assert dm.utils.fs.is_file(pathlib.Path(str(tmp_file3)))
214 |     assert open(tmp_file3).read() == "hello"
215 | 


--------------------------------------------------------------------------------
/tests/test_utils_jobs.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numbers
  3 | import operator
  4 | import unittest
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from functools import reduce
 10 | 
 11 | import datamol as dm
 12 | 
 13 | 
 14 | def random_fn(*args, op="mul", **kwargs):
 15 |     """Perform random functions on a list"""
 16 |     all_values = [x for x in args if isinstance(x, numbers.Number)]
 17 |     all_values += [x for x in kwargs.values() if isinstance(x, numbers.Number)]
 18 |     op_fn = getattr(operator, op, None)
 19 |     if op_fn is None:
 20 |         op_fn = getattr(math, op)
 21 |         return op_fn(all_values[0])
 22 |     return reduce(op_fn, all_values)
 23 | 
 24 | 
 25 | class TestJobs(unittest.TestCase):
 26 |     def test_sequential(self):
 27 |         jobrunner = dm.JobRunner(n_jobs=None, progress=False)
 28 |         # practically do nothing (add a single value with nothing)
 29 |         o1 = jobrunner(random_fn, [9, 25, 1024], op="add")
 30 |         self.assertEqual(o1, [9, 25, 1024])
 31 | 
 32 |         # take the sqrt
 33 |         o2 = jobrunner(random_fn, [9, 25, 1024], op="sqrt")
 34 |         self.assertEqual(o2, [3, 5, 32])
 35 | 
 36 |         # multiply all inputs
 37 |         o3 = jobrunner(random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul")
 38 |         self.assertEqual(o3, [6, 4 * 5 * 6, 0])
 39 | 
 40 |         # do the same thing but with kwargs
 41 |         o4 = jobrunner(
 42 |             random_fn,
 43 |             iter([dict(a=1, b=2, c=3), dict(a=4, b=5, c=6), dict(a=3, b=4, c=0)]),
 44 |             arg_type="kwargs",
 45 |             op="mul",
 46 |         )
 47 |         self.assertEqual(o4, [6, 4 * 5 * 6, 0])
 48 | 
 49 |         o5 = jobrunner(random_fn, np.asarray([9, 25, 1024]), op="add")
 50 |         self.assertEqual(o5, [9, 25, 1024])
 51 | 
 52 |     def test_parallel(self):
 53 |         jobrunner1 = dm.JobRunner(n_jobs=4, progress=True)  # use loky backend
 54 |         o1 = jobrunner1(random_fn, [9, 25, 1024], op="add")
 55 |         self.assertEqual(o1, [9, 25, 1024])
 56 | 
 57 |         o5 = jobrunner1(random_fn, np.asarray([9, 25, 1024]), op="add")
 58 |         self.assertEqual(o5, [9, 25, 1024])
 59 | 
 60 |         o3 = jobrunner1(random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul")
 61 |         self.assertEqual(o3, [6, 4 * 5 * 6, 0])
 62 | 
 63 |         # use threads instead, no progress
 64 |         jobrunner2 = dm.JobRunner(n_jobs=2, progress=False, prefer="threads")
 65 |         o2 = jobrunner2(random_fn, [9, 25, 1024], op="sqrt")
 66 |         self.assertEqual(o2, [3, 5, 32])
 67 | 
 68 |         o4 = jobrunner2(
 69 |             random_fn,
 70 |             iter([dict(a=1, b=2, c=3), dict(a=4, b=5, c=6), dict(a=3, b=4, c=0)]),
 71 |             arg_type="kwargs",
 72 |             op="mul",
 73 |         )
 74 |         self.assertEqual(o4, [6, 4 * 5 * 6, 0])
 75 | 
 76 |     def test_seq_vs_parallel(self):
 77 |         # test parallel vs sequential
 78 |         jobrunner = dm.JobRunner(n_jobs=4, progress=False)  # use loky backend
 79 |         o_seq = jobrunner.sequential(
 80 |             random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul"
 81 |         )
 82 |         o_par = jobrunner.parallel(
 83 |             random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul"
 84 |         )
 85 |         self.assertEqual(o_seq, o_par)
 86 | 
 87 |     def test_parallelized(self):
 88 |         def fn(x):
 89 |             return x**2
 90 | 
 91 |         results = dm.parallelized(
 92 |             fn,
 93 |             [{"x": i} for i in range(10)],
 94 |             scheduler="processes",
 95 |             n_jobs=None,
 96 |             arg_type="kwargs",
 97 |             progress=True,
 98 |         )
 99 |         assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
100 | 
101 |         results = dm.parallelized(
102 |             fn,
103 |             [[i] for i in range(10)],
104 |             scheduler="processes",
105 |             n_jobs=None,
106 |             arg_type="args",
107 |             progress=True,
108 |         )
109 |         assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
110 | 
111 |         results = dm.parallelized(
112 |             fn,
113 |             range(10),
114 |             scheduler="processes",
115 |             n_jobs=None,
116 |             progress=False,
117 |         )
118 |         assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
119 | 
120 |     def test_job_kwargs(self):
121 |         def fn(x):
122 |             return x**2
123 | 
124 |         results = dm.parallelized(
125 |             fn,
126 |             [{"x": i} for i in range(10)],
127 |             scheduler="processes",
128 |             n_jobs=None,
129 |             arg_type="kwargs",
130 |             progress=True,
131 |             verbose=100,
132 |         )
133 |         assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
134 | 
135 |     def test_tqdm_kwargs(self):
136 |         def fn(x):
137 |             return x**2
138 | 
139 |         results = dm.parallelized(
140 |             fn,
141 |             [{"x": i} for i in range(10)],
142 |             scheduler="processes",
143 |             n_jobs=None,
144 |             arg_type="kwargs",
145 |             progress=True,
146 |             tqdm_kwargs=dict(desc="My progress bar"),
147 |         )
148 |         assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
149 | 
150 |     def test_with_batch_size(self):
151 |         def _fn(n):
152 |             return n * 3
153 | 
154 |         def _fn_return_none(n):
155 |             return None
156 | 
157 |         results = dm.utils.parallelized(
158 |             _fn,
159 |             range(997),
160 |             n_jobs=-1,
161 |             progress=True,
162 |             batch_size=10,
163 |         )
164 |         assert len(results) == 997
165 | 
166 |         results = dm.utils.parallelized(
167 |             _fn_return_none,
168 |             range(997),
169 |             n_jobs=-1,
170 |             progress=True,
171 |             batch_size=10,
172 |         )
173 |         assert len(results) == 997
174 | 
175 |     def test_with_total(self):
176 |         def _fn_process_fn(_, row):
177 |             datum = {}
178 |             datum["smiles"] = row["smiles"]
179 |             return pd.Series(datum)
180 | 
181 |         data = dm.freesolv()
182 |         data = data.iloc[:50]
183 | 
184 |         # parallel mode
185 | 
186 |         ## check the `total` arg is ok
187 |         dm.parallelized(
188 |             _fn_process_fn,
189 |             data.iterrows(),
190 |             n_jobs=-1,
191 |             progress=True,
192 |             arg_type="args",
193 |             total=50,
194 |         )
195 | 
196 |         ## check collision between guessed total and provided one
197 |         dm.parallelized(
198 |             _fn_process_fn,
199 |             list(data.iterrows()),
200 |             n_jobs=-1,
201 |             progress=True,
202 |             arg_type="args",
203 |             total=50,
204 |         )
205 | 
206 |         # sequential mode
207 | 
208 |         ## check the `total` arg is ok
209 |         dm.parallelized(
210 |             _fn_process_fn,
211 |             data.iterrows(),
212 |             n_jobs=1,
213 |             progress=True,
214 |             arg_type="args",
215 |             total=50,
216 |         )
217 | 
218 |         ## check collision between guessed total and provided one
219 |         dm.parallelized(
220 |             _fn_process_fn,
221 |             list(data.iterrows()),
222 |             n_jobs=1,
223 |             progress=True,
224 |             arg_type="args",
225 |             total=50,
226 |         )
227 | 
228 | 
229 | def test_parallelized_with_batches():
230 |     data = dm.freesolv()
231 |     data = data.iloc[:10]
232 | 
233 |     def _fn1(smiles):
234 |         return len(smiles)
235 | 
236 |     results1 = dm.parallelized(
237 |         _fn1,
238 |         data["smiles"],
239 |         progress=False,
240 |         n_jobs=-1,
241 |     )
242 | 
243 |     def _fn2(smiles_list):
244 |         return [len(s) for s in smiles_list]
245 | 
246 |     results2 = dm.parallelized_with_batches(
247 |         _fn2,
248 |         data["smiles"],
249 |         batch_size=2,
250 |         progress=False,
251 |         n_jobs=-1,
252 |     )
253 | 
254 |     assert results1 == results2
255 | 


--------------------------------------------------------------------------------
/tests/test_utils_perf.py:
--------------------------------------------------------------------------------
 1 | import datamol as dm
 2 | 
 3 | 
 4 | def test_watch_duration():
 5 |     def fn(n):
 6 |         for i in range(n):
 7 |             print(i)
 8 | 
 9 |     with dm.utils.perf.watch_duration(log=True) as w:
10 |         fn(5)
11 | 
12 |     assert isinstance(w.duration, float)
13 | 


--------------------------------------------------------------------------------
/tests/test_viz.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import base64
  4 | import io
  5 | 
  6 | import numpy as np
  7 | import ipywidgets as widgets
  8 | 
  9 | import PIL
 10 | from PIL import Image
 11 | 
 12 | import datamol as dm
 13 | 
 14 | 
 15 | # NOTE(hadim): rdkit returns different image objects
 16 | # according to the Python process context (Jupyter notebook vs terminal).
 17 | # In consequence, those tests will fail if they are executed within a
 18 | # Jupyter notebook.
 19 | 
 20 | 
 21 | def _convert_ipython_to_array(image):
 22 |     """convert ipython image to numpy array"""
 23 |     image_obj = base64.b64decode(str(image._repr_png_()))
 24 |     try:
 25 |         image_obj = Image.open(io.BytesIO(image_obj))
 26 |         return np.array(image_obj)
 27 |     except Exception:
 28 |         return np.array(image)
 29 | 
 30 | 
 31 | def test_to_image():
 32 |     # Get a list of molecules
 33 |     data = dm.data.freesolv()
 34 |     mols = dm.from_df(data)  # type: ignore
 35 |     mols = mols[:8]
 36 | 
 37 |     # With multiple molecules
 38 |     legends = [dm.to_smiles(mol) for mol in mols]
 39 |     image = dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200), use_svg=False)
 40 |     image = _convert_ipython_to_array(image)
 41 | 
 42 |     print(type(image))
 43 | 
 44 |     image = np.array(image)
 45 | 
 46 |     assert image.dtype == np.uint8
 47 |     assert image.shape == (400, 800, 3)
 48 |     assert image.shape[1] == 200 * 4
 49 | 
 50 |     # With a single molecule
 51 |     mol = mols[0]
 52 |     legends = dm.to_smiles(mol)
 53 |     image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200), use_svg=False)
 54 |     image = _convert_ipython_to_array(image)
 55 |     image = np.array(image)
 56 | 
 57 |     assert image.dtype == np.uint8
 58 |     assert image.shape == (200, 200, 3)
 59 | 
 60 |     dm.viz.to_image(mol, indices=True, mol_size=400)
 61 | 
 62 |     # With input smiles
 63 |     mol = "CCCOCc1cc(c2ncccc2)ccc1"
 64 |     legends = mol
 65 |     image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200), use_svg=False)
 66 |     image = _convert_ipython_to_array(image)
 67 |     image = np.array(image)
 68 | 
 69 |     assert image.dtype == np.uint8
 70 |     assert image.shape == (200, 200, 3)
 71 | 
 72 | 
 73 | def test_to_image_incorrect_aromaticity():
 74 |     query = "C-c1cn(-C-2-[N,O:3]-[#6@H](-C-[#6,#8:1]-[*:2])-C(-[#8])-C-2-[#1,#8,#9:4])c2ncnc(-C)c12"
 75 |     mol = dm.from_smarts(query)
 76 |     dm.to_image(
 77 |         mol,
 78 |         mol_size=300,
 79 |         use_svg=False,
 80 |         legends="a legend",
 81 |         legend_fontsize=40,
 82 |         stereo_annotations=False,
 83 |     )
 84 | 
 85 | 
 86 | def test_to_image_save_file(tmpdir):
 87 |     smiles = "CCCOCc1cc(c2ncccc2)ccc1"
 88 |     mol = dm.to_mol(smiles)
 89 | 
 90 |     image_path = str(tmpdir.join("mol.png"))
 91 |     dm.viz.to_image(mol, outfile=image_path, use_svg=False)
 92 | 
 93 |     # check whether the png is valid
 94 |     try:
 95 |         img = Image.open(image_path)
 96 |         img.verify()
 97 |     except PIL.UnidentifiedImageError:
 98 |         pytest.fail(f"The image {image_path} is invalid.")
 99 | 
100 |     image_path = str(tmpdir.join("mol.svg"))
101 |     dm.viz.to_image(mol, outfile=image_path, use_svg=True)
102 | 
103 |     # check whether the svg looks valid
104 |     with open(image_path) as f:
105 |         content = f.read().strip()
106 |     assert content.startswith("<?xml ") or content.startswith("<svg")
107 |     assert content.endswith("</svg>")
108 | 
109 | 
110 | def test_conformers():
111 |     import nglview as nv
112 | 
113 |     smiles = "CCCC=O"
114 |     mol = dm.to_mol(smiles)
115 |     mol = dm.conformers.generate(mol)
116 | 
117 |     # one conformer
118 |     view = dm.viz.conformers(mol)
119 |     assert type(view) == nv.widget.NGLWidget
120 | 
121 |     # multiple conformers
122 |     view = dm.viz.conformers(mol, n_confs=12)
123 |     assert type(view) == widgets.GridspecLayout
124 | 
125 | 
126 | @pytest.mark.skipif(
127 |     not dm.is_greater_than_current_rdkit_version("2023.03"),
128 |     reason="Circle Grid requires rdkit>2022.09",
129 | )
130 | def test_circle_grid(tmp_path):
131 |     mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
132 |     dm.viz.circle_grid(
133 |         mol,
134 |         [
135 |             [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")],
136 |             [dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")],
137 |         ],
138 |         outfile=str(tmp_path / "image.png"),
139 |     )
140 | 
141 | 
142 | @pytest.mark.skipif(
143 |     not dm.is_greater_than_current_rdkit_version("2023.03"),
144 |     reason="Circle Grid requires rdkit>2022.09",
145 | )
146 | def test_circle_grid_with_hex_color(tmp_path):
147 |     mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
148 |     dm.viz.circle_grid(
149 |         mol,
150 |         [
151 |             [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")],
152 |             [dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")],
153 |         ],
154 |         ring_color="#ff1472",
155 |         layout_random_seed=None,
156 |     )
157 | 
158 | 
159 | @pytest.mark.skipif(
160 |     not dm.is_greater_than_current_rdkit_version("2023.03"),
161 |     reason="Circle Grid requires rdkit>2022.09",
162 | )
163 | def test_circle_grid_with_angle_start(tmp_path):
164 |     mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
165 |     dm.viz.circle_grid(
166 |         mol,
167 |         [
168 |             [dm.to_mol("CCC"), dm.to_mol("CCCCCCC"), dm.to_mol("CCCCCO")],
169 |             [
170 |                 dm.to_mol("CCCO"),
171 |             ],
172 |         ],
173 |         # ring_color=(0, 0, 0, 0.5),
174 |         ring_color="#ff1472aa",
175 |         layout_random_seed=19,
176 |         ring_mol_start_angles_degrees=[90, 90],
177 |     )
178 | 
179 | 
180 | def test_to_image_align():
181 |     # Get a list of molecules
182 |     data = dm.data.freesolv()
183 |     mols = dm.from_df(data)  # type: ignore
184 |     mols = mols[:8]
185 | 
186 |     # With multiple molecules
187 |     dm.viz.to_image(mols, align=True)
188 | 
189 | 
190 | def test_to_image_align_template():
191 |     # Get a list of molecules
192 |     data = dm.data.freesolv()
193 |     mols = dm.from_df(data)  # type: ignore
194 |     mols = mols[:8]
195 | 
196 |     dm.viz.to_image(mols, align=mols[0])
197 | 


--------------------------------------------------------------------------------
/tests/test_viz_lasso_highlight.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import datamol as dm
  3 | 
  4 | 
  5 | # The following tests are supposed to work and should not raise any errors
  6 | def test_original_working_solution_str():
  7 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
  8 |     smarts_list = "CONN"
  9 |     assert dm.lasso_highlight_image(smi, smarts_list)
 10 | 
 11 | 
 12 | # The following tests are supposed to work and should not raise any errors
 13 | def test_from_mol():
 14 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 15 |     mol = dm.to_mol(smi)
 16 |     smarts_list = "CONN"
 17 |     assert dm.lasso_highlight_image(mol, smarts_list)
 18 | 
 19 | 
 20 | def test_with_highlight():
 21 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 22 |     mol = dm.to_mol(smi)
 23 |     smarts_list = "CONN"
 24 |     highlight_atoms = [4, 5, 6]
 25 |     highlight_bonds = [1, 2, 3, 4]
 26 |     highlight_atom_colors = {4: (230, 230, 250), 5: (230, 230, 250), 6: (230, 230, 250)}
 27 |     highlight_bond_colors = {
 28 |         1: (230, 230, 250),
 29 |         2: (230, 230, 250),
 30 |         3: (230, 230, 250),
 31 |         4: (230, 230, 250),
 32 |     }
 33 |     assert dm.lasso_highlight_image(
 34 |         mol,
 35 |         smarts_list,
 36 |         highlight_atoms=highlight_atoms,
 37 |         highlight_bonds=highlight_bonds,
 38 |         highlight_atom_colors=highlight_atom_colors,
 39 |         highlight_bond_colors=highlight_bond_colors,
 40 |         continuousHighlight=False,
 41 |     )
 42 | 
 43 | 
 44 | def test_original_working_solution_list_single_str():
 45 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 46 |     smarts_list = ["CONN"]
 47 |     assert dm.lasso_highlight_image(smi, smarts_list)
 48 | 
 49 | 
 50 | def test_original_working_solution_list_str():
 51 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 52 |     smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"]
 53 |     assert dm.lasso_highlight_image(smi, smarts_list)
 54 | 
 55 | 
 56 | def test_original_working_solution_mol():
 57 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 58 |     smarts_list = dm.to_mol("CONN")
 59 |     assert dm.lasso_highlight_image(smi, smarts_list)
 60 | 
 61 | 
 62 | def test_original_working_solution_list_single_mol():
 63 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 64 |     smarts_list = [dm.to_mol("CONN")]
 65 |     assert dm.lasso_highlight_image(smi, smarts_list)
 66 | 
 67 | 
 68 | def test_original_working_solution_List_mol():
 69 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 70 |     smarts_list = [dm.to_mol("CONN"), dm.to_mol("N#CC~CO"), dm.to_mol("C=CON"), dm.to_mol("CONNCN")]
 71 |     assert dm.lasso_highlight_image(smi, smarts_list)
 72 | 
 73 | 
 74 | def test_wokring_solution_with_more_structures_than_colors():
 75 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 76 |     smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
 77 |     assert dm.lasso_highlight_image(smi, smarts_list)
 78 | 
 79 | 
 80 | def test_drawing_options():
 81 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 82 |     smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
 83 |     assert dm.lasso_highlight_image(smi, smarts_list, bondLineWidth=15)
 84 | 
 85 | 
 86 | def test_wrong_drawing_options():
 87 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
 88 |     smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
 89 | 
 90 |     with pytest.raises(ValueError):
 91 |         dm.lasso_highlight_image(smi, smarts_list, bondLineWidthXXXXXXX=15)
 92 | 
 93 | 
 94 | def test_input_mol_is_none():
 95 |     smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"]
 96 | 
 97 |     with pytest.raises(ValueError):
 98 |         dm.lasso_highlight_image(None, smarts_list)
 99 | 
100 | 
101 | def test_search_input_error_empty_list():
102 |     # should still go through but just print out the structure without any highlights
103 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
104 |     smarts_list = []
105 |     assert dm.lasso_highlight_image(smi, smarts_list)
106 | 
107 | 
108 | def test_target_input_error_empty_str():
109 |     with pytest.raises(ValueError):
110 |         smi = ""
111 |         smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"]
112 |         dm.lasso_highlight_image(smi, smarts_list)
113 | 
114 | 
115 | def test_target_input_error_None():
116 |     with pytest.raises(ValueError):
117 |         smi = None
118 |         smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"]
119 |         dm.lasso_highlight_image(smi, smarts_list)
120 | 
121 | 
122 | def test_search_input_error_smarts_no_substructure():
123 |     # This test should still continue but will just print out a structure without any highlights and a warning
124 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
125 |     smarts_list = ["CCCCCC"]
126 |     assert dm.lasso_highlight_image(smi, smarts_list)
127 | 
128 | 
129 | # testing using <class 'IPython.core.display.SVG'>" == str(type(img)) so to not bring in IPython
130 | # as a dependency for the tests
131 | def test_SVG_is_returned_explicit():
132 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
133 |     smarts_list = ["CC"]
134 |     img = dm.lasso_highlight_image(smi, smarts_list, use_svg=True)
135 |     assert isinstance(img, str)
136 | 
137 | 
138 | def test_SVG_is_returned_implict():
139 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
140 |     smarts_list = ["CC"]
141 |     img = dm.lasso_highlight_image(smi, smarts_list)
142 |     assert isinstance(img, str)
143 | 
144 | 
145 | def test_PNG_is_returned():
146 |     smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]"
147 |     smarts_list = ["CC"]
148 |     img = dm.lasso_highlight_image(smi, smarts_list, use_svg=False)
149 | 
150 |     from PIL import Image
151 | 
152 |     assert isinstance(img, Image.Image)
153 | 
154 | 
155 | def test_aromatic_query_work():
156 |     smi = "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3"
157 |     smarts_list = ["c1ccccc1"]
158 |     assert dm.lasso_highlight_image(smi, smarts_list)
159 | 
160 | 
161 | def test_smarts_query():
162 |     smi = "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3"
163 |     smarts_list = "[#6]"
164 |     assert dm.lasso_highlight_image(smi, smarts_list)
165 | 
166 | 
167 | def test_query_and_atom_indices_list():
168 |     dm.viz.lasso_highlight_image(
169 |         "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
170 |         search_molecules="c1ccccc1",
171 |         atom_indices=[[4, 5, 6], [1, 2, 3, 4]],
172 |     )
173 | 
174 | 
175 | def test_multiple_mol_lasso():
176 |     img = dm.viz.lasso_highlight_image(
177 |         ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"],
178 |         search_molecules="c1ccccc1",
179 |     )
180 |     assert isinstance(img, str)
181 | 
182 |     img = dm.viz.lasso_highlight_image(
183 |         ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"],
184 |         search_molecules="c1ccccc1",
185 |         mol_size=(200, 200),
186 |         n_cols=1,
187 |         use_svg=False,
188 |     )
189 |     from PIL import Image
190 | 
191 |     assert isinstance(img, Image.Image)
192 |     img.size == (400, 200)
193 | 
194 | 
195 | def test_multiple_mol_lasso_different_scale_legends():
196 |     dm.viz.lasso_highlight_image(
197 |         ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"],
198 |         legends=["Mol1", "Mol2"],
199 |         search_molecules="c1ccccc1",
200 |         n_cols=1,
201 |         draw_mols_same_scale=False,
202 |     )
203 | 
204 | 
205 | def test_atom_indices_list_of_list():
206 |     dm.viz.lasso_highlight_image(
207 |         "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
208 |         search_molecules=None,
209 |         atom_indices=[[4, 5, 6], [1, 2, 3, 4]],
210 |     )
211 | 
212 | 
213 | def test_atom_indices_list():
214 |     dm.viz.lasso_highlight_image(
215 |         "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
216 |         search_molecules=None,
217 |         atom_indices=[4, 5, 6],
218 |     )
219 | 
220 | 
221 | def test_with_hex_color():
222 |     dm.viz.lasso_highlight_image(
223 |         "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3",
224 |         search_molecules=None,
225 |         atom_indices=[4, 5, 6],
226 |         color_list=["#ff1472"],
227 |     )
228 | 


--------------------------------------------------------------------------------
/tests/test_viz_substrcture.py:
--------------------------------------------------------------------------------
 1 | import datamol as dm
 2 | 
 3 | 
 4 | def test_match_substructure():
 5 |     mol1 = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O")
 6 |     mol2 = dm.to_mol("CCN(CC)CC(=O)CC(C)NC1=C2C=CC(=CC2=NC=C1)Cl")
 7 | 
 8 |     query1 = dm.from_smarts("[C;H0](=O)")
 9 |     query2 = dm.to_mol("CN(C)")
10 | 
11 |     # Test multiple scenarios
12 | 
13 |     dm.viz.match_substructure(
14 |         mols=[mol1, mol2],
15 |         queries=[query1, query2],
16 |         highlight_bonds=True,
17 |         use_svg=True,
18 |     )
19 |     dm.viz.match_substructure(
20 |         mols=mol1,
21 |         queries=[query1, query2],
22 |         highlight_bonds=True,
23 |         use_svg=True,
24 |     )
25 |     dm.viz.match_substructure(
26 |         mols=[mol1, mol2],
27 |         queries=query1,
28 |         highlight_bonds=False,
29 |         use_svg=False,
30 |     )
31 |     dm.viz.match_substructure(
32 |         mols=mol1,
33 |         queries=query2,
34 |         highlight_bonds=True,
35 |         use_svg=False,
36 |     )
37 | 


--------------------------------------------------------------------------------