├── .github ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── PULL_REQUEST_TEMPLATE.md ├── SECURITY.md └── workflows │ ├── code-check.yml │ ├── doc.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── binder ├── environment.yml └── postBuild ├── codecov.yml ├── datamol ├── __init__.py ├── _sanifix4.py ├── _version.py ├── align.py ├── cluster.py ├── conformers │ ├── __init__.py │ ├── _conformers.py │ └── _features.py ├── convert.py ├── data │ ├── __init__.py │ ├── cdk2.sdf │ ├── chembl_approved_drugs.parquet │ ├── chembl_drugs.csv │ ├── chembl_samples.csv │ ├── freesolv.csv │ ├── reactions.json │ ├── salts_solvents.smi │ ├── solubility.test.sdf │ └── solubility.train.sdf ├── descriptors │ ├── __init__.py │ ├── compute.py │ └── descriptors.py ├── fp.py ├── fragment │ ├── __init__.py │ ├── _assemble.py │ └── _fragment.py ├── graph.py ├── io.py ├── isomers │ ├── __init__.py │ ├── _enumerate.py │ └── _structural.py ├── log.py ├── mcs.py ├── mol.py ├── molar.py ├── predictors │ ├── __init__.py │ └── esol.py ├── reactions │ ├── __init__.py │ ├── _attachments.py │ └── _reactions.py ├── scaffold │ ├── __init__.py │ └── _fuzzy.py ├── similarity.py ├── types.py ├── utils │ ├── __init__.py │ ├── decorators.py │ ├── fs.py │ ├── jobs.py │ ├── perf.py │ └── testing.py └── viz │ ├── __init__.py │ ├── _circle_grid.py │ ├── _conformers.py │ ├── _lasso_highlight.py │ ├── _substructure.py │ ├── _viz.py │ └── utils.py ├── docs ├── CNAME ├── api │ ├── datamol.align.md │ ├── datamol.cluster.md │ ├── datamol.conformers.md │ ├── datamol.convert.md │ ├── datamol.data.md │ ├── datamol.descriptors.md │ ├── datamol.fp.md │ ├── datamol.fragment.md │ ├── datamol.graph.md │ ├── datamol.io.md │ ├── datamol.isomers.md │ ├── datamol.log.md │ ├── datamol.mol.md │ ├── datamol.molar.md │ ├── datamol.reactions.md │ ├── datamol.scaffold.md │ ├── datamol.similarity.md │ ├── datamol.utils.fs.md │ ├── datamol.utils.md │ └── datamol.viz.md ├── assets │ ├── css │ │ ├── custom-datamol.css │ │ ├── custom.css │ │ └── tweak-width.css │ └── js │ │ └── google-analytics.js ├── contribute.md ├── images │ ├── logo-black.png │ ├── logo-black.svg │ ├── logo-title.svg │ ├── logo.png │ └── logo.svg ├── index.md ├── license.md ├── tutorials │ ├── Aligning.ipynb │ ├── Clustering.ipynb │ ├── Conformers.ipynb │ ├── Descriptors.ipynb │ ├── Filesystem.ipynb │ ├── Fragment.ipynb │ ├── Fuzzy_Scaffolds.ipynb │ ├── Preprocessing.ipynb │ ├── Reactions.ipynb │ ├── Scaffolds.ipynb │ ├── The_Basics.ipynb │ ├── Visualization.ipynb │ ├── data │ │ ├── Enamine_DNA_Libary_5530cmpds_20200831_SMALL.sdf │ │ └── ReactionBlock.rxn │ └── images │ │ ├── Aligning_1.png │ │ ├── Aligning_2.png │ │ ├── Conformers_1.png │ │ ├── Descriptors_1.png │ │ ├── Fragment_1.png │ │ ├── Fragment_2.png │ │ ├── Fragment_3.png │ │ ├── Preprocess_1.png │ │ └── Scaffolds_1.png └── usage.md ├── env.yml ├── mkdocs.yml ├── notebooks └── Get_ChEMBL_Approved_Drugs.ipynb ├── pyproject.toml └── tests ├── conftest.py ├── data ├── TUBB3-observations-last-broken.sdf ├── TUBB3-observations.sdf ├── TUBB3-observations.sdf.gz ├── freesolv.csv ├── freesolv.xlsx └── test.mol2 ├── test_align.py ├── test_cluster.py ├── test_conformers.py ├── test_convert.py ├── test_data.py ├── test_descriptors.py ├── test_fp.py ├── test_fragment.py ├── test_graph.py ├── test_import.py ├── test_io.py ├── test_isomers.py ├── test_log.py ├── test_mcs.py ├── test_mol.py ├── test_molar.py ├── test_notebooks.py ├── test_predictors.py ├── test_reactions.py ├── test_scaffold.py ├── test_similarity.py ├── test_utils_fs.py ├── test_utils_jobs.py ├── test_utils_perf.py ├── test_viz.py ├── test_viz_lasso_highlight.py └── test_viz_substrcture.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @hadim 2 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | . 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | see documentation directly at https://docs.datamol.io/stable/contribute.html 2 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Changelogs 2 | 3 | - _enumerate the changes of that PR._ 4 | 5 | --- 6 | 7 | _Checklist:_ 8 | 9 | - [ ] _Was this PR discussed in an issue? It is recommended to first discuss a new feature into a GitHub issue before opening a PR._ 10 | - [ ] _Add tests to cover the fixed bug(s) or the new introduced feature(s) (if appropriate)._ 11 | - [ ] _Update the API documentation is a new function is added, or an existing one is deleted._ 12 | - [ ] _Write concise and explanatory changelogs below._ 13 | - [ ] _If possible, assign one of the following labels to the PR: `feature`, `fix` or `test` (or ask a maintainer to do it for you)._ 14 | 15 | --- 16 | 17 | _discussion related to that PR_ 18 | -------------------------------------------------------------------------------- /.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | Please report any security-related issues directly to hadrien@valencediscovery.com. 4 | -------------------------------------------------------------------------------- /.github/workflows/code-check.yml: -------------------------------------------------------------------------------- 1 | name: code-check 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | tags: ["*"] 7 | pull_request: 8 | branches: 9 | - "*" 10 | - "!gh-pages" 11 | 12 | jobs: 13 | python-format-black: 14 | name: Python lint [black] 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout the code 18 | uses: actions/checkout@v4 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: "3.10" 24 | 25 | - name: Install black 26 | run: | 27 | pip install black>=24 28 | 29 | - name: Lint 30 | run: black --check . 31 | 32 | python-lint-ruff: 33 | name: Python lint [ruff] 34 | runs-on: ubuntu-latest 35 | steps: 36 | - name: Checkout the code 37 | uses: actions/checkout@v4 38 | 39 | - name: Set up Python 40 | uses: actions/setup-python@v4 41 | with: 42 | python-version: "3.10" 43 | 44 | - name: Install ruff 45 | run: | 46 | pip install ruff 47 | 48 | - name: Lint 49 | run: ruff . 50 | -------------------------------------------------------------------------------- /.github/workflows/doc.yml: -------------------------------------------------------------------------------- 1 | name: doc 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | # Prevent doc action on `main` to conflict with each others. 8 | concurrency: 9 | group: doc-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | doc: 14 | runs-on: "ubuntu-latest" 15 | timeout-minutes: 30 16 | 17 | defaults: 18 | run: 19 | shell: bash -l {0} 20 | 21 | steps: 22 | - name: Checkout the code 23 | uses: actions/checkout@v4 24 | 25 | - name: Setup mamba 26 | uses: mamba-org/setup-micromamba@v1 27 | with: 28 | environment-file: env.yml 29 | environment-name: my_env 30 | cache-environment: true 31 | cache-downloads: true 32 | 33 | - name: Install library 34 | run: python -m pip install --no-deps . 35 | 36 | - name: Configure git 37 | run: | 38 | git config --global user.name "${GITHUB_ACTOR}" 39 | git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com" 40 | 41 | - name: Deploy the doc 42 | run: | 43 | echo "Get the gh-pages branch" 44 | git fetch origin gh-pages 45 | 46 | echo "Build and deploy the doc on main" 47 | mike deploy --push main 48 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | release-version: 7 | description: "A valid Semver version string" 8 | required: true 9 | 10 | permissions: 11 | contents: write 12 | pull-requests: write 13 | 14 | jobs: 15 | release: 16 | # Do not release if not triggered from the default branch 17 | if: github.ref == format('refs/heads/{0}', github.event.repository.default_branch) 18 | 19 | runs-on: ubuntu-latest 20 | timeout-minutes: 30 21 | 22 | defaults: 23 | run: 24 | shell: bash -l {0} 25 | 26 | steps: 27 | - name: Checkout the code 28 | uses: actions/checkout@v4 29 | 30 | - name: Setup mamba 31 | uses: mamba-org/setup-micromamba@v1 32 | with: 33 | environment-file: env.yml 34 | environment-name: my_env 35 | cache-environment: true 36 | cache-downloads: true 37 | create-args: >- 38 | pip 39 | semver 40 | python-build 41 | setuptools_scm 42 | 43 | - name: Check the version is valid semver 44 | run: | 45 | RELEASE_VERSION="${{ inputs.release-version }}" 46 | 47 | { 48 | pysemver check $RELEASE_VERSION 49 | } || { 50 | echo "The version '$RELEASE_VERSION' is not a valid Semver version string." 51 | echo "Please use a valid semver version string. More details at https://semver.org/" 52 | echo "The release process is aborted." 53 | exit 1 54 | } 55 | 56 | - name: Check the version is higher than the latest one 57 | run: | 58 | # Retrieve the git tags first 59 | git fetch --prune --unshallow --tags &> /dev/null 60 | 61 | RELEASE_VERSION="${{ inputs.release-version }}" 62 | LATEST_VERSION=$(git describe --abbrev=0 --tags) 63 | 64 | IS_HIGHER_VERSION=$(pysemver compare $RELEASE_VERSION $LATEST_VERSION) 65 | 66 | if [ "$IS_HIGHER_VERSION" != "1" ]; then 67 | echo "The version '$RELEASE_VERSION' is not higher than the latest version '$LATEST_VERSION'." 68 | echo "The release process is aborted." 69 | exit 1 70 | fi 71 | 72 | - name: Build Changelog 73 | id: github_release 74 | uses: mikepenz/release-changelog-builder-action@v4 75 | env: 76 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 77 | with: 78 | toTag: "main" 79 | 80 | - name: Configure git 81 | run: | 82 | git config --global user.name "${GITHUB_ACTOR}" 83 | git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com" 84 | 85 | - name: Create and push git tag 86 | env: 87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 88 | run: | 89 | # Tag the release 90 | git tag -a "${{ inputs.release-version }}" -m "Release version ${{ inputs.release-version }}" 91 | 92 | # Checkout the git tag 93 | git checkout "${{ inputs.release-version }}" 94 | 95 | # Push the modified changelogs 96 | git push origin main 97 | 98 | # Push the tags 99 | git push origin "${{ inputs.release-version }}" 100 | 101 | - name: Install library 102 | run: python -m pip install --no-deps . 103 | 104 | - name: Build the wheel and sdist 105 | run: python -m build --no-isolation 106 | 107 | - name: Publish package to PyPI 108 | uses: pypa/gh-action-pypi-publish@release/v1 109 | with: 110 | password: ${{ secrets.PYPI_API_TOKEN }} 111 | packages-dir: dist/ 112 | 113 | - name: Create GitHub Release 114 | uses: softprops/action-gh-release@de2c0eb89ae2a093876385947365aca7b0e5f844 115 | with: 116 | tag_name: ${{ inputs.release-version }} 117 | body: ${{steps.github_release.outputs.changelog}} 118 | 119 | - name: Deploy the doc 120 | run: | 121 | echo "Get the gh-pages branch" 122 | git fetch origin gh-pages 123 | 124 | echo "Build and deploy the doc on ${{ inputs.release-version }}" 125 | mike deploy --push stable 126 | mike deploy --push ${{ inputs.release-version }} 127 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | tags: ["*"] 7 | pull_request: 8 | branches: 9 | - "*" 10 | - "!gh-pages" 11 | schedule: 12 | - cron: "0 4 * * MON" 13 | 14 | jobs: 15 | test: 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.10", "3.11"] 20 | os: ["ubuntu-latest", "macos-latest", "windows-latest"] 21 | rdkit-version: ["2023.09", "2024.03"] 22 | 23 | runs-on: ${{ matrix.os }} 24 | timeout-minutes: 30 25 | 26 | defaults: 27 | run: 28 | shell: bash -l {0} 29 | 30 | name: | 31 | os=${{ matrix.os }} 32 | - python=${{ matrix.python-version }} 33 | - rdkit=${{ matrix.rdkit-version }} 34 | 35 | steps: 36 | - name: Checkout the code 37 | uses: actions/checkout@v4 38 | 39 | - name: Setup mamba 40 | uses: mamba-org/setup-micromamba@v1 41 | with: 42 | environment-file: env.yml 43 | environment-name: my_env 44 | cache-environment: true 45 | cache-downloads: true 46 | create-args: >- 47 | python=${{ matrix.python-version }} 48 | rdkit=${{ matrix.rdkit-version }} 49 | 50 | - name: Install library 51 | run: python -m pip install --no-deps -e . # `-e` required for correct `coverage` run. 52 | 53 | - name: Run tests 54 | run: pytest 55 | 56 | - name: Codecov Upload 57 | uses: codecov/codecov-action@v4 58 | with: 59 | files: ./coverage.xml 60 | flags: unittests 61 | name: codecov-umbrella 62 | fail_ci_if_error: false 63 | verbose: false 64 | env_vars: ${{ matrix.os }},${{ matrix.python-version }},${{ matrix.rdkit-version }} 65 | 66 | - name: Test building the doc 67 | run: mkdocs build 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.env 2 | cov.xml 3 | coverage.xml 4 | 5 | .vscode/ 6 | 7 | .ipynb_checkpoints/ 8 | 9 | *.py[cod] 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Packages 15 | *.egg 16 | *.egg-info 17 | dist 18 | build 19 | eggs 20 | parts 21 | bin 22 | var 23 | sdist 24 | develop-eggs 25 | .installed.cfg 26 | lib 27 | lib64 28 | 29 | # Installer logs 30 | pip-log.txt 31 | 32 | # Unit test / coverage reports 33 | .coverage* 34 | .tox 35 | nosetests.xml 36 | htmlcov 37 | 38 | # Translations 39 | *.mo 40 | 41 | # Mr Developer 42 | .mr.developer.cfg 43 | .project 44 | .pydevproject 45 | 46 | # Complexity 47 | output/*.html 48 | output/*/index.html 49 | 50 | # Sphinx 51 | docs/_build 52 | 53 | MANIFEST 54 | 55 | *.tif 56 | 57 | # Rever 58 | rever/ 59 | 60 | # Dev notebook 61 | dev.ipynb 62 | 63 | # MkDocs 64 | site/ 65 | 66 | .idea/ 67 | __pycache__ 68 | .DS_Store 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 |

6 | datamol - molecular processing made easy
7 |

8 |

9 | 10 | Docs 11 | | 12 | 13 | Homepage 14 | 15 |

16 | 17 | --- 18 | 19 | [![DOI](https://zenodo.org/badge/341603042.svg)](https://zenodo.org/badge/latestdoi/341603042) 20 | [![Binder](http://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/datamol-io/datamol/main?urlpath=lab/tree/docs/tutorials/The_Basics.ipynb) 21 | [![PyPI](https://img.shields.io/pypi/v/datamol)](https://pypi.org/project/datamol/) 22 | [![Conda](https://img.shields.io/conda/v/conda-forge/datamol?label=conda&color=success)](https://anaconda.org/conda-forge/datamol) 23 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/datamol)](https://pypi.org/project/datamol/) 24 | [![Conda](https://img.shields.io/conda/dn/conda-forge/datamol)](https://anaconda.org/conda-forge/datamol) 25 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/datamol)](https://pypi.org/project/datamol/) 26 | [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/datamol-io/datamol/blob/main/LICENSE) 27 | [![GitHub Repo stars](https://img.shields.io/github/stars/datamol-io/datamol)](https://github.com/datamol-io/datamol/stargazers) 28 | [![GitHub Repo stars](https://img.shields.io/github/forks/datamol-io/datamol)](https://github.com/datamol-io/datamol/network/members) 29 | [![Codecov](https://codecov.io/gh/datamol-io/datamol/branch/main/graph/badge.svg?token=2ETG8SA7IG)](https://codecov.io/gh/datamol-io/datamol) 30 | 31 | Datamol is a python library to work with molecules. It's a layer built on top of [RDKit](https://www.rdkit.org/) and aims to be as light as possible. 32 | 33 | - 🐍 Simple pythonic API 34 | - ⚗️ RDKit first: all you manipulate are `rdkit.Chem.Mol` objects. 35 | - ✅ Manipulating molecules often relies on many options; Datamol provides good defaults by design. 36 | - 🧠 Performance matters: built-in efficient parallelization when possible with an optional progress bar. 37 | - 🕹️ Modern IO: out-of-the-box support for remote paths using `fsspec` to read and write multiple formats (sdf, xlsx, csv, etc). 38 | 39 | ## Try Online 40 | 41 | Visit [![Binder](http://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/datamol-io/datamol/main?urlpath=lab/tree/docs/tutorials/The_Basics.ipynb) and try Datamol online. 42 | 43 | ## Documentation 44 | 45 | Visit . 46 | 47 | ## Installation 48 | 49 | Use conda: 50 | 51 | ```bash 52 | mamba install -c conda-forge datamol 53 | ``` 54 | 55 | ## Quick API Tour 56 | 57 | ```python 58 | import datamol as dm 59 | 60 | # Common functions 61 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True) 62 | fp = dm.to_fp(mol) 63 | selfies = dm.to_selfies(mol) 64 | inchi = dm.to_inchi(mol) 65 | 66 | # Standardize and sanitize 67 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O") 68 | mol = dm.fix_mol(mol) 69 | mol = dm.sanitize_mol(mol) 70 | mol = dm.standardize_mol(mol) 71 | 72 | # Dataframe manipulation 73 | df = dm.data.freesolv() 74 | mols = dm.from_df(df) 75 | 76 | # 2D viz 77 | legends = [dm.to_smiles(mol) for mol in mols[:10]] 78 | dm.viz.to_image(mols[:10], legends=legends) 79 | 80 | # Generate conformers 81 | smiles = "O=C(C)Oc1ccccc1C(=O)O" 82 | mol = dm.to_mol(smiles) 83 | mol_with_conformers = dm.conformers.generate(mol) 84 | 85 | # 3D viz (using nglview) 86 | dm.viz.conformers(mol, n_confs=10) 87 | 88 | # Compute SASA from conformers 89 | sasa = dm.conformers.sasa(mol_with_conformers) 90 | 91 | # Easy IO 92 | mols = dm.read_sdf("s3://my-awesome-data-lake/smiles.sdf", as_df=False) 93 | dm.to_sdf(mols, "gs://data-bucket/smiles.sdf") 94 | ``` 95 | 96 | ## How to cite 97 | 98 | Please cite Datamol if you use it in your research: [![DOI](https://zenodo.org/badge/341603042.svg)](https://zenodo.org/badge/latestdoi/341603042). 99 | 100 | ## Compatibilities 101 | 102 | Version compatibilities are an essential topic for production-software stacks. We are cautious about documenting compatibility between `datamol`, `python` and `rdkit`. 103 | 104 | See below the associated versions of Python and RDKit, for which a minor version of Datamol **has been tested** during its whole lifecycle. _It does not mean other combinations does not work but that those are not tested._ 105 | 106 | | `datamol` | `python` | `rdkit` | 107 | | --------- | ------------------- | ----------------------------- | 108 | | `0.12.x` | `[3.10, 3.11]` | `[2023.03, 2023.09]` | 109 | | `0.11.x` | `[3.9, 3.10, 3.11]` | `[2022.09, 2023.03]` | 110 | | `0.10.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` | 111 | | `0.9.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` | 112 | | `0.8.x` | `[3.8, 3.9, 3.10]` | `[2021.09, 2022.03, 2022.09]` | 113 | | `0.7.x` | `[3.8, 3.9]` | `[2021.09, 2022.03]` | 114 | | `0.6.x` | `[3.8, 3.9]` | `[2021.09]` | 115 | | `0.5.x` | `[3.8, 3.9]` | `[2021.03, 2021.09]` | 116 | | `0.4.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` | 117 | | `0.3.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` | 118 | 119 | ## CI Status 120 | 121 | The CI runs tests and performs code quality checks for the following combinations: 122 | 123 | - The three major platforms: Windows, OSX and Linux. 124 | - The two latest Python versions. 125 | - The two latest RDKit versions. 126 | 127 | | | `main` | 128 | | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 129 | | Lib build & Testing | [![test](https://github.com/datamol-io/datamol/actions/workflows/test.yml/badge.svg)](https://github.com/datamol-io/datamol/actions/workflows/test.yml) | 130 | | Code Sanity (linting and type analysis) | [![code-check](https://github.com/datamol-io/datamol/actions/workflows/code-check.yml/badge.svg)](https://github.com/datamol-io/datamol/actions/workflows/code-check.yml) | 131 | | Documentation Build | [![doc](https://github.com/datamol-io/datamol/actions/workflows/doc.yml/badge.svg)](https://github.com/datamol-io/datamol/actions/workflows/doc.yml) | 132 | 133 | ## License 134 | 135 | Under the Apache-2.0 license. See [LICENSE](LICENSE). 136 | -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | 4 | dependencies: 5 | - python >=3.8 6 | - pip 7 | - tqdm 8 | - loguru 9 | - joblib 10 | - fsspec >=2021.9 11 | - s3fs >=2021.9 12 | - gcsfs >=2021.9 13 | - platformdirs 14 | - packaging 15 | - typing_extensions 16 | - importlib_resources 17 | 18 | # Scientific 19 | - pandas 20 | - numpy 21 | - scipy 22 | - pillow 23 | - matplotlib 24 | - scikit-learn 25 | 26 | # Chemistry 27 | - rdkit >=2021.03 28 | - selfies 29 | 30 | # Optional deps 31 | - openpyxl 32 | - networkx 33 | - nglview 34 | - xlsxwriter 35 | - pyarrow 36 | 37 | # Dev 38 | - pytest >=6.0 39 | - pytest-cov 40 | - pytest-xdist 41 | - black >=24 42 | - jupyterlab 43 | - mypy 44 | - codecov 45 | - nbconvert 46 | 47 | # Doc 48 | - mkdocs 49 | - mkdocs-material >=7.1.1 50 | - mkdocs-material-extensions 51 | - mkdocstrings 52 | - mkdocstrings-python 53 | - mkdocs-jupyter 54 | - markdown-include 55 | - mdx_truly_sane_lists 56 | - mike >=1.0.0 57 | - seaborn 58 | -------------------------------------------------------------------------------- /binder/postBuild: -------------------------------------------------------------------------------- 1 | pip install -e . 2 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | range: "50...80" 3 | status: 4 | project: 5 | default: 6 | threshold: 1% 7 | patch: false 8 | -------------------------------------------------------------------------------- /datamol/_sanifix4.py: -------------------------------------------------------------------------------- 1 | """ 2 | sanifix4.py 3 | Original code from rdkit [James Davidson] 4 | """ 5 | 6 | from rdkit import Chem, RDLogger 7 | 8 | 9 | logger = RDLogger.logger() 10 | 11 | 12 | def _FragIndicesToMol(oMol, indices): 13 | em = Chem.EditableMol(Chem.Mol()) 14 | 15 | newIndices = {} 16 | for i, idx in enumerate(indices): 17 | em.AddAtom(oMol.GetAtomWithIdx(idx)) 18 | newIndices[idx] = i 19 | 20 | for i, idx in enumerate(indices): 21 | at = oMol.GetAtomWithIdx(idx) 22 | for bond in at.GetBonds(): 23 | if bond.GetBeginAtomIdx() == idx: 24 | oidx = bond.GetEndAtomIdx() 25 | else: 26 | oidx = bond.GetBeginAtomIdx() 27 | # make sure every bond only gets added once: 28 | if oidx < idx: 29 | continue 30 | em.AddBond(newIndices[idx], newIndices[oidx], bond.GetBondType()) 31 | res = em.GetMol() 32 | res.ClearComputedProps() 33 | Chem.GetSymmSSSR(res) 34 | res.UpdatePropertyCache(False) 35 | res._idxMap = newIndices 36 | return res 37 | 38 | 39 | def _recursivelyModifyNs(mol, matches, indices=None): 40 | if indices is None: 41 | indices = [] 42 | res = None 43 | while len(matches) and res is None: 44 | tIndices = indices[:] 45 | nextIdx = matches.pop(0) 46 | tIndices.append(nextIdx) 47 | nm = Chem.Mol(mol.ToBinary()) 48 | nm.GetAtomWithIdx(nextIdx).SetNoImplicit(True) 49 | nm.GetAtomWithIdx(nextIdx).SetNumExplicitHs(1) 50 | cp = Chem.Mol(nm.ToBinary()) 51 | try: 52 | Chem.SanitizeMol(cp) 53 | except ValueError: 54 | res, indices = _recursivelyModifyNs(nm, matches, indices=tIndices) 55 | else: 56 | indices = tIndices 57 | res = cp 58 | return res, indices 59 | 60 | 61 | def AdjustAromaticNs(m, nitrogenPattern="[n&D2&H0;r5,r6]"): 62 | """ 63 | default nitrogen pattern matches Ns in 5 rings and 6 rings in order to be able 64 | to fix: O=c1ccncc1 65 | """ 66 | Chem.GetSymmSSSR(m) 67 | m.UpdatePropertyCache(False) 68 | 69 | # break non-ring bonds linking rings: 70 | em = Chem.EditableMol(m) 71 | linkers = m.GetSubstructMatches(Chem.MolFromSmarts("[r]!@[r]")) 72 | plsFix = set() 73 | for a, b in linkers: 74 | em.RemoveBond(a, b) 75 | plsFix.add(a) 76 | plsFix.add(b) 77 | nm = em.GetMol() 78 | for at in plsFix: 79 | at = nm.GetAtomWithIdx(at) 80 | if at.GetIsAromatic() and at.GetAtomicNum() == 7: 81 | at.SetNumExplicitHs(1) 82 | at.SetNoImplicit(True) 83 | 84 | # build molecules from the fragments: 85 | fragLists = Chem.GetMolFrags(nm) 86 | frags = [_FragIndicesToMol(nm, x) for x in fragLists] 87 | 88 | # loop through the fragments in turn and try to aromatize them: 89 | ok = True 90 | for i, frag in enumerate(frags): 91 | cp = Chem.Mol(frag) 92 | try: 93 | Chem.SanitizeMol(cp) 94 | except ValueError: 95 | matches = [x[0] for x in frag.GetSubstructMatches(Chem.MolFromSmarts(nitrogenPattern))] 96 | lres, indices = _recursivelyModifyNs(frag, matches) 97 | if not lres: 98 | # print 'frag %d failed (%s)'%(i,str(fragLists[i])) 99 | ok = False 100 | break 101 | else: 102 | revMap = {} 103 | for k, v in frag._idxMap.items(): 104 | revMap[v] = k 105 | for idx in indices: 106 | oatom = m.GetAtomWithIdx(revMap[idx]) 107 | oatom.SetNoImplicit(True) 108 | oatom.SetNumExplicitHs(1) 109 | if not ok: 110 | return None 111 | return m 112 | 113 | 114 | def sanifix(m): 115 | if m is None: 116 | return None 117 | try: 118 | m.UpdatePropertyCache(False) 119 | cp = Chem.Mol(m.ToBinary()) 120 | Chem.SanitizeMol(cp) 121 | return cp 122 | except ValueError as e: 123 | logger.debug(f"{Chem.MolToSmiles(m)} failed due to {e}") 124 | try: 125 | m = AdjustAromaticNs(m) 126 | if m is not None: 127 | Chem.SanitizeMol(m) 128 | return m 129 | except Exception as ee: 130 | logger.debug(f"{Chem.MolToSmiles(m)} failed due to {ee}") 131 | return None 132 | except RuntimeError as e: 133 | logger.debug(f"{Chem.MolToSmiles(m)} failed due to {e}") 134 | logger.info(f"The faulty smiles is: {Chem.MolToSmiles(m)}") 135 | raise e 136 | -------------------------------------------------------------------------------- /datamol/_version.py: -------------------------------------------------------------------------------- 1 | try: 2 | from importlib.metadata import version 3 | from importlib.metadata import PackageNotFoundError 4 | except ModuleNotFoundError: 5 | # Try backported to PY<38 `importlib_metadata`. 6 | from importlib_metadata import version 7 | from importlib_metadata import PackageNotFoundError 8 | 9 | 10 | import rdkit 11 | import packaging.version 12 | 13 | 14 | try: 15 | __version__ = version("datamol") 16 | except PackageNotFoundError: 17 | # package is not installed 18 | __version__ = "dev" 19 | 20 | CURRENT_RDKIT_VERSION = rdkit.__version__ 21 | CURRENT_RDKIT_VERSION_OBJ = packaging.version.parse(CURRENT_RDKIT_VERSION) 22 | 23 | 24 | def is_lower_than_current_rdkit_version(rdkit_version: str): 25 | return CURRENT_RDKIT_VERSION_OBJ < packaging.version.parse(rdkit_version) 26 | 27 | 28 | def is_greater_than_current_rdkit_version(rdkit_version: str): 29 | return CURRENT_RDKIT_VERSION_OBJ > packaging.version.parse(rdkit_version) 30 | 31 | 32 | def is_lower_eq_than_current_rdkit_version(rdkit_version: str): 33 | return CURRENT_RDKIT_VERSION_OBJ <= packaging.version.parse(rdkit_version) 34 | 35 | 36 | def is_greater_eq_than_current_rdkit_version(rdkit_version: str): 37 | return CURRENT_RDKIT_VERSION_OBJ >= packaging.version.parse(rdkit_version) 38 | -------------------------------------------------------------------------------- /datamol/conformers/__init__.py: -------------------------------------------------------------------------------- 1 | from ._conformers import generate 2 | from ._conformers import cluster 3 | from ._conformers import rmsd 4 | from ._conformers import return_centroids 5 | from ._conformers import translate 6 | from ._conformers import align_conformers 7 | 8 | from ._features import sasa 9 | from ._features import get_coords 10 | from ._features import center_of_mass 11 | from ._features import keep_conformers 12 | -------------------------------------------------------------------------------- /datamol/conformers/_features.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from typing import List 3 | from typing import Optional 4 | 5 | import numpy as np 6 | 7 | from ..types import Mol 8 | from ..utils.jobs import JobRunner 9 | from ..utils import decorators 10 | from ..mol import PERIODIC_TABLE 11 | from ..mol import copy_mol 12 | 13 | 14 | @decorators.disable_on_os("win") 15 | def sasa( 16 | mol: Mol, 17 | conf_id: Optional[Union[int, List[int]]] = None, 18 | n_jobs: int = 1, 19 | ) -> np.ndarray: 20 | """Compute Solvent Accessible Surface Area of all the conformers 21 | using FreeSASA (https://freesasa.github.io/). Values are returned 22 | as an array and also stored within each conformer as a property 23 | called `rdkit_free_sasa`. 24 | 25 | Example: 26 | 27 | ```python 28 | smiles = "O=C(C)Oc1ccccc1C(=O)O" 29 | mol = dm.to_mol(smiles) 30 | mol = dm.conformers.generate(mol) 31 | 32 | # Compute SASA for all the conformers without parallelization 33 | sasa_values = dm.conformers.sasa(mol, conf_id=None, n_jobs=1) 34 | 35 | # If minimization has been enabled (default to True) 36 | # you can access the computed energy. 37 | conf = mol.GetConformer(0) 38 | props = conf.GetPropsAsDict() 39 | print(props) 40 | # {'rdkit_uff_energy': 1.7649408317784008} 41 | ``` 42 | 43 | Args: 44 | mol: a molecule 45 | conf_id: Id of the conformers to compute. If None, compute all. 46 | n_jobs: Number of jobs for parallelization. Set to 1 to disable 47 | and -1 to use all cores. 48 | 49 | Returns: 50 | mol: the molecule with the conformers. 51 | """ 52 | from rdkit.Chem import rdFreeSASA 53 | 54 | if mol.GetNumConformers() == 0: 55 | raise ValueError( 56 | "The molecule has 0 conformers. You can generate conformers with `dm.conformers.generate(mol)`." 57 | ) 58 | 59 | # Get Van der Waals radii (angstrom) 60 | radii = [PERIODIC_TABLE.GetRvdw(atom.GetAtomicNum()) for atom in mol.GetAtoms()] 61 | 62 | # Which conformers to compute 63 | conf_ids = [] 64 | if conf_id is None: 65 | # If None compute for all the conformers 66 | conf_ids = list(range(mol.GetNumConformers())) # type: ignore 67 | elif isinstance(conf_id, int): 68 | conf_ids = [conf_id] 69 | else: 70 | conf_ids = conf_id 71 | 72 | # Compute solvent accessible surface area 73 | def _get_sasa(i): 74 | conf = mol.GetConformer(i) 75 | sasa = rdFreeSASA.CalcSASA(mol, radii, confIdx=conf.GetId()) 76 | conf.SetDoubleProp("rdkit_free_sasa", sasa) 77 | return sasa 78 | 79 | runner = JobRunner(n_jobs=n_jobs) 80 | sasa_values = runner(_get_sasa, conf_ids) 81 | return np.array(sasa_values) 82 | 83 | 84 | def get_coords(mol: Mol, conf_id: int = -1): 85 | """Get the coordinate of a conformer of a molecule. 86 | 87 | Args: 88 | mol: a molecule. 89 | conf_id: a conformer id. 90 | """ 91 | 92 | if mol.GetNumConformers() == 0: 93 | raise ValueError("Molecule does not have any conformers.") 94 | 95 | conf = mol.GetConformer(id=conf_id) 96 | return conf.GetPositions() 97 | 98 | 99 | def center_of_mass( 100 | mol: Mol, 101 | use_atoms: bool = True, 102 | digits: Optional[int] = None, 103 | conf_id: int = -1, 104 | ) -> np.ndarray: 105 | """Compute the center of mass of a conformer of a molecule. 106 | 107 | Args: 108 | mol: a molecule 109 | use_atoms: Whether to compute the true center of mass or the geometrical center. 110 | digits: Number of digits to round to. 111 | conf_id: the conformer id. 112 | 113 | Returns 114 | cm: Center of mass or geometrical center 115 | """ 116 | coords = get_coords(mol, conf_id=conf_id) 117 | atom_weight = np.ones((coords.shape[0])) 118 | 119 | if use_atoms: 120 | atom_weight = np.array([atom.GetMass() for atom in mol.GetAtoms()]) 121 | 122 | atom_weight = atom_weight[:, None] 123 | atom_weight /= atom_weight.sum() 124 | center = (coords * atom_weight).sum(axis=0) 125 | 126 | if digits is not None: 127 | center = center.round(digits) 128 | 129 | return center 130 | 131 | 132 | def keep_conformers( 133 | mol: Mol, 134 | indices_to_keep: Union[int, List[int]] = -1, 135 | assign_id: bool = True, 136 | copy: bool = True, 137 | ): 138 | """Keep on the specified conformer(s) in `indices_to_keep`. 139 | 140 | Args: 141 | mol: A molecule. 142 | indices_to_keep: A indice or a least of indices of conformers to keep. 143 | assign_id: Whether to assign the kept conformers an id or keep the original one. 144 | copy: Whether to copy the molecule or not. 145 | """ 146 | 147 | if copy: 148 | mol = copy_mol(mol) 149 | 150 | if not isinstance(indices_to_keep, list): 151 | indices_to_keep = [indices_to_keep] 152 | 153 | # Extract conformers to keep 154 | confs_to_keep = [mol.GetConformer(conf_id) for conf_id in indices_to_keep] 155 | 156 | # Copy current mol and remove all conformers 157 | mol2 = copy_mol(mol) 158 | mol2.RemoveAllConformers() 159 | 160 | # Add conformers 161 | _ = [mol2.AddConformer(conf, assignId=assign_id) for conf in confs_to_keep] 162 | 163 | # Cleanup 164 | mol = mol2 165 | 166 | return mol 167 | -------------------------------------------------------------------------------- /datamol/data/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The data module aims to provide a fast and convenient access to various molecular datasets. 3 | 4 | --- 5 | """ 6 | 7 | from typing import Optional 8 | from typing import cast 9 | from typing import Union 10 | from typing import List 11 | from typing import overload 12 | from typing import Literal 13 | 14 | import sys 15 | import io 16 | import functools 17 | 18 | try: 19 | import importlib.resources as importlib_resources 20 | except ImportError: 21 | import importlib_resources 22 | 23 | import pandas as pd 24 | 25 | from ..types import Mol 26 | from ..io import read_sdf 27 | from ..convert import from_df 28 | from ..convert import render_mol_df 29 | 30 | 31 | @functools.lru_cache() 32 | def datamol_data_file_path(filename: str, dm_module: str = "datamol.data") -> str: 33 | if sys.version_info < (3, 9, 0): 34 | with importlib_resources.path(dm_module, filename) as p: 35 | data_path = p 36 | else: 37 | data_path = importlib_resources.files(dm_module).joinpath(filename) 38 | 39 | return str(data_path) 40 | 41 | 42 | def open_datamol_data_file( 43 | filename: str, 44 | open_binary: bool = False, 45 | dm_module: str = "datamol.data", 46 | ): 47 | if sys.version_info < (3, 9, 0): 48 | if open_binary: 49 | file_context_manager = importlib_resources.open_binary(dm_module, filename) 50 | else: 51 | file_context_manager = importlib_resources.open_text(dm_module, filename) 52 | else: 53 | if open_binary: 54 | mode = "rb" 55 | else: 56 | mode = "r" 57 | 58 | file_context_manager = ( 59 | importlib_resources.files(dm_module).joinpath(filename).open(mode=mode) 60 | ) 61 | 62 | # NOTE(hadim): we assume the file always exists 63 | file_context_manager = cast(io.TextIOWrapper, file_context_manager) 64 | 65 | return file_context_manager 66 | 67 | 68 | @overload 69 | def freesolv(as_df: Literal[True] = True) -> pd.DataFrame: ... 70 | 71 | 72 | @overload 73 | def freesolv(as_df: Literal[False] = False) -> List[Mol]: ... 74 | 75 | 76 | @overload 77 | def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: ... 78 | 79 | 80 | def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: 81 | """Return the FreeSolv dataset as a dataframe. 82 | 83 | The dataset contains 642 molecules and the following columns: 84 | `['iupac', 'smiles', 'expt', 'calc']`. 85 | 86 | Warning: 87 | This dataset is only meant to be used as a toy dataset for pedagogic and 88 | testing purposes. **It is not** a dataset for benchmarking, analysis or 89 | model training. 90 | """ 91 | 92 | with open_datamol_data_file("freesolv.csv") as f: 93 | data = pd.read_csv(f) 94 | 95 | if not as_df: 96 | data = from_df(data) 97 | 98 | return data 99 | 100 | 101 | @overload 102 | def cdk2(as_df: Literal[True] = True, mol_column: Optional[str] = "mol") -> pd.DataFrame: ... 103 | 104 | 105 | @overload 106 | def cdk2(as_df: Literal[False] = False, mol_column: Optional[str] = "mol") -> List[Mol]: ... 107 | 108 | 109 | @overload 110 | def cdk2( 111 | as_df: bool = True, mol_column: Optional[str] = "mol" 112 | ) -> Union[List[Mol], pd.DataFrame]: ... 113 | 114 | 115 | def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"): 116 | """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`. 117 | 118 | Args: 119 | as_df: Whether to return a list mol or a pandas DataFrame. 120 | mol_column: Name of the mol column. Only relevant if `as_df` is True. 121 | """ 122 | 123 | with open_datamol_data_file("cdk2.sdf", open_binary=True) as f: 124 | data = read_sdf(f, as_df=as_df, mol_column=mol_column) 125 | return data 126 | 127 | 128 | @overload 129 | def solubility(as_df: Literal[True] = True, mol_column: Optional[str] = "mol") -> pd.DataFrame: ... 130 | 131 | 132 | @overload 133 | def solubility(as_df: Literal[False] = False, mol_column: Optional[str] = "mol") -> List[Mol]: ... 134 | 135 | 136 | @overload 137 | def solubility( 138 | as_df: bool = True, mol_column: Optional[str] = "mol" 139 | ) -> Union[List[Mol], pd.DataFrame]: ... 140 | 141 | 142 | def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"): 143 | """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`. 144 | 145 | The dataframe or the list of molecules with contain a `split` column, either `train` or `test`. 146 | 147 | Args: 148 | as_df: Whether to return a list mol or a pandas DataFrame. 149 | mol_column: Name of the mol column. Only relevant if `as_df` is True. 150 | """ 151 | 152 | with open_datamol_data_file("solubility.train.sdf", open_binary=True) as f: 153 | train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None) 154 | 155 | with open_datamol_data_file("solubility.test.sdf", open_binary=True) as f: 156 | test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None) 157 | 158 | train = cast(pd.DataFrame, train) 159 | test = cast(pd.DataFrame, test) 160 | 161 | train["split"] = "train" 162 | test["split"] = "test" 163 | 164 | # NOTE(hadim): LMAO RDkit consistency xD 165 | test = test.rename(columns={"SMILES": "smiles"}) 166 | 167 | data = pd.concat([train, test], ignore_index=True) 168 | 169 | if as_df: 170 | if mol_column is None: 171 | data = data.drop(columns=["mol"]) 172 | 173 | render_mol_df(data) 174 | return data 175 | 176 | return from_df(data, mol_column=mol_column) 177 | 178 | 179 | @overload 180 | def chembl_drugs(as_df: Literal[True] = True) -> pd.DataFrame: ... 181 | 182 | 183 | @overload 184 | def chembl_drugs(as_df: Literal[False] = False) -> List[Mol]: ... 185 | 186 | 187 | def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: 188 | """A list of ~2.5k molecules from ChEMBL (all approved drugs) in SMILES format. 189 | Includes metadata indicating year of first approval, molecule chembl id, molecule type and pref_name. 190 | 191 | List was generated with ['Get_ChEMBL_Approved_Drugs.ipynb'](https://github.com/datamol-io/datamol/notebooks/Get_ChEMBL_Approved_Drugs.ipynb) on 2023-10-18. 192 | The notebook works with the chembl_webresource_client api to collect chembl IDs and metadata, then focuses on small molecules with valid SMILES and first approval date. 193 | """ 194 | with open_datamol_data_file("chembl_approved_drugs.parquet", open_binary=True) as f: 195 | data = pd.read_parquet(f) 196 | 197 | if not as_df: 198 | data = from_df(data) 199 | 200 | return data 201 | 202 | 203 | @overload 204 | def chembl_samples(as_df: Literal[True] = True) -> pd.DataFrame: ... 205 | 206 | 207 | @overload 208 | def chembl_samples(as_df: Literal[False] = False) -> List[Mol]: ... 209 | 210 | 211 | def chembl_samples(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]: 212 | """A list of ~2k molecules from ChEMBL. 213 | 214 | Originally, proposed by Patrick Walters at . 215 | """ 216 | 217 | with open_datamol_data_file("chembl_samples.csv") as f: 218 | data = pd.read_csv(f) 219 | 220 | if not as_df: 221 | data = from_df(data) 222 | 223 | return data 224 | -------------------------------------------------------------------------------- /datamol/data/chembl_approved_drugs.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/datamol/data/chembl_approved_drugs.parquet -------------------------------------------------------------------------------- /datamol/descriptors/__init__.py: -------------------------------------------------------------------------------- 1 | from .descriptors import mw 2 | from .descriptors import fsp3 3 | from .descriptors import n_hba 4 | from .descriptors import n_hbd 5 | from .descriptors import n_lipinski_hba 6 | from .descriptors import n_lipinski_hbd 7 | from .descriptors import n_rings 8 | from .descriptors import n_hetero_atoms 9 | from .descriptors import n_heavy_atoms 10 | from .descriptors import n_rotatable_bonds 11 | from .descriptors import n_radical_electrons 12 | from .descriptors import tpsa 13 | from .descriptors import qed 14 | from .descriptors import clogp 15 | from .descriptors import sas 16 | from .descriptors import n_NHOH 17 | from .descriptors import n_NO 18 | from .descriptors import formal_charge 19 | from .descriptors import n_aliphatic_carbocycles 20 | from .descriptors import n_aliphatic_heterocyles 21 | from .descriptors import n_aliphatic_rings 22 | from .descriptors import n_aromatic_carbocycles 23 | from .descriptors import n_aromatic_heterocyles 24 | from .descriptors import n_aromatic_rings 25 | from .descriptors import n_saturated_carbocycles 26 | from .descriptors import n_saturated_heterocyles 27 | from .descriptors import n_saturated_rings 28 | from .descriptors import n_aromatic_atoms 29 | from .descriptors import n_aromatic_atoms_proportion 30 | from .descriptors import refractivity 31 | from .descriptors import n_rigid_bonds 32 | from .descriptors import n_stereo_centers 33 | from .descriptors import n_charged_atoms 34 | from .descriptors import n_stereo_centers_unspecified 35 | from .descriptors import n_spiro_atoms 36 | 37 | from .compute import any_rdkit_descriptor 38 | from .compute import compute_many_descriptors 39 | from .compute import batch_compute_many_descriptors 40 | -------------------------------------------------------------------------------- /datamol/descriptors/compute.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from typing import Dict 3 | from typing import List 4 | from typing import Union 5 | from typing import Optional 6 | 7 | import functools 8 | 9 | import pandas as pd 10 | 11 | from rdkit.Chem import Descriptors 12 | from rdkit.Chem import rdMolDescriptors 13 | 14 | from .. import Mol 15 | from ..utils.jobs import parallelized 16 | 17 | from .descriptors import mw 18 | from .descriptors import fsp3 19 | from .descriptors import n_lipinski_hba 20 | from .descriptors import n_lipinski_hbd 21 | from .descriptors import n_rings 22 | from .descriptors import n_hetero_atoms 23 | from .descriptors import n_heavy_atoms 24 | from .descriptors import n_rotatable_bonds 25 | from .descriptors import n_radical_electrons 26 | from .descriptors import tpsa 27 | from .descriptors import qed 28 | from .descriptors import clogp 29 | from .descriptors import sas 30 | from .descriptors import n_aliphatic_carbocycles 31 | from .descriptors import n_aliphatic_heterocyles 32 | from .descriptors import n_aliphatic_rings 33 | from .descriptors import n_aromatic_carbocycles 34 | from .descriptors import n_aromatic_heterocyles 35 | from .descriptors import n_aromatic_rings 36 | from .descriptors import n_saturated_carbocycles 37 | from .descriptors import n_saturated_heterocyles 38 | from .descriptors import n_saturated_rings 39 | 40 | 41 | def any_rdkit_descriptor(name: str) -> Callable: 42 | """Return a descriptor function by name either from 43 | `rdkit.Chem import Descriptors` or `rdkit.Chem.rdMolDescriptors`. 44 | 45 | Args: 46 | name: Descriptor name. 47 | """ 48 | fn = getattr(Descriptors, name, None) 49 | 50 | if fn is None: 51 | fn = getattr(rdMolDescriptors, name, None) 52 | 53 | if fn is None: 54 | raise ValueError(f"Descriptor {name} not found.") 55 | 56 | return fn 57 | 58 | 59 | _DEFAULT_PROPERTIES_FN = { 60 | "mw": mw, 61 | "fsp3": fsp3, 62 | "n_lipinski_hba": n_lipinski_hba, 63 | "n_lipinski_hbd": n_lipinski_hbd, 64 | "n_rings": n_rings, 65 | "n_hetero_atoms": n_hetero_atoms, 66 | "n_heavy_atoms": n_heavy_atoms, 67 | "n_rotatable_bonds": n_rotatable_bonds, 68 | "n_radical_electrons": n_radical_electrons, 69 | "tpsa": tpsa, 70 | "qed": qed, 71 | "clogp": clogp, 72 | "sas": sas, 73 | "n_aliphatic_carbocycles": n_aliphatic_carbocycles, 74 | "n_aliphatic_heterocyles": n_aliphatic_heterocyles, 75 | "n_aliphatic_rings": n_aliphatic_rings, 76 | "n_aromatic_carbocycles": n_aromatic_carbocycles, 77 | "n_aromatic_heterocyles": n_aromatic_heterocyles, 78 | "n_aromatic_rings": n_aromatic_rings, 79 | "n_saturated_carbocycles": n_saturated_carbocycles, 80 | "n_saturated_heterocyles": n_saturated_heterocyles, 81 | "n_saturated_rings": n_saturated_rings, 82 | } 83 | 84 | 85 | def compute_many_descriptors( 86 | mol: Mol, 87 | properties_fn: Optional[Dict[str, Union[Callable, str]]] = None, 88 | add_properties: bool = True, 89 | ) -> dict: 90 | """Compute a list of opiniated molecular properties. 91 | 92 | Args: 93 | mol: A molecule. 94 | properties_fn: A list of functions that compute properties. If None, 95 | a default list of properties is used. If the function is a string, 96 | `dm.descriptors.any_descriptor()` is used to retrieve the descriptor 97 | function. 98 | add_properties: Whether to add the computed properties to the default list. 99 | 100 | Returns: 101 | Computed properties as a dict. 102 | """ 103 | 104 | if properties_fn is None: 105 | properties_fn = _DEFAULT_PROPERTIES_FN 106 | elif add_properties: 107 | [properties_fn.setdefault(k, v) for k, v in _DEFAULT_PROPERTIES_FN.items()] 108 | 109 | props = {} 110 | for k, v in properties_fn.items(): 111 | if isinstance(v, str): 112 | v = any_rdkit_descriptor(v) 113 | 114 | props[k] = v(mol) 115 | 116 | return props 117 | 118 | 119 | def batch_compute_many_descriptors( 120 | mols: List[Mol], 121 | properties_fn: Optional[Dict[str, Union[Callable, str]]] = None, 122 | add_properties: bool = True, 123 | n_jobs: int = 1, 124 | batch_size: Optional[int] = None, 125 | progress: bool = False, 126 | progress_leave: bool = True, 127 | ) -> pd.DataFrame: 128 | """Compute a list of opiniated molecular properties on a list of molecules. 129 | 130 | Args: 131 | mols: A list of molecules. 132 | properties_fn: A list of functions that compute properties. If None, 133 | a default list of properties is used. If the function is a string, 134 | `dm.descriptors.any_descriptor()` is used to retrieve the descriptor 135 | function. 136 | add_properties: Whether to add the computed properties to the default list. 137 | 138 | Returns: 139 | A dataframe of computed properties with one row per input molecules. 140 | """ 141 | 142 | compute_fn = functools.partial( 143 | compute_many_descriptors, 144 | properties_fn=properties_fn, 145 | add_properties=add_properties, 146 | ) 147 | 148 | props = parallelized( 149 | compute_fn, 150 | mols, 151 | batch_size=batch_size, 152 | progress=progress, 153 | n_jobs=n_jobs, 154 | tqdm_kwargs=dict(leave=progress_leave), 155 | ) 156 | return pd.DataFrame(props) 157 | -------------------------------------------------------------------------------- /datamol/descriptors/descriptors.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from rdkit.Chem import Descriptors 5 | from rdkit.Chem import rdMolDescriptors 6 | from rdkit.Chem import RDConfig 7 | from rdkit.Chem import Lipinski 8 | from rdkit.Chem import rdmolops 9 | from rdkit.Chem import Crippen 10 | 11 | 12 | from .. import Mol 13 | from ..convert import from_smarts 14 | from ..log import no_rdkit_log 15 | from .._version import is_lower_than_current_rdkit_version 16 | 17 | 18 | @no_rdkit_log 19 | def _sasscorer(mol: Mol): 20 | sys.path.append(os.path.join(RDConfig.RDContribDir, "SA_Score")) 21 | try: 22 | import sascorer # type:ignore 23 | except ImportError: 24 | raise ImportError( 25 | "Could not import sascorer. If you installed rdkit-pypi with `pip`, please uninstall it and reinstall rdkit with `conda` or `mamba`." 26 | ) 27 | 28 | return sascorer.calculateScore(mol) 29 | 30 | 31 | _AROMATIC_QUERY = from_smarts("a") 32 | 33 | mw = rdMolDescriptors.CalcExactMolWt 34 | fsp3 = rdMolDescriptors.CalcFractionCSP3 35 | tpsa = rdMolDescriptors.CalcTPSA 36 | qed = Descriptors.qed 37 | clogp = Descriptors.MolLogP # type: ignore 38 | sas = _sasscorer 39 | formal_charge = rdmolops.GetFormalCharge 40 | refractivity = Crippen.MolMR 41 | 42 | n_hba = rdMolDescriptors.CalcNumHBA 43 | n_hbd = rdMolDescriptors.CalcNumHBD 44 | n_lipinski_hba = rdMolDescriptors.CalcNumLipinskiHBA 45 | n_lipinski_hbd = rdMolDescriptors.CalcNumLipinskiHBD 46 | n_rings = rdMolDescriptors.CalcNumRings 47 | n_hetero_atoms = rdMolDescriptors.CalcNumHeteroatoms 48 | 49 | 50 | if is_lower_than_current_rdkit_version("2021.09"): 51 | n_heavy_atoms = Descriptors.HeavyAtomCount # type: ignore 52 | else: 53 | n_heavy_atoms = rdMolDescriptors.CalcNumHeavyAtoms 54 | 55 | n_rotatable_bonds = rdMolDescriptors.CalcNumRotatableBonds 56 | n_radical_electrons = Descriptors.NumRadicalElectrons 57 | n_NHOH = Lipinski.NHOHCount 58 | n_NO = Lipinski.NOCount 59 | n_spiro_atoms = rdMolDescriptors.CalcNumSpiroAtoms 60 | 61 | n_aliphatic_carbocycles = rdMolDescriptors.CalcNumAliphaticCarbocycles 62 | n_aliphatic_heterocyles = rdMolDescriptors.CalcNumAliphaticHeterocycles 63 | n_aliphatic_rings = rdMolDescriptors.CalcNumAliphaticRings 64 | 65 | n_aromatic_carbocycles = rdMolDescriptors.CalcNumAromaticCarbocycles 66 | n_aromatic_heterocyles = rdMolDescriptors.CalcNumAromaticHeterocycles 67 | n_aromatic_rings = rdMolDescriptors.CalcNumAromaticRings 68 | 69 | n_saturated_carbocycles = rdMolDescriptors.CalcNumSaturatedCarbocycles 70 | n_saturated_heterocyles = rdMolDescriptors.CalcNumSaturatedHeterocycles 71 | n_saturated_rings = rdMolDescriptors.CalcNumSaturatedRings 72 | 73 | 74 | def n_rigid_bonds(mol: Mol) -> int: 75 | """Compute the number of rigid bonds in a molecule. 76 | 77 | Rigid bonds are bonds that are not single and not in rings. 78 | 79 | Args: 80 | mol: A molecule. 81 | 82 | Returns: 83 | n_rigid_bonds: number of rigid bonds in the molecule 84 | """ 85 | non_rigid_bonds_count = from_smarts("*-&!@*") 86 | n_rigid_bonds = mol.GetNumBonds() - len(mol.GetSubstructMatches(non_rigid_bonds_count)) 87 | return n_rigid_bonds 88 | 89 | 90 | def n_aromatic_atoms(mol: Mol) -> int: 91 | """Calculate the number of aromatic atoms.""" 92 | matches = mol.GetSubstructMatches(_AROMATIC_QUERY) 93 | return len(matches) 94 | 95 | 96 | def n_aromatic_atoms_proportion(mol: Mol) -> int: 97 | """Calculate the aromatic proportion: # aromatic atoms/#atoms total. 98 | 99 | Args: 100 | mol: A molecule. 101 | 102 | Only heavy atoms are considered. 103 | """ 104 | return n_aromatic_atoms(mol) / mol.GetNumHeavyAtoms() 105 | 106 | 107 | def n_stereo_centers(mol: Mol) -> int: 108 | """Compute the number of stereocenters in a molecule. 109 | 110 | Args: 111 | mol: A molecule. 112 | 113 | Returns: 114 | n_stero_center: number of stereocenters in the molecule 115 | """ 116 | n = 0 117 | try: 118 | rdmolops.FindPotentialStereo(mol, cleanIt=False) 119 | n = rdMolDescriptors.CalcNumAtomStereoCenters(mol) 120 | except Exception: 121 | pass 122 | return n 123 | 124 | 125 | def n_stereo_centers_unspecified(mol: Mol) -> int: 126 | """Compute the number of unspecified stereocenters in a molecule. 127 | 128 | Args: 129 | mol: A molecule. 130 | 131 | Returns: 132 | n_stereo_centers_unspecified: number of unspecified stereocenters in the molecule 133 | """ 134 | n = 0 135 | try: 136 | rdmolops.FindPotentialStereo(mol, cleanIt=False) 137 | n = rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(mol) 138 | except Exception: 139 | pass 140 | return n 141 | 142 | 143 | def n_charged_atoms(mol: Mol) -> int: 144 | """Compute the number of charged atoms in a molecule. 145 | 146 | Args: 147 | mol: A molecule. 148 | 149 | Returns: 150 | n_charged_atoms: number of charged atoms in the molecule 151 | """ 152 | return sum([at.GetFormalCharge() != 0 for at in mol.GetAtoms()]) 153 | -------------------------------------------------------------------------------- /datamol/fragment/__init__.py: -------------------------------------------------------------------------------- 1 | from ._fragment import brics 2 | from ._fragment import frag 3 | from ._fragment import recap 4 | from ._fragment import anybreak 5 | from ._fragment import mmpa_frag 6 | from ._fragment import mmpa_cut 7 | 8 | from ._assemble import assemble_fragment_order 9 | from ._assemble import break_mol 10 | from ._assemble import build 11 | -------------------------------------------------------------------------------- /datamol/fragment/_fragment.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | from typing import Optional 3 | from typing import Any 4 | 5 | from rdkit import Chem 6 | from rdkit.Chem import BRICS 7 | from rdkit.Chem import Recap 8 | from rdkit.Chem import rdMMPA 9 | 10 | from rdkit.Chem.Fraggle import FraggleSim 11 | 12 | import datamol as dm 13 | 14 | 15 | def brics( 16 | mol: Chem.rdchem.Mol, 17 | singlepass: bool = True, 18 | remove_parent: bool = False, 19 | sanitize: bool = True, 20 | fix: bool = True, 21 | ): 22 | """Run BRICS on the molecules and potentially fix dummy atoms. 23 | 24 | Args: 25 | mol: a molecule. 26 | singlepass: Single pass for `BRICSDecompose`. 27 | remove_parent: Remove parent from the fragments. 28 | sanitize: Wether to sanitize the fragments. 29 | fix: Wether to fix the fragments. 30 | """ 31 | frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass) 32 | frags = list(frags) 33 | 34 | if fix: 35 | frags = [dm.fix_mol(x) for x in frags] 36 | if sanitize: 37 | frags = [dm.sanitize_mol(x) for x in frags] 38 | if remove_parent: 39 | frags.pop(0) 40 | 41 | frags = [x for x in frags if x is not None] 42 | 43 | return frags 44 | 45 | 46 | def frag( 47 | mol: Chem.rdchem.Mol, 48 | remove_parent: bool = False, 49 | sanitize: bool = True, 50 | fix: bool = True, 51 | ): 52 | """Generate all possible fragmentation of a molecule. 53 | 54 | Args: 55 | mol: a molecule. 56 | remove_parent: Remove parent from the fragments. 57 | sanitize: Wether to sanitize the fragments. 58 | fix: Wether to fix the fragments. 59 | """ 60 | frags = FraggleSim.generate_fraggle_fragmentation(mol) 61 | 62 | smiles = set([]) 63 | for seq in frags: 64 | smiles |= {s.strip() for s in seq.split(".")} 65 | 66 | smiles = list(sorted(smiles, reverse=True)) 67 | frags = [dm.to_mol(s) for s in smiles] 68 | 69 | if fix: 70 | frags = [dm.fix_mol(x) for x in frags] 71 | if sanitize: 72 | frags = [dm.sanitize_mol(x) for x in frags] 73 | 74 | frags = [x for x in frags if x is not None] 75 | 76 | if remove_parent: 77 | return frags 78 | return [mol] + frags 79 | 80 | 81 | def recap( 82 | mol: Chem.rdchem.Mol, 83 | remove_parent: bool = False, 84 | sanitize: bool = True, 85 | fix: bool = True, 86 | ): 87 | """Fragment the molecule using the recap algorithm. 88 | 89 | Args: 90 | mol: a molecule. 91 | remove_parent: Remove parent from the fragments. 92 | sanitize: Wether to sanitize the fragments. 93 | fix: Wether to fix the fragments. 94 | """ 95 | res = Recap.RecapDecompose(mol) 96 | frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()] 97 | 98 | if fix: 99 | frags = [dm.fix_mol(x) for x in frags] 100 | if sanitize: 101 | frags = [dm.sanitize_mol(x) for x in frags] 102 | 103 | frags = [x for x in frags if x is not None] 104 | 105 | if remove_parent: 106 | return frags 107 | return [mol] + frags 108 | 109 | 110 | def anybreak( 111 | mol: Chem.rdchem.Mol, 112 | remove_parent: bool = False, 113 | sanitize: bool = True, 114 | fix: bool = True, 115 | ): 116 | """Fragment molecule by applying brics first, then fall back to frag. 117 | 118 | Args: 119 | mol: a molecule. 120 | remove_parent: Remove parent from the fragments. 121 | sanitize: Wether to sanitize the fragments. 122 | fix: Wether to fix the fragments. 123 | """ 124 | frags = [] 125 | try: 126 | frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize) 127 | except Exception: 128 | pass 129 | 130 | if len(frags) == 0: 131 | frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix) 132 | 133 | return frags 134 | 135 | 136 | def mmpa_frag( 137 | mol: dm.Mol, 138 | pattern: Optional[str] = None, 139 | max_cut: int = 1, 140 | max_bond_cut: int = 20, 141 | h_split: bool = False, 142 | ) -> Optional[Set[dm.Mol]]: 143 | """Fragment molecule on specific bonds suitable for a MMPA analysis. 144 | 145 | Args: 146 | mol: Molecule to fragment. 147 | pattern: Bond pattern to split on. Will use default rdkit pattern 148 | '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided. 149 | max_cut: Number of cuts. 150 | max_bond_cut: Maximum number of bond to cut. Default to 20. 151 | h_split: Whether to split at hydrogen position too. 152 | This is equivalent to enabling the addition of new fragments. 153 | 154 | Returns: 155 | List of fragments. 156 | """ 157 | 158 | frags = [] 159 | if pattern is None: 160 | frags = rdMMPA.FragmentMol( 161 | mol, 162 | maxCuts=max_cut, 163 | resultsAsMols=False, 164 | maxCutBonds=max_bond_cut, 165 | ) 166 | elif pattern: 167 | frags = rdMMPA.FragmentMol( 168 | mol, 169 | pattern=pattern, 170 | maxCuts=max_cut, 171 | resultsAsMols=False, 172 | maxCutBonds=max_bond_cut, 173 | ) 174 | 175 | if h_split: 176 | mol = dm.add_hs(mol) 177 | frags += rdMMPA.FragmentMol( 178 | mol, 179 | pattern="[#1]!@!=!#[!#1]", 180 | maxCuts=1, 181 | resultsAsMols=False, 182 | maxCutBonds=max_bond_cut, 183 | ) 184 | return set(frags) 185 | 186 | 187 | def mmpa_cut(mol: dm.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]: 188 | """Cut molecules to perform mmpa analysis later 189 | 190 | Args: 191 | mol: Molecule to fragment. 192 | rdkit_pattern: Whether to perform the fragmentation 193 | using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]" 194 | 195 | Returns: 196 | List of 'smiles,core,chains' 197 | """ 198 | 199 | if mol is None: 200 | return mol 201 | 202 | outlines = set() 203 | 204 | smiles = dm.to_smiles(mol) 205 | 206 | if rdkit_pattern: 207 | frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30) 208 | else: 209 | # heavy atoms 210 | frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30) 211 | frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30)) 212 | 213 | frags = set(frags) 214 | for core, chains in frags: 215 | output = f"{smiles},{core},{chains}\n" 216 | outlines.add(output) 217 | 218 | # hydrogen splitting 219 | mol = dm.add_hs(mol) 220 | smiles = dm.to_smiles(mol) 221 | 222 | n = mol.GetNumHeavyAtoms() 223 | if n < 60: 224 | frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True) 225 | for core, chains in frags: 226 | output = f"{smiles},{core},{chains}\n" 227 | outlines.add(output) 228 | 229 | return outlines 230 | -------------------------------------------------------------------------------- /datamol/isomers/__init__.py: -------------------------------------------------------------------------------- 1 | from ._structural import IsomerEnumerator 2 | 3 | from ._enumerate import enumerate_stereoisomers 4 | from ._enumerate import enumerate_tautomers 5 | from ._enumerate import enumerate_structisomers 6 | from ._enumerate import count_stereoisomers 7 | from ._enumerate import remove_stereochemistry 8 | from ._enumerate import canonical_tautomer 9 | -------------------------------------------------------------------------------- /datamol/log.py: -------------------------------------------------------------------------------- 1 | from rdkit import RDLogger 2 | from rdkit import rdBase 3 | from functools import wraps 4 | 5 | 6 | class without_rdkit_log: 7 | """Context manager to disable RDKit logs. By default all logs are disabled. 8 | 9 | Example: 10 | 11 | ```python 12 | import datamol as dm 13 | 14 | with dm.without_rdkit_log(): 15 | mol = dm.to_mol("CCCCO") # potential RDKit logs won't show 16 | ``` 17 | """ 18 | 19 | def __init__( 20 | self, 21 | mute_errors: bool = True, 22 | mute_warning: bool = True, 23 | mute_info: bool = True, 24 | mute_debug: bool = True, 25 | enable: bool = True, 26 | ): 27 | if enable is False: 28 | mute_errors = False 29 | mute_warning = False 30 | mute_info = False 31 | mute_debug = False 32 | 33 | # Get current log state 34 | self.previous_status = self._get_log_status() 35 | 36 | # Init the desired log state to apply during in the context 37 | self.desired_status = {} 38 | self.desired_status["rdApp.error"] = not mute_errors 39 | self.desired_status["rdApp.warning"] = not mute_warning 40 | self.desired_status["rdApp.debug"] = not mute_debug 41 | self.desired_status["rdApp.info"] = not mute_info 42 | 43 | def _get_log_status(self): 44 | """Get the current log status of RDKit logs.""" 45 | log_status = rdBase.LogStatus() 46 | log_status = {st.split(":")[0]: st.split(":")[1] for st in log_status.split("\n")} 47 | log_status = {k: True if v == "enabled" else False for k, v in log_status.items()} 48 | return log_status 49 | 50 | def _apply_log_status(self, log_status): 51 | """Apply an RDKit log status.""" 52 | for k, v in log_status.items(): 53 | if v is True: 54 | rdBase.EnableLog(k) 55 | else: 56 | rdBase.DisableLog(k) 57 | 58 | def __enter__(self): 59 | self._apply_log_status(self.desired_status) 60 | 61 | def __exit__(self, *args, **kwargs): 62 | self._apply_log_status(self.previous_status) 63 | 64 | 65 | def disable_rdkit_log(): 66 | """Disable all rdkit logs.""" 67 | for log_level in RDLogger._levels: 68 | rdBase.DisableLog(log_level) 69 | 70 | 71 | def enable_rdkit_log(): 72 | """Enable all rdkit logs.""" 73 | for log_level in RDLogger._levels: 74 | rdBase.EnableLog(log_level) 75 | 76 | 77 | def no_rdkit_log( 78 | func=None, 79 | *, 80 | mute_errors: bool = True, 81 | mute_warning: bool = True, 82 | mute_info: bool = True, 83 | mute_debug: bool = True, 84 | enable: bool = True, 85 | ): 86 | """Decorator to disable RDKit logs. 87 | 88 | This decorator can be used to suppress RDKit logs when executing a specific function. 89 | By default, all log levels (error, warning, info, and debug) are muted. 90 | 91 | Args: 92 | mute_errors : Whether to mute error logs (default is True). 93 | mute_warning : Whether to mute warning logs (default is True). 94 | mute_info : Whether to mute info logs (default is True). 95 | mute_debug : Whether to mute debug logs (default is True). 96 | enable: Whether to enable the log muting (default is True). If set to False, no logs will be muted. 97 | 98 | Example: 99 | ```python 100 | @no_rdkit_log() 101 | def example_function(): 102 | # Your function code here 103 | pass 104 | 105 | example_function() # RDKit logs won't show during this function's execution 106 | ``` 107 | """ 108 | 109 | if func is None: 110 | return lambda f: no_rdkit_log( 111 | f, 112 | mute_errors=mute_errors, 113 | mute_warning=mute_warning, 114 | mute_info=mute_info, 115 | mute_debug=mute_debug, 116 | enable=enable, 117 | ) 118 | 119 | @wraps(func) 120 | def wrapper(*args, **kwargs): 121 | with without_rdkit_log(mute_errors, mute_warning, mute_info, mute_debug, enable): 122 | return func(*args, **kwargs) 123 | 124 | return wrapper 125 | -------------------------------------------------------------------------------- /datamol/mcs.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Any 3 | 4 | from rdkit.Chem import rdFMCS 5 | 6 | import datamol as dm 7 | 8 | ALLOWED_ATOM_COMPARE = ["CompareAny", "CompareAnyHeavyAtom", "CompareElements", "CompareIsotopes"] 9 | ALLOWED_BOND_COMPARE = ["CompareAny", "CompareOrder", "CompareOrderExact"] 10 | ALLOWED_RING_COMPARE = ["IgnoreRingFusion", "PermissiveRingFusion", "StrictRingFusion"] 11 | 12 | 13 | def find_mcs( 14 | mols: List[dm.Mol], 15 | maximize_bonds: bool = True, 16 | threshold: float = 0.0, 17 | timeout: int = 5, 18 | verbose: bool = False, 19 | match_valences: bool = False, 20 | ring_matches_ring_only: bool = True, 21 | complete_rings_only: bool = False, 22 | match_chiral_tag: bool = False, 23 | seed_smarts: str = "", 24 | atom_compare: str = "CompareElements", 25 | bond_compare: str = "CompareOrder", 26 | ring_compare: str = "IgnoreRingFusion", 27 | with_details: bool = False, 28 | **kwargs: Any, 29 | ): 30 | """Find the maximum common substructure from a list of molecules. 31 | 32 | Args: 33 | mols: List of molecules. 34 | maximize_bonds: Maximize the number of bonds in the substructure. 35 | threshold: The threshold for the MCS (between 0 and 1). 36 | timeout: The timeout for the MCS. 37 | verbose: Whether to enable verbose mode. 38 | match_valences: Whether to match valences. 39 | ring_matches_ring_only: Whether to match rings only. 40 | complete_rings_only: Whether to match complete rings only. 41 | match_chiral_tag: Whether to match chiral tags. 42 | seed_smarts: The seed SMARTS. 43 | atom_compare: One of "CompareAny", "CompareAnyHeavyAtom", "CompareElements", 44 | "CompareIsotopes". 45 | bond_compare: One of "CompareAny", "CompareOrder", "CompareOrderExact". 46 | ring_compare: One of "IgnoreRingFusion", "PermissiveRingFusion", "StrictRingFusion". 47 | with_details: Whether to return the RDKit MCS object or just the SMARTS string. 48 | **kwargs: Additional arguments for the MCS. 49 | """ 50 | 51 | if atom_compare not in ALLOWED_ATOM_COMPARE: 52 | raise ValueError(f"atom_compare must be one of {ALLOWED_ATOM_COMPARE}") 53 | 54 | if bond_compare not in ALLOWED_BOND_COMPARE: 55 | raise ValueError(f"bond_compare must be one of {ALLOWED_BOND_COMPARE}") 56 | 57 | if ring_compare not in ALLOWED_RING_COMPARE: 58 | raise ValueError(f"ring_compare must be one of {ALLOWED_RING_COMPARE}") 59 | 60 | args = {} 61 | args["maximizeBonds"] = maximize_bonds 62 | args["threshold"] = threshold 63 | args["timeout"] = timeout 64 | args["verbose"] = verbose 65 | args["matchValences"] = match_valences 66 | args["ringMatchesRingOnly"] = ring_matches_ring_only 67 | args["completeRingsOnly"] = complete_rings_only 68 | args["matchChiralTag"] = match_chiral_tag 69 | args["seedSmarts"] = seed_smarts 70 | args["atomCompare"] = rdFMCS.AtomCompare.names[atom_compare] 71 | args["bondCompare"] = rdFMCS.BondCompare.names[bond_compare] 72 | args["ringCompare"] = rdFMCS.RingCompare.names[ring_compare] 73 | 74 | args.update(kwargs) 75 | 76 | mcs = rdFMCS.FindMCS(mols, **args) 77 | 78 | if with_details: 79 | return mcs 80 | 81 | smarts = mcs.smartsString 82 | if smarts == "": 83 | smarts = None 84 | return smarts 85 | -------------------------------------------------------------------------------- /datamol/molar.py: -------------------------------------------------------------------------------- 1 | """A set of utility functions to convert between various units and formats used in drug discovery. 2 | """ 3 | 4 | from typing import Union 5 | from typing import Iterable 6 | 7 | import numpy as np 8 | 9 | 10 | _MOLAR_SCALES = {"M": 1, "mM": 1e-3, "uM": 1e-6, "nM": 1e-9, "pM": 1e-12, "fM": 1e-15} 11 | 12 | 13 | def molar_to_log( 14 | values: Union[float, Iterable[float], np.ndarray], 15 | unit: str, 16 | ) -> Union[float, Iterable[float], np.ndarray]: 17 | """Convert a molar concentration (XC50 for example) to its log scaled value (pXC50). 18 | 19 | Args: 20 | values: A molar concentration (can be a scalar, a list or an array). 21 | unit: The unit of the input concentration. Choose from: 22 | `{'M', 'fM', 'mM', 'nM', 'pM', 'uM'}`. 23 | """ 24 | 25 | if unit not in _MOLAR_SCALES: 26 | raise ValueError( 27 | f"The unit '{unit}' is not supported. Choose from {set(_MOLAR_SCALES.keys())}." 28 | ) 29 | 30 | return -1 * np.log10(np.array(values) * _MOLAR_SCALES[unit]) 31 | 32 | 33 | def log_to_molar( 34 | values: Union[float, Iterable[float], np.ndarray], 35 | unit: str, 36 | ) -> Union[float, Iterable[float], np.ndarray]: 37 | """Convert a log-scaled molar concentration (pXC50 for example) to its unscaled value (XC50). 38 | 39 | Args: 40 | values: A log-scaled molar concentration (can be a scalar, a list or an array). 41 | unit: The unit of the input concentration. Choose from: 42 | `{'M', 'fM', 'mM', 'nM', 'pM', 'uM'}`. 43 | """ 44 | 45 | if unit not in _MOLAR_SCALES: 46 | raise ValueError( 47 | f"The unit '{unit}' is not supported. Choose from {set(_MOLAR_SCALES.keys())}." 48 | ) 49 | 50 | return 10 ** (-1 * np.array(values, dtype="float")) / _MOLAR_SCALES[unit] 51 | -------------------------------------------------------------------------------- /datamol/predictors/__init__.py: -------------------------------------------------------------------------------- 1 | from .esol import esol 2 | from .esol import esol_from_data 3 | -------------------------------------------------------------------------------- /datamol/predictors/esol.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import pandas as pd 4 | 5 | 6 | from .. import Mol 7 | 8 | from ..descriptors.descriptors import clogp 9 | from ..descriptors.descriptors import mw 10 | from ..descriptors.descriptors import n_rotatable_bonds 11 | from ..descriptors.descriptors import n_aromatic_atoms_proportion 12 | 13 | 14 | _ESOL_INTERCEPT = 0.26121066137801696 15 | _ESOL_COEF = { 16 | "mw": -0.0066138847738667125, 17 | "clogp": -0.7416739523408995, 18 | "n_rotatable_bonds": 0.003451545565957996, 19 | "n_aromatic_atoms_proportion": -0.42624840441316975, 20 | } 21 | 22 | 23 | def esol(mol: Mol): 24 | """Compute the solubility descriptor ESOL. 25 | 26 | Note that the intermediate descriptors will be computed on-the-fly. If you prefer 27 | precomputing those then you can use `esol_from_data`. 28 | 29 | Source: https://github.com/PatWalters/solubility/blob/d1536c58afe5e0e7ac4c96e2ffef496d5b98664b/esol.py 30 | """ 31 | 32 | esol = ( 33 | _ESOL_INTERCEPT 34 | + _ESOL_COEF["clogp"] * clogp(mol) 35 | + _ESOL_COEF["mw"] * mw(mol) 36 | + _ESOL_COEF["n_rotatable_bonds"] * n_rotatable_bonds(mol) 37 | + _ESOL_COEF["n_aromatic_atoms_proportion"] * n_aromatic_atoms_proportion(mol) 38 | ) 39 | 40 | return esol 41 | 42 | 43 | def esol_from_data(data: Union[pd.Series, pd.DataFrame, dict]): 44 | """Compute the solubility descriptor ESOL. 45 | 46 | `data` must contains the following intermediate descriptors: 47 | 48 | - `clogp`: `dm.descriptors.clogp` 49 | - `mw`: `dm.descriptors.mw` 50 | - `n_rotatable_bonds`: `dm.descriptors.n_rotatable_bonds` 51 | - `n_aromatic_atoms_proportion`: `dm.descriptors.n_aromatic_atoms_proportion` 52 | 53 | Source: https://github.com/PatWalters/solubility/blob/d1536c58afe5e0e7ac4c96e2ffef496d5b98664b/esol.py 54 | 55 | Args: 56 | data: A dataframe or series containing the intermediate descriptors. 57 | """ 58 | 59 | esol = ( 60 | _ESOL_INTERCEPT 61 | + _ESOL_COEF["clogp"] * data["clogp"] 62 | + _ESOL_COEF["mw"] * data["mw"] 63 | + _ESOL_COEF["n_rotatable_bonds"] * data["n_rotatable_bonds"] 64 | + _ESOL_COEF["n_aromatic_atoms_proportion"] * data["n_aromatic_atoms_proportion"] 65 | ) 66 | 67 | return esol 68 | -------------------------------------------------------------------------------- /datamol/reactions/__init__.py: -------------------------------------------------------------------------------- 1 | from ._reactions import is_reaction_ok 2 | from ._reactions import select_reaction_output 3 | from ._reactions import apply_reaction 4 | from ._reactions import can_react 5 | from ._reactions import inverse_reaction 6 | from ._reactions import find_reactant_position 7 | from ._reactions import ATTACHING_RXN 8 | from ._reactions import rxn_from_smarts 9 | from ._reactions import rxn_to_smarts 10 | from ._reactions import rxn_from_block 11 | from ._reactions import rxn_from_block_file 12 | from ._reactions import rxn_to_block 13 | from ._reactions import rxn_to_block_file 14 | 15 | from ._attachments import add_brackets_to_attachment_points 16 | from ._attachments import convert_attach_to_isotope 17 | from ._attachments import num_attachment_points 18 | from ._attachments import open_attach_points 19 | -------------------------------------------------------------------------------- /datamol/reactions/_attachments.py: -------------------------------------------------------------------------------- 1 | from typing import cast 2 | from typing import Union 3 | 4 | import re 5 | import operator 6 | 7 | import datamol as dm 8 | from rdkit import Chem 9 | 10 | ATTACHMENT_POINT_TOKEN = "*" 11 | ATTACHMENT_POINT_NUM_REGEXP = r"\[{}:?(\d*)\]".format(re.escape(ATTACHMENT_POINT_TOKEN)) 12 | ATTACHMENT_POINT_REGEXP = r"(?:{0}|\[{0}[^\]]*\])".format(re.escape(ATTACHMENT_POINT_TOKEN)) 13 | ATTACHMENT_POINT_NO_BRACKETS_REGEXP = r"(? str: 20 | """ 21 | Adds brackets to the attachment points (if they don't have them). 22 | Example: "CC(C)CO*" to "CC(C)CO[*]" 23 | 24 | Args: 25 | smiles: A smiles string. 26 | 27 | Returns: 28 | A smiles string with brackets. 29 | """ 30 | return re.sub( 31 | ATTACHMENT_POINT_NO_BRACKETS_REGEXP, 32 | "[{}]".format(ATTACHMENT_POINT_TOKEN), 33 | smiles, 34 | ) 35 | 36 | 37 | def convert_attach_to_isotope( 38 | mol_or_smiles: Union[dm.Mol, str], 39 | same_isotope: bool = False, 40 | as_smiles: bool = False, 41 | ) -> Union[dm.Mol, str]: 42 | """Convert attachment to isotope mapping. 43 | 44 | Examples: "O=C(NCc1cnc([*])c1)[*]" to "O=C(NCc1cnc([1*])c1)[2*]" 45 | 46 | Args: 47 | mol_or_smiles: A Mol object or a smiles to be converted 48 | same_isotope: Whether convert to the same isotope. 49 | Example: "O=C(NCc1cnc([*])c1)[*]" to "O=C(NCc1cnc([1*])c1)[1*]" 50 | 51 | Returns: 52 | Converted Mol object or SMILES. 53 | """ 54 | mol = dm.to_mol(mol_or_smiles) 55 | smiles = dm.to_smiles(mol) 56 | smiles = cast(str, smiles) 57 | 58 | smiles = add_brackets_to_attachment_points(smiles) 59 | 60 | # reg matching seems to be the most effective 61 | subs_reg = r"[\g<1>{}]" 62 | if same_isotope: 63 | subs_reg = "[1{}]" 64 | 65 | smiles = re.sub(ATTACHMENT_POINT_NUM_REGEXP, subs_reg.format(ATTACHMENT_POINT_TOKEN), smiles) 66 | 67 | if as_smiles: 68 | return smiles 69 | return dm.to_mol(smiles) 70 | 71 | 72 | def num_attachment_points(mol_or_smiles: Union[dm.Mol, str]) -> int: 73 | """ 74 | Get the number of attachment point in the 75 | 76 | Args: 77 | mol_or_smiles: A Mol object or a smiles to be converted 78 | 79 | Returns: 80 | Number of attachment points of the given molecule. 81 | """ 82 | if isinstance(mol_or_smiles, dm.Mol): 83 | mol = cast(dm.Mol, mol_or_smiles) 84 | n_points = len( 85 | [atom for atom in mol.GetAtoms() if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN] 86 | ) 87 | else: 88 | n_points = len(re.findall(ATTACHMENT_POINT_REGEXP, mol_or_smiles)) 89 | 90 | return n_points 91 | 92 | 93 | def open_attach_points( 94 | mol: dm.Mol, 95 | fix_atom_map: bool = False, 96 | bond_type: dm.BondType = dm.SINGLE_BOND, 97 | ) -> dm.Mol: 98 | """Compute attachment points on a molecule. 99 | This will highlight all valid attachment point on the current molecule instead. 100 | 101 | Args: 102 | mol: A Mol object to be processed. 103 | fix_atom_map: Whether fix the atom mapping of the molecule. 104 | bond_type: The bond type to be opened. 105 | 106 | Returns: 107 | Molecule with open attachment points 108 | """ 109 | 110 | emol = Chem.rdchem.RWMol(dm.to_mol(mol)) 111 | with dm.log.without_rdkit_log(): 112 | atoms = [ 113 | (a.GetIdx(), a) 114 | for a in emol.GetAtoms() 115 | if a.GetSymbol() != ATTACHMENT_POINT_TOKEN 116 | and a.GetImplicitValence() > 0 117 | and (not a.HasProp("_protected") or a.GetProp("_protected") != "1") 118 | ] 119 | atoms.sort(reverse=True, key=operator.itemgetter(0)) 120 | 121 | for atom in atoms: 122 | new_atom = Chem.rdchem.Atom(ATTACHMENT_POINT_TOKEN) 123 | new_atom.SetAtomMapNum(1 if fix_atom_map else atom[0]) 124 | new_index = emol.AddAtom(new_atom) 125 | emol.UpdatePropertyCache(strict=False) 126 | if bond_type is not None: 127 | emol.AddBond(atom[0], new_index, bond_type) 128 | else: 129 | emol.AddBond(atom[0], new_index) 130 | 131 | mol = dm.sanitize_mol(emol) 132 | return mol 133 | -------------------------------------------------------------------------------- /datamol/scaffold/__init__.py: -------------------------------------------------------------------------------- 1 | from ._fuzzy import trim_side_chain 2 | from ._fuzzy import fuzzy_scaffolding 3 | -------------------------------------------------------------------------------- /datamol/similarity.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Optional 3 | from typing import Union 4 | from typing import Any 5 | 6 | import functools 7 | 8 | import numpy as np 9 | from sklearn.metrics import pairwise_distances_chunked 10 | from scipy.spatial import distance 11 | 12 | import datamol as dm 13 | 14 | 15 | def pdist( 16 | mols: List[Union[str, dm.Mol]], 17 | n_jobs: Optional[int] = 1, 18 | squareform: bool = True, 19 | **fp_args: Any, 20 | ) -> np.ndarray: 21 | """Compute the pairwise tanimoto distance between the fingerprints of all the 22 | molecules in the input set. 23 | 24 | Args: 25 | mols: list of molecules 26 | n_jobs: Number of jobs for parallelization. Let to 1 for no 27 | parallelization. Set to -1 to use all available cores. 28 | squareform: Whether to return in square form (matrix) or in a condensed 29 | form (1D vector). 30 | **fp_args: list of args to pass to `to_fp()`. 31 | 32 | Returns: 33 | dist_mat 34 | """ 35 | 36 | fps = dm.parallelized( 37 | functools.partial(dm.to_fp, as_array=True, **fp_args), 38 | mols, 39 | n_jobs=n_jobs, 40 | ) 41 | 42 | fps_array = np.array(fps) 43 | 44 | dist_mat = distance.pdist(fps_array, metric="jaccard") 45 | 46 | if squareform: 47 | dist_mat = distance.squareform(dist_mat, force="tomatrix") 48 | 49 | return dist_mat 50 | 51 | 52 | def cdist( 53 | mols1: List[Union[str, dm.Mol]], 54 | mols2: List[Union[str, dm.Mol]], 55 | n_jobs: Optional[int] = 1, 56 | distances_chunk: bool = False, 57 | distances_chunk_memory: int = 1024, 58 | distances_n_jobs: int = -1, 59 | **fp_args: Any, 60 | ) -> np.ndarray: 61 | """Compute the tanimoto distance between the fingerprints of each pair of 62 | molecules of the two collections of inputs. 63 | 64 | Args: 65 | mols1: list of molecules. 66 | mols2: list of molecules. 67 | n_jobs: Number of jobs for fingerprint computation. Let to 1 for no 68 | parallelization. Set to -1 to use all available cores. 69 | distances_chunk: Whether to use chunked computation. 70 | distances_chunk_memory: Memory size in MB to use for chunked computation. 71 | distances_n_jobs: Number of jobs for parallelization. 72 | **fp_args: list of args to pass to `to_fp()`. 73 | 74 | Returns: 75 | distmat 76 | """ 77 | 78 | fps1 = dm.parallelized( 79 | functools.partial(dm.to_fp, as_array=True, **fp_args), 80 | mols1, 81 | n_jobs=n_jobs, 82 | ) 83 | 84 | fps2 = dm.parallelized( 85 | functools.partial(dm.to_fp, as_array=True, **fp_args), 86 | mols2, 87 | n_jobs=n_jobs, 88 | ) 89 | 90 | fps1_array = np.array(fps1).astype(bool) 91 | fps2_array = np.array(fps2).astype(bool) 92 | 93 | if distances_chunk: 94 | distances = pairwise_distances_chunked( 95 | fps1_array, 96 | fps2_array, 97 | metric="jaccard", 98 | n_jobs=distances_n_jobs, 99 | working_memory=distances_chunk_memory, 100 | ) 101 | distances_array = np.vstack(list(distances)) 102 | else: 103 | distances_array = distance.cdist(fps1_array, fps2_array, metric="jaccard") 104 | 105 | return distances_array 106 | -------------------------------------------------------------------------------- /datamol/types.py: -------------------------------------------------------------------------------- 1 | # NOTE(hadim): typing_extensions can be replaced by typing once we drop support for Python 3.9. 2 | from typing_extensions import TypeAlias 3 | from typing import Union 4 | from typing import Tuple 5 | 6 | from rdkit import Chem 7 | from rdkit.Chem import rdChemReactions 8 | 9 | Mol: TypeAlias = Chem.rdchem.Mol 10 | BondType: TypeAlias = Chem.rdchem.BondType 11 | ChemicalReaction: TypeAlias = rdChemReactions.ChemicalReaction 12 | Atom: TypeAlias = Chem.rdchem.Atom 13 | Bond: TypeAlias = Chem.rdchem.Bond 14 | 15 | RDKitColor = Union[Tuple[float, float, float, float], Tuple[float, float, float]] 16 | DatamolColor = Union[RDKitColor, str] 17 | -------------------------------------------------------------------------------- /datamol/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .jobs import JobRunner 2 | from .jobs import parallelized 3 | from .jobs import parallelized_with_batches 4 | 5 | from . import fs 6 | from . import perf 7 | 8 | from . import decorators 9 | -------------------------------------------------------------------------------- /datamol/utils/decorators.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from typing import List 3 | from typing import Union 4 | 5 | import platform 6 | from functools import wraps 7 | 8 | 9 | def disable_on_os(os_names: Union[str, List[str]]): 10 | """A decorator to disable a function raising an error if the OS detected is not supported. 11 | 12 | Args: 13 | os_names: OS names to disable this function. Valid OS names are: `["linux", "osx", "win"]`. 14 | """ 15 | 16 | if isinstance(os_names, str): 17 | os_names = [os_names] 18 | 19 | valid_os_names = [] 20 | for os_name in os_names: 21 | if os_name == "linux": 22 | valid_os_names.append("Linux") 23 | elif os_name == "win": 24 | valid_os_names.append("Windows") 25 | elif os_name == "osx": 26 | valid_os_names.append("Darwin") 27 | else: 28 | valid_os_names.append(os_name) 29 | 30 | def real_decorator(function: Callable): 31 | @wraps(function) 32 | def wrapper(*args, **kwargs): 33 | if platform.system() not in valid_os_names: 34 | retval = function(*args, **kwargs) 35 | return retval 36 | else: 37 | raise NotImplementedError( 38 | f"The function {function.__name__} is not supported" 39 | f" for the platform '{platform.system()}'." 40 | ) 41 | 42 | return wrapper 43 | 44 | return real_decorator 45 | -------------------------------------------------------------------------------- /datamol/utils/perf.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from loguru import logger 4 | 5 | 6 | duration_intervals = ( 7 | ("weeks", 604800), # 60 * 60 * 24 * 7 8 | ("days", 86400), # 60 * 60 * 24 9 | ("h", 3600), # 60 * 60 10 | ("min", 60), 11 | ("s", 1), 12 | ("ms", 1e-3), 13 | ("us", 1e-6), 14 | ) 15 | 16 | 17 | def human_duration(seconds: float, granularity: int = 1): 18 | # NOTE(hadim): far from being perfect. 19 | 20 | result = [] 21 | duration: float = seconds 22 | for name, count in duration_intervals: 23 | value = duration // count 24 | if value: 25 | duration -= value * count 26 | result.append(f"{value:.0f}{name}") 27 | return ", ".join(result[:granularity]) 28 | 29 | 30 | class watch_duration: 31 | """A Python decorator to measure execution time with logging capability. 32 | 33 | Args: 34 | log: Whether to log the measured duration. 35 | log_human_duration: Whether to log duration in a human way 36 | depending on the amount. 37 | 38 | Example: 39 | 40 | ```python 41 | def fn(n): 42 | for i in range(n): 43 | print(i) 44 | time.sleep(0.2) 45 | 46 | with dm.utils.perf.watch_duration(log=True) as w: 47 | fn(5) 48 | 49 | print(w.duration) 50 | ``` 51 | """ 52 | 53 | def __init__(self, log: bool = True, log_human_duration: bool = True): 54 | self.log = log 55 | self.log_human_duration = log_human_duration 56 | 57 | self.start = None 58 | self.end = None 59 | self.duration = None 60 | self.duration_minutes = None 61 | 62 | def __enter__(self): 63 | self.start = time.time() 64 | return self 65 | 66 | def __exit__(self, *_): 67 | assert self.start is not None 68 | 69 | self.end = time.time() 70 | self.duration = self.end - self.start 71 | self.duration_minutes = self.duration / 60 72 | 73 | if self.log: 74 | if self.log_human_duration: 75 | logger.info(f"Duration {human_duration(self.duration)}.") 76 | else: 77 | logger.info(f"Duration {self.duration_minutes:.2f} minutes") 78 | -------------------------------------------------------------------------------- /datamol/utils/testing.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Optional 3 | from typing import Union 4 | 5 | import functools 6 | 7 | import numpy as np 8 | from scipy.spatial import distance 9 | 10 | from rdkit import Chem 11 | from rdkit.DataManip.Metric import GetTanimotoDistMat # type: ignore 12 | from rdkit.DataStructs.cDataStructs import TanimotoSimilarity 13 | 14 | import datamol as dm 15 | 16 | 17 | def pdist_rdkit( 18 | mols: List[Union[str, Chem.rdchem.Mol]], 19 | n_jobs: Optional[int] = 1, 20 | squareform: bool = True, 21 | **fp_args, 22 | ) -> np.ndarray: 23 | """Equivalent to `dm.similarity.pdist` but uses the RDKit API. 24 | 25 | Important: 26 | This function is only used for testing and shoult not be used in production. 27 | """ 28 | 29 | fps = dm.parallelized( 30 | functools.partial(dm.to_fp, as_array=False, **fp_args), 31 | mols, 32 | n_jobs=n_jobs, 33 | ) 34 | 35 | fps = list(fps) # type: ignore 36 | 37 | dist = GetTanimotoDistMat(fps) 38 | 39 | # Put in squareform: `scipy.spatial.distance.squareform` is incompatible with RDKit returned vector. 40 | dist_mat = np.zeros((len(fps), len(fps))) 41 | dist_mat[np.tril_indices_from(dist_mat, -1)] = dist 42 | dist_mat += dist_mat.T 43 | 44 | if not squareform: 45 | dist_mat = distance.squareform(dist_mat, force="tovector") 46 | 47 | return dist_mat 48 | 49 | 50 | def cdist_rdkit( 51 | mols1: List[Union[str, Chem.rdchem.Mol]], 52 | mols2: List[Union[str, Chem.rdchem.Mol]], 53 | n_jobs: Optional[int] = 1, 54 | **fp_args, 55 | ) -> np.ndarray: 56 | """Equivalent to `dm.similarity.cdist` but uses the RDKit API. 57 | 58 | Important: 59 | This function is only used for testing and shoult not be used in production. 60 | """ 61 | 62 | fps1 = dm.parallelized( 63 | functools.partial(dm.to_fp, as_array=False, **fp_args), 64 | mols1, 65 | n_jobs=n_jobs, 66 | ) 67 | 68 | fps2 = dm.parallelized( 69 | functools.partial(dm.to_fp, as_array=False, **fp_args), 70 | mols2, 71 | n_jobs=n_jobs, 72 | ) 73 | 74 | fps1 = list(fps1) # type: ignore 75 | fps2 = list(fps2) # type: ignore 76 | 77 | dist_mat = np.zeros((len(fps1), len(fps2))) 78 | for i in range(len(fps1)): 79 | for j in range(len(fps2)): 80 | d = 1 - TanimotoSimilarity(fps1[i], fps2[j]) 81 | dist_mat[i, j] = d 82 | 83 | return dist_mat 84 | -------------------------------------------------------------------------------- /datamol/viz/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | 3 | from ._viz import to_image 4 | 5 | from ._substructure import match_substructure 6 | 7 | from ._conformers import conformers 8 | 9 | from ._circle_grid import circle_grid 10 | from ._circle_grid import MolsCircleGrid 11 | 12 | from ._lasso_highlight import lasso_highlight_image 13 | -------------------------------------------------------------------------------- /datamol/viz/_conformers.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from typing import List 3 | from typing import Optional 4 | 5 | import copy 6 | import itertools 7 | 8 | from rdkit import Chem 9 | from rdkit.Chem import rdMolAlign 10 | 11 | 12 | def _get_nglview(): 13 | try: 14 | import nglview as nv 15 | 16 | return nv 17 | except ImportError: 18 | raise ImportError("You must install nglview from https://github.com/nglviewer/nglview.") 19 | 20 | 21 | def _get_ipywidgets(): 22 | try: 23 | import ipywidgets as widgets 24 | 25 | return widgets 26 | except ImportError: 27 | raise ImportError( 28 | "You must install ipywidgets from https://github.com/jupyter-widgets/ipywidgets/." 29 | ) 30 | 31 | 32 | def conformers( 33 | mol: Chem.rdchem.Mol, 34 | conf_id: int = -1, 35 | n_confs: Optional[Union[int, List[int]]] = None, 36 | align_conf: bool = True, 37 | n_cols: int = 3, 38 | sync_views: bool = True, 39 | remove_hs: bool = True, 40 | width: str = "auto", 41 | ): 42 | """Visualize the conformer(s) of a molecule. 43 | 44 | Args: 45 | mol: a molecule. 46 | conf_id: The ID of the conformer to show. -1 shows 47 | the first conformer. Only works if `n_confs` is None. 48 | n_confs: Can be a number of conformers 49 | to shows or a list of conformer indices. When None, only the first 50 | conformer is displayed. When -1, show all conformers. 51 | align_conf: Whether to align conformers together. 52 | n_cols: Number of columns. Defaults to 3. 53 | sync_views: Wether to sync the multiple views. 54 | remove_hs: Wether to remove the hydrogens of the conformers. 55 | width: The width of the returned view. Defaults to "auto". 56 | """ 57 | 58 | widgets = _get_ipywidgets() 59 | nv = _get_nglview() 60 | 61 | if mol.GetNumConformers() == 0: 62 | raise ValueError( 63 | "The molecule has 0 conformers. You can generate conformers with `dm.conformers.generate(mol)`." 64 | ) 65 | 66 | # Clone the molecule 67 | mol = copy.deepcopy(mol) 68 | 69 | if remove_hs: 70 | mol = Chem.RemoveHs(mol) # type: ignore 71 | else: 72 | mol = Chem.AddHs(mol) # type: ignore 73 | 74 | if n_confs is None: 75 | return nv.show_rdkit(mol, conf_id=conf_id) 76 | 77 | # If n_confs is int, convert to list of conformer IDs 78 | if n_confs == -1: 79 | n_confs = [conf.GetId() for conf in mol.GetConformers()] 80 | elif isinstance(n_confs, int): 81 | if n_confs > mol.GetNumConformers(): 82 | n_confs = mol.GetNumConformers() 83 | n_confs = list(range(n_confs)) # type: ignore 84 | 85 | if align_conf: 86 | rdMolAlign.AlignMolConformers(mol, confIds=n_confs) 87 | 88 | # Get number of rows 89 | n_rows = len(n_confs) // n_cols 90 | n_rows += 1 if (len(n_confs) % n_cols) > 0 else 0 91 | 92 | # Create a grid 93 | grid = widgets.GridspecLayout(n_rows, n_cols) # type: ignore 94 | 95 | # Create and add views to the grid. 96 | widget_coords = itertools.product(range(n_rows), range(n_cols)) 97 | views = [] 98 | for i, (conf_id, (x, y)) in enumerate(zip(n_confs, widget_coords)): 99 | view = nv.show_rdkit(mol, conf_id=conf_id) 100 | view.layout.width = width 101 | view.layout.align_self = "stretch" 102 | grid[x, y] = view 103 | views.append(view) 104 | 105 | # Sync views 106 | if sync_views: 107 | for view in views: 108 | view._set_sync_camera(views) 109 | 110 | return grid 111 | -------------------------------------------------------------------------------- /datamol/viz/_substructure.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import Union 3 | from typing import List 4 | 5 | import datamol as dm 6 | 7 | from ._viz import to_image 8 | 9 | 10 | def match_substructure( 11 | mols: Union[List[dm.Mol], dm.Mol], 12 | queries: Union[List[dm.Mol], dm.Mol], 13 | highlight_bonds: bool = True, 14 | copy: bool = True, 15 | **kwargs: Any, 16 | ): 17 | """Generate an image of molecule(s) with substructure matches for a given 18 | pattern or substructure. 19 | 20 | Args: 21 | mols: One or more molecules. 22 | queries: One or more queries. 23 | highlight_bonds: Whether to also highlight the bonds matching the patterns. 24 | copy: Whether to copy the molecules and the queries. 25 | kwargs: Other kwargs passed to `dm.viz.to_image`. 26 | """ 27 | 28 | # NOTE(hadim): `MolsToGridImage` used in `to_image` can't use a list of list of indices 29 | # for every molecules so it's not really possible to have different colors for different 30 | # matches in the same molecules. 31 | # In the future, we will implement our custom `MolsToGridImage` in order to have more controls 32 | # on the colors used. 33 | # For the same reason, we don't bother about colors here. 34 | 35 | if isinstance(mols, dm.Mol): 36 | mols = [mols] 37 | 38 | if isinstance(queries, dm.Mol): 39 | queries = [queries] 40 | 41 | # Copy mols and patterns 42 | if copy: 43 | mols = [dm.copy_mol(mol) for mol in mols] 44 | queries = [dm.copy_mol(mol) for mol in queries] 45 | 46 | all_atom_indices = [] 47 | all_bond_indices = [] 48 | 49 | for mol in mols: 50 | atom_indices = [] 51 | bond_indices = [] 52 | 53 | for query in queries: 54 | if highlight_bonds: 55 | atom_matches, bond_matches = dm.substructure_matching_bonds(mol, query) 56 | atom_indices += atom_matches 57 | bond_indices += bond_matches 58 | else: 59 | atom_indices += list(mol.GetSubstructMatches(query, uniquify=True)) # type: ignore 60 | bond_indices += [] 61 | 62 | # NOTE(hadim): we must flatten the atom/bond indices, since `MolsToGridImage` 63 | # don't accept multiple list of indices for every single molecule. 64 | bond_indices = [item for sublist in bond_indices for item in sublist] 65 | atom_indices = [item for sublist in atom_indices for item in sublist] 66 | 67 | all_atom_indices.append(atom_indices) 68 | all_bond_indices.append(bond_indices) 69 | 70 | image = to_image( 71 | mols, 72 | highlight_atom=all_atom_indices, 73 | highlight_bond=all_bond_indices, 74 | **kwargs, 75 | ) 76 | 77 | return image 78 | -------------------------------------------------------------------------------- /datamol/viz/_viz.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from typing import List 3 | from typing import Tuple 4 | from typing import Optional 5 | from typing import Any 6 | from loguru import logger 7 | 8 | from rdkit.Chem import Draw 9 | 10 | import datamol as dm 11 | 12 | from .utils import prepare_mol_for_drawing 13 | from .utils import image_to_file 14 | 15 | 16 | def to_image( 17 | mols: Union[List[Union[dm.Mol, str]], dm.Mol, str], 18 | legends: Union[List[Union[str, None]], str, None] = None, 19 | n_cols: int = 4, 20 | use_svg: bool = True, 21 | mol_size: Union[Tuple[int, int], int] = (300, 300), 22 | highlight_atom: Optional[List[List[int]]] = None, 23 | highlight_bond: Optional[List[List[int]]] = None, 24 | outfile: Optional[str] = None, 25 | max_mols: int = 32, 26 | max_mols_ipython: int = 50, 27 | copy: bool = True, 28 | indices: bool = False, 29 | bond_indices: bool = False, 30 | bond_line_width: int = 2, 31 | stereo_annotations: bool = True, 32 | legend_fontsize: int = 16, 33 | kekulize: bool = True, 34 | align: Union[dm.Mol, str, bool] = False, 35 | **kwargs: Any, 36 | ): 37 | """Generate an image out of a molecule or a list of molecules. 38 | 39 | Args: 40 | mols: One or a list of molecules. 41 | legends: A string or a list of string as legend for every molecules. 42 | n_cols: Number of molecules per column. 43 | use_svg: Whether to ouput an SVG (or a PNG). 44 | mol_size: A int or a tuple of int defining the size per molecule. 45 | highlight_atom: the atoms to highlight. 46 | highlight_bond: The bonds to highlight. 47 | outfile: Path where to save the image (local or remote path). 48 | max_mols: The maximum number of molecules to display. 49 | max_mols_ipython: The maximum number of molecules to display when running within an IPython environment. 50 | copy: Whether to copy the molecules or not. 51 | indices: Whether to draw the atom indices. 52 | bond_indices: Whether to draw the bond indices. 53 | bond_line_width: The width of the bond lines. 54 | legend_fontsize: Font size for the legend. 55 | kekulize: Run kekulization routine on molecules. Skipped if fails. 56 | align: Whether to align the 2D coordinates of the molecules. 57 | - If set to True, align all molecules with `dm.align.auto_align_many()`. 58 | - If set to a molecule, it is used as a template for alignment with `dm.align.template_align()`. 59 | - If set to False, no alignment is performed. 60 | For a more custom alignment, we suggest using directly the module `dm.align` instead. 61 | **kwargs: Additional arguments to pass to the drawing function. See RDKit 62 | documentation related to `MolDrawOptions` for more details at 63 | https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html. 64 | """ 65 | 66 | if isinstance(mol_size, int): 67 | mol_size = (mol_size, mol_size) 68 | 69 | if isinstance(mols, (dm.Mol, str)): 70 | mols = [mols] 71 | 72 | # Convert smiles to molecules if strings are provided as input for API consistency 73 | mols = mols[:] # avoid in place modification 74 | for i in range(len(mols)): 75 | if isinstance(mols[i], str): 76 | mols[i] = dm.to_mol(mols[i]) 77 | 78 | if isinstance(legends, str): 79 | legends = [legends] 80 | 81 | if copy: 82 | mols = [dm.copy_mol(mol) for mol in mols] 83 | 84 | if max_mols is not None: 85 | mols = mols[:max_mols] 86 | 87 | if legends is not None: 88 | legends = legends[:max_mols] 89 | 90 | # Whether to align the molecules 91 | if isinstance(align, (dm.Mol, str)): 92 | mols = [dm.align.template_align(mol, template=align) for mol in mols] 93 | elif align is True: 94 | mols = dm.align.auto_align_many(mols) 95 | 96 | # Prepare molecules before drawing 97 | mols = [prepare_mol_for_drawing(mol, kekulize=kekulize) for mol in mols] 98 | 99 | _highlight_atom = highlight_atom 100 | if highlight_atom is not None and isinstance(highlight_atom[0], int): 101 | _highlight_atom = [highlight_atom] 102 | 103 | _highlight_bond = highlight_bond 104 | if highlight_bond is not None and isinstance(highlight_bond[0], int): 105 | _highlight_bond = [highlight_bond] 106 | 107 | # Don't make the image bigger than it 108 | if len(mols) < n_cols: 109 | n_cols = len(mols) 110 | 111 | draw_options = Draw.rdMolDraw2D.MolDrawOptions() 112 | draw_options.legendFontSize = legend_fontsize 113 | draw_options.addAtomIndices = indices 114 | draw_options.addBondIndices = bond_indices 115 | draw_options.addStereoAnnotation = stereo_annotations 116 | draw_options.bondLineWidth = bond_line_width 117 | 118 | # Add the custom drawing options. 119 | _kwargs = {} 120 | for k, v in kwargs.items(): 121 | if hasattr(draw_options, k): 122 | setattr(draw_options, k, v) 123 | else: 124 | _kwargs[k] = v 125 | 126 | # Check if we are in a Jupyter notebook or IPython display context 127 | # If so, conditionally add the maxMols argument 128 | in_notebook = dm.viz.utils.is_ipython_session() 129 | 130 | if in_notebook: 131 | _kwargs["maxMols"] = max_mols_ipython 132 | if max_mols > max_mols_ipython: 133 | logger.warning( 134 | f"You have set max_mols to {max_mols}, which is higher than max_mols_ipython ({max_mols_ipython}). " 135 | "Consider increasing max_mols_ipython if you want to display all molecules in an IPython environment." 136 | ) 137 | 138 | image = Draw.MolsToGridImage( 139 | mols, 140 | legends=legends, 141 | molsPerRow=n_cols, 142 | useSVG=use_svg, 143 | subImgSize=mol_size, 144 | highlightAtomLists=_highlight_atom, 145 | highlightBondLists=_highlight_bond, 146 | drawOptions=draw_options, 147 | **_kwargs, 148 | ) 149 | 150 | if outfile is not None: 151 | image_to_file(image, outfile, as_svg=use_svg) 152 | return image 153 | -------------------------------------------------------------------------------- /datamol/viz/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from typing import Union 3 | 4 | import io 5 | import fsspec 6 | 7 | from rdkit.Chem import Draw 8 | from matplotlib import colors as mcolors 9 | 10 | import PIL.Image 11 | import PIL.PngImagePlugin 12 | 13 | import datamol as dm 14 | 15 | from datamol.types import RDKitColor 16 | from datamol.types import DatamolColor 17 | 18 | 19 | def prepare_mol_for_drawing(mol: Optional[dm.Mol], kekulize: bool = True) -> Optional[dm.Mol]: 20 | """Prepare the molecule before drawing to avoid any error due to unsanitized molecule 21 | or incorrect valence or aromaticity. 22 | 23 | Code is inspired from `rdkit.Chem.Draw._moltoimg`. 24 | 25 | Args: 26 | mol: A molecule to prepare. If set to None, the function will return None. 27 | kekulize: Whether to kekulize the molecule. 28 | """ 29 | 30 | if mol is None: 31 | return None 32 | 33 | try: 34 | with dm.without_rdkit_log(): 35 | # Check for implicit and explicit valence 36 | if mol.NeedsUpdatePropertyCache(): # type: ignore 37 | mol.UpdatePropertyCache(False) # type: ignore 38 | 39 | # Check for aromaticity 40 | if dm.is_lower_than_current_rdkit_version("2022.09"): 41 | _kekulize = Draw._okToKekulizeMol(mol, kekulize) # type: ignore 42 | else: 43 | _kekulize = Draw.shouldKekulize(mol, kekulize) 44 | 45 | # Run the rdkit preparation procedure 46 | _mol = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=_kekulize) 47 | 48 | except ValueError: # <- can happen on a kekulization failure 49 | # Run the rdkit preparation procedure with kekulize set to `False` 50 | _mol = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False) 51 | 52 | return _mol 53 | 54 | 55 | def is_ipython_session() -> bool: 56 | try: 57 | kernel_name = get_ipython().__class__.__name__ # noqa: F821 # type: ignore 58 | module_name = get_ipython().__class__.__module__ # noqa: F821 # type: ignore 59 | 60 | if kernel_name == "ZMQInteractiveShell" or module_name == "google.colab._shell": 61 | return True 62 | except Exception: 63 | pass 64 | 65 | return False 66 | 67 | 68 | def drawer_to_image(drawer: Draw.rdMolDraw2D.MolDraw2D): 69 | """Convert an RDkit drawer to an image. The image can be either a PNG or SVG depending on the 70 | drawer class. The returned image type will depends whether the Python session is an IPython one or not. 71 | 72 | This function matches the behavior of `datamol.to_image` and `rdkit.Chem.Draw.MolDraw2DToImage`. 73 | 74 | Args: 75 | drawer: An RDkit drawer. 76 | 77 | Returns: 78 | An image: either PNG or SVG depending on the drawer class. If within an IPython sessions, 79 | IPython display objects are returned. 80 | """ 81 | 82 | is_svg = isinstance(drawer, Draw.rdMolDraw2D.MolDraw2DSVG) 83 | 84 | if is_ipython_session(): 85 | if is_svg: 86 | from IPython.core.display import SVG 87 | 88 | return SVG(drawer.GetDrawingText()) 89 | else: 90 | from IPython.core.display import Image 91 | 92 | return Image(drawer.GetDrawingText()) 93 | else: 94 | if is_svg: 95 | return drawer.GetDrawingText() 96 | else: 97 | from PIL import Image 98 | 99 | return Image.open(io.BytesIO(drawer.GetDrawingText())) 100 | 101 | 102 | def image_to_file( 103 | image: Union[ 104 | str, 105 | PIL.PngImagePlugin.PngImageFile, 106 | bytes, 107 | PIL.Image.Image, 108 | ], 109 | outfile, 110 | as_svg: bool = False, 111 | ): 112 | """Save image to file. The image can be either a PNG or SVG depending 113 | 114 | Args: 115 | image: Image to save to a file 116 | outfile: Path to the output file where to save the image 117 | as_svg: Whether the image is an SVG or not 118 | """ 119 | 120 | with fsspec.open(outfile, "wb") as f: 121 | if as_svg: 122 | if isinstance(image, str): 123 | # in a terminal process 124 | f.write(image.encode()) # type: ignore 125 | else: 126 | # in a jupyter kernel process 127 | f.write(image.data.encode()) # type: ignore 128 | else: 129 | if isinstance(image, PIL.PngImagePlugin.PngImageFile): # type: ignore 130 | # in a terminal process 131 | image.save(f) # type: ignore 132 | else: 133 | # in a jupyter kernel process 134 | f.write(image.data) # type: ignore 135 | 136 | 137 | def to_rdkit_color(color: Optional[DatamolColor]) -> Optional[RDKitColor]: 138 | """If required convert a datamol color (rgb, rgba or hex string) to an RDKit 139 | color (rgb or rgba). 140 | 141 | Args: 142 | color: A datamol color: hex, rgb, rgba or None. 143 | """ 144 | if color is None: 145 | return None 146 | 147 | if isinstance(color, str): 148 | return mcolors.to_rgba(color) # type: ignore 149 | if isinstance(color, (tuple, list)) and len(color) in [3, 4] and any(x > 1 for x in color): 150 | return tuple(x / 255 if i < 3 else x for i, x in enumerate(color)) 151 | 152 | return color 153 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | docs.datamol.io 2 | -------------------------------------------------------------------------------- /docs/api/datamol.align.md: -------------------------------------------------------------------------------- 1 | # `datamol.align` 2 | 3 | ::: datamol.align 4 | -------------------------------------------------------------------------------- /docs/api/datamol.cluster.md: -------------------------------------------------------------------------------- 1 | # `datamol.cluster` 2 | 3 | ::: datamol.cluster 4 | -------------------------------------------------------------------------------- /docs/api/datamol.conformers.md: -------------------------------------------------------------------------------- 1 | # `datamol.conformers` 2 | 3 | ::: datamol.conformers._conformers 4 | ::: datamol.conformers._features 5 | -------------------------------------------------------------------------------- /docs/api/datamol.convert.md: -------------------------------------------------------------------------------- 1 | # `datamol.convert` 2 | 3 | ::: datamol.convert 4 | -------------------------------------------------------------------------------- /docs/api/datamol.data.md: -------------------------------------------------------------------------------- 1 | # `datamol.data` 2 | 3 | ::: datamol.data 4 | -------------------------------------------------------------------------------- /docs/api/datamol.descriptors.md: -------------------------------------------------------------------------------- 1 | # `datamol.descriptors` 2 | 3 | ::: datamol.descriptors.descriptors 4 | ::: datamol.descriptors.compute 5 | -------------------------------------------------------------------------------- /docs/api/datamol.fp.md: -------------------------------------------------------------------------------- 1 | # `datamol.fp` 2 | 3 | ::: datamol.fp 4 | -------------------------------------------------------------------------------- /docs/api/datamol.fragment.md: -------------------------------------------------------------------------------- 1 | # `datamol.fragment` 2 | 3 | ::: datamol.fragment._fragment 4 | ::: datamol.fragment._assemble 5 | -------------------------------------------------------------------------------- /docs/api/datamol.graph.md: -------------------------------------------------------------------------------- 1 | # `datamol.graph` 2 | 3 | ::: datamol.graph 4 | -------------------------------------------------------------------------------- /docs/api/datamol.io.md: -------------------------------------------------------------------------------- 1 | # `datamol.io` 2 | 3 | ::: datamol.io 4 | -------------------------------------------------------------------------------- /docs/api/datamol.isomers.md: -------------------------------------------------------------------------------- 1 | # `datamol.isomers` 2 | 3 | ::: datamol.isomers._enumerate 4 | ::: datamol.isomers._structural 5 | -------------------------------------------------------------------------------- /docs/api/datamol.log.md: -------------------------------------------------------------------------------- 1 | # `datamol.log` 2 | 3 | ::: datamol.log 4 | -------------------------------------------------------------------------------- /docs/api/datamol.mol.md: -------------------------------------------------------------------------------- 1 | # `datamol.mol` 2 | 3 | ::: datamol.mol 4 | -------------------------------------------------------------------------------- /docs/api/datamol.molar.md: -------------------------------------------------------------------------------- 1 | # `datamol.molar` 2 | 3 | ::: datamol.molar 4 | -------------------------------------------------------------------------------- /docs/api/datamol.reactions.md: -------------------------------------------------------------------------------- 1 | # `datamol.reactions` 2 | 3 | ::: datamol.reactions._reactions 4 | ::: datamol.reactions._attachments 5 | -------------------------------------------------------------------------------- /docs/api/datamol.scaffold.md: -------------------------------------------------------------------------------- 1 | # `datamol.scaffold` 2 | 3 | ::: datamol.scaffold._fuzzy 4 | -------------------------------------------------------------------------------- /docs/api/datamol.similarity.md: -------------------------------------------------------------------------------- 1 | # `datamol.similarity` 2 | 3 | ::: datamol.similarity 4 | -------------------------------------------------------------------------------- /docs/api/datamol.utils.fs.md: -------------------------------------------------------------------------------- 1 | # `datamol.utils.fs` 2 | 3 | ::: datamol.utils.fs 4 | 5 | -------------------------------------------------------------------------------- /docs/api/datamol.utils.md: -------------------------------------------------------------------------------- 1 | # `datamol.utils` 2 | 3 | ::: datamol.utils.decorators 4 | ::: datamol.utils.jobs 5 | ::: datamol.utils.perf 6 | -------------------------------------------------------------------------------- /docs/api/datamol.viz.md: -------------------------------------------------------------------------------- 1 | # `datamol.viz` 2 | 3 | ## Vizualize molecule in 2D or 3D 4 | 5 | ::: datamol.viz.to_image 6 | ::: datamol.viz.conformers 7 | 8 | ## Specific plotting functions 9 | 10 | ::: datamol.viz.MolsCircleGrid 11 | ::: datamol.viz.circle_grid 12 | 13 | ## Vizualize 2D molecule with highlighted substructures 14 | 15 | ::: datamol.viz.lasso_highlight_image 16 | -------------------------------------------------------------------------------- /docs/assets/css/custom-datamol.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --datamol-primary: #F89D4C; 3 | --datamol-secondary: #343a40; 4 | 5 | /* Primary color shades */ 6 | --md-primary-fg-color: var(--datamol-primary); 7 | --md-primary-fg-color--light: var(--datamol-primary); 8 | --md-primary-fg-color--dark: var(--datamol-primary); 9 | --md-primary-bg-color: var(--datamol-secondary); 10 | --md-primary-bg-color--light: var(--datamol-secondary); 11 | --md-text-link-color: var(--datamol-secondary); 12 | 13 | /* Accent color shades */ 14 | --md-accent-fg-color: var(--datamol-secondary); 15 | --md-accent-fg-color--transparent: var(--datamol-secondary); 16 | --md-accent-bg-color: var(--datamol-secondary); 17 | --md-accent-bg-color--light: var(--datamol-secondary); 18 | } 19 | 20 | :root>* { 21 | /* Code block color shades */ 22 | --md-code-bg-color: hsla(0, 0%, 96%, 1); 23 | --md-code-fg-color: hsla(200, 18%, 26%, 1); 24 | 25 | /* Footer */ 26 | --md-footer-bg-color: var(--datamol-primary); 27 | /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */ 28 | --md-footer-fg-color: var(--datamol-secondary); 29 | --md-footer-fg-color--light: var(--datamol-secondary); 30 | --md-footer-fg-color--lighter: var(--datamol-secondary); 31 | 32 | } 33 | 34 | .md-header { 35 | background-image: linear-gradient(to right, #F89D4C, #E20000); 36 | } 37 | 38 | .md-footer { 39 | background-image: linear-gradient(to right, #F89D4C, #E20000); 40 | } 41 | 42 | .md-tabs { 43 | background-image: linear-gradient(to right, #F4F6F9, #E2CEC3); 44 | } 45 | 46 | .md-header__topic { 47 | color: rgb(255, 255, 255); 48 | } 49 | 50 | .md-source__repository, 51 | .md-source__icon, 52 | .md-search__input, 53 | .md-search__input::placeholder, 54 | .md-search__input~.md-search__icon, 55 | .md-footer__inner.md-grid, 56 | .md-copyright__highlight, 57 | .md-copyright, 58 | .md-footer-meta.md-typeset a, 59 | .md-version { 60 | color: rgb(255, 255, 255) !important; 61 | } 62 | 63 | .md-search__form { 64 | background-color: rgba(255, 255, 255, 0.2); 65 | } 66 | 67 | .md-search__input { 68 | color: #222222 !important; 69 | } 70 | 71 | .md-header__topic { 72 | color: rgb(255, 255, 255); 73 | font-size: 1.4em; 74 | } 75 | 76 | /* Increase the size of the logo */ 77 | .md-header__button.md-logo img, 78 | .md-header__button.md-logo svg { 79 | height: 2rem !important; 80 | } 81 | 82 | /* Reduce the margin around the logo */ 83 | .md-header__button.md-logo { 84 | margin: 0.4em; 85 | padding: 0.4em; 86 | } 87 | 88 | /* Remove the `In` and `Out` block in rendered Jupyter notebooks */ 89 | .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt, 90 | .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt { 91 | display: none !important; 92 | } 93 | -------------------------------------------------------------------------------- /docs/assets/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Indentation. */ 2 | div.doc-contents:not(.first) { 3 | padding-left: 25px; 4 | border-left: 4px solid rgba(230, 230, 230); 5 | margin-bottom: 80px; 6 | } 7 | 8 | /* Don't capitalize names. */ 9 | h5.doc-heading { 10 | text-transform: none !important; 11 | } 12 | 13 | /* Don't use vertical space on hidden ToC entries. */ 14 | .hidden-toc::before { 15 | margin-top: 0 !important; 16 | padding-top: 0 !important; 17 | } 18 | 19 | /* Don't show permalink of hidden ToC entries. */ 20 | .hidden-toc a.headerlink { 21 | display: none; 22 | } 23 | 24 | /* Avoid breaking parameters name, etc. in table cells. */ 25 | td code { 26 | word-break: normal !important; 27 | } 28 | 29 | /* For pieces of Markdown rendered in table cells. */ 30 | td p { 31 | margin-top: 0 !important; 32 | margin-bottom: 0 !important; 33 | } 34 | -------------------------------------------------------------------------------- /docs/assets/css/tweak-width.css: -------------------------------------------------------------------------------- 1 | @media only screen and (min-width: 76.25em) { 2 | .md-main__inner { 3 | max-width: none; 4 | padding-left: 2em; 5 | padding-left: 2em; 6 | } 7 | .md-sidebar--primary { 8 | left: 0; 9 | } 10 | .md-sidebar--secondary { 11 | right: 0; 12 | margin-left: 0; 13 | -webkit-transform: none; 14 | transform: none; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /docs/assets/js/google-analytics.js: -------------------------------------------------------------------------------- 1 | var gtag_id = "G-0L9PP26N2H"; 2 | 3 | var script = document.createElement("script"); 4 | script.src = "https://www.googletagmanager.com/gtag/js?id=" + gtag_id; 5 | document.head.appendChild(script); 6 | 7 | window.dataLayer = window.dataLayer || []; 8 | function gtag(){dataLayer.push(arguments);} 9 | gtag('js', new Date()); 10 | gtag('config', gtag_id); 11 | -------------------------------------------------------------------------------- /docs/contribute.md: -------------------------------------------------------------------------------- 1 | # Contribute 2 | 3 | The below documents the development lifecycle of Datamol. 4 | 5 | ## Setup a dev environment 6 | 7 | ```bash 8 | mamba env create -n datamol -f env.yml 9 | mamba activate datamol 10 | pip install -e . 11 | ``` 12 | 13 | ## Setup a dev environment with dev container 14 | 15 | This repository is setup to use [dev container](https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/introduction-to-dev-containers). You can use it locally with VSCode or any editor supporting dev containers as well as on GitHub Codespaces. 16 | 17 | The env is based on the Micromamba Docker image. 18 | 19 | ## Continuous Integration 20 | 21 | Datamol uses Github Actions to: 22 | 23 | - **Build and test** `datamol`. 24 | - Multiple combinations of OS, Python and RDKit versions are tested. 25 | - **Check** the code: 26 | - Formatting with `black`. 27 | - Static type check with `mypy`. 28 | - **Documentation**: build and deploy the documentation on `main` and for every new git tag. 29 | 30 | ## Run tests 31 | 32 | ```bash 33 | pytest 34 | ``` 35 | 36 | ## Build the documentation 37 | 38 | You can build and serve the documentation locally with: 39 | 40 | ```bash 41 | # Build and serve the doc 42 | mike serve 43 | ``` 44 | 45 | ### Multi-versionning 46 | 47 | The doc is built for eash push on `main` and every git tags using [mike](https://github.com/jimporter/mike). Everything is automated using Github Actions. Please refer to the official mike's documentation for the details. 48 | 49 | ## Release a new version 50 | 51 | The process is fully automated by executing the [`release` GH Action](https://github.com/datamol-io/datamol/actions/workflows/release.yml). 52 | -------------------------------------------------------------------------------- /docs/images/logo-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/images/logo-black.png -------------------------------------------------------------------------------- /docs/images/logo-black.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/images/logo.png -------------------------------------------------------------------------------- /docs/images/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Datamol is a python library to work with molecules. It's a layer built on top of [RDKit](https://www.rdkit.org/) and aims to be as light as possible. 4 | 5 | - 🐍 Simple pythonic API 6 | - ⚗️ RDKit first: all you manipulate are `rdkit.Chem.Mol` objects. 7 | - ✅ Manipulating molecules often rely on many options; Datamol provides good defaults by design. 8 | - 🧠 Performance matters: built-in efficient parallelization when possible with optional progress bar. 9 | - 🕹️ Modern IO: out-of-the-box support for remote paths using `fsspec` to read and write multiple formats (sdf, xlsx, csv, etc). 10 | 11 | Visit our website at . 12 | 13 | ## Installation 14 | 15 | Use conda: 16 | 17 | ```bash 18 | mamba install -c conda-forge datamol 19 | ``` 20 | 21 | _**Tips:** You can replace `mamba` by `conda`._ 22 | 23 | _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install Datamol. The package is also pip installable if you need it: `pip install datamol`._ 24 | 25 | ## Quick API Tour 26 | 27 | ```python 28 | import datamol as dm 29 | 30 | # Common functions 31 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True) 32 | fp = dm.to_fp(mol) 33 | selfies = dm.to_selfies(mol) 34 | inchi = dm.to_inchi(mol) 35 | 36 | # Standardize and sanitize 37 | mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O") 38 | mol = dm.fix_mol(mol) 39 | mol = dm.sanitize_mol(mol) 40 | mol = dm.standardize_mol(mol) 41 | 42 | # Dataframe manipulation 43 | df = dm.data.freesolv() 44 | mols = dm.from_df(df) 45 | 46 | # 2D viz 47 | legends = [dm.to_smiles(mol) for mol in mols[:10]] 48 | dm.viz.to_image(mols[:10], legends=legends) 49 | 50 | # Generate conformers 51 | smiles = "O=C(C)Oc1ccccc1C(=O)O" 52 | mol = dm.to_mol(smiles) 53 | mol_with_conformers = dm.conformers.generate(mol) 54 | 55 | # 3D viz (using nglview) 56 | dm.viz.conformers(mol, n_confs=10) 57 | 58 | # Compute SASA from conformers 59 | sasa = dm.conformers.sasa(mol_with_conformers) 60 | 61 | # Easy IO 62 | mols = dm.read_sdf("s3://my-awesome-data-lake/smiles.sdf", as_df=False) 63 | dm.to_sdf(mols, "gs://data-bucket/smiles.sdf") 64 | ``` 65 | 66 | ## How to cite 67 | 68 | Please cite Datamol if you use it in your research: [![DOI](https://zenodo.org/badge/341603042.svg)](https://zenodo.org/badge/latestdoi/341603042). 69 | 70 | ## Compatibilities 71 | 72 | Version compatibilities are an essential topic for production-software stacks. We are cautious about documenting compatibility between `datamol`, `python` and `rdkit`. 73 | 74 | See below the associated versions of Python and RDKit, for which a minor version of Datamol **has been tested** during its whole lifecycle. _It does not mean other combinations does not work but that those are not tested._ 75 | 76 | | `datamol` | `python` | `rdkit` | 77 | | --------- | ------------------- | ----------------------------- | 78 | | `0.12.x` | `[3.10, 3.11]` | `[2023.03, 2023.09]` | 79 | | `0.11.x` | `[3.9, 3.10, 3.11]` | `[2022.09, 2023.03]` | 80 | | `0.10.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` | 81 | | `0.9.x` | `[3.9, 3.10, 3.11]` | `[2022.03, 2022.09]` | 82 | | `0.8.x` | `[3.8, 3.9, 3.10]` | `[2021.09, 2022.03, 2022.09]` | 83 | | `0.7.x` | `[3.8, 3.9]` | `[2021.09, 2022.03]` | 84 | | `0.6.x` | `[3.8, 3.9]` | `[2021.09]` | 85 | | `0.5.x` | `[3.8, 3.9]` | `[2021.03, 2021.09]` | 86 | | `0.4.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` | 87 | | `0.3.x` | `[3.8, 3.9]` | `[2020.09, 2021.03]` | 88 | -------------------------------------------------------------------------------- /docs/license.md: -------------------------------------------------------------------------------- 1 | ``` 2 | {!LICENSE!} 3 | ``` 4 | -------------------------------------------------------------------------------- /docs/tutorials/data/ReactionBlock.rxn: -------------------------------------------------------------------------------- 1 | $RXN 2 | 3 | ISIS 082120061354 4 | 5 | 2 1 6 | $MOL 7 | 8 | -ISIS- 08210613542D 9 | 10 | 3 2 0 0 0 0 0 0 0 0999 V2000 11 | -1.4340 -0.6042 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0 12 | -0.8639 -0.9333 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 13 | -1.4340 0.0542 0.0000 O 0 0 0 0 0 0 0 0 0 1 0 0 14 | 1 2 1 0 0 0 0 15 | 1 3 2 0 0 0 0 16 | M END 17 | $MOL 18 | 19 | -ISIS- 08210613542D 20 | 21 | 1 0 0 0 0 0 0 0 0 0999 V2000 22 | 2.2125 -0.7833 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0 23 | M END 24 | $MOL 25 | 26 | -ISIS- 08210613542D 27 | 28 | 3 2 0 0 0 0 0 0 0 0999 V2000 29 | 9.5282 -0.8083 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0 30 | 8.9579 -0.4792 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0 31 | 8.9579 0.1792 0.0000 O 0 0 0 0 0 0 0 0 0 1 0 0 32 | 1 2 1 0 0 0 0 33 | 2 3 2 0 0 0 0 34 | M END 35 | -------------------------------------------------------------------------------- /docs/tutorials/images/Aligning_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Aligning_1.png -------------------------------------------------------------------------------- /docs/tutorials/images/Aligning_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Aligning_2.png -------------------------------------------------------------------------------- /docs/tutorials/images/Conformers_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Conformers_1.png -------------------------------------------------------------------------------- /docs/tutorials/images/Descriptors_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Descriptors_1.png -------------------------------------------------------------------------------- /docs/tutorials/images/Fragment_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_1.png -------------------------------------------------------------------------------- /docs/tutorials/images/Fragment_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_2.png -------------------------------------------------------------------------------- /docs/tutorials/images/Fragment_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Fragment_3.png -------------------------------------------------------------------------------- /docs/tutorials/images/Preprocess_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Preprocess_1.png -------------------------------------------------------------------------------- /docs/tutorials/images/Scaffolds_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/docs/tutorials/images/Scaffolds_1.png -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | ## How to use 4 | 5 | Datamol has been designed to be used with a single import: 6 | 7 | ```python 8 | import datamol as dm 9 | ``` 10 | 11 | All `datamol` functions are available under `dm`. 12 | 13 | ## Lazy loading 14 | 15 | datamol uses lazy loading to dynamically expose all its API without imposing a long import time during `import datamol as dm`. In case of trouble you can always disable lazy loading by setting the environment variable `DATAMOL_DISABLE_LAZY_LOADING` to `1`. Please report any issue [on the datamol repo](https://github.com/datamol-io/datamol/issues). 16 | -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | 4 | dependencies: 5 | - python >=3.8 6 | - pip 7 | - tqdm 8 | - loguru 9 | - joblib 10 | - fsspec >=2021.9 11 | - s3fs >=2021.9 12 | - gcsfs >=2021.9 13 | - platformdirs 14 | - packaging 15 | - typing_extensions 16 | - importlib_resources 17 | 18 | # Scientific 19 | - pandas 20 | - numpy 21 | - scipy 22 | - pillow 23 | - matplotlib 24 | - scikit-learn 25 | 26 | # Chemistry 27 | - rdkit 28 | - selfies 29 | 30 | # Optional deps 31 | - openpyxl 32 | - networkx 33 | - nglview 34 | - xlsxwriter 35 | - pyarrow 36 | 37 | # Dev 38 | - pytest >=6.0 39 | - pytest-cov 40 | - pytest-xdist 41 | - black >=24 42 | - ruff 43 | - jupyterlab 44 | - mypy 45 | - codecov 46 | - nbconvert 47 | 48 | # Doc 49 | - mkdocs <1.6 50 | - mkdocs-material >=7.1.1 51 | - mkdocs-material-extensions 52 | - mkdocstrings 53 | - mkdocstrings-python 54 | - mkdocs-jupyter 55 | - markdown-include 56 | - mdx_truly_sane_lists 57 | - mike >=1.0.0 58 | - seaborn 59 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: "datamol" 2 | site_description: "A python library to work with molecules. Built on top of RDKit." 3 | repo_url: "https://github.com/datamol-io/datamol" 4 | repo_name: "datamol-io/datamol" 5 | copyright: Copyright 2020 - 2023 datamol.io 6 | 7 | site_url: "" 8 | remote_branch: "gh-pages" 9 | use_directory_urls: false 10 | docs_dir: "docs" 11 | 12 | # Fail on warnings to detect issues with types and docstring 13 | strict: true 14 | 15 | nav: 16 | - Overview: index.md 17 | - Usage: usage.md 18 | - Tutorials: 19 | - The Basics: tutorials/The_Basics.ipynb 20 | - Preprocessing: tutorials/Preprocessing.ipynb 21 | - Descriptors: tutorials/Descriptors.ipynb 22 | - Chemical Reactions: tutorials/Reactions.ipynb 23 | - Scaffolds: tutorials/Scaffolds.ipynb 24 | - Aligning: tutorials/Aligning.ipynb 25 | - Fuzzy_Scaffolds: tutorials/Fuzzy_Scaffolds.ipynb 26 | - Clustering: tutorials/Clustering.ipynb 27 | - Fragment: tutorials/Fragment.ipynb 28 | - Conformers: tutorials/Conformers.ipynb 29 | - Visualization: tutorials/Visualization.ipynb 30 | - Datamol Filesystem Module: tutorials/Filesystem.ipynb 31 | - API: 32 | - datamol.align: api/datamol.align.md 33 | - datamol.cluster: api/datamol.cluster.md 34 | - datamol.conformers: api/datamol.conformers.md 35 | - datamol.convert: api/datamol.convert.md 36 | - datamol.data: api/datamol.data.md 37 | - datamol.descriptors: api/datamol.descriptors.md 38 | - datamol.fp: api/datamol.fp.md 39 | - datamol.fragment: api/datamol.fragment.md 40 | - datamol.graph: api/datamol.graph.md 41 | - datamol.io: api/datamol.io.md 42 | - datamol.isomers: api/datamol.isomers.md 43 | - datamol.log: api/datamol.log.md 44 | - datamol.molar: api/datamol.molar.md 45 | - datamol.mol: api/datamol.mol.md 46 | - datamol.reactions: api/datamol.reactions.md 47 | - datamol.scaffold: api/datamol.scaffold.md 48 | - datamol.similarity: api/datamol.similarity.md 49 | - datamol.utils: api/datamol.utils.md 50 | - datamol.utils.fs: api/datamol.utils.fs.md 51 | - datamol.viz: api/datamol.viz.md 52 | 53 | - Contribute: contribute.md 54 | - License: license.md 55 | 56 | theme: 57 | name: material 58 | # NOTE(hadim): to customize the material primary and secondary 59 | # color check `docs/assets/css/datamol-custom.css`. 60 | features: 61 | - navigation.tabs 62 | - navigation.expand 63 | favicon: images/logo-black.png 64 | logo: images/logo.svg 65 | 66 | extra_css: 67 | - assets/css/custom.css 68 | - assets/css/custom-datamol.css 69 | - assets/css/tweak-width.css 70 | 71 | extra_javascript: 72 | - assets/js/google-analytics.js 73 | 74 | markdown_extensions: 75 | - admonition 76 | - markdown_include.include 77 | - pymdownx.emoji 78 | - pymdownx.magiclink 79 | - pymdownx.superfences 80 | - pymdownx.tabbed 81 | - pymdownx.tasklist 82 | # For `tab_length=2` in the markdown extension 83 | # See https://github.com/mkdocs/mkdocs/issues/545 84 | - mdx_truly_sane_lists 85 | - toc: 86 | permalink: true 87 | toc_depth: 4 88 | 89 | watch: 90 | - datamol/ 91 | 92 | plugins: 93 | - search 94 | 95 | - mkdocstrings: 96 | handlers: 97 | python: 98 | setup_commands: 99 | - import sys 100 | - sys.path.append("docs") 101 | - sys.path.append("datamol") 102 | options: 103 | new_path_syntax: true 104 | show_root_heading: false 105 | heading_level: 3 106 | show_root_full_path: false 107 | 108 | - mkdocs-jupyter: 109 | execute: false 110 | # kernel_name: python3 111 | 112 | - mike: 113 | version_selector: true 114 | 115 | extra: 116 | version: 117 | # Multi versioning provider for mkdocs-material (used for the JS selector) 118 | provider: mike 119 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "datamol" 7 | description = "A python library to work with molecules. Built on top of RDKit." 8 | authors = [{ name = "Hadrien Mary", email = "hadrien@valencediscovery.com" }] 9 | readme = "README.md" 10 | dynamic = ["version"] 11 | requires-python = ">=3.8" 12 | license = { text = "Apache" } 13 | classifiers = [ 14 | "Development Status :: 5 - Production/Stable", 15 | "Intended Audience :: Developers", 16 | "Intended Audience :: Healthcare Industry", 17 | "Intended Audience :: Science/Research", 18 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 19 | "Topic :: Scientific/Engineering :: Bio-Informatics", 20 | "Topic :: Scientific/Engineering :: Information Analysis", 21 | "Topic :: Scientific/Engineering :: Medical Science Apps.", 22 | "Natural Language :: English", 23 | "Operating System :: OS Independent", 24 | "Programming Language :: Python", 25 | "Programming Language :: Python :: 3", 26 | "Programming Language :: Python :: 3.8", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | ] 31 | dependencies = [ 32 | "tqdm", 33 | "loguru", 34 | "joblib", 35 | "fsspec>=2021.9", 36 | "pandas", 37 | "numpy", 38 | "scipy", 39 | "matplotlib", 40 | "pillow", 41 | "selfies", 42 | "platformdirs", 43 | "scikit-learn", 44 | "packaging", 45 | "typing-extensions", 46 | "importlib-resources", 47 | "rdkit", 48 | ] 49 | 50 | [project.urls] 51 | Website = "https://datamol.io" 52 | "Source Code" = "https://github.com/datamol-io/datamol" 53 | "Bug Tracker" = "https://github.com/datamol-io/datamol/issues" 54 | Documentation = "https://docs.datamol.io" 55 | 56 | [tool.setuptools] 57 | include-package-data = true 58 | 59 | [tool.setuptools_scm] 60 | fallback_version = "dev" 61 | 62 | [tool.setuptools.packages.find] 63 | where = ["."] 64 | include = ["datamol", "datamol.*"] 65 | exclude = [] 66 | namespaces = true 67 | 68 | [tool.setuptools.package-data] 69 | "datamol.data" = ["*"] 70 | 71 | [tool.black] 72 | line-length = 100 73 | target-version = ['py39', 'py310'] 74 | include = '\.pyi?$' 75 | 76 | [tool.pytest.ini_options] 77 | minversion = "6.0" 78 | addopts = "--verbose --cov=datamol --cov-fail-under=85 --cov-report xml --cov-report term --durations=10 -n auto" 79 | testpaths = ["tests"] 80 | filterwarnings = [ 81 | "ignore::DeprecationWarning:rdkit.Chem.MolStandardize", 82 | "ignore::DeprecationWarning:jupyter_client", 83 | "ignore::DeprecationWarning:pkg_resources", 84 | "ignore::DeprecationWarning:joblib.externals.loky.backend", 85 | "ignore::DeprecationWarning:dateutil.tz.tz", 86 | "ignore::DeprecationWarning:joblib._utils", 87 | "ignore::DeprecationWarning:openpyxl.packaging.core", 88 | "ignore::DeprecationWarning:tqdm.std", 89 | ] 90 | 91 | [tool.coverage.run] 92 | source = ["datamol/"] 93 | disable_warnings = ["no-data-collected"] 94 | data_file = ".coverage/coverage" 95 | 96 | [tool.coverage.report] 97 | omit = ["datamol/__init__.py", "datamol/_version.py"] 98 | 99 | [tool.coverage.xml] 100 | output = "coverage.xml" 101 | 102 | [tool.mypy] 103 | exclude = [] 104 | ignore_missing_imports = true 105 | 106 | [tool.pyright] 107 | reportShadowedImports = false 108 | 109 | [tool.ruff] 110 | ignore = [ 111 | "E501", # Never enforce `E501` (line length violations). 112 | "E731", # Do not assign a lambda expression, use a def 113 | ] 114 | line-length = 110 115 | target-version = "py311" 116 | 117 | [tool.ruff.per-file-ignores] 118 | "__init__.py" = [ 119 | "F401", # imported but unused 120 | "E402", # Module level import not at top of file 121 | ] 122 | 123 | [tool.ruff.pycodestyle] 124 | max-doc-length = 150 125 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import pathlib 3 | from loguru import logger 4 | 5 | import pytest 6 | 7 | 8 | DATA_DIR_PATH = pathlib.Path(__file__).parent.resolve() / "data" 9 | 10 | 11 | @pytest.fixture 12 | def current_platform(): 13 | if platform.system() == "Linux": 14 | return "linux" 15 | elif platform.system() == "Darwin": 16 | return "osx" 17 | elif platform.system() == "Windows": 18 | return "win" 19 | else: 20 | return platform.system() 21 | 22 | 23 | @pytest.fixture(autouse=True) 24 | def skip_by_platform(request, current_platform): 25 | if request.node.get_closest_marker("skip_platform"): 26 | if request.node.get_closest_marker("skip_platform").args[0] == current_platform: 27 | pytest.skip(f"skipped on this platform: {current_platform}") 28 | 29 | 30 | def pytest_configure(config): 31 | config.addinivalue_line( 32 | "markers", 33 | "skip_platform(current_platform): skip test for a given platform from `['linux', 'osx', 'win']`", 34 | ) 35 | 36 | 37 | @pytest.fixture 38 | def datadir(request): 39 | return DATA_DIR_PATH 40 | 41 | 42 | # Mandatory for the below monkeypatch function. 43 | from _pytest.logging import caplog as _caplog # noqa: E402, F401 44 | 45 | 46 | @pytest.fixture 47 | def caplog(_caplog): # noqa: F811 48 | """Monkeypatching the pytest caplog to work with loguru. 49 | 50 | See https://loguru.readthedocs.io/en/latest/resources/migration.html#making-things-work-with-pytest-and-caplog 51 | """ 52 | import logging 53 | 54 | class PropogateHandler(logging.Handler): 55 | def emit(self, record): 56 | logging.getLogger(record.name).handle(record) 57 | 58 | handler_id = logger.add(PropogateHandler(), format="{message}") 59 | yield _caplog 60 | logger.remove(handler_id) 61 | -------------------------------------------------------------------------------- /tests/data/TUBB3-observations.sdf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/tests/data/TUBB3-observations.sdf.gz -------------------------------------------------------------------------------- /tests/data/freesolv.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamol-io/datamol/0312388b956e2b4eeb72d791167cfdb873c7beab/tests/data/freesolv.xlsx -------------------------------------------------------------------------------- /tests/data/test.mol2: -------------------------------------------------------------------------------- 1 | @MOLECULE 2 | mol_first 3 | 11 11 1 0 0 4 | SMALL 5 | AMBER ff14SB 6 | 7 | @ATOM 8 | 1 C1 -0.0167 1.3778 0.0096 C.ar 1 UNK 0.0267 9 | 2 C2 0.0021 -0.0041 0.0020 C.ar 1 UNK -0.0438 10 | 3 C3 1.2218 -0.6631 -0.0131 C.ar 1 UNK -0.0592 11 | 4 C4 2.3820 0.0960 -0.0201 C.ar 1 UNK -0.0438 12 | 5 C5 2.2849 1.4746 -0.0118 C.ar 1 UNK 0.0267 13 | 6 N6 1.1072 2.0677 0.0026 N.ar 1 UNK -0.2647 14 | 7 H7 -0.9627 1.8988 0.0169 H 1 UNK 0.0840 15 | 8 H8 -0.9217 -0.5635 0.0075 H 1 UNK 0.0639 16 | 9 H9 1.2671 -1.7422 -0.0190 H 1 UNK 0.0624 17 | 10 H10 3.3495 -0.3839 -0.0316 H 1 UNK 0.0639 18 | 11 H11 3.1838 2.0731 -0.0171 H 1 UNK 0.0840 19 | @BOND 20 | 1 1 6 ar 21 | 2 1 2 ar 22 | 3 1 7 1 23 | 4 2 3 ar 24 | 5 2 8 1 25 | 6 3 4 ar 26 | 7 3 9 1 27 | 8 4 5 ar 28 | 9 4 10 1 29 | 10 5 6 ar 30 | 11 5 11 1 31 | @SUBSTRUCTURE 32 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 33 | 34 | @MOLECULE 35 | mol_sec 36 | 9 9 1 0 0 37 | SMALL 38 | AMBER ff14SB 39 | 40 | 41 | @ATOM 42 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838 43 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106 44 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532 45 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120 46 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422 47 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480 48 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014 49 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806 50 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854 51 | @BOND 52 | 1 1 6 2 53 | 2 1 2 1 54 | 3 1 7 1 55 | 4 2 3 1 56 | 5 2 4 1 57 | 6 4 5 2 58 | 7 4 8 1 59 | 8 5 6 1 60 | 9 5 9 1 61 | @SUBSTRUCTURE 62 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 63 | 64 | @MOLECULE 65 | mol_third 66 | 9 9 1 0 0 67 | SMALL 68 | AMBER ff14SB 69 | 70 | 71 | @ATOM 72 | 1 C1 1.2973 -0.3859 -0.0124 C 1 UNK 0.0838 73 | 2 N2 0.0021 -0.0041 0.0020 N 1 UNK -0.3106 74 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532 75 | 4 C4 -0.0165 1.3646 0.0095 C 1 UNK 0.0120 76 | 5 C5 1.2671 1.7717 -0.0005 C 1 UNK 0.0422 77 | 6 N6 2.0482 0.6814 -0.0138 N 1 UNK -0.2480 78 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014 79 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806 80 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854 81 | @BOND 82 | 1 1 6 2 83 | 2 1 2 1 84 | 3 1 7 1 85 | 4 2 3 1 86 | 5 2 4 1 87 | 6 4 5 2 88 | 7 4 8 1 89 | 8 5 6 1 90 | 9 5 9 1 91 | @SUBSTRUCTURE 92 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 93 | 94 | @MOLECULE 95 | mol_sec_f 96 | 9 9 1 0 0 97 | SMALL 98 | AMBER ff14SB 99 | 100 | 101 | @ATOM 102 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838 103 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106 104 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532 105 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120 106 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422 107 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480 108 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014 109 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806 110 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854 111 | 112 | 1 1 6 2 113 | 2 1 2 1 114 | 3 1 7 1 115 | 4 2 3 1 116 | 5 2 4 1 117 | 6 4 5 2 118 | 7 4 8 1 119 | 8 5 6 1 120 | 9 5 9 1 121 | @SUBSTRUCTURE 122 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 123 | 124 | @MOLECULE 125 | mol_sec_f1 126 | 9 9 1 0 0 127 | SMALL 128 | AMBER ff14SB 129 | 130 | 131 | 132 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838 133 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106 134 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532 135 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120 136 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422 137 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480 138 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014 139 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806 140 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854 141 | @BOND 142 | 1 1 6 2 143 | 2 1 2 1 144 | 3 1 7 1 145 | 4 2 3 1 146 | 5 2 4 1 147 | 6 4 5 2 148 | 7 4 8 1 149 | 8 5 6 1 150 | 9 5 9 1 151 | @SUBSTRUCTURE 152 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 153 | 154 | @MOLECULE 155 | mol_sec_f3 156 | 9 9 1 0 0 157 | SMALL 158 | AMBER ff14SB 159 | 160 | @ATOM 161 | @BOND 162 | 1 1 6 2 163 | 2 1 2 1 164 | 3 1 7 1 165 | 4 2 3 1 166 | 5 2 4 1 167 | 6 4 5 2 168 | 7 4 8 1 169 | 8 5 6 1 170 | 9 5 9 1 171 | @SUBSTRUCTURE 172 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 173 | 174 | @MOLECULE 175 | mol_sec_f4 176 | 9 9 1 0 0 177 | SMALL 178 | AMBER ff14SB 179 | 180 | @ATOM 181 | @BOND 182 | @SUBSTRUCTURE 183 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 184 | 185 | 186 | 187 | @MOLECULE 188 | 189 | 190 | 191 | @ATOM 192 | 1 C1 1.2973 -0.3859 -0.0124 C.2 1 UNK 0.0838 193 | 2 N2 0.0021 -0.0041 0.0020 N.pl3 1 UNK -0.3106 194 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532 195 | 4 C4 -0.0165 1.3646 0.0095 C.2 1 UNK 0.0120 196 | 5 C5 1.2671 1.7717 -0.0005 C.2 1 UNK 0.0422 197 | 6 N6 2.0482 0.6814 -0.0138 N.2 1 UNK -0.2480 198 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014 199 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806 200 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854 201 | 202 | 1 1 6 2 203 | 2 1 2 1 204 | 3 1 7 1 205 | 4 2 3 1 206 | 5 2 4 1 207 | 6 4 5 2 208 | 7 4 8 1 209 | 8 5 6 1 210 | 9 5 9 1 211 | @SUBSTRUCTURE 212 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 213 | 214 | @MOLECULE 215 | mol_sec 216 | 9 9 1 0 0 217 | SMALL 218 | AMBER ff14SB 219 | 220 | 221 | @ATOM 222 | 1 C1 1.2973 -0.3859 -0.0124 C 1 UNK 0.0838 223 | 2 N2 0.0021 -0.0041 0.0020 N 1 UNK -0.3106 224 | 3 H3 -0.7708 -0.5902 0.0062 H 1 UNK 0.1532 225 | 4 C4 -0.0165 1.3646 0.0095 C 1 UNK 0.0120 226 | 5 C5 1.2671 1.7717 -0.0005 C 1 UNK 0.0422 227 | 6 N6 2.0482 0.6814 -0.0138 N 1 UNK -0.2480 228 | 7 H7 1.6529 -1.4057 -0.0216 H 1 UNK 0.1014 229 | 8 H8 -0.8923 1.9965 0.0173 H 1 UNK 0.0806 230 | 9 H9 1.6079 2.7966 0.0017 H 1 UNK 0.0854 231 | @BOND 232 | 1 1 6 2 233 | 2 1 2 1 234 | 3 1 7 1 235 | 4 2 3 1 236 | 5 2 4 1 237 | 6 4 5 2 238 | 7 4 8 1 239 | 8 5 6 1 240 | 9 5 9 1 241 | @SUBSTRUCTURE 242 | 1 UNK 1 RESIDUE 4 A UNK 0 ROOT 243 | 244 | -------------------------------------------------------------------------------- /tests/test_align.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pandas as pd 4 | import datamol as dm 5 | 6 | 7 | def test_template_align(): 8 | data: pd.DataFrame = dm.cdk2(as_df=True) # type: ignore 9 | data = data.iloc[:6].copy() # type: ignore 10 | 11 | template = data.iloc[0]["mol"] 12 | data["aligned_mol"] = data["mol"].apply(lambda x: dm.align.template_align(x, template=template)) 13 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True 14 | 15 | template = data.iloc[0]["smiles"] 16 | data["aligned_mol"] = data["smiles"].apply( 17 | lambda x: dm.align.template_align(x, template=template) 18 | ) 19 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True 20 | 21 | template = data.iloc[0]["mol"] 22 | data["aligned_mol"] = data["mol"].apply( 23 | lambda x: dm.align.template_align(x, template=template, auto_select_coord_gen=True) 24 | ) 25 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True 26 | 27 | template = data.iloc[0]["mol"] 28 | data["aligned_mol"] = data["mol"].apply( 29 | lambda x: dm.align.template_align(x, template=template, use_depiction=False) 30 | ) 31 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True 32 | 33 | template = None 34 | data["aligned_mol"] = data["mol"].apply(lambda x: dm.align.template_align(x, template=template)) 35 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True 36 | 37 | template = None 38 | data["aligned_mol"] = data["mol"].apply( 39 | lambda x: dm.align.template_align(x, template=template, copy=False) 40 | ) 41 | assert bool(data["aligned_mol"].apply(lambda x: isinstance(x, dm.Mol)).all()) is True 42 | 43 | assert dm.align.template_align(None) is None 44 | 45 | 46 | def test_auto_align_many(): 47 | data: pd.DataFrame = dm.solubility(as_df=True) # type: ignore 48 | data = data.iloc[:16].copy() # type: ignore 49 | 50 | excepted_cluster_size = [8, 6, 5, 6, 6] 51 | 52 | for i, partition_method in enumerate( 53 | [ 54 | "cluster", 55 | "scaffold", 56 | "anongraph-scaffold", 57 | "anon-scaffold", 58 | "strip-scaffold", 59 | ] 60 | ): 61 | print(partition_method) 62 | 63 | data["aligned_mol"] = dm.align.auto_align_many( 64 | data["mol"], 65 | partition_method=partition_method, 66 | ) 67 | 68 | props = data["aligned_mol"].apply(lambda x: pd.Series(x.GetPropsAsDict())) 69 | 70 | assert "dm.auto_align_many.cluster_id" in props.columns 71 | assert "dm.auto_align_many.core" in props.columns 72 | assert props["dm.auto_align_many.cluster_id"].dtype.name == "int64" 73 | assert props["dm.auto_align_many.core"].dtype.name == "object" 74 | 75 | assert props["dm.auto_align_many.cluster_id"].unique().shape[0] == excepted_cluster_size[i] 76 | 77 | with pytest.raises(ValueError): 78 | dm.align.auto_align_many(data["mol"], partition_method="invalid") 79 | -------------------------------------------------------------------------------- /tests/test_cluster.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import datamol as dm 4 | 5 | 6 | def test_cluster_mols(): 7 | # Get some mols 8 | data = dm.data.freesolv() 9 | smiles = data["smiles"].iloc[:100].tolist() 10 | mols = [dm.to_mol(s) for s in smiles] 11 | 12 | _, mol_clusters = dm.cluster_mols(mols, cutoff=0.7) 13 | cluster_sizes = [11, 7, 5, 3, 3, 3, 2, 3, 2, 1, 2, 2, 1] 14 | assert [len(c) for c in mol_clusters[:13]] == cluster_sizes 15 | 16 | 17 | def test_pick_diverse(): 18 | # Get some mols 19 | data = dm.data.freesolv() 20 | smiles = data["smiles"].iloc[:100].tolist() 21 | mols = [dm.to_mol(s) for s in smiles] 22 | 23 | indices, _ = dm.pick_diverse(mols, npick=18, seed=19) 24 | 25 | excepted_indices = np.array( 26 | [9, 14, 47, 50, 56, 61, 67, 89, 83, 90, 94, 10, 0, 96, 15, 58, 71, 21] 27 | ) 28 | 29 | assert np.all(indices == excepted_indices) 30 | 31 | 32 | def test_pick_centroids(): 33 | data = dm.data.freesolv() 34 | smiles = data["smiles"].iloc[:100].tolist() 35 | mols = [dm.to_mol(s) for s in smiles] 36 | indices, centroids = dm.pick_centroids( 37 | mols, npick=18, threshold=0.7, method="sphere", n_jobs=-1 38 | ) 39 | excepted_indices = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18, 20]) 40 | 41 | assert np.all(indices == excepted_indices) 42 | 43 | 44 | def test_assign_to_centroids(): 45 | data = dm.data.freesolv() 46 | smiles = data["smiles"].iloc[:100].tolist() 47 | mols = [dm.to_mol(s) for s in smiles] 48 | indices, centroids = dm.pick_centroids( 49 | mols, npick=18, threshold=0.7, method="sphere", n_jobs=-1 50 | ) 51 | 52 | cluster_map, cluster_list = dm.assign_to_centroids(mols, centroids, n_jobs=-1) 53 | # expect centroid to be in centroid list 54 | assert indices[0] in cluster_map[0] 55 | # expect no intersection after assignment 56 | map_intersection = set.intersection(*map(set, cluster_map.values())) 57 | assert len(map_intersection) == 0 58 | # expect some similar molecule in a given cluster 59 | # assert 33 in cluster_map[0] 60 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | import datamol as dm 2 | 3 | 4 | def test_freesolv(): 5 | data = dm.data.freesolv() 6 | assert data.shape == (642, 4) 7 | assert list(data.columns) == ["iupac", "smiles", "expt", "calc"] 8 | 9 | 10 | def test_cdk2(): 11 | data = dm.data.cdk2() 12 | assert data.shape == (47, 12) 13 | assert list(data.columns) == [ 14 | "smiles", 15 | "mol", 16 | "id", 17 | "Cluster", 18 | "MODEL.SOURCE", 19 | "MODEL.CCRATIO", 20 | "r_mmffld_Potential_Energy-OPLS_2005", 21 | "r_mmffld_RMS_Derivative-OPLS_2005", 22 | "b_mmffld_Minimization_Converged-OPLS_2005", 23 | "s_st_Chirality_1", 24 | "s_st_Chirality_2", 25 | "s_st_Chirality_3", 26 | ] 27 | 28 | 29 | def test_solubility(): 30 | data = dm.data.solubility() 31 | assert data.shape == (1282, 7) 32 | assert list(data.columns) == [ 33 | "mol", 34 | "ID", 35 | "NAME", 36 | "SOL", 37 | "SOL_classification", 38 | "smiles", 39 | "split", 40 | ] 41 | 42 | 43 | def test_chembl_drugs(): 44 | data = dm.data.chembl_drugs() 45 | assert data.shape == (2628, 5) 46 | assert list(data.columns) == [ 47 | "first_approval", 48 | "molecule_chembl_id", 49 | "molecule_type", 50 | "pref_name", 51 | "smiles", 52 | ] 53 | 54 | 55 | def test_chembl_samples(): 56 | data = dm.data.chembl_samples() 57 | assert data.shape == (2000, 1) 58 | assert list(data.columns) == ["smiles"] 59 | -------------------------------------------------------------------------------- /tests/test_descriptors.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pandas as pd 4 | import datamol as dm 5 | 6 | 7 | def test_descriptors(): 8 | smiles_list = ["CC(=O)OC1=CC=CC=C1C(=O)O", "CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl"] 9 | 10 | for smiles in smiles_list: 11 | mol = dm.to_mol(smiles) 12 | 13 | dm.descriptors.mw(mol) 14 | dm.descriptors.fsp3(mol) 15 | dm.descriptors.n_hba(mol) 16 | dm.descriptors.n_hbd(mol) 17 | dm.descriptors.n_lipinski_hba(mol) 18 | dm.descriptors.n_lipinski_hbd(mol) 19 | dm.descriptors.n_rings(mol) 20 | dm.descriptors.n_hetero_atoms(mol) 21 | dm.descriptors.n_heavy_atoms(mol) 22 | dm.descriptors.n_rotatable_bonds(mol) 23 | dm.descriptors.n_aliphatic_rings(mol) 24 | dm.descriptors.n_aromatic_rings(mol) 25 | dm.descriptors.n_saturated_rings(mol) 26 | dm.descriptors.n_radical_electrons(mol) 27 | dm.descriptors.tpsa(mol) 28 | dm.descriptors.qed(mol) 29 | dm.descriptors.clogp(mol) 30 | dm.descriptors.sas(mol) 31 | dm.descriptors.sas(mol) 32 | dm.descriptors.n_stereo_centers_unspecified(mol) 33 | dm.descriptors.n_spiro_atoms(mol) 34 | 35 | dm.descriptors.n_aliphatic_carbocycles(mol) 36 | dm.descriptors.n_aliphatic_heterocyles(mol) 37 | dm.descriptors.n_aliphatic_rings(mol) 38 | dm.descriptors.n_aromatic_carbocycles(mol) 39 | dm.descriptors.n_aromatic_heterocyles(mol) 40 | dm.descriptors.n_aromatic_rings(mol) 41 | dm.descriptors.n_saturated_carbocycles(mol) 42 | dm.descriptors.n_saturated_heterocyles(mol) 43 | dm.descriptors.n_saturated_rings(mol) 44 | 45 | 46 | def test_compute_many_descriptors(): 47 | mol = dm.to_mol("CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl") 48 | 49 | true_values = pd.Series( 50 | { 51 | "mw": 319.181525512, 52 | "fsp3": 0.5, 53 | "n_lipinski_hba": 3.0, 54 | "n_lipinski_hbd": 1.0, 55 | "n_rings": 2.0, 56 | "n_hetero_atoms": 4.0, 57 | "n_heavy_atoms": 22.0, 58 | "n_rotatable_bonds": 8.0, 59 | "n_radical_electrons": 0.0, 60 | "tpsa": 28.16, 61 | "qed": 0.7564117572128701, 62 | "clogp": 4.810600000000004, 63 | "sas": 2.670786229594949, 64 | "n_aliphatic_carbocycles": 0.0, 65 | "n_aliphatic_heterocyles": 0.0, 66 | "n_aliphatic_rings": 0.0, 67 | "n_aromatic_carbocycles": 1.0, 68 | "n_aromatic_heterocyles": 1.0, 69 | "n_aromatic_rings": 2.0, 70 | "n_saturated_carbocycles": 0.0, 71 | "n_saturated_heterocyles": 0.0, 72 | "n_saturated_rings": 0.0, 73 | } 74 | ) 75 | 76 | # Scenario #1 77 | props = dm.descriptors.compute_many_descriptors(mol) 78 | props = pd.Series(props) 79 | 80 | assert props.equals(true_values) 81 | 82 | # Scenario #2 83 | props = dm.descriptors.compute_many_descriptors( 84 | mol, 85 | properties_fn={"hello": lambda x: 88}, 86 | add_properties=False, 87 | ) 88 | assert props == {"hello": 88} 89 | 90 | # Scenario #3 91 | props = dm.descriptors.compute_many_descriptors( 92 | mol, 93 | properties_fn={"hello": lambda x: 88}, 94 | add_properties=True, 95 | ) 96 | props = pd.Series(props) 97 | 98 | true_values_2 = true_values.copy() 99 | true_values_2["hello"] = 88 100 | true_values_2 = true_values_2[props.index] 101 | 102 | assert true_values_2.equals(props) 103 | 104 | 105 | def test_compute_many_descriptors_with_function_as_string(): 106 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O") 107 | 108 | results = dm.descriptors.compute_many_descriptors( 109 | mol, 110 | properties_fn={"max_partial_charge": "MaxPartialCharge"}, 111 | add_properties=False, 112 | ) 113 | 114 | assert "max_partial_charge" in results.keys() 115 | assert pytest.approx(0.33900378687731025) == results["max_partial_charge"] 116 | 117 | 118 | def test_batch_compute_many_descriptors(): 119 | data = dm.data.freesolv() 120 | data = data.iloc[:30] 121 | mols = data["smiles"].apply(dm.to_mol).tolist() 122 | 123 | props = dm.descriptors.batch_compute_many_descriptors( 124 | mols, 125 | batch_size=64, 126 | n_jobs=-1, 127 | progress=False, 128 | ) 129 | 130 | assert set(props.columns.tolist()) == { 131 | "mw", 132 | "fsp3", 133 | "n_lipinski_hba", 134 | "n_lipinski_hbd", 135 | "n_rings", 136 | "n_hetero_atoms", 137 | "n_heavy_atoms", 138 | "n_rotatable_bonds", 139 | "n_radical_electrons", 140 | "tpsa", 141 | "qed", 142 | "clogp", 143 | "sas", 144 | "n_aliphatic_carbocycles", 145 | "n_aliphatic_heterocyles", 146 | "n_aliphatic_rings", 147 | "n_aromatic_carbocycles", 148 | "n_aromatic_heterocyles", 149 | "n_aromatic_rings", 150 | "n_saturated_carbocycles", 151 | "n_saturated_heterocyles", 152 | "n_saturated_rings", 153 | } 154 | assert props.shape == (30, 22) 155 | 156 | 157 | def test_any_rdkit_descriptor(): 158 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O") 159 | 160 | value = dm.descriptors.any_rdkit_descriptor("MaxPartialCharge")(mol) 161 | assert pytest.approx(value) == 0.33900378687731025 162 | 163 | value = dm.descriptors.any_rdkit_descriptor("CalcFractionCSP3")(mol) 164 | assert pytest.approx(value) == 0.1111111111111111 165 | 166 | with pytest.raises(ValueError): 167 | dm.descriptors.any_rdkit_descriptor("DOES NOT EXIST") 168 | 169 | 170 | def test_n_aromatic_atoms(): 171 | smiles = "Nc1cnn(-c2ccccc2)c(=O)c1Cl" 172 | mol = dm.to_mol(smiles) 173 | 174 | assert dm.descriptors.n_aromatic_atoms(mol) == 12 175 | assert dm.descriptors.n_aromatic_atoms_proportion(mol) == 0.8 176 | 177 | 178 | def test_formal_charge(): 179 | mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC") 180 | assert dm.descriptors.formal_charge(mol) == 0 181 | 182 | mol = dm.to_mol("C(CC(=O)[O-])C(C(=O)[O-])[NH3+]") 183 | assert dm.descriptors.formal_charge(mol) == -1 184 | 185 | 186 | def test_refractivity(): 187 | mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3") 188 | 189 | value = dm.descriptors.refractivity(mol) 190 | assert pytest.approx(value, rel=2) == 81.10 191 | 192 | 193 | def test_n_rigid_bonds(): 194 | mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC") 195 | assert dm.descriptors.n_rigid_bonds(mol) == 20 196 | 197 | mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3") 198 | assert dm.descriptors.n_rigid_bonds(mol) == 19 199 | 200 | 201 | def test_n_stereocenters(): 202 | mol = dm.to_mol("CC(=O)NC1CCC2=CC(=C(C(=C2C3=CC=C(C(=O)C=C13)OC)OC)OC)OC") 203 | 204 | assert dm.descriptors.n_stereo_centers(mol) == 1 205 | 206 | mol = dm.to_mol("CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3") 207 | assert dm.descriptors.n_stereo_centers(mol) == 0 208 | 209 | 210 | def test_n_charged_atoms(): 211 | mol = dm.to_mol("C(CC(=O)[O-])C(C(=O)[O-])[NH3+]") 212 | assert dm.descriptors.n_charged_atoms(mol) == 3 213 | -------------------------------------------------------------------------------- /tests/test_fp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datamol as dm 4 | 5 | 6 | def test_to_fp(): 7 | smiles = "CC(=O)Oc1ccccc1C(=O)O" 8 | mol = dm.to_mol(smiles) 9 | 10 | assert dm.to_fp(mol).shape[0] == 2048 11 | assert dm.to_fp(mol).sum() == 31 12 | 13 | 14 | def test_list_fp(): 15 | assert set(dm.list_supported_fingerprints().keys()) == { 16 | "atompair", 17 | "atompair-count", 18 | "avalon-count", 19 | "ecfp", 20 | "fcfp", 21 | "ecfp-count", 22 | "erg", 23 | "estate", 24 | "fcfp-count", 25 | "layered", 26 | "maccs", 27 | "pattern", 28 | "rdkit", 29 | "topological", 30 | "topological-count", 31 | "rdkit-count", 32 | } 33 | 34 | 35 | def test_all_fps(): 36 | smiles = "CC(=O)Oc1ccccc1C(=O)O" 37 | mol = dm.to_mol(smiles) 38 | 39 | fp_infos = {} 40 | for fp_type in dm.list_supported_fingerprints(): 41 | fold_size = None 42 | if fp_type == "rdkit-count": 43 | fold_size = 2048 44 | 45 | print(fp_type) 46 | args = {} 47 | args["mol"] = mol 48 | args["as_array"] = True 49 | args["fp_type"] = fp_type 50 | args["fold_size"] = fold_size 51 | fp = dm.to_fp(**args) 52 | 53 | fp_infos[fp_type] = dict(size=len(fp), bits_sum=fp.sum()) 54 | 55 | print(fp_infos) 56 | 57 | assert fp_infos == { 58 | "maccs": {"size": 167, "bits_sum": 21}, 59 | "ecfp": {"size": 2048, "bits_sum": 31}, 60 | "fcfp": {"size": 2048, "bits_sum": 22}, 61 | "topological": {"size": 2048, "bits_sum": 18}, 62 | "atompair": {"size": 2048, "bits_sum": 68}, 63 | "rdkit": {"size": 2048, "bits_sum": 354}, 64 | "pattern": {"size": 2048, "bits_sum": 173}, 65 | "layered": {"size": 2048, "bits_sum": 335}, 66 | "erg": {"size": 315, "bits_sum": 23.4}, 67 | "estate": {"size": 79, "bits_sum": 13}, 68 | "avalon-count": {"size": 512, "bits_sum": 168}, 69 | "ecfp-count": {"size": 2048, "bits_sum": 42}, 70 | "fcfp-count": {"size": 2048, "bits_sum": 35}, 71 | "topological-count": {"size": 2048, "bits_sum": 19}, 72 | "atompair-count": {"size": 2048, "bits_sum": 78}, 73 | "rdkit-count": {"size": 2048, "bits_sum": 301}, 74 | } 75 | 76 | 77 | def test_fp_invalid_input(): 78 | args = {} 79 | args["mol"] = None 80 | args["radius"] = 3 81 | 82 | with pytest.raises(ValueError): 83 | dm.to_fp(**args) 84 | 85 | args["mol"] = "dsdsdsd" 86 | with pytest.raises(ValueError): 87 | dm.to_fp(**args) 88 | -------------------------------------------------------------------------------- /tests/test_fragment.py: -------------------------------------------------------------------------------- 1 | import datamol as dm 2 | 3 | 4 | def test_brics(): 5 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 6 | mol = dm.to_mol(smiles) 7 | frags = dm.fragment.brics(mol) 8 | assert len(frags) == 9 9 | 10 | 11 | def test_frag(): 12 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 13 | mol = dm.to_mol(smiles) 14 | frags = dm.fragment.frag(mol) 15 | assert len(frags) == 9 16 | 17 | 18 | def test_recap(): 19 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 20 | mol = dm.to_mol(smiles) 21 | frags = dm.fragment.recap(mol) 22 | assert len(frags) == 3 23 | 24 | 25 | def test_anybreak(): 26 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 27 | mol = dm.to_mol(smiles) 28 | frags = dm.fragment.anybreak(mol) 29 | assert len(frags) == 9 30 | 31 | 32 | def test_mmpa(): 33 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 34 | mol = dm.to_mol(smiles) 35 | 36 | frags = dm.fragment.mmpa_cut(mol) 37 | assert len(frags) == 39 38 | assert "CCCOCc1cccc(-c2ccccn2)c1,C(C[*:2])[*:1],C[*:1].c1ccc(-c2cccc(CO[*:2])c2)nc1\n" in frags 39 | 40 | 41 | def test_assemble(): 42 | # Fragment a molecule 43 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 44 | mol = dm.to_mol(smiles) 45 | frags = dm.fragment.brics(mol) 46 | 47 | # Limit the number of fragments to work with because 48 | # assembling is computationally intensive. 49 | frags = frags[:2] 50 | 51 | # Assemble molecules from the list of fragments 52 | mols = list(dm.fragment.assemble_fragment_order(frags, max_n_mols=4)) 53 | 54 | assert len(mols) == 4 55 | 56 | 57 | def test_break_mol(): 58 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 59 | mol = dm.to_mol(smiles) 60 | fragments, *_, tree = dm.fragment.break_mol(mol, randomize=False, mode="brics", returnTree=True) 61 | 62 | assert fragments == ["CCC", "O", "C", "c1ccncc1", "c1ccccc1"] 63 | assert list(tree.nodes) == [0, 1, 2, 3, 4, 5, 6, 7, 8] 64 | assert list(tree.edges) == [(0, 1), (0, 2), (2, 3), (2, 4), (4, 5), (4, 6), (6, 7), (6, 8)] 65 | 66 | 67 | def test_assemble_build(): 68 | mols = [[dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")], [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")]] 69 | 70 | results = list(dm.fragment.build(mols)) 71 | assert len(results) == 71 72 | 73 | results = list(dm.fragment.build(mols, mode="rxn")) 74 | assert len(results) == 0 75 | 76 | results = list(dm.fragment.build(mols, mode=None)) 77 | assert len(results) == 0 78 | -------------------------------------------------------------------------------- /tests/test_import.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datamol as dm 4 | 5 | 6 | def test_datamol_import_fails(): 7 | with pytest.raises(AttributeError): 8 | dm.that_import_does_not_exist 9 | -------------------------------------------------------------------------------- /tests/test_isomers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datamol as dm 4 | 5 | 6 | def test_enumerate_tautomers(): 7 | mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1") 8 | 9 | mols = dm.enumerate_tautomers(mol, n_variants=10) 10 | 11 | assert {dm.to_smiles(m) for m in mols} == {"O=C1C=[N:1]C2CCCCC2C1", "OC1=CC2CCCCC2[N:1]=C1"} 12 | 13 | 14 | def test_enumerate_stereo(): 15 | mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1") 16 | 17 | mols = dm.enumerate_stereoisomers(mol, n_variants=10) 18 | 19 | assert {dm.to_smiles(m) for m in mols} == { 20 | "OC1=C[C@@H]2CCCC[C@@H]2[N:1]=C1", 21 | "OC1=C[C@@H]2CCCC[C@H]2[N:1]=C1", 22 | "OC1=C[C@H]2CCCC[C@@H]2[N:1]=C1", 23 | "OC1=C[C@H]2CCCC[C@H]2[N:1]=C1", 24 | } 25 | 26 | 27 | def test_enumerate_stereo_undefined_failure(): 28 | mol = dm.to_mol( 29 | "N=1C(NC2CC2)=C3C(=NC1)N(/C=C/C=4C=C(C=CC4C)C(=O)NC=5C=C(C=C(C5)N6CCN(CC6)C)C(F)(F)F)C=N3" 30 | ) 31 | with pytest.raises(RuntimeError): 32 | dm.enumerate_stereoisomers(mol, clean_it=True) 33 | 34 | mols = dm.enumerate_stereoisomers(mol, clean_it=False) 35 | assert len(mols) == 2 # only one double bond 36 | 37 | 38 | def test_enumerate_stereo_timeout(): 39 | mol = dm.to_mol("CCCCC") 40 | 41 | # NOTE(hadim): it's impossible to predict anything given a timeout for different 42 | # machines so we here we just check the code can run without errors 43 | dm.enumerate_stereoisomers(mol, n_variants=2, timeout_seconds=1) 44 | 45 | 46 | def test_count_stereoisomers(): 47 | num_isomers_1 = dm.count_stereoisomers(dm.to_mol("CC=CC"), undefined_only=True) 48 | num_isomers_2 = dm.count_stereoisomers(dm.to_mol("CC=CC"), undefined_only=False) 49 | assert num_isomers_1 == num_isomers_2 50 | 51 | assert dm.count_stereoisomers(dm.to_mol("Br/C=C\\Br"), undefined_only=True) == 1 52 | 53 | 54 | def test_enumerate_structural(): 55 | mol = dm.to_mol("CCCCC") # pentane has only three structural isomers 56 | 57 | mols_iso = dm.enumerate_structisomers( 58 | mol, 59 | n_variants=2, 60 | allow_cycle=False, 61 | depth=1, 62 | allow_double_bond=False, 63 | allow_triple_bond=False, 64 | ) 65 | 66 | assert {dm.to_smiles(m) for m in mols_iso} == {"CCC(C)C"} 67 | 68 | # NOTE(hadim): disable to reduce testing time 69 | # mols_cyclo_iso = dm.enumerate_structisomers(mol, n_variants=5, depth=2, allow_cycle=True) 70 | 71 | # # expect 3 molecules with cycles 72 | # assert sum([Chem.rdMolDescriptors.CalcNumRings(x) == 1 for x in mols_cyclo_iso]) == 3 # type: ignore 73 | 74 | # mols_cyclo_iso_double = dm.enumerate_structisomers( 75 | # mol, n_variants=10, allow_cycle=True, allow_double_bond=True 76 | # ) 77 | # should have mol with double link 78 | # assert sum(["=" in dm.to_smiles(x) for x in mols_cyclo_iso_double]) > 0 79 | 80 | 81 | @pytest.mark.skip_platform("win") 82 | def test_enumerate_structural_timeout(): 83 | mol = dm.to_mol("CCCCC") 84 | 85 | # NOTE(hadim): it's impossible to predict anything given a timeout for different 86 | # machines so we here we just check the code can run without errors 87 | dm.enumerate_structisomers(mol, n_variants=10, timeout_seconds=1) 88 | 89 | 90 | def test_canonical_tautomer(): 91 | smiles = "Oc1c(cccc3)c3nc2ccncc12" 92 | mol = dm.to_mol(smiles) 93 | 94 | canonical_mol = dm.canonical_tautomer(mol) 95 | 96 | assert dm.to_smiles(canonical_mol) == "O=c1c2ccccc2[nH]c2ccncc12" 97 | assert dm.to_inchikey(canonical_mol) == dm.to_inchikey(mol) 98 | 99 | 100 | def test_remove_stereochemistry(): 101 | mol = dm.to_mol("C[C@H]1CCC[C@@H](C)[C@@H]1Cl") 102 | mol_no_stereo = dm.remove_stereochemistry(mol) 103 | assert dm.to_smiles(mol_no_stereo) == "CC1CCCC(C)C1Cl" 104 | -------------------------------------------------------------------------------- /tests/test_log.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datamol as dm 4 | 5 | 6 | @dm.no_rdkit_log 7 | def no_log_to_mol(smiles): 8 | return dm.to_mol(smiles) 9 | 10 | 11 | def check_logs_are_shown(capfd): 12 | smiles = "fake_smiles" 13 | dm.to_mol(smiles) 14 | _, err = capfd.readouterr() 15 | assert "SMILES Parse Error" in err 16 | 17 | 18 | def check_logs_are_not_shown(capfd): 19 | smiles = "fake_smiles" 20 | dm.to_mol(smiles) 21 | _, err = capfd.readouterr() 22 | assert err == "" 23 | 24 | 25 | def check_logs_are_not_shown_deco(capfd): 26 | smiles = "fake_smiles" 27 | no_log_to_mol(smiles) 28 | _, err = capfd.readouterr() 29 | assert err == "" 30 | 31 | 32 | @pytest.mark.skip_platform("win") 33 | def test_rdkit_log(capfd): 34 | """Test multiple rdkit log scenarios.""" 35 | 36 | check_logs_are_shown(capfd) 37 | check_logs_are_not_shown_deco(capfd) 38 | 39 | check_logs_are_shown(capfd) 40 | with dm.without_rdkit_log(): 41 | check_logs_are_not_shown(capfd) 42 | check_logs_are_shown(capfd) 43 | 44 | dm.disable_rdkit_log() 45 | check_logs_are_not_shown(capfd) 46 | 47 | dm.enable_rdkit_log() 48 | check_logs_are_shown(capfd) 49 | 50 | dm.disable_rdkit_log() 51 | with dm.without_rdkit_log(): 52 | check_logs_are_not_shown(capfd) 53 | check_logs_are_not_shown(capfd) 54 | 55 | 56 | @pytest.mark.skip_platform("win") 57 | def test_rdkit_log_enable(capfd): 58 | dm.enable_rdkit_log() 59 | 60 | with dm.without_rdkit_log(): 61 | check_logs_are_not_shown(capfd) 62 | 63 | with dm.without_rdkit_log(enable=False): 64 | check_logs_are_shown(capfd) 65 | 66 | check_logs_are_shown(capfd) 67 | -------------------------------------------------------------------------------- /tests/test_mcs.py: -------------------------------------------------------------------------------- 1 | import datamol as dm 2 | 3 | 4 | def test_find_mcs(): 5 | smiles_list = [ 6 | "C=CC(=O)NCCOc1cc2ncnc(Nc3ccc(Br)cc3F)c2cc1NC(=O)C=C", 7 | "C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Br)c3)ncnc2cc1OCCCN1CCOCC1", 8 | "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCCNC(=O)CN(C)C", 9 | "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)NCC", 10 | "C=CC(=O)Nc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCCNC(=O)CN(C)C", 11 | ] 12 | mols = [dm.to_mol(s) for s in smiles_list] 13 | smarts = dm.find_mcs(mols=mols, timeout=2) 14 | 15 | # NOTE(hadim): hash are different given different RDKit version 16 | expected_hashes = [ 17 | # RDKit >= 2023.09 18 | "762f483ac10cc0f45c5aa2c790f9ef52f8dfb337", 19 | # RDKit <= 2023.03 20 | "49eff32e405d17980fad428cf4063ec52e2c5fda", 21 | ] 22 | 23 | assert dm.hash_mol(dm.from_smarts(smarts)) in expected_hashes 24 | -------------------------------------------------------------------------------- /tests/test_molar.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datamol as dm 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | MOLAR_TEST_VALUES = pd.DataFrame( 9 | [ 10 | (1, 6, "uM"), 11 | (0.059, 7.229147988357856, "uM"), 12 | (0.024, 7.61978876, "uM"), 13 | (0.187, 6.72815839, "uM"), 14 | (0.00154, 8.8124793, "uM"), 15 | (128, 6.892790, "nM"), 16 | (0.000128, 6.892790, "mM"), 17 | ], 18 | columns=["xc50", "pxc50", "unit"], 19 | ) 20 | 21 | 22 | def test_molar_to_log(): 23 | # test scalar 24 | value, log_value, unit = MOLAR_TEST_VALUES.iloc[0].values 25 | assert dm.molar.molar_to_log(value, unit=unit) == log_value 26 | 27 | # test arrays 28 | for unit in ["uM", "mM", "nM"]: 29 | mask = MOLAR_TEST_VALUES["unit"] == unit 30 | values = MOLAR_TEST_VALUES[mask]["xc50"].tolist() 31 | log_values = MOLAR_TEST_VALUES[mask]["pxc50"].tolist() 32 | np.testing.assert_almost_equal(dm.molar.molar_to_log(values, unit=unit), log_values) 33 | 34 | # test wrong unit 35 | with pytest.raises(ValueError): 36 | dm.molar.molar_to_log(0.000128, unit="kcal/mol") 37 | 38 | 39 | def test_log_to_molar(): 40 | # test scalar 41 | value, log_value, unit = MOLAR_TEST_VALUES.iloc[0].values 42 | np.testing.assert_almost_equal(dm.molar.log_to_molar(log_value, unit=unit), value) 43 | 44 | # test arrays 45 | for unit in ["uM", "mM", "nM"]: 46 | mask = MOLAR_TEST_VALUES["unit"] == unit 47 | values = MOLAR_TEST_VALUES[mask]["xc50"].tolist() 48 | log_values = MOLAR_TEST_VALUES[mask]["pxc50"].tolist() 49 | np.testing.assert_almost_equal( 50 | dm.molar.log_to_molar(log_values, unit=unit), values, decimal=5 51 | ) 52 | 53 | # test wrong unit 54 | with pytest.raises(ValueError): 55 | dm.molar.log_to_molar(7.214, unit="kcal/mol") 56 | 57 | 58 | def test_log_to_molar_with_integer(): 59 | dm.molar.log_to_molar(6, unit="uM") 60 | -------------------------------------------------------------------------------- /tests/test_notebooks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pathlib 3 | 4 | import nbformat 5 | import datamol as dm 6 | from nbconvert.preprocessors.execute import ExecutePreprocessor 7 | 8 | ROOT_DIR = pathlib.Path(__file__).parent.resolve() 9 | 10 | NOTEBOOK_DIR = ROOT_DIR.parent / "docs" / "tutorials" 11 | 12 | NOTEBOOK_PATHS = sorted(list(NOTEBOOK_DIR.glob("*.ipynb"))) 13 | 14 | # Discard `Filesystem.ipynb` because it takes too long to run. 15 | NOTEBOOK_PATHS = list(filter(lambda x: "Filesystem.ipynb" != x.name, NOTEBOOK_PATHS)) 16 | 17 | 18 | @pytest.mark.skip_platform("win") 19 | @pytest.mark.parametrize("nb_path", NOTEBOOK_PATHS, ids=[str(n.name) for n in NOTEBOOK_PATHS]) 20 | def test_notebook(nb_path): 21 | # Setup and configure the processor to execute the notebook 22 | if "Visualization.ipynb" in nb_path.name and not dm.is_greater_than_current_rdkit_version( 23 | "2023.03" 24 | ): 25 | pytest.skip("Circle Grid requires rdkit>2022.09") 26 | ep = ExecutePreprocessor(timeout=600, kernel_name="python") 27 | 28 | # Open the notebook 29 | with open(nb_path) as f: 30 | nb = nbformat.read(f, as_version=nbformat.NO_CONVERT) 31 | 32 | # Execute the notebook 33 | ep.preprocess(nb, {"metadata": {"path": NOTEBOOK_DIR}}) 34 | -------------------------------------------------------------------------------- /tests/test_predictors.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datamol as dm 4 | import numpy as np 5 | 6 | 7 | def test_esol(): 8 | smiles = "Nc1cnn(-c2ccccc2)c(=O)c1Cl" 9 | mol = dm.to_mol(smiles) 10 | 11 | assert np.allclose(dm.predictors.esol(mol), -2.627091966265316) 12 | 13 | 14 | def test_esol_from_data(): 15 | data = dm.freesolv() 16 | data = data.iloc[:20] 17 | 18 | with pytest.raises(KeyError): 19 | dm.predictors.esol_from_data(data) 20 | 21 | data["mol"] = data["smiles"].apply(dm.to_mol) 22 | data["clogp"] = data["mol"].apply(dm.descriptors.clogp) 23 | data["mw"] = data["mol"].apply(dm.descriptors.mw) 24 | data["n_rotatable_bonds"] = data["mol"].apply(dm.descriptors.n_rotatable_bonds) 25 | data["n_aromatic_atoms_proportion"] = data["mol"].apply( 26 | dm.descriptors.n_aromatic_atoms_proportion 27 | ) 28 | 29 | # dataframe 30 | esol_values = dm.predictors.esol_from_data(data) 31 | assert esol_values.dtype == float 32 | assert esol_values.shape == (20,) 33 | 34 | # series 35 | v = dm.predictors.esol_from_data(data.iloc[0]) 36 | v = float(v) 37 | assert isinstance(v, float) 38 | 39 | # dict 40 | v = dm.predictors.esol_from_data(data.iloc[0].to_dict()) 41 | v = float(v) 42 | assert isinstance(v, float) 43 | -------------------------------------------------------------------------------- /tests/test_scaffold.py: -------------------------------------------------------------------------------- 1 | import datamol as dm 2 | 3 | 4 | def test_fuzzy_scaffolding(): 5 | smiles = [ 6 | "Cc1ccc(NC(=O)Cn2cccn2)c(Br)c1", 7 | "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1", 8 | "CC(NC(=O)CSCc1cccs1)C1CCCO1", 9 | "CC1CCCCN1C(=O)CN1CCC[C@@H](N)C1", 10 | "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1", # no way this one (Remdesivir) is in the db 11 | "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1", 12 | ] 13 | 14 | mols = [dm.to_mol(s) for s in smiles] 15 | 16 | # NOTE(hadim): different version of rdkit (2020.09 vs 2021.03) returns 17 | # different SMILES here. 18 | # assert "O=C(CN1CCC[C@@H]([*:1])C1)N1CCCCC1[*:2]" in all_scaffolds 19 | # assert "O=C(CSCc1cccs1)NC(C1CCCO1)[*:1]" in all_scaffolds 20 | # assert "O=C(N=c1sccn1[*:1])C(Oc1ccc([*:3])cc1)[*:2]" in all_scaffolds 21 | 22 | all_scaffolds, df_scf2infos, df_scf2groups = dm.scaffold.fuzzy_scaffolding(mols) 23 | 24 | assert len(all_scaffolds) == 5 25 | assert len(df_scf2infos.columns) == 3 26 | 27 | # because we are returning the output for each scf 28 | # these should be the same 29 | assert len(df_scf2infos.index) == len(df_scf2groups.index) 30 | assert list(df_scf2infos["scf"]) == list(df_scf2groups["scf"]) 31 | 32 | # mere coincidence that scf2infos and scf2groups for the columns have the 33 | # the same length. the reason there are 3 not two is because it could have 34 | # extra columns where a cell may have none values. 35 | assert len(df_scf2groups.columns) == 3 36 | -------------------------------------------------------------------------------- /tests/test_similarity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | import datamol as dm 5 | import datamol.utils.testing 6 | 7 | 8 | def test_pdist(): 9 | smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"] 10 | mols = [dm.to_mol(smiles) for smiles in smiles_list] 11 | 12 | dist_mat = dm.pdist(mols) 13 | 14 | assert dist_mat.shape == (3, 3) 15 | assert dist_mat.sum() == 5.6757105943152455 16 | 17 | dist_mat = dm.pdist(mols, n_jobs=None) 18 | 19 | assert dist_mat.shape == (3, 3) 20 | assert dist_mat.sum() == 5.6757105943152455 21 | 22 | 23 | def test_pdist_condensed(): 24 | smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"] 25 | mols = [dm.to_mol(smiles) for smiles in smiles_list] 26 | 27 | dist_mat = dm.pdist(mols, squareform=False) 28 | 29 | assert dist_mat.shape == (3,) 30 | assert dist_mat.sum() == 2.8378552971576227 31 | 32 | 33 | def test_cdist(): 34 | smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"] 35 | mols1 = [dm.to_mol(smiles) for smiles in smiles_list1] 36 | 37 | smiles_list2 = [ 38 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1", 39 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21", 40 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl", 41 | ] 42 | mols2 = [dm.to_mol(smiles) for smiles in smiles_list2] 43 | 44 | dist_mat = dm.cdist(mols1, mols2) 45 | 46 | assert dist_mat.shape == (3, 3) 47 | assert np.isclose(dist_mat.mean(), 0.9416270180919872) 48 | 49 | 50 | def test_cdist_chunked(): 51 | smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"] 52 | mols1 = [dm.to_mol(smiles) for smiles in smiles_list1] 53 | 54 | smiles_list2 = [ 55 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1", 56 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21", 57 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl", 58 | ] 59 | mols2 = [dm.to_mol(smiles) for smiles in smiles_list2] 60 | 61 | d1 = dm.cdist(mols1, mols2, distances_chunk=True) 62 | d2 = dm.cdist(mols1, mols2, distances_chunk=False) 63 | 64 | assert d1.shape == d2.shape 65 | assert np.allclose(d1, d2) 66 | 67 | 68 | def test_cdist_pdist_consistent(): 69 | smiles_list1 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"] 70 | mols1 = [dm.to_mol(smiles) for smiles in smiles_list1] 71 | 72 | dist_mat = dm.cdist(mols1, mols1) 73 | dist_mat2 = dm.pdist(mols1) 74 | 75 | assert np.isclose(dist_mat.mean(), dist_mat2.mean()) 76 | assert np.allclose(dist_mat, dist_mat2) 77 | 78 | 79 | def test_cdist_pdist_invalid_input(): 80 | smiles_list = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1", "dsdsdsd"] 81 | 82 | with pytest.raises(ValueError): 83 | dm.similarity.cdist(smiles_list, smiles_list) 84 | 85 | with pytest.raises(ValueError): 86 | dm.similarity.pdist(smiles_list) 87 | 88 | 89 | def test_datamol_pdist_same_as_rdkit(): 90 | smiles_list = [ 91 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1", 92 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21", 93 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl", 94 | ] 95 | 96 | dist_mat = dm.similarity.pdist(smiles_list) 97 | dist_mat_rdkit = datamol.utils.testing.pdist_rdkit(smiles_list) 98 | 99 | assert np.allclose(dist_mat, dist_mat_rdkit) 100 | 101 | 102 | def test_datamol_cdist_same_as_rdkit(): 103 | smiles_list = [ 104 | "COc1cc(Nc2ncc(Cl)c(-c3cccc(CC#N)c3)n2)ccc1N1CCN(C)CC1", 105 | "ON=C(O)CCCCCN=C(O)C=C1c2ccccc2-c2ccccc21", 106 | "COc1ccc(CCc2nnc(-c3ccc4nc[nH]c4c3)o2)cc1Cl", 107 | ] 108 | 109 | smiles_list2 = ["CC(=O)Oc1ccccc1C(=O)O", "C1OC1CC", "c1cc2ccccc2cc1"] 110 | 111 | dist_mat = dm.similarity.cdist(smiles_list, smiles_list2) 112 | dist_mat_rdkit = datamol.utils.testing.cdist_rdkit(smiles_list, smiles_list2) 113 | 114 | assert np.allclose(dist_mat, dist_mat_rdkit) 115 | -------------------------------------------------------------------------------- /tests/test_utils_fs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pathlib 4 | 5 | import fsspec 6 | import datamol as dm 7 | 8 | 9 | def test_copy_files(tmp_path): 10 | source_path = tmp_path / "source.txt" 11 | destination_path = tmp_path / "destination.txt" 12 | 13 | content = "hello this is a content" 14 | with open(source_path, "w") as f: 15 | f.write(content) 16 | 17 | dm.utils.fs.copy_file(source_path, destination_path) 18 | 19 | with open(destination_path) as f: 20 | assert f.read() == content 21 | 22 | 23 | def test_copy_dir(tmp_path): 24 | source_path = tmp_path / "source_dir" 25 | source_path_subdir = source_path / "a_subdir" 26 | destination_path = tmp_path / "destination_dir" 27 | destination_path_subdir = destination_path / "a_subdir" 28 | 29 | dm.utils.fs.mkdir(source_path) 30 | dm.utils.fs.mkdir(source_path_subdir) 31 | 32 | content = "hello this is a content" 33 | file1_path = source_path / "hello.txt" 34 | with open(file1_path, "w") as f: 35 | f.write(content) 36 | 37 | file2_path = source_path_subdir / "hello.txt" 38 | with open(file2_path, "w") as f: 39 | f.write(content) 40 | 41 | assert not dm.utils.fs.is_dir(destination_path_subdir) 42 | assert not dm.utils.fs.is_dir(destination_path) 43 | 44 | dm.utils.fs.copy_dir(source_path, destination_path) 45 | 46 | assert dm.utils.fs.is_dir(destination_path_subdir) 47 | assert dm.utils.fs.is_dir(destination_path) 48 | assert dm.utils.fs.is_file(file1_path) 49 | assert dm.utils.fs.is_file(file2_path) 50 | 51 | with open(file1_path) as f: 52 | assert f.read() == content 53 | 54 | with open(file2_path) as f: 55 | assert f.read() == content 56 | 57 | 58 | def test_mkdir(tmp_path): 59 | source_path = tmp_path / "source_dir" 60 | source_path_subdir = source_path / "a_subdir" 61 | 62 | dm.utils.fs.mkdir(source_path) 63 | 64 | assert dm.utils.fs.is_dir(source_path) 65 | assert not dm.utils.fs.is_dir(source_path_subdir) 66 | 67 | dm.utils.fs.mkdir(source_path_subdir) 68 | 69 | assert dm.utils.fs.is_dir(source_path) 70 | assert dm.utils.fs.is_dir(source_path_subdir) 71 | 72 | 73 | @pytest.mark.skip_platform("win") 74 | def test_cache_dir(): 75 | cache_dir = dm.utils.fs.get_cache_dir("my_app") 76 | assert str(cache_dir).endswith("my_app") 77 | assert cache_dir.exists() 78 | assert cache_dir.is_dir() 79 | 80 | cache_dir = dm.utils.fs.get_cache_dir("my_app", suffix="likelydonotalreadyexist", create=False) 81 | assert str(cache_dir).endswith("likelydonotalreadyexist") 82 | assert not cache_dir.exists() 83 | assert not cache_dir.is_dir() 84 | 85 | cache_dir = dm.utils.fs.get_cache_dir("my_app", suffix="iamasuffix") 86 | assert str(cache_dir).endswith("iamasuffix") 87 | assert "my_app" in str(cache_dir) 88 | assert cache_dir.exists() 89 | assert cache_dir.is_dir() 90 | 91 | 92 | def test_get_mapper(tmp_path): 93 | fsmapper = dm.utils.fs.get_mapper(str(tmp_path / "test.txt")) 94 | 95 | # NOTE(hadim): depends the fsspec version 96 | assert fsmapper.fs.protocol in ["file", ("file", "local")] 97 | 98 | 99 | @pytest.mark.skip_platform("win") 100 | def test_get_basename(tmp_path): 101 | assert dm.utils.fs.get_basename(str(tmp_path / "test.txt")) == "test.txt" 102 | assert dm.utils.fs.get_basename("s3://a-bucket-that-likely-do-not-exist/test.txt") == "test.txt" 103 | 104 | 105 | def test_get_extension(tmp_path): 106 | assert dm.utils.fs.get_extension(str(tmp_path / "test.txt")) == "txt" 107 | assert dm.utils.fs.get_extension("s3://a-bucket-that-likely-do-not-exist/test.txt") == "txt" 108 | 109 | 110 | def test_exists(tmp_path): 111 | tmp_file = tmp_path / "test.txt" 112 | 113 | assert not dm.utils.fs.exists(tmp_file) 114 | assert not dm.utils.fs.is_file(tmp_file) 115 | 116 | assert dm.utils.fs.is_dir(tmp_path) 117 | assert not dm.utils.fs.is_dir(tmp_path / "likely-does-not-exist") 118 | 119 | with open(tmp_file, "w") as f: 120 | f.write("hello") 121 | 122 | assert dm.utils.fs.exists(tmp_file) 123 | assert dm.utils.fs.is_file(tmp_file) 124 | 125 | assert not dm.utils.fs.is_file(open(tmp_file)) 126 | assert not dm.utils.fs.is_dir(open(tmp_file)) 127 | 128 | 129 | def test_get_protocol(tmp_path): 130 | assert dm.utils.fs.get_protocol(tmp_path / "ahahah.txt") == "file" 131 | assert dm.utils.fs.get_protocol("s3://a-bucket-that-likely-do-not-exist/test.txt") == "s3" 132 | 133 | 134 | def test_is_local_path(tmp_path): 135 | assert dm.utils.fs.is_local_path(tmp_path / "ahahah.txt") 136 | assert not dm.utils.fs.is_local_path("s3://a-bucket-that-likely-do-not-exist/test.txt") 137 | 138 | 139 | @pytest.mark.skip_platform("win") 140 | def test_join(tmp_path): 141 | assert ( 142 | dm.utils.fs.join("s3://a-bucket-that-likely-do-not-exist", "test.txt") 143 | == "s3://a-bucket-that-likely-do-not-exist/test.txt" 144 | ) 145 | assert dm.utils.fs.join(tmp_path, "test.txt") == str(tmp_path / "test.txt") 146 | 147 | 148 | def test_get_size(tmp_path): 149 | tmp_file = tmp_path / "test.txt" 150 | 151 | with open(tmp_file, "w") as f: 152 | f.write("hello") 153 | 154 | assert dm.utils.fs.get_size(tmp_file) > 0 155 | assert dm.utils.fs.get_size(open(tmp_file)) > 0 156 | assert dm.utils.fs.get_size(fsspec.open(tmp_file)) > 0 157 | 158 | 159 | def test_md5(tmp_path): 160 | tmp_file = tmp_path / "test.txt" 161 | 162 | with open(tmp_file, "w") as f: 163 | f.write("hello") 164 | 165 | assert dm.utils.fs.md5(tmp_file) == "5d41402abc4b2a76b9719d911017c592" 166 | 167 | 168 | @pytest.mark.skip_platform("win") 169 | def test_glob(tmp_path): 170 | for i in range(5): 171 | tmp_file = tmp_path / f"test_{i}.txt" 172 | 173 | with open(tmp_file, "w") as f: 174 | f.write("hello") 175 | 176 | tmp_path_regex = tmp_path / "*.txt" 177 | assert len(dm.utils.fs.glob(tmp_path_regex)) == 5 178 | 179 | 180 | def test_copy_file(tmp_path): 181 | tmp_file = tmp_path / "test.txt" 182 | 183 | assert dm.utils.fs.is_dir(tmp_path) 184 | assert dm.utils.fs.is_dir(str(tmp_path)) 185 | assert dm.utils.fs.is_dir(pathlib.Path(str(tmp_path))) 186 | 187 | assert not dm.utils.fs.is_dir(tmp_path / "not_exist_dir") 188 | assert not dm.utils.fs.is_dir(str(tmp_path / "not_exist_dir")) 189 | assert not dm.utils.fs.is_dir(pathlib.Path(str(tmp_path / "not_exist_dir"))) 190 | 191 | with open(tmp_file, "w") as f: 192 | f.write("hello") 193 | 194 | tmp_file2 = tmp_path / "test2.txt" 195 | assert not dm.utils.fs.is_file(tmp_file2) 196 | assert not dm.utils.fs.is_file(str(tmp_file2)) 197 | assert not dm.utils.fs.is_file(pathlib.Path(str(tmp_file2))) 198 | 199 | dm.utils.fs.copy_file(tmp_file, tmp_file2) 200 | 201 | assert dm.utils.fs.is_file(tmp_file2) 202 | assert dm.utils.fs.is_file(str(tmp_file2)) 203 | assert dm.utils.fs.is_file(pathlib.Path(str(tmp_file2))) 204 | assert open(tmp_file2).read() == "hello" 205 | 206 | with pytest.raises(ValueError): 207 | dm.utils.fs.copy_file(tmp_file, tmp_file2) 208 | 209 | tmp_file3 = tmp_path / "test3.txt" 210 | dm.utils.fs.copy_file(tmp_file, tmp_file3, progress=True) 211 | assert dm.utils.fs.is_file(tmp_file3) 212 | assert dm.utils.fs.is_file(str(tmp_file3)) 213 | assert dm.utils.fs.is_file(pathlib.Path(str(tmp_file3))) 214 | assert open(tmp_file3).read() == "hello" 215 | -------------------------------------------------------------------------------- /tests/test_utils_jobs.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numbers 3 | import operator 4 | import unittest 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from functools import reduce 10 | 11 | import datamol as dm 12 | 13 | 14 | def random_fn(*args, op="mul", **kwargs): 15 | """Perform random functions on a list""" 16 | all_values = [x for x in args if isinstance(x, numbers.Number)] 17 | all_values += [x for x in kwargs.values() if isinstance(x, numbers.Number)] 18 | op_fn = getattr(operator, op, None) 19 | if op_fn is None: 20 | op_fn = getattr(math, op) 21 | return op_fn(all_values[0]) 22 | return reduce(op_fn, all_values) 23 | 24 | 25 | class TestJobs(unittest.TestCase): 26 | def test_sequential(self): 27 | jobrunner = dm.JobRunner(n_jobs=None, progress=False) 28 | # practically do nothing (add a single value with nothing) 29 | o1 = jobrunner(random_fn, [9, 25, 1024], op="add") 30 | self.assertEqual(o1, [9, 25, 1024]) 31 | 32 | # take the sqrt 33 | o2 = jobrunner(random_fn, [9, 25, 1024], op="sqrt") 34 | self.assertEqual(o2, [3, 5, 32]) 35 | 36 | # multiply all inputs 37 | o3 = jobrunner(random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul") 38 | self.assertEqual(o3, [6, 4 * 5 * 6, 0]) 39 | 40 | # do the same thing but with kwargs 41 | o4 = jobrunner( 42 | random_fn, 43 | iter([dict(a=1, b=2, c=3), dict(a=4, b=5, c=6), dict(a=3, b=4, c=0)]), 44 | arg_type="kwargs", 45 | op="mul", 46 | ) 47 | self.assertEqual(o4, [6, 4 * 5 * 6, 0]) 48 | 49 | o5 = jobrunner(random_fn, np.asarray([9, 25, 1024]), op="add") 50 | self.assertEqual(o5, [9, 25, 1024]) 51 | 52 | def test_parallel(self): 53 | jobrunner1 = dm.JobRunner(n_jobs=4, progress=True) # use loky backend 54 | o1 = jobrunner1(random_fn, [9, 25, 1024], op="add") 55 | self.assertEqual(o1, [9, 25, 1024]) 56 | 57 | o5 = jobrunner1(random_fn, np.asarray([9, 25, 1024]), op="add") 58 | self.assertEqual(o5, [9, 25, 1024]) 59 | 60 | o3 = jobrunner1(random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul") 61 | self.assertEqual(o3, [6, 4 * 5 * 6, 0]) 62 | 63 | # use threads instead, no progress 64 | jobrunner2 = dm.JobRunner(n_jobs=2, progress=False, prefer="threads") 65 | o2 = jobrunner2(random_fn, [9, 25, 1024], op="sqrt") 66 | self.assertEqual(o2, [3, 5, 32]) 67 | 68 | o4 = jobrunner2( 69 | random_fn, 70 | iter([dict(a=1, b=2, c=3), dict(a=4, b=5, c=6), dict(a=3, b=4, c=0)]), 71 | arg_type="kwargs", 72 | op="mul", 73 | ) 74 | self.assertEqual(o4, [6, 4 * 5 * 6, 0]) 75 | 76 | def test_seq_vs_parallel(self): 77 | # test parallel vs sequential 78 | jobrunner = dm.JobRunner(n_jobs=4, progress=False) # use loky backend 79 | o_seq = jobrunner.sequential( 80 | random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul" 81 | ) 82 | o_par = jobrunner.parallel( 83 | random_fn, [(1, 2, 3), (4, 5, 6), (3, 4, 0)], arg_type="args", op="mul" 84 | ) 85 | self.assertEqual(o_seq, o_par) 86 | 87 | def test_parallelized(self): 88 | def fn(x): 89 | return x**2 90 | 91 | results = dm.parallelized( 92 | fn, 93 | [{"x": i} for i in range(10)], 94 | scheduler="processes", 95 | n_jobs=None, 96 | arg_type="kwargs", 97 | progress=True, 98 | ) 99 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] 100 | 101 | results = dm.parallelized( 102 | fn, 103 | [[i] for i in range(10)], 104 | scheduler="processes", 105 | n_jobs=None, 106 | arg_type="args", 107 | progress=True, 108 | ) 109 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] 110 | 111 | results = dm.parallelized( 112 | fn, 113 | range(10), 114 | scheduler="processes", 115 | n_jobs=None, 116 | progress=False, 117 | ) 118 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] 119 | 120 | def test_job_kwargs(self): 121 | def fn(x): 122 | return x**2 123 | 124 | results = dm.parallelized( 125 | fn, 126 | [{"x": i} for i in range(10)], 127 | scheduler="processes", 128 | n_jobs=None, 129 | arg_type="kwargs", 130 | progress=True, 131 | verbose=100, 132 | ) 133 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] 134 | 135 | def test_tqdm_kwargs(self): 136 | def fn(x): 137 | return x**2 138 | 139 | results = dm.parallelized( 140 | fn, 141 | [{"x": i} for i in range(10)], 142 | scheduler="processes", 143 | n_jobs=None, 144 | arg_type="kwargs", 145 | progress=True, 146 | tqdm_kwargs=dict(desc="My progress bar"), 147 | ) 148 | assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] 149 | 150 | def test_with_batch_size(self): 151 | def _fn(n): 152 | return n * 3 153 | 154 | def _fn_return_none(n): 155 | return None 156 | 157 | results = dm.utils.parallelized( 158 | _fn, 159 | range(997), 160 | n_jobs=-1, 161 | progress=True, 162 | batch_size=10, 163 | ) 164 | assert len(results) == 997 165 | 166 | results = dm.utils.parallelized( 167 | _fn_return_none, 168 | range(997), 169 | n_jobs=-1, 170 | progress=True, 171 | batch_size=10, 172 | ) 173 | assert len(results) == 997 174 | 175 | def test_with_total(self): 176 | def _fn_process_fn(_, row): 177 | datum = {} 178 | datum["smiles"] = row["smiles"] 179 | return pd.Series(datum) 180 | 181 | data = dm.freesolv() 182 | data = data.iloc[:50] 183 | 184 | # parallel mode 185 | 186 | ## check the `total` arg is ok 187 | dm.parallelized( 188 | _fn_process_fn, 189 | data.iterrows(), 190 | n_jobs=-1, 191 | progress=True, 192 | arg_type="args", 193 | total=50, 194 | ) 195 | 196 | ## check collision between guessed total and provided one 197 | dm.parallelized( 198 | _fn_process_fn, 199 | list(data.iterrows()), 200 | n_jobs=-1, 201 | progress=True, 202 | arg_type="args", 203 | total=50, 204 | ) 205 | 206 | # sequential mode 207 | 208 | ## check the `total` arg is ok 209 | dm.parallelized( 210 | _fn_process_fn, 211 | data.iterrows(), 212 | n_jobs=1, 213 | progress=True, 214 | arg_type="args", 215 | total=50, 216 | ) 217 | 218 | ## check collision between guessed total and provided one 219 | dm.parallelized( 220 | _fn_process_fn, 221 | list(data.iterrows()), 222 | n_jobs=1, 223 | progress=True, 224 | arg_type="args", 225 | total=50, 226 | ) 227 | 228 | 229 | def test_parallelized_with_batches(): 230 | data = dm.freesolv() 231 | data = data.iloc[:10] 232 | 233 | def _fn1(smiles): 234 | return len(smiles) 235 | 236 | results1 = dm.parallelized( 237 | _fn1, 238 | data["smiles"], 239 | progress=False, 240 | n_jobs=-1, 241 | ) 242 | 243 | def _fn2(smiles_list): 244 | return [len(s) for s in smiles_list] 245 | 246 | results2 = dm.parallelized_with_batches( 247 | _fn2, 248 | data["smiles"], 249 | batch_size=2, 250 | progress=False, 251 | n_jobs=-1, 252 | ) 253 | 254 | assert results1 == results2 255 | -------------------------------------------------------------------------------- /tests/test_utils_perf.py: -------------------------------------------------------------------------------- 1 | import datamol as dm 2 | 3 | 4 | def test_watch_duration(): 5 | def fn(n): 6 | for i in range(n): 7 | print(i) 8 | 9 | with dm.utils.perf.watch_duration(log=True) as w: 10 | fn(5) 11 | 12 | assert isinstance(w.duration, float) 13 | -------------------------------------------------------------------------------- /tests/test_viz.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import base64 4 | import io 5 | 6 | import numpy as np 7 | import ipywidgets as widgets 8 | 9 | import PIL 10 | from PIL import Image 11 | 12 | import datamol as dm 13 | 14 | 15 | # NOTE(hadim): rdkit returns different image objects 16 | # according to the Python process context (Jupyter notebook vs terminal). 17 | # In consequence, those tests will fail if they are executed within a 18 | # Jupyter notebook. 19 | 20 | 21 | def _convert_ipython_to_array(image): 22 | """convert ipython image to numpy array""" 23 | image_obj = base64.b64decode(str(image._repr_png_())) 24 | try: 25 | image_obj = Image.open(io.BytesIO(image_obj)) 26 | return np.array(image_obj) 27 | except Exception: 28 | return np.array(image) 29 | 30 | 31 | def test_to_image(): 32 | # Get a list of molecules 33 | data = dm.data.freesolv() 34 | mols = dm.from_df(data) # type: ignore 35 | mols = mols[:8] 36 | 37 | # With multiple molecules 38 | legends = [dm.to_smiles(mol) for mol in mols] 39 | image = dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200), use_svg=False) 40 | image = _convert_ipython_to_array(image) 41 | 42 | print(type(image)) 43 | 44 | image = np.array(image) 45 | 46 | assert image.dtype == np.uint8 47 | assert image.shape == (400, 800, 3) 48 | assert image.shape[1] == 200 * 4 49 | 50 | # With a single molecule 51 | mol = mols[0] 52 | legends = dm.to_smiles(mol) 53 | image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200), use_svg=False) 54 | image = _convert_ipython_to_array(image) 55 | image = np.array(image) 56 | 57 | assert image.dtype == np.uint8 58 | assert image.shape == (200, 200, 3) 59 | 60 | dm.viz.to_image(mol, indices=True, mol_size=400) 61 | 62 | # With input smiles 63 | mol = "CCCOCc1cc(c2ncccc2)ccc1" 64 | legends = mol 65 | image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200), use_svg=False) 66 | image = _convert_ipython_to_array(image) 67 | image = np.array(image) 68 | 69 | assert image.dtype == np.uint8 70 | assert image.shape == (200, 200, 3) 71 | 72 | 73 | def test_to_image_incorrect_aromaticity(): 74 | query = "C-c1cn(-C-2-[N,O:3]-[#6@H](-C-[#6,#8:1]-[*:2])-C(-[#8])-C-2-[#1,#8,#9:4])c2ncnc(-C)c12" 75 | mol = dm.from_smarts(query) 76 | dm.to_image( 77 | mol, 78 | mol_size=300, 79 | use_svg=False, 80 | legends="a legend", 81 | legend_fontsize=40, 82 | stereo_annotations=False, 83 | ) 84 | 85 | 86 | def test_to_image_save_file(tmpdir): 87 | smiles = "CCCOCc1cc(c2ncccc2)ccc1" 88 | mol = dm.to_mol(smiles) 89 | 90 | image_path = str(tmpdir.join("mol.png")) 91 | dm.viz.to_image(mol, outfile=image_path, use_svg=False) 92 | 93 | # check whether the png is valid 94 | try: 95 | img = Image.open(image_path) 96 | img.verify() 97 | except PIL.UnidentifiedImageError: 98 | pytest.fail(f"The image {image_path} is invalid.") 99 | 100 | image_path = str(tmpdir.join("mol.svg")) 101 | dm.viz.to_image(mol, outfile=image_path, use_svg=True) 102 | 103 | # check whether the svg looks valid 104 | with open(image_path) as f: 105 | content = f.read().strip() 106 | assert content.startswith("") 108 | 109 | 110 | def test_conformers(): 111 | import nglview as nv 112 | 113 | smiles = "CCCC=O" 114 | mol = dm.to_mol(smiles) 115 | mol = dm.conformers.generate(mol) 116 | 117 | # one conformer 118 | view = dm.viz.conformers(mol) 119 | assert type(view) == nv.widget.NGLWidget 120 | 121 | # multiple conformers 122 | view = dm.viz.conformers(mol, n_confs=12) 123 | assert type(view) == widgets.GridspecLayout 124 | 125 | 126 | @pytest.mark.skipif( 127 | not dm.is_greater_than_current_rdkit_version("2023.03"), 128 | reason="Circle Grid requires rdkit>2022.09", 129 | ) 130 | def test_circle_grid(tmp_path): 131 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O") 132 | dm.viz.circle_grid( 133 | mol, 134 | [ 135 | [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")], 136 | [dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")], 137 | ], 138 | outfile=str(tmp_path / "image.png"), 139 | ) 140 | 141 | 142 | @pytest.mark.skipif( 143 | not dm.is_greater_than_current_rdkit_version("2023.03"), 144 | reason="Circle Grid requires rdkit>2022.09", 145 | ) 146 | def test_circle_grid_with_hex_color(tmp_path): 147 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O") 148 | dm.viz.circle_grid( 149 | mol, 150 | [ 151 | [dm.to_mol("CCC"), dm.to_mol("CCCCCCC")], 152 | [dm.to_mol("CCCO"), dm.to_mol("CCCCCCCO")], 153 | ], 154 | ring_color="#ff1472", 155 | layout_random_seed=None, 156 | ) 157 | 158 | 159 | @pytest.mark.skipif( 160 | not dm.is_greater_than_current_rdkit_version("2023.03"), 161 | reason="Circle Grid requires rdkit>2022.09", 162 | ) 163 | def test_circle_grid_with_angle_start(tmp_path): 164 | mol = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O") 165 | dm.viz.circle_grid( 166 | mol, 167 | [ 168 | [dm.to_mol("CCC"), dm.to_mol("CCCCCCC"), dm.to_mol("CCCCCO")], 169 | [ 170 | dm.to_mol("CCCO"), 171 | ], 172 | ], 173 | # ring_color=(0, 0, 0, 0.5), 174 | ring_color="#ff1472aa", 175 | layout_random_seed=19, 176 | ring_mol_start_angles_degrees=[90, 90], 177 | ) 178 | 179 | 180 | def test_to_image_align(): 181 | # Get a list of molecules 182 | data = dm.data.freesolv() 183 | mols = dm.from_df(data) # type: ignore 184 | mols = mols[:8] 185 | 186 | # With multiple molecules 187 | dm.viz.to_image(mols, align=True) 188 | 189 | 190 | def test_to_image_align_template(): 191 | # Get a list of molecules 192 | data = dm.data.freesolv() 193 | mols = dm.from_df(data) # type: ignore 194 | mols = mols[:8] 195 | 196 | dm.viz.to_image(mols, align=mols[0]) 197 | -------------------------------------------------------------------------------- /tests/test_viz_lasso_highlight.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import datamol as dm 3 | 4 | 5 | # The following tests are supposed to work and should not raise any errors 6 | def test_original_working_solution_str(): 7 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 8 | smarts_list = "CONN" 9 | assert dm.lasso_highlight_image(smi, smarts_list) 10 | 11 | 12 | # The following tests are supposed to work and should not raise any errors 13 | def test_from_mol(): 14 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 15 | mol = dm.to_mol(smi) 16 | smarts_list = "CONN" 17 | assert dm.lasso_highlight_image(mol, smarts_list) 18 | 19 | 20 | def test_with_highlight(): 21 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 22 | mol = dm.to_mol(smi) 23 | smarts_list = "CONN" 24 | highlight_atoms = [4, 5, 6] 25 | highlight_bonds = [1, 2, 3, 4] 26 | highlight_atom_colors = {4: (230, 230, 250), 5: (230, 230, 250), 6: (230, 230, 250)} 27 | highlight_bond_colors = { 28 | 1: (230, 230, 250), 29 | 2: (230, 230, 250), 30 | 3: (230, 230, 250), 31 | 4: (230, 230, 250), 32 | } 33 | assert dm.lasso_highlight_image( 34 | mol, 35 | smarts_list, 36 | highlight_atoms=highlight_atoms, 37 | highlight_bonds=highlight_bonds, 38 | highlight_atom_colors=highlight_atom_colors, 39 | highlight_bond_colors=highlight_bond_colors, 40 | continuousHighlight=False, 41 | ) 42 | 43 | 44 | def test_original_working_solution_list_single_str(): 45 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 46 | smarts_list = ["CONN"] 47 | assert dm.lasso_highlight_image(smi, smarts_list) 48 | 49 | 50 | def test_original_working_solution_list_str(): 51 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 52 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"] 53 | assert dm.lasso_highlight_image(smi, smarts_list) 54 | 55 | 56 | def test_original_working_solution_mol(): 57 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 58 | smarts_list = dm.to_mol("CONN") 59 | assert dm.lasso_highlight_image(smi, smarts_list) 60 | 61 | 62 | def test_original_working_solution_list_single_mol(): 63 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 64 | smarts_list = [dm.to_mol("CONN")] 65 | assert dm.lasso_highlight_image(smi, smarts_list) 66 | 67 | 68 | def test_original_working_solution_List_mol(): 69 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 70 | smarts_list = [dm.to_mol("CONN"), dm.to_mol("N#CC~CO"), dm.to_mol("C=CON"), dm.to_mol("CONNCN")] 71 | assert dm.lasso_highlight_image(smi, smarts_list) 72 | 73 | 74 | def test_wokring_solution_with_more_structures_than_colors(): 75 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 76 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"] 77 | assert dm.lasso_highlight_image(smi, smarts_list) 78 | 79 | 80 | def test_drawing_options(): 81 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 82 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"] 83 | assert dm.lasso_highlight_image(smi, smarts_list, bondLineWidth=15) 84 | 85 | 86 | def test_wrong_drawing_options(): 87 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 88 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"] 89 | 90 | with pytest.raises(ValueError): 91 | dm.lasso_highlight_image(smi, smarts_list, bondLineWidthXXXXXXX=15) 92 | 93 | 94 | def test_input_mol_is_none(): 95 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN", "FCCl", "OCO", "N#C", "N#CC", "CC#N"] 96 | 97 | with pytest.raises(ValueError): 98 | dm.lasso_highlight_image(None, smarts_list) 99 | 100 | 101 | def test_search_input_error_empty_list(): 102 | # should still go through but just print out the structure without any highlights 103 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 104 | smarts_list = [] 105 | assert dm.lasso_highlight_image(smi, smarts_list) 106 | 107 | 108 | def test_target_input_error_empty_str(): 109 | with pytest.raises(ValueError): 110 | smi = "" 111 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"] 112 | dm.lasso_highlight_image(smi, smarts_list) 113 | 114 | 115 | def test_target_input_error_None(): 116 | with pytest.raises(ValueError): 117 | smi = None 118 | smarts_list = ["CONN", "N#CC~CO", "C=CON", "CONNCN"] 119 | dm.lasso_highlight_image(smi, smarts_list) 120 | 121 | 122 | def test_search_input_error_smarts_no_substructure(): 123 | # This test should still continue but will just print out a structure without any highlights and a warning 124 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 125 | smarts_list = ["CCCCCC"] 126 | assert dm.lasso_highlight_image(smi, smarts_list) 127 | 128 | 129 | # testing using " == str(type(img)) so to not bring in IPython 130 | # as a dependency for the tests 131 | def test_SVG_is_returned_explicit(): 132 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 133 | smarts_list = ["CC"] 134 | img = dm.lasso_highlight_image(smi, smarts_list, use_svg=True) 135 | assert isinstance(img, str) 136 | 137 | 138 | def test_SVG_is_returned_implict(): 139 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 140 | smarts_list = ["CC"] 141 | img = dm.lasso_highlight_image(smi, smarts_list) 142 | assert isinstance(img, str) 143 | 144 | 145 | def test_PNG_is_returned(): 146 | smi = "CO[C@@H](O)C1=C(O[C@H](F)Cl)C(C#N)=C1ONNC[NH3+]" 147 | smarts_list = ["CC"] 148 | img = dm.lasso_highlight_image(smi, smarts_list, use_svg=False) 149 | 150 | from PIL import Image 151 | 152 | assert isinstance(img, Image.Image) 153 | 154 | 155 | def test_aromatic_query_work(): 156 | smi = "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3" 157 | smarts_list = ["c1ccccc1"] 158 | assert dm.lasso_highlight_image(smi, smarts_list) 159 | 160 | 161 | def test_smarts_query(): 162 | smi = "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3" 163 | smarts_list = "[#6]" 164 | assert dm.lasso_highlight_image(smi, smarts_list) 165 | 166 | 167 | def test_query_and_atom_indices_list(): 168 | dm.viz.lasso_highlight_image( 169 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", 170 | search_molecules="c1ccccc1", 171 | atom_indices=[[4, 5, 6], [1, 2, 3, 4]], 172 | ) 173 | 174 | 175 | def test_multiple_mol_lasso(): 176 | img = dm.viz.lasso_highlight_image( 177 | ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"], 178 | search_molecules="c1ccccc1", 179 | ) 180 | assert isinstance(img, str) 181 | 182 | img = dm.viz.lasso_highlight_image( 183 | ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"], 184 | search_molecules="c1ccccc1", 185 | mol_size=(200, 200), 186 | n_cols=1, 187 | use_svg=False, 188 | ) 189 | from PIL import Image 190 | 191 | assert isinstance(img, Image.Image) 192 | img.size == (400, 200) 193 | 194 | 195 | def test_multiple_mol_lasso_different_scale_legends(): 196 | dm.viz.lasso_highlight_image( 197 | ["CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", "c1ccccc1"], 198 | legends=["Mol1", "Mol2"], 199 | search_molecules="c1ccccc1", 200 | n_cols=1, 201 | draw_mols_same_scale=False, 202 | ) 203 | 204 | 205 | def test_atom_indices_list_of_list(): 206 | dm.viz.lasso_highlight_image( 207 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", 208 | search_molecules=None, 209 | atom_indices=[[4, 5, 6], [1, 2, 3, 4]], 210 | ) 211 | 212 | 213 | def test_atom_indices_list(): 214 | dm.viz.lasso_highlight_image( 215 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", 216 | search_molecules=None, 217 | atom_indices=[4, 5, 6], 218 | ) 219 | 220 | 221 | def test_with_hex_color(): 222 | dm.viz.lasso_highlight_image( 223 | "CC(N)Cc1c[nH]c2ccc3c(c12)CCCO3", 224 | search_molecules=None, 225 | atom_indices=[4, 5, 6], 226 | color_list=["#ff1472"], 227 | ) 228 | -------------------------------------------------------------------------------- /tests/test_viz_substrcture.py: -------------------------------------------------------------------------------- 1 | import datamol as dm 2 | 3 | 4 | def test_match_substructure(): 5 | mol1 = dm.to_mol("CC(=O)OC1=CC=CC=C1C(=O)O") 6 | mol2 = dm.to_mol("CCN(CC)CC(=O)CC(C)NC1=C2C=CC(=CC2=NC=C1)Cl") 7 | 8 | query1 = dm.from_smarts("[C;H0](=O)") 9 | query2 = dm.to_mol("CN(C)") 10 | 11 | # Test multiple scenarios 12 | 13 | dm.viz.match_substructure( 14 | mols=[mol1, mol2], 15 | queries=[query1, query2], 16 | highlight_bonds=True, 17 | use_svg=True, 18 | ) 19 | dm.viz.match_substructure( 20 | mols=mol1, 21 | queries=[query1, query2], 22 | highlight_bonds=True, 23 | use_svg=True, 24 | ) 25 | dm.viz.match_substructure( 26 | mols=[mol1, mol2], 27 | queries=query1, 28 | highlight_bonds=False, 29 | use_svg=False, 30 | ) 31 | dm.viz.match_substructure( 32 | mols=mol1, 33 | queries=query2, 34 | highlight_bonds=True, 35 | use_svg=False, 36 | ) 37 | --------------------------------------------------------------------------------