├── .coveragerc
├── .editorconfig
├── .gitattributes
├── .github
└── workflows
│ ├── codecov.yml
│ ├── docs.yml
│ └── tests.yml
├── .gitignore
├── .isort.cfg
├── .markdownlint.yaml
├── .mypy.ini
├── .pytest.ini
├── .ruff.toml
├── .zenodo.json
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE.md
├── README.md
├── dimorphite_dl
├── __init__.py
├── cli.py
├── io.py
├── mol.py
├── neutralize.py
├── protonate
│ ├── __init__.py
│ ├── change.py
│ ├── data.py
│ ├── detect.py
│ ├── results.py
│ ├── run.py
│ └── site.py
└── smarts
│ └── site_substructures.smarts
├── docs
├── .nav.yml
├── .overrides
│ └── main.html
├── css
│ ├── base.css
│ ├── colors.css
│ ├── jupyter.css
│ ├── launchy.css
│ └── mkdocstrings.css
├── development.md
├── gen_ref_pages.py
├── img
│ └── launchy
│ │ └── colab.svg
├── index.md
└── js
│ └── mathjax-config.js
├── hooks
└── launchy.py
├── mkdocs.yml
├── pixi.lock
├── pixi.toml
├── pyproject.toml
├── tests
├── conftest.py
├── files
│ └── sample_molecules.smi
├── mol
│ ├── test_detect_substruct.py
│ └── test_neutralize.py
├── protonate
│ ├── test_data.py
│ └── test_run.py
├── test_smiles_io.py
└── tmp
│ └── .gitignore
└── training_data
├── README.md
└── training_data.json
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = true
3 | data_file = .coverage
4 | source = "tests"
5 |
6 | [paths]
7 | source = dimorphite_dl
8 |
9 | [report]
10 | show_missing = true
11 | skip_empty = true
12 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # Check http://editorconfig.org for more information
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
13 | [*.{py, pyi}]
14 | indent_style = space
15 | indent_size = 4
16 |
17 | [*.bat]
18 | indent_style = tab
19 | end_of_line = crlf
20 |
21 | [Makefile]
22 | indent_style = tab
23 |
24 | [*.{yml, yaml}]
25 | indent_size = 2
26 | trim_trailing_whitespace = true
27 |
28 | [*.md]
29 | indent_size = 4
30 | trim_trailing_whitespace = true
31 |
32 | [LICENSE]
33 | insert_final_newline = false
34 |
35 | [*.{diff,patch}]
36 | trim_trailing_whitespace = false
37 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, in case people don't have core.autocrlf set.
2 | * text=auto
3 |
4 | # Explicitly declare text files you want to always be normalized and converted
5 | # to native line endings on checkout.
6 | *.md text
7 | *.rst text
8 |
9 | # Denote all files that are truly binary and should not be modified.
10 | *.png binary
11 | *.jpg binary
12 |
13 | pixi.lock linguist-language=YAML linguist-generated=true
14 |
--------------------------------------------------------------------------------
/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
1 | name: Codecov
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 | workflow_dispatch:
9 |
10 | jobs:
11 | run:
12 | name: codecov
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - name: Checkout code
17 | uses: actions/checkout@v4
18 | with:
19 | lfs: true
20 |
21 | - name: Install pixi
22 | uses: prefix-dev/setup-pixi@v0.8.8
23 | with:
24 | locked: false
25 | frozen: false
26 | cache: true
27 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }}
28 |
29 | - name: Setup environment
30 | run: pixi install -e dev
31 |
32 | - name: Get test coverage
33 | run: pixi run tests
34 |
35 | - name: Upload to Codecov
36 | uses: codecov/codecov-action@v5
37 | with:
38 | env_vars: OS,PYTHON
39 | fail_ci_if_error: true
40 | verbose: true
41 | token: ${{ secrets.CODECOV_TOKEN }}
42 |
43 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Documentation
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 |
7 | workflow_dispatch:
8 |
9 | permissions:
10 | contents: read
11 | pages: write
12 | id-token: write
13 |
14 | concurrency:
15 | group: "pages"
16 | cancel-in-progress: true
17 |
18 | jobs:
19 | deploy:
20 | name: docs
21 | environment:
22 | name: github-pages
23 | url: ${{ steps.deployment.outputs.page_url }}
24 | runs-on: ubuntu-latest
25 |
26 | steps:
27 | - name: Checkout
28 | uses: actions/checkout@v4
29 | with:
30 | fetch-depth: 0
31 |
32 | - name: Install pixi
33 | uses: prefix-dev/setup-pixi@v0.8.8
34 | with:
35 | locked: false
36 | frozen: false
37 | cache: true
38 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }}
39 |
40 | - name: Setup environment
41 | run: pixi install -e docs
42 |
43 | - name: Build documentation
44 | run: pixi run docs
45 |
46 | - name: Setup Pages
47 | uses: actions/configure-pages@v5
48 |
49 | - name: Upload artifact
50 | uses: actions/upload-pages-artifact@v3
51 | with:
52 | path: 'public/'
53 |
54 | - name: Deploy to GitHub Pages
55 | id: deployment
56 | uses: actions/deploy-pages@v4
57 |
58 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 | # Allows you to run this workflow manually from the Actions tab
9 | workflow_dispatch:
10 |
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - name: Checkout repo
17 | uses: actions/checkout@v4
18 |
19 | - name: Install pixi
20 | uses: prefix-dev/setup-pixi@v0.8.8
21 | with:
22 | locked: false
23 | frozen: false
24 | cache: true
25 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }}
26 |
27 | - name: Setup environment
28 | run: pixi install -e dev
29 |
30 | - name: Run tests
31 | run: pixi run tests
32 |
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/.DS_Store
2 | __pycache__
3 | .venv
4 | .env
5 | dist/
6 | **/_version.py
7 |
8 | # Jupyter Notebook
9 | **/.ipynb_checkpoints
10 |
11 | # IDE settings
12 | .vscode/
13 | .idea/
14 |
15 | public/
16 |
17 | .cache
18 |
19 | node_modules
20 | package-lock.json
21 | package.json
22 |
23 | # pixi environments
24 | .pixi
25 | *.egg-info
26 |
27 | # coverage
28 | coverage.xml
29 | report.xml
30 | .coverage
31 |
--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | py_version = 313
3 | skip=.bzr,.direnv,.eggs,.git,.hg,.mypy_cache,.nox,.pants.d,.svn,.tox,.venv,__pypackages__,_build,buck-out,build,dist,node_modules,venv,.pixi
4 | line_length = 88
5 | known_typing = typing,types,typing_extensions,mypy,mypy_extensions
6 | sections = FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
7 | profile = black
8 | include_trailing_comma = true
9 | multi_line_output = 3
10 | indent = 4
11 |
--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | # https://github.com/DavidAnson/markdownlint/blob/main/schema/.markdownlint.yaml
2 | MD007:
3 | indent: 4
4 | MD013: false
5 | MD022: false
6 | MD024: false
7 | MD026: false
8 | MD028: false
9 | MD030:
10 | ol_multi: 2
11 | ol_single: 2
12 | ul_multi: 3
13 | ul_single: 3
14 | MD031: false
15 | MD032: false
16 | MD033: false
17 | MD034: false
18 | MD036: false
19 | MD038: false
20 | MD041: false
21 | MD046: false
22 | MD052: false
23 | MD053: false
24 |
--------------------------------------------------------------------------------
/.mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = "3.12"
3 | pretty = true
4 | show_traceback = true
5 | color_output = true
6 | allow_redefinition = false
7 | check_untyped_defs = true
8 | disallow_any_generics = true
9 | disallow_incomplete_defs = true
10 | ignore_missing_imports = true
11 | implicit_reexport = false
12 | no_implicit_optional = true
13 | show_column_numbers = true
14 | show_error_codes = true
15 | show_error_context = true
16 | strict_equality = true
17 | strict_optional = true
18 | warn_no_return = true
19 | warn_redundant_casts = true
20 | warn_return_any = true
21 | warn_unreachable = true
22 | warn_unused_configs = true
23 | warn_unused_ignores = true
24 |
--------------------------------------------------------------------------------
/.pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | norecursedirs =
3 | dimorphite_dl
4 | *.egg
5 | .eggs
6 | dist
7 | build
8 | docs
9 | .tox
10 | .git
11 | __pycache__
12 | doctest_optionflags =
13 | NUMBER
14 | NORMALIZE_WHITESPACE
15 | IGNORE_EXCEPTION_DETAIL
16 | addopts =
17 | --strict-markers
18 | --tb=short
19 | --doctest-modules
20 | --doctest-continue-on-failure
21 | testpaths = tests
22 |
--------------------------------------------------------------------------------
/.ruff.toml:
--------------------------------------------------------------------------------
1 | exclude = [
2 | ".bzr",
3 | ".direnv",
4 | ".eggs",
5 | ".git",
6 | ".git-rewrite",
7 | ".hg",
8 | ".ipynb_checkpoints",
9 | ".mypy_cache",
10 | ".nox",
11 | ".pants.d",
12 | ".pyenv",
13 | ".pytest_cache",
14 | ".pytype",
15 | ".ruff_cache",
16 | ".svn",
17 | ".tox",
18 | ".venv",
19 | ".vscode",
20 | "__pypackages__",
21 | "_build",
22 | "buck-out",
23 | "build",
24 | "dist",
25 | "node_modules",
26 | "site-packages",
27 | "venv",
28 | ".pixi",
29 | ".pytest_cache",
30 | ]
31 |
32 | line-length = 88
33 | indent-width = 4
34 |
35 | [format]
36 | quote-style = "double"
37 | indent-style = "space"
38 | line-ending = "lf"
39 | docstring-code-format = true
40 | docstring-code-line-length = "dynamic"
41 | skip-magic-trailing-comma = true
42 |
43 |
--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
1 | {
2 | "upload_type": "software",
3 | "title": "Dimorphite-DL",
4 | "creators": [
5 | {
6 | "name": "Ropp, Patrick J.",
7 | },
8 | {
9 | "name": "Kaminsky, Jesse C.",
10 | "orcid": "0000-0001-5796-2874"
11 | },
12 | {
13 | "name": "Yablonski, Sara",
14 | },
15 | {
16 | "name": "Spiegel, Jacob O.",
17 | "orcid": "0000-0002-8496-6915",
18 | },
19 | {
20 | "name": "Maldonado, Alex M.",
21 | "orcid": "0000-0003-3280-062X",
22 | "affiliation": "Department of Biological Sciences, University of Pittsburgh"
23 | },
24 | {
25 | "name": "Durrant, Jacob D.",
26 | "orcid": "0000-0002-5808-4097",
27 | "affiliation": "Department of Biological Sciences, University of Pittsburgh"
28 | },
29 | ],
30 | "access_right": "open",
31 | "license": {
32 | "id": "apache-2.0",
33 | },
34 | "language": "eng",
35 | }
36 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
4 |
5 | ## [Unreleased]
6 |
7 | ### Added
8 |
9 | - Turn on and control logging through the CLI.
10 | - `colorize` keyword argument for `enable_logging` for logs to not use ANSI color codes.
11 |
12 | ### Fixed
13 |
14 | - Determining if a provided input string was a SMILES or path to file.
15 | `CCC(C)=C(Cl)C/C(I)=C(\C)F` was incorrectly classified as a file.
16 |
17 | ## [2.0.1] - 2025-06-03
18 |
19 | ### Changed
20 |
21 | - Rearranged `__init__.py` imports to mainly have `from dimorphite_dl import protonate_smiles`.
22 |
23 | ### Fixed
24 |
25 | - Circular import of SMARTS.
26 |
27 | ## [2.0.0] - 2025-06-01
28 |
29 | ### Changed
30 |
31 | - Fallback mechanism now uses the previous successful site protonation.
32 | In previous versions, sometimes only the last successful protonation site type was returned.
33 | If the third phosphate protonation failed, then it would fallback to the last successful protonation before the first phosphate.
34 | Now, we would return the second phosphate protonation.
35 | - Major refactor of practically everything.
36 |
37 | ## [1.2.5] - 2025-05-21
38 |
39 | ### Changed
40 |
41 | - Major reorganization of the original `dimorphite_dl.py` file into Python modules under the package name `dimorphite_dl`. No code logic has been change, just refactored.
42 |
43 | ## [1.2.4]
44 |
45 | ### Added
46 |
47 | - Added test cases for ATP and NAD.
48 |
49 | ### Changed
50 |
51 | - Dimorphite-DL now better protonates compounds with polyphosphate chains
52 | (e.g., ATP). See `site_substructures.smarts` for the rationale behind the
53 | added pKa values.
54 | - `site_substructures.smarts` now allows comments (lines that start with `#`).
55 | - Improved suport for the `--silent` option.
56 | - Reformatted code per the [*Black* Python code formatter](https://github.com/psf/black).
57 |
58 | ### Fixed
59 |
60 | - Fixed a bug that affected how Dimorphite-DL deals with new protonation
61 | states that yield invalid SMILES strings.
62 | - Previously, it simply returned the original input SMILES in these rare
63 | cases (better than nothing). Now, it instead returns the last valid SMILES
64 | produced, not necessarily the original SMILES.
65 | - Consider `O=C(O)N1C=CC=C1` at pH 3.5 as an example.
66 | - Dimorphite-DL first deprotonates the carboxyl group, producing
67 | `O=C([O-])n1cccc1` (a valid SMILES).
68 | - It then attempts to protonate the aromatic nitrogen, producing
69 | `O=C([O-])[n+]1cccc1`, an invalid SMILES.
70 | - Previously, it would output the original SMILES, `O=C(O)N1C=CC=C1`. Now
71 | it outputs the last valid SMILES, `O=C([O-])n1cccc1`.
72 |
73 | ## [1.2.3]
74 |
75 | ### Added
76 |
77 | - Added "silent" option to suppress all output.
78 | - Added code to suppress unnecessary RDKit warnings.
79 |
80 | ### Changed
81 |
82 | - Updated protonation of nitrogen, oxygen, and sulfur atoms to be compatible
83 | with the latest version of RDKit, which broke backwards compatibility.
84 | - Updated copyright to 2020.
85 |
86 | ## [1.2.2]
87 |
88 | ### Added
89 |
90 | - Added a new parameter to limit the number of variants per compound
91 | (`--max_variants`). The default is 128.
92 |
93 | ## [1.2.1]
94 |
95 | ### Fixed
96 |
97 | - Corrected a bug that rarely misprotonated/deprotonated compounds with
98 | multiple ionization sites (e.g., producing a carbanion).
99 |
100 | ## [1.2.0]
101 |
102 | ### Fixed
103 |
104 | - Corrected a bug that led Dimorphite-DL to sometimes produce output molecules
105 | that are non-physical.
106 | - Corrected a bug that gave incorrect protonation states for rare molecules
107 | (aromatic rings with nitrogens that are protonated when electrically
108 | neutral, e.g. pyridin-4(1H)-one).
109 | - `run_with_mol_list()` now preserves non-string properties.
110 | - `run_with_mol_list()` throws a warning if it cannot process a molecule,
111 | rather than terminating the program with an error.
112 |
113 | ## [1.1.0]
114 |
115 | ### Added
116 |
117 | - Dimorphite-DL now distinguishes between indoles/pyrroles and
118 | Aromatic_nitrogen_protonated.
119 | - It is now possible to call Dimorphite-DL from another Python script, in
120 | addition to the command line. See the `README.md` file for instructions.
121 |
122 | ## [1.0.0]
123 |
124 | The original version described in:
125 |
126 | Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An
127 | open-source program for enumerating the ionization states of drug-like small
128 | molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.
129 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation.
6 |
7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
8 |
9 | ## Our Standards
10 |
11 | Examples of behavior that contributes to a positive environment for our community include:
12 |
13 | - Demonstrating empathy and kindness toward other people
14 | - Being respectful of differing opinions, viewpoints, and experiences
15 | - Giving and gracefully accepting constructive feedback
16 | - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
17 | - Focusing on what is best not just for us as individuals but for the overall community
18 |
19 | Examples of unacceptable behavior include:
20 |
21 | - The use of sexualized language or imagery and sexual attention or advances of any kind;
22 | - Trolling, insulting or derogatory comments, and personal or political attacks;
23 | - Public or private harassment;
24 | - Publishing others' private information, such as a physical or email address, without their explicit permission;
25 | - Other conduct that could reasonably be considered inappropriate in a professional setting.
26 |
27 | ## Enforcement Responsibilities
28 |
29 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior.
30 | They will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
31 |
32 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct and will communicate reasons for moderation decisions when appropriate.
33 |
34 | ## Scope
35 |
36 | This Code of Conduct applies within all community spaces and when an individual officially represents the community in public spaces.
37 | Examples of representing our community include:
38 |
39 | - using an official email address,
40 | - posting via an official social media account,
41 | - or acting as an appointed representative at an online or offline event.
42 |
43 | ## Enforcement
44 |
45 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at durrantj@pitt.edu.
46 | All complaints will be reviewed and investigated promptly and fairly.
47 |
48 | All community leaders must respect the privacy and security of the reporter of any incident.
49 |
50 | ## Enforcement Guidelines
51 |
52 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:
53 |
54 | ### 1. Correction
55 |
56 | **Community Impact**:
57 | Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.
58 |
59 | **Consequence**:
60 | A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate.
61 | A public apology may be requested.
62 |
63 | ### 2. Warning
64 |
65 | **Community Impact**:
66 | A violation through a single incident or series of actions.
67 |
68 | **Consequence**:
69 | A warning with consequences for continued behavior.
70 | No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time.
71 | This includes avoiding interactions in community spaces and external channels like social media.
72 | Violating these terms may lead to a temporary or permanent ban.
73 |
74 | ### 3. Temporary Ban
75 |
76 | **Community Impact**:
77 | A severe violation of community standards, including sustained inappropriate behavior.
78 |
79 | **Consequence**:
80 | A temporary ban from any sort of interaction or public communication with the community for a specified period of time.
81 | No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period.
82 | Violating these terms may lead to a permanent ban.
83 |
84 | ### 4. Permanent Ban
85 |
86 | **Community Impact**:
87 | Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals.
88 |
89 | **Consequence**:
90 | A permanent ban from any sort of public interaction within the community.
91 |
92 | ## Attribution
93 |
94 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
95 |
96 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
97 |
98 | For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ].
99 | Translations are available at [https://www.contributor-covenant.org/translations][translations].
100 |
101 | [homepage]: https://www.contributor-covenant.org
102 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
103 | [Mozilla CoC]: https://github.com/mozilla/diversity
104 | [FAQ]: https://www.contributor-covenant.org/faq
105 | [translations]: https://www.contributor-covenant.org/translations
106 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # Apache License
2 |
3 | *Version 2.0, January 2004*
4 | <[http://www.apache.org/licenses](http://www.apache.org/licenses)>
5 |
6 | ## Terms and Conditions for use, reproduction, and distribution
7 |
8 | ### 1. Definitions
9 |
10 | "License" shall mean the terms and conditions for use, reproduction, and
11 | distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by the
14 | copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all other
17 | entities that control, are controlled by, or are under common control with
18 | that entity. For the purposes of this definition, "control" means (i) the
19 | power, direct or indirect, to cause the direction or management of such
20 | entity, whether by contract or otherwise, or (ii) ownership of
21 | fifty percent (50%) or more of the outstanding shares, or (iii) beneficial
22 | ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity exercising
25 | permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation source,
29 | and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical transformation
32 | or translation of a Source form, including but not limited to compiled
33 | object code, generated documentation, and conversions to
34 | other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or Object
37 | form, made available under the License, as indicated by a copyright notice
38 | that is included in or attached to the work (an example is provided in the
39 | Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object form,
42 | that is based on (or derived from) the Work and for which the editorial
43 | revisions, annotations, elaborations, or other modifications represent,
44 | as a whole, an original work of authorship. For the purposes of this
45 | License, Derivative Works shall not include works that remain separable
46 | from, or merely link (or bind by name) to the interfaces of, the Work and
47 | Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including the original
50 | version of the Work and any modifications or additions to that Work or
51 | Derivative Works thereof, that is intentionally submitted to Licensor for
52 | inclusion in the Work by the copyright owner or by an individual or
53 | Legal Entity authorized to submit on behalf of the copyright owner.
54 | For the purposes of this definition, "submitted" means any form of
55 | electronic, verbal, or written communication sent to the Licensor or its
56 | representatives, including but not limited to communication on electronic
57 | mailing lists, source code control systems, and issue tracking systems
58 | that are managed by, or on behalf of, the Licensor for the purpose of
59 | discussing and improving the Work, but excluding communication that is
60 | conspicuously marked or otherwise designated in writing by the copyright
61 | owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity on
64 | behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | ### 2. Grant of Copyright License
68 |
69 | Subject to the terms and conditions of this License, each Contributor
70 | hereby grants to You a perpetual, worldwide, non-exclusive, no-charge,
71 | royalty-free, irrevocable copyright license to reproduce, prepare
72 | Derivative Works of, publicly display, publicly perform, sublicense,
73 | and distribute the Work and such Derivative Works in
74 | Source or Object form.
75 |
76 | ### 3. Grant of Patent License
77 |
78 | Subject to the terms and conditions of this License, each Contributor
79 | hereby grants to You a perpetual, worldwide, non-exclusive, no-charge,
80 | royalty-free, irrevocable (except as stated in this section) patent
81 | license to make, have made, use, offer to sell, sell, import, and
82 | otherwise transfer the Work, where such license applies only to those
83 | patent claims licensable by such Contributor that are necessarily
84 | infringed by their Contribution(s) alone or by combination of their
85 | Contribution(s) with the Work to which such Contribution(s) was submitted.
86 | If You institute patent litigation against any entity (including a
87 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
88 | Contribution incorporated within the Work constitutes direct or
89 | contributory patent infringement, then any patent licenses granted to
90 | You under this License for that Work shall terminate as of the date such
91 | litigation is filed.
92 |
93 | ### 4. Redistribution
94 |
95 | You may reproduce and distribute copies of the Work or Derivative Works
96 | thereof in any medium, with or without modifications, and in Source or
97 | Object form, provided that You meet the following conditions:
98 |
99 | 1. You must give any other recipients of the Work or Derivative Works a
100 | copy of this License; and
101 |
102 | 2. You must cause any modified files to carry prominent notices stating
103 | that You changed the files; and
104 |
105 | 3. You must retain, in the Source form of any Derivative Works that You
106 | distribute, all copyright, patent, trademark, and attribution notices from
107 | the Source form of the Work, excluding those notices that do not pertain
108 | to any part of the Derivative Works; and
109 |
110 | 4. If the Work includes a "NOTICE" text file as part of its distribution,
111 | then any Derivative Works that You distribute must include a readable copy
112 | of the attribution notices contained within such NOTICE file, excluding
113 | those notices that do not pertain to any part of the Derivative Works,
114 | in at least one of the following places: within a NOTICE text file
115 | distributed as part of the Derivative Works; within the Source form or
116 | documentation, if provided along with the Derivative Works; or, within a
117 | display generated by the Derivative Works, if and wherever such
118 | third-party notices normally appear. The contents of the NOTICE file are
119 | for informational purposes only and do not modify the License.
120 | You may add Your own attribution notices within Derivative Works that You
121 | distribute, alongside or as an addendum to the NOTICE text from the Work,
122 | provided that such additional attribution notices cannot be construed
123 | as modifying the License.
124 |
125 | You may add Your own copyright statement to Your modifications and may
126 | provide additional or different license terms and conditions for use,
127 | reproduction, or distribution of Your modifications, or for any such
128 | Derivative Works as a whole, provided Your use, reproduction, and
129 | distribution of the Work otherwise complies with the conditions
130 | stated in this License.
131 |
132 | ### 5. Submission of Contributions
133 |
134 | Unless You explicitly state otherwise, any Contribution intentionally
135 | submitted for inclusion in the Work by You to the Licensor shall be under
136 | the terms and conditions of this License, without any additional
137 | terms or conditions. Notwithstanding the above, nothing herein shall
138 | supersede or modify the terms of any separate license agreement you may
139 | have executed with Licensor regarding such Contributions.
140 |
141 | ### 6. Trademarks
142 |
143 | This License does not grant permission to use the trade names, trademarks,
144 | service marks, or product names of the Licensor, except as required for
145 | reasonable and customary use in describing the origin of the Work and
146 | reproducing the content of the NOTICE file.
147 |
148 | ### 7. Disclaimer of Warranty
149 |
150 | Unless required by applicable law or agreed to in writing, Licensor
151 | provides the Work (and each Contributor provides its Contributions)
152 | on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
153 | either express or implied, including, without limitation, any warranties
154 | or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS
155 | FOR A PARTICULAR PURPOSE. You are solely responsible for determining the
156 | appropriateness of using or redistributing the Work and assume any risks
157 | associated with Your exercise of permissions under this License.
158 |
159 | ### 8. Limitation of Liability
160 |
161 | In no event and under no legal theory, whether in tort
162 | (including negligence), contract, or otherwise, unless required by
163 | applicable law (such as deliberate and grossly negligent acts) or agreed
164 | to in writing, shall any Contributor be liable to You for damages,
165 | including any direct, indirect, special, incidental, or consequential
166 | damages of any character arising as a result of this License or out of
167 | the use or inability to use the Work (including but not limited to damages
168 | for loss of goodwill, work stoppage, computer failure or malfunction,
169 | or any and all other commercial damages or losses), even if such
170 | Contributor has been advised of the possibility of such damages.
171 |
172 | ### 9. Accepting Warranty or Additional Liability
173 |
174 | While redistributing the Work or Derivative Works thereof, You may choose
175 | to offer, and charge a fee for, acceptance of support, warranty,
176 | indemnity, or other liability obligations and/or rights consistent with
177 | this License. However, in accepting such obligations, You may act only
178 | on Your own behalf and on Your sole responsibility, not on behalf of any
179 | other Contributor, and only if You agree to indemnify, defend, and hold
180 | each Contributor harmless for any liability incurred by, or claims
181 | asserted against, such Contributor by reason of your accepting any such
182 | warranty or additional liability.
183 |
184 | *END OF TERMS AND CONDITIONS*
185 |
186 | ## APPENDIX: How to apply the Apache License to your work
187 |
188 | To apply the Apache License to your work, attach the following boilerplate
189 | notice, with the fields enclosed by brackets "[]" replaced with your own
190 | identifying information. (Don't include the brackets!) The text should be
191 | enclosed in the appropriate comment syntax for the file format. We also
192 | recommend that a file or class name and description of purpose be included
193 | on the same "printed page" as the copyright notice for easier
194 | identification within third-party archives.
195 |
196 | Copyright 2025 durrantlab
197 |
198 | Licensed under the Apache License, Version 2.0 (the "License");
199 | you may not use this file except in compliance with the License.
200 | You may obtain a copy of the License at
201 |
202 | https://www.apache.org/licenses/LICENSE-2.0
203 |
204 | Unless required by applicable law or agreed to in writing, software
205 | distributed under the License is distributed on an "AS IS" BASIS,
206 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
207 | or implied. See the License for the specific language governing
208 | permissions and limitations under the License.
209 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
dimorphite_dl
2 |
3 | Adds hydrogen atoms to molecular representations as specified by pH
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | Dimorphite-DL is a fast, accurate, accessible, and modular open-source program designed for enumerating small-molecule ionization states.
34 | It specifically adds or removes hydrogen atoms from molecular representations to achieve the appropriate protonation state for a user-specified pH range.
35 |
36 | Accurate protonation states are crucial in cheminformatics and computational drug discovery, as a molecule's ionization state significantly impacts its physicochemical properties, biological activity, and interactions with targets.
37 | Dimorphite-DL addresses this by providing a robust solution for preparing molecules for various downstream applications like docking, molecular dynamics, and virtual screening.
38 |
39 | ## Installation
40 |
41 | You can install the latest released version on [PyPI](https://pypi.org/project/dimorphite-dl/) using the following command.
42 |
43 | ```bash
44 | pip install dimorphite_dl
45 | ```
46 |
47 | Or you can install the latest development version from the `main` branch on [GitHub](https://github.com/durrantlab/dimorphite_dl) using
48 |
49 | ```bash
50 | pip install https://github.com/durrantlab/dimorphite_dl.git
51 | ```
52 |
53 | ## Usage
54 |
55 | ### CLI
56 |
57 | The command-line interface (`dimorphite_dl`) provides straightforward access to Dimorphite-DL's functionalities.
58 |
59 | **Positional Arguments:**
60 |
61 | - `SMI`: SMILES string or path to a file containing SMILES strings to protonate.
62 |
63 | **Options:**
64 |
65 | - `--ph_min MIN`: Minimum pH to consider (default: 6.4).
66 | - `--ph_max MAX`: Maximum pH to consider (default: 8.4).
67 | - `--precision PRE`: pKa precision factor, representing the number of standard deviations from the mean pKa to consider when determining ionization states (default: 1.0).
68 | - `--output_file FILE`: Optional path to a file to write the protonated SMILES results.
69 | - `--max_variants MXV`: Limits the number of protonation variants generated per input compound (default: 128).
70 | - `--label_states`: If set, output SMILES will be labeled with their target ionization state ("DEPROTONATED", "PROTONATED", or "BOTH").
71 | - `--log_level`: Enable logging and set the level.
72 | Can be `none`, `debug`, `info`, `warning`, `error`, or `critical`.
73 | Defaults to no logging.
74 |
75 | #### Examples
76 |
77 | Protonate molecules from a file:
78 |
79 | ```bash
80 | dimorphite_dl sample_molecules.smi
81 | ```
82 |
83 | Protonate a single SMILES string within a specific pH range:
84 |
85 | ```bash
86 | dimorphite_dl --ph_min -3.0 --ph_max -2.0 "CCC(=O)O"
87 | ```
88 |
89 | Protonate a SMILES string and save output to a file:
90 |
91 | ```bash
92 | dimorphite_dl --ph_min -3.0 --ph_max -2.0 --output_file output.smi "CCCN"
93 | ```
94 |
95 | Protonate molecules from a file with increased pKa precision and state labels:
96 |
97 | ```bash
98 | dimorphite_dl --precision 2.0 --label_states sample_molecules.smi
99 | ```
100 |
101 | ### Scripting
102 |
103 | Dimorphite-DL can be easily integrated into your Python scripts.
104 | The primary function for this is `protonate_smiles` from `dimorphite_dl.protonate`.
105 |
106 | ```python
107 | from dimorphite_dl import protonate_smiles
108 |
109 | # Protonate a single SMILES string with custom pH range and precision
110 | protonated_mol_1: list[str] = protonate_smiles(
111 | "CCC(=O)O", ph_min=6.8, ph_max=7.9, precision=0.5
112 | )
113 | print(f"Protonated 'CCC(=O)O': {protonated_mol_1}")
114 |
115 | # Protonate a list of SMILES strings
116 | protonated_mol_list: list[str] = protonate_smiles(["CCC(=O)O", "CCCN"])
117 | print(f"Protonated list: {protonated_mol_list}")
118 |
119 | # Protonate molecules from a SMILES file
120 | # Make sure '~/example.smi' exists and contains SMILES strings
121 | # protonated_from_file: list[str] = protonate_smiles("~/example.smi")
122 | # print(f"Protonated from file: {protonated_from_file}")
123 |
124 | # Example with labeling states and limiting variants
125 | protonated_labeled: list[str] = protonate_smiles(
126 | "C1CCCCC1C(=O)O", ph_min=7.0, ph_max=7.4, label_states=True, max_variants=5
127 | )
128 | print(f"Protonated with labels: {protonated_labeled}")
129 | ```
130 |
131 | ## Known issues
132 |
133 | Dimorphite_dl is designed to handle the vast majority of ionizable functional groups accurately, but there are some edge cases where the current SMARTS patterns and pKa assignments may not behave as expected.
134 | The following are known limitations that users should be aware of when working with specific molecular substructures:
135 |
136 | - **Tertiary Amides**: Tertiary amides (e.g., N-acetylpiperidine `CC(=O)N1CCCCC1`) are incorrectly treated as basic amines (pKa ~8) instead of neutral species because current amide SMARTS patterns require an N-H bond.
137 | - **Indoles and Pyrroles**: These heterocycles are correctly deprotonated around pH 14.5 but are not protonated at very low pH (~-3.5) where they would be expected to protonate under extremely acidic conditions.
138 |
139 | ## Development
140 |
141 | We use [pixi](https://pixi.sh/latest/) to manage Python environments and simplify the developer workflow.
142 | Once you have [pixi](https://pixi.sh/latest/) installed, move into `dimorphite_dl` directory (e.g., `cd dimorphite_dl`) and install the environment using the command
143 |
144 | ```bash
145 | pixi install
146 | ```
147 |
148 | Now you can activate the new virtual environment using
149 |
150 | ```sh
151 | pixi shell
152 | ```
153 |
154 | ## Citation
155 |
156 | If you use Dimorphite-DL in your research, please cite:
157 |
158 | Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An open-source program for enumerating the ionization states of drug-like small
159 | molecules. *J Cheminform 11*:14. doi: [10.1186/s13321-019-0336-9](https://doi.org/10.1186/s13321-019-0336-9).
160 |
161 | ## License
162 |
163 | This project is released under the Apache-2.0 License as specified in `LICENSE.md`.
164 |
--------------------------------------------------------------------------------
/dimorphite_dl/__init__.py:
--------------------------------------------------------------------------------
1 | """Adds hydrogen atoms to molecular representations as specified by pH"""
2 |
3 | from typing import Any
4 |
5 | import os
6 | import sys
7 | from ast import literal_eval
8 |
9 | from loguru import logger
10 |
11 | from .protonate.run import protonate_smiles
12 |
13 | __all__ = ["protonate_smiles"]
14 |
15 | try:
16 | from ._version import version as __version__
17 | except ImportError:
18 | __version__ = "unknown"
19 |
20 | logger.disable("dimorphite_dl")
21 |
22 | LOG_FORMAT = (
23 | "{time:HH:mm:ss} | "
24 | "{level: <8} | "
25 | "{name} :{function} :{line} - {message} "
26 | )
27 |
28 |
29 | def enable_logging(
30 | level_set: int,
31 | stdout_set: bool = True,
32 | file_path: str | None = None,
33 | log_format: str = LOG_FORMAT,
34 | colorize: bool = True,
35 | ) -> None:
36 | r"""Enable logging.
37 |
38 | Args:
39 | level: Requested log level: `10` is debug, `20` is info.
40 | file_path: Also write logs to files here.
41 | """
42 | config: dict[str, Any] = {"handlers": []}
43 | if stdout_set:
44 | config["handlers"].append(
45 | {
46 | "sink": sys.stdout,
47 | "level": level_set,
48 | "format": log_format,
49 | "colorize": colorize,
50 | }
51 | )
52 | if isinstance(file_path, str):
53 | config["handlers"].append(
54 | {
55 | "sink": file_path,
56 | "level": level_set,
57 | "format": log_format,
58 | "colorize": colorize,
59 | }
60 | )
61 | # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.configure
62 | logger.configure(**config)
63 |
64 | logger.enable("dimorphite_dl")
65 |
66 |
67 | if literal_eval(os.environ.get("DIMORPHITE_DL_LOG", "False")):
68 | level = int(os.environ.get("DIMORPHITE_DL_LOG_LEVEL", 20))
69 | stdout = literal_eval(os.environ.get("DIMORPHITE_DL_STDOUT", "True"))
70 | log_file_path = os.environ.get("DIMORPHITE_DL_LOG_FILE_PATH", None)
71 | enable_logging(level, stdout, log_file_path)
72 |
--------------------------------------------------------------------------------
/dimorphite_dl/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from loguru import logger
4 |
5 | from dimorphite_dl import __version__, enable_logging, protonate_smiles
6 |
7 | LOG_LEVEL_TO_INT = {"debug": 10, "info": 20, "warning": 30, "error": 40, "critical": 50}
8 |
9 |
10 | def run_cli() -> None:
11 | """The main definition run when you call the script from the commandline."""
12 | parser = argparse.ArgumentParser(description=f"dimorphite_dl v{__version__}")
13 | parser.add_argument(
14 | "--ph_min",
15 | metavar="MIN",
16 | type=float,
17 | default=6.4,
18 | help="Minimum pH to consider (default: 6.4)",
19 | )
20 | parser.add_argument(
21 | "--ph_max",
22 | metavar="MAX",
23 | type=float,
24 | default=8.4,
25 | help="Maximum pH to consider (default: 8.4)",
26 | )
27 | parser.add_argument(
28 | "--precision",
29 | metavar="PRE",
30 | type=float,
31 | default=1.0,
32 | help="pKa precision factor (i.e., number of standard devations)",
33 | )
34 | parser.add_argument(
35 | "--output_file",
36 | metavar="FILE",
37 | type=str,
38 | help="Output file to write protonated SMILES (optional)",
39 | )
40 | parser.add_argument(
41 | "--max_variants",
42 | metavar="MXV",
43 | type=int,
44 | default=128,
45 | help="Limit number of variants per input compound (default: 128)",
46 | )
47 | parser.add_argument(
48 | "--label_states",
49 | action="store_true",
50 | help="label protonated SMILES with target state "
51 | + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").',
52 | )
53 | parser.add_argument(
54 | "--log_level",
55 | choices=["none", "debug", "info", "warning", "error", "critical"],
56 | default="none",
57 | help="Enable and set logging level. Defaults to none (i.e., no logging)",
58 | )
59 | parser.add_argument(
60 | "smiles", metavar="SMI", type=str, help="SMILES or path to SMILES to protonate"
61 | )
62 |
63 | args = parser.parse_args()
64 | if args.log_level != "none":
65 | enable_logging(LOG_LEVEL_TO_INT[args.log_level])
66 |
67 | if args.output_file is not None:
68 | logger.info("Writing smiles to {}", args.output_file)
69 | f = open(args.output_file, "w", encoding="utf-8")
70 |
71 | for smiles_protonated in protonate_smiles(
72 | smiles_input=args.smiles,
73 | ph_min=args.ph_min,
74 | ph_max=args.ph_max,
75 | precision=args.precision,
76 | label_states=args.label_states,
77 | max_variants=args.max_variants,
78 | ):
79 | if args.output_file is not None:
80 | f.write(smiles_protonated + "\n")
81 | else:
82 | print(smiles_protonated)
83 |
--------------------------------------------------------------------------------
/dimorphite_dl/io.py:
--------------------------------------------------------------------------------
1 | """
2 | Robust, memory-efficient SMILES string handling library.
3 |
4 | Provides unified streaming interface for processing SMILES from various sources
5 | with comprehensive error handling, validation, and memory optimization.
6 | """
7 |
8 | from typing import Any, TextIO
9 |
10 | import gzip
11 | import os
12 | import pathlib
13 | from collections.abc import Iterable, Iterator
14 | from dataclasses import dataclass, field
15 |
16 | from loguru import logger
17 | from rdkit.Chem.MolStandardize import rdMolStandardize
18 |
19 |
20 | @dataclass
21 | class SMILESRecord:
22 | """Container for a SMILES string with metadata."""
23 |
24 | smiles: str
25 | identifier: str = ""
26 | source_line: int | None = None
27 | metadata: dict[str, Any] = field(default_factory=dict)
28 |
29 |
30 | class SMILESValidationError(Exception):
31 | """Raised when SMILES validation fails."""
32 |
33 | pass
34 |
35 |
36 | class SMILESStreamError(Exception):
37 | """Raised when streaming encounters an error."""
38 |
39 | pass
40 |
41 |
42 | class SMILESProcessor:
43 | """
44 | Memory-efficient SMILES string processor with robust error handling.
45 |
46 | Handles various input formats and provides streaming interface for
47 | processing large datasets without memory overflow.
48 | """
49 |
50 | def __init__(
51 | self,
52 | validate_smiles: bool = True,
53 | skip_invalid: bool = True,
54 | max_length: int | None = 10000,
55 | chunk_size: int = 1000,
56 | ):
57 | """
58 | Initialize SMILES processor.
59 |
60 | Args:
61 | validate_smiles: Whether to validate SMILES syntax
62 | skip_invalid: Skip invalid SMILES instead of raising errors
63 | max_length: Maximum allowed SMILES length (None for no limit)
64 | chunk_size: Batch size for processing operations
65 | """
66 | self.validate_smiles = validate_smiles
67 | self.skip_invalid = skip_invalid
68 | self.max_length = max_length
69 | self.chunk_size = chunk_size
70 | self._stats: dict[str, int] = {"processed": 0, "skipped": 0, "errors": 0}
71 |
72 | def stream(
73 | self, input_data: str | Iterable[str] | Iterator[str]
74 | ) -> Iterator[SMILESRecord]:
75 | """
76 | Stream SMILES records from various input types.
77 |
78 | Args:
79 | input_data: File path, single SMILES, or iterable of SMILES
80 |
81 | Yields:
82 | SMILESRecord: Validated SMILES records with metadata
83 |
84 | Raises:
85 | SMILESStreamError: If input cannot be processed
86 | """
87 | self._reset_stats()
88 |
89 | try:
90 | if isinstance(input_data, str):
91 | yield from self._handle_string_input(input_data)
92 | elif hasattr(input_data, "__iter__"):
93 | yield from self._handle_iterable_input(input_data)
94 | else:
95 | raise SMILESStreamError(f"Unsupported input type: {type(input_data)}")
96 |
97 | except Exception as e:
98 | logger.error(f"Error streaming SMILES: {e}")
99 | if not self.skip_invalid:
100 | if isinstance(e, SMILESValidationError):
101 | raise e
102 | else:
103 | raise SMILESStreamError(f"Failed to process input: {e}") from e
104 |
105 | def stream_batches(
106 | self, input_data: str | Iterable[str], batch_size: int | None = None
107 | ) -> Iterator[list[SMILESRecord]]:
108 | """
109 | Stream SMILES records in batches for efficient processing.
110 |
111 | Args:
112 | input_data: Input source
113 | batch_size: Size of each batch (uses instance default if None)
114 |
115 | Yields:
116 | Batches of SMILES records
117 | """
118 | batch_size = batch_size or self.chunk_size
119 | batch = []
120 |
121 | for record in self.stream(input_data):
122 | batch.append(record)
123 | if len(batch) >= batch_size:
124 | yield batch
125 | batch = []
126 |
127 | if batch: # Yield remaining records
128 | yield batch
129 |
130 | def _handle_string_input(self, input_str: str) -> Iterator[SMILESRecord]:
131 | """Handle string input - either file path or single SMILES."""
132 | if self._is_file_path(input_str):
133 | yield from self._stream_from_file(input_str)
134 | else:
135 | # Single SMILES string
136 | record = self._create_record(input_str, source_line=1)
137 | if record:
138 | yield record
139 |
140 | def _handle_iterable_input(self, iterable: Iterable[str]) -> Iterator[SMILESRecord]:
141 | """Handle iterable input (list, generator, etc.).
142 |
143 | This will skip empty lines.
144 | """
145 | logger.debug("Handling iterable input of {}", iterable)
146 | for line_num, line in enumerate(iterable, 1):
147 | if isinstance(line, str):
148 | line = line.strip()
149 | line_split = line.split()
150 | if len(line_split) > 2:
151 | logger.warning(
152 | f"Lines can only contain a smiles string and identifier, but we were given {line}"
153 | )
154 | raise ValueError(
155 | "Line contains more than two items (smiles and identifier)"
156 | )
157 | if len(line_split) == 0:
158 | continue
159 | smiles = line_split[0]
160 | if len(line_split) == 2:
161 | identifier = line_split[1]
162 | else:
163 | identifier = ""
164 | record = self._create_record(smiles, identifier, source_line=line_num)
165 | if record:
166 | yield record
167 | else:
168 | self._handle_error(
169 | f"Non-string item at position {line_num}: {type(line)}"
170 | )
171 |
172 | def _stream_from_file(self, filepath: str) -> Iterator[SMILESRecord]:
173 | """Stream SMILES from file with format auto-detection."""
174 | logger.debug("Streaming from {}", filepath)
175 | path = pathlib.Path(filepath)
176 |
177 | if not path.exists():
178 | raise SMILESStreamError(f"File not found: {filepath}")
179 |
180 | # Handle compressed files
181 | open_func = gzip.open if path.suffix == ".gz" else open
182 | mode = "rt" if path.suffix == ".gz" else "r"
183 |
184 | try:
185 | with open_func(filepath, mode, encoding="utf-8", errors="replace") as f:
186 | yield from self._stream_from_file_object(f, path)
187 | except Exception as e:
188 | raise SMILESStreamError(f"Error reading file {filepath}: {e}") from e
189 |
190 | def _stream_from_file_object(
191 | self, file_obj: TextIO, path: pathlib.Path
192 | ) -> Iterator[SMILESRecord]:
193 | """Stream from file object based on file extension."""
194 | suffix = path.suffix.lower().replace(".gz", "")
195 |
196 | if suffix in {".smiles", ".smi", ".txt", ""}:
197 | yield from self._stream_from_text(file_obj)
198 | else:
199 | logger.warning(f"Unknown file format {suffix}, treating as text")
200 | yield from self._stream_from_text(file_obj)
201 |
202 | def _stream_from_text(self, file_obj: TextIO) -> Iterator[SMILESRecord]:
203 | """Stream SMILES from plain text file."""
204 | for line_num, line in enumerate(file_obj, 1):
205 | line = line.strip()
206 | if line and not line.startswith("#"):
207 | # Handle multi-column format (SMILES ID)
208 | parts = line.split()
209 | smiles = parts[0]
210 | identifier = parts[1] if len(parts) > 1 else ""
211 |
212 | record = self._create_record(
213 | smiles, identifier=identifier, source_line=line_num
214 | )
215 | if record:
216 | yield record
217 |
218 | def _create_record(
219 | self,
220 | smiles: str,
221 | identifier: str = "",
222 | source_line: int | None = None,
223 | metadata: dict[str, Any] | None = None,
224 | ) -> SMILESRecord | None:
225 | """Create and validate a SMILES record."""
226 | smiles = smiles.strip()
227 |
228 | if not smiles:
229 | return None
230 |
231 | try:
232 | # Length validation
233 | if self.max_length and len(smiles) > self.max_length:
234 | self._handle_error(
235 | f"SMILES too long ({len(smiles)} > {self.max_length}): {smiles[:50]}..."
236 | )
237 | return None
238 |
239 | # Basic syntax validation
240 | if self.validate_smiles:
241 | if not self._validate_smiles_syntax(smiles):
242 | self._handle_error(f"Invalid SMILES syntax: {smiles}")
243 | return None
244 |
245 | self._stats["processed"] += 1
246 | return SMILESRecord(
247 | smiles=smiles,
248 | identifier=identifier,
249 | source_line=source_line,
250 | metadata=metadata or {},
251 | )
252 |
253 | except Exception as e:
254 | self._handle_error(f"Error creating record for '{smiles}': {e}")
255 | return None
256 |
257 | def _validate_smiles_syntax(self, smiles: str) -> bool:
258 | """SMILES syntax validation using RDKit."""
259 | logger.info("Processing {}", smiles)
260 | try:
261 | rdMolStandardize.ValidateSmiles(smiles)
262 | logger.debug("SMILES is valid")
263 | return True
264 | except Exception:
265 | logger.info("SMILES is NOT valid")
266 | return False
267 |
268 | def _is_file_path(self, s: str) -> bool:
269 | """Check if string is likely a file path."""
270 | # Don't treat very long strings as file paths
271 | if len(s) > 1000:
272 | return False
273 |
274 | # Check if it has any file path indicators
275 | has_path_indicators = (
276 | os.path.exists(s)
277 | or os.path.sep in s
278 | or (os.path.altsep and os.path.altsep in s)
279 | or s.endswith((".smiles", ".smi", ".txt", ".csv", ".sdf", ".gz"))
280 | )
281 |
282 | if not has_path_indicators:
283 | return False
284 |
285 | # If it looks like a path, try to validate it as a real file path
286 | try:
287 | path = pathlib.Path(s)
288 | # If the path exists, it's definitely a file
289 | if path.exists():
290 | return True
291 | # If the parent directory exists, it could be a valid file path
292 | if path.parent.exists():
293 | return True
294 | # If it has a valid file extension and reasonable structure, assume it's a path
295 | if path.suffix in {".smiles", ".smi", ".txt", ".csv", ".sdf", ".gz"}:
296 | return True
297 | except (OSError, ValueError):
298 | # If we can't even create a Path object, it's probably not a file path
299 | return False
300 |
301 | return False
302 |
303 | def _handle_error(self, message: str):
304 | """Handle errors based on skip_invalid setting."""
305 | self._stats["errors"] += 1
306 | if self.skip_invalid:
307 | self._stats["skipped"] += 1
308 | logger.warning(message)
309 | else:
310 | raise SMILESValidationError(message)
311 |
312 | def _reset_stats(self):
313 | """Reset processing statistics."""
314 | self._stats = {"processed": 0, "skipped": 0, "errors": 0}
315 |
316 | def get_stats(self) -> dict[str, int]:
317 | """Get processing statistics."""
318 | return self._stats.copy()
319 |
320 |
321 | # Convenience functions
322 | def stream_smiles(input_data: str | Iterable[str], **kwargs) -> Iterator[SMILESRecord]:
323 | """Convenience function for streaming SMILES."""
324 | processor = SMILESProcessor(**kwargs)
325 | yield from processor.stream(input_data)
326 |
327 |
328 | def process_smiles_file(filepath: str, **kwargs) -> Iterator[SMILESRecord]:
329 | """Convenience function for processing SMILES files."""
330 | processor = SMILESProcessor(**kwargs)
331 | yield from processor.stream(filepath)
332 |
--------------------------------------------------------------------------------
/dimorphite_dl/mol.py:
--------------------------------------------------------------------------------
1 | """
2 | Class for handling SMILES strings and RDKit mol objects.
3 | """
4 |
5 | from typing import Any
6 |
7 | import copy
8 | import os
9 | import sys
10 |
11 | from loguru import logger
12 | from rdkit import Chem
13 |
14 | from dimorphite_dl.neutralize import MoleculeNeutralizer
15 |
16 |
17 | class MoleculeRecord:
18 | """
19 | Enhanced class for managing SMILES strings and RDKit mol objects.
20 |
21 | Handles all molecule-related operations including validation, conversion,
22 | neutralization, and hydrogen management.
23 | """
24 |
25 | def __init__(self, smiles: str, identifier: str = "") -> None:
26 | """
27 | Initialize a MoleculeRecord.
28 |
29 | Args:
30 | smiles: SMILES string representation of the molecule
31 | identifier: Optional unique identifier for the molecule
32 |
33 | Raises:
34 | ValueError: If smiles is not a valid string
35 | """
36 | assert isinstance(smiles, str)
37 | assert isinstance(identifier, str)
38 |
39 | smiles = smiles.strip()
40 | if not smiles:
41 | raise ValueError("SMILES string cannot be empty")
42 |
43 | self.smiles_original = smiles
44 | """Original SMILES used to initialize this MoleculeRecord"""
45 |
46 | self.identifier = identifier
47 | """Unique identifier for molecule"""
48 |
49 | self.smiles = smiles
50 | """Current SMILES after any processing"""
51 |
52 | self._mol: Chem.Mol | None = None
53 | """Cached RDKit mol object"""
54 |
55 | self._mol_with_hs: Chem.Mol | None = None
56 | """Cached RDKit mol object with explicit hydrogens"""
57 |
58 | self._neutralizer: MoleculeNeutralizer | None = None
59 | """Cached neutralizer instance"""
60 |
61 | @property
62 | def mol(self) -> Chem.Mol | None:
63 | """Get the RDKit mol object, creating it if necessary."""
64 | if self._mol is None:
65 | self._mol = self.to_mol()
66 | return self._mol
67 |
68 | @mol.setter
69 | def mol(self, value: Chem.Mol | None) -> None:
70 | """Set the RDKit mol object and clear dependent caches."""
71 | self._mol = value
72 | self._mol_with_hs = None # Clear dependent cache
73 |
74 | def to_mol(self) -> Chem.Mol | None:
75 | """
76 | Convert current SMILES to a RDKit Mol object.
77 |
78 | Returns:
79 | RDKit Mol object or None if conversion fails
80 | """
81 | conversion_info = self.to_mol_silenced(self.smiles)
82 |
83 | if conversion_info["mol"] is None:
84 | error_msg = conversion_info["stderr_content"].strip()
85 | if error_msg:
86 | logger.warning(
87 | "RDKit failed to parse SMILES '{}'. RDKit error: {}",
88 | self.smiles,
89 | error_msg,
90 | )
91 | else:
92 | logger.warning(
93 | "RDKit failed to parse SMILES '{}' (no specific error message)",
94 | self.smiles,
95 | )
96 | return None
97 |
98 | mol = conversion_info["mol"]
99 | logger.trace("SMILES after conversion: {}", Chem.MolToSmiles(mol))
100 | return mol
101 |
102 | def to_mol_with_hs(self) -> Chem.Mol | None:
103 | """
104 | Get RDKit mol object with explicit hydrogens.
105 |
106 | Returns:
107 | RDKit Mol object with explicit hydrogens or None if conversion fails
108 | """
109 | if self._mol_with_hs is None:
110 | base_mol = self.mol
111 | if base_mol is not None:
112 | self._mol_with_hs = self.add_hydrogens(base_mol)
113 | return self._mol_with_hs
114 |
115 | def refresh_mol_from_smiles(self) -> bool:
116 | """
117 | Refresh the mol object from current SMILES string.
118 |
119 | Returns:
120 | True if successful, False otherwise
121 | """
122 | self._mol = None
123 | self._mol_with_hs = None
124 | new_mol = self.to_mol()
125 | return new_mol is not None
126 |
127 | def update_smiles_from_mol(self, mol: Chem.Mol | None = None) -> bool:
128 | """
129 | Update SMILES string from RDKit mol object.
130 |
131 | Args:
132 | mol: Optional mol object to use. If None, uses self.mol
133 |
134 | Returns:
135 | True if successful, False otherwise
136 | """
137 | if mol is None:
138 | mol = self.mol
139 |
140 | if mol is None:
141 | logger.warning("Cannot update SMILES: no valid mol object available")
142 | return False
143 |
144 | try:
145 | new_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
146 | if new_smiles:
147 | self.smiles = new_smiles
148 | # Clear mol cache since we're updating from external mol
149 | if mol is not self._mol:
150 | self._mol = mol
151 | self._mol_with_hs = None
152 | return True
153 | except Exception as e:
154 | logger.warning("Error generating SMILES from mol: {}", str(e))
155 |
156 | return False
157 |
158 | def get_canonical_smiles(self, isomeric: bool = True) -> str | None:
159 | """
160 | Get canonical SMILES representation.
161 |
162 | Args:
163 | isomeric: Whether to include stereochemistry information
164 |
165 | Returns:
166 | Canonical SMILES string or None if conversion fails
167 | """
168 | mol = self.mol
169 | if mol is None:
170 | return None
171 |
172 | try:
173 | return Chem.MolToSmiles(mol, isomericSmiles=isomeric, canonical=True)
174 | except Exception as e:
175 | logger.warning("Error generating canonical SMILES: {}", str(e))
176 | return None
177 |
178 | def make_canonical(self, isomeric: bool = True) -> None:
179 | smiles = self.get_canonical_smiles(isomeric=isomeric)
180 | if smiles is not None:
181 | self._update_smiles(smiles)
182 |
183 | def is_valid(self) -> bool:
184 | """
185 | Check if the current SMILES represents a valid molecule.
186 |
187 | Returns:
188 | True if valid, False otherwise
189 | """
190 | return self.mol is not None
191 |
192 | def _update_smiles(self, smiles: str) -> None:
193 | self.smiles = smiles
194 | # Clear caches since SMILES changed
195 | self._mol = None
196 | self._mol_with_hs = None
197 |
198 | def get_neutralized(self, smiles: str) -> str:
199 | if self._neutralizer is None:
200 | self._neutralizer = MoleculeNeutralizer()
201 |
202 | neutralized_smiles = self._neutralizer.neutralize_smiles(smiles)
203 | if neutralized_smiles is not None:
204 | logger.debug("Successfully neutralized molecule")
205 | return neutralized_smiles
206 | raise RuntimeError("Issue neutralizing SMILES")
207 |
208 | def neutralize(self):
209 | """
210 | Neutralize the molecule using the neutralizer.
211 |
212 | Returns:
213 | True if neutralization was successful, False otherwise
214 | """
215 | smiles_neutralized = self.get_neutralized(self.smiles)
216 | self._update_smiles(smiles_neutralized)
217 |
218 | @staticmethod
219 | def add_hydrogens(mol: Chem.Mol) -> Chem.Mol | None:
220 | """
221 | Add explicit hydrogens to a molecule.
222 |
223 | Args:
224 | mol: RDKit mol object
225 |
226 | Returns:
227 | Mol object with explicit hydrogens or None if failed
228 | """
229 | if mol is None:
230 | return None
231 |
232 | logger.debug("Adding hydrogens to molecule")
233 | try:
234 | mol_with_hs = Chem.AddHs(mol)
235 | if mol_with_hs is None:
236 | logger.warning("Failed to add hydrogens to molecule")
237 | return None
238 | logger.trace("After adding hydrogens: {}", Chem.MolToSmiles(mol_with_hs))
239 | return mol_with_hs
240 | except Exception as e:
241 | logger.warning("Error adding hydrogens to molecule: {}", str(e))
242 | return None
243 |
244 | @staticmethod
245 | def remove_hydrogens(mol: Chem.Mol) -> Chem.Mol | None:
246 | """
247 | Remove explicit hydrogens from a molecule.
248 |
249 | Args:
250 | mol: RDKit mol object
251 |
252 | Returns:
253 | Mol object without explicit hydrogens or None if failed
254 | """
255 | if mol is None:
256 | logger.info("No molecule was provided")
257 | return None
258 |
259 | logger.debug("Removing hydrogens from molecule")
260 | try:
261 | mol_no_hs = Chem.RemoveHs(mol)
262 | if mol_no_hs is None:
263 | logger.warning("Failed to remove hydrogens from molecule")
264 | return None
265 | return mol_no_hs
266 | except Exception as e:
267 | logger.warning("Error removing hydrogens from molecule: {}", str(e))
268 | return None
269 |
270 | @staticmethod
271 | def unprotect_atoms(mol: Chem.Mol) -> Chem.Mol:
272 | """
273 | Set the protected property on all atoms to 0.
274 |
275 | Args:
276 | mol: RDKit mol object to unprotect
277 |
278 | Returns:
279 | The same mol object (modified in place)
280 | """
281 | logger.trace("Unprotecting each atom")
282 | for atom in mol.GetAtoms():
283 | atom.SetProp("_protected", "0")
284 | return mol
285 |
286 | @staticmethod
287 | def protect_atoms(mol: Chem.Mol, atom_indices: list[int]) -> Chem.Mol:
288 | """
289 | Set the protected property on specified atoms to 1.
290 |
291 | Args:
292 | mol: RDKit mol object
293 | atom_indices: List of atom indices to protect
294 |
295 | Returns:
296 | The same mol object (modified in place)
297 | """
298 | logger.trace("Protecting atom(s): {}", atom_indices)
299 | for idx in atom_indices:
300 | try:
301 | atom = mol.GetAtomWithIdx(idx)
302 | atom.SetProp("_protected", "1")
303 | except Exception as e:
304 | logger.warning("Could not protect atom at index {}: {}", idx, str(e))
305 | return mol
306 |
307 | @staticmethod
308 | def is_atom_protected(mol: Chem.Mol, atom_idx: int) -> bool:
309 | """
310 | Check if an atom is protected.
311 |
312 | Args:
313 | mol: RDKit mol object
314 | atom_idx: Atom index to check
315 |
316 | Returns:
317 | True if atom is protected, False otherwise
318 | """
319 | try:
320 | atom = mol.GetAtomWithIdx(atom_idx)
321 | protected = atom.GetProp("_protected")
322 | return protected == "1"
323 | except Exception:
324 | return False
325 |
326 | def process_azides(self) -> None:
327 | """
328 | Process azide patterns in SMILES string.
329 |
330 | Args:
331 | smiles: Input SMILES string
332 |
333 | Returns:
334 | SMILES string with processed azides
335 | """
336 | smiles_working = self.smiles
337 | if "N=N=N" in smiles_working or "NN#N" in smiles_working:
338 | logger.info("Attempting to fix azide patterns in: '{}'", smiles_working)
339 | smiles_working = smiles_working.replace("N=N=N", "N=[N+]=N")
340 | smiles_working = smiles_working.replace("NN#N", "N=[N+]=N")
341 | if smiles_working != self.smiles:
342 | logger.info(
343 | "Modified SMILES: '{}' -> '{}'", self.smiles, smiles_working
344 | )
345 | self._update_smiles(smiles_working)
346 |
347 | def prepare_for_protonation(self) -> Chem.Mol:
348 | """
349 | Prepare molecule for protonation site detection.
350 |
351 | Returns:
352 | Prepared RDKit mol object or None if preparation fails
353 | """
354 | logger.info("Preparing molecule for analysis")
355 |
356 | self.process_azides()
357 | self.neutralize()
358 |
359 | base_mol = self.to_mol()
360 | if base_mol is None:
361 | raise RuntimeError("Could not convert SMILES to RDKit Mol")
362 |
363 | mol_with_hydrogens = self.add_hydrogens(base_mol)
364 | if mol_with_hydrogens is None:
365 | raise RuntimeError("Could not add Hydrogens to Mol")
366 |
367 | prepared_mol = self.unprotect_atoms(mol_with_hydrogens)
368 |
369 | atom_count = prepared_mol.GetNumAtoms()
370 | logger.trace("Molecule prepared with {} atoms", atom_count)
371 | assert atom_count > 0 # Molecule must have at least one atom
372 |
373 | self._update_smiles(Chem.MolToSmiles(prepared_mol))
374 | return prepared_mol
375 |
376 | @staticmethod
377 | def to_mol_silenced(smiles: str) -> dict[str, Any]:
378 | """
379 | Capture RDKit stderr output and return mol object with error messages.
380 |
381 | Args:
382 | smiles: SMILES string to convert
383 |
384 | Returns:
385 | Dictionary with 'mol' (RDKit Mol or None) and 'stderr_content' (string)
386 | """
387 | logger.debug("Converting SMILES to RDKit mol: {}", smiles)
388 |
389 | # Set up stderr capture
390 | stderr_fileno = sys.stderr.fileno()
391 | stderr_save = os.dup(stderr_fileno)
392 | stderr_pipe = os.pipe()
393 |
394 | try:
395 | # Redirect stderr to pipe
396 | os.dup2(stderr_pipe[1], stderr_fileno)
397 | os.close(stderr_pipe[1])
398 |
399 | # Convert SMILES to mol (this may write to stderr)
400 | mol = Chem.MolFromSmiles(smiles)
401 |
402 | # Read captured stderr
403 | os.close(stderr_fileno)
404 | stderr_content = os.read(stderr_pipe[0], 1024).decode(
405 | "utf-8", errors="ignore"
406 | )
407 |
408 | except Exception as e:
409 | logger.error("Error during SMILES conversion: {}", str(e))
410 | mol = None
411 | stderr_content = f"Exception during conversion: {str(e)}"
412 |
413 | finally:
414 | # Restore stderr
415 | try:
416 | os.close(stderr_pipe[0])
417 | except Exception:
418 | pass
419 | try:
420 | os.dup2(stderr_save, stderr_fileno)
421 | os.close(stderr_save)
422 | except Exception:
423 | pass
424 |
425 | return {"mol": mol, "stderr_content": stderr_content}
426 |
427 | def copy(self) -> "MoleculeRecord":
428 | """
429 | Create a deep copy of this MoleculeRecord.
430 |
431 | Returns:
432 | New MoleculeRecord instance
433 | """
434 | new_record = MoleculeRecord(self.smiles, self.identifier)
435 | new_record.smiles_original = self.smiles_original
436 |
437 | # Deep copy mol objects if they exist
438 | if self._mol is not None:
439 | new_record._mol = copy.deepcopy(self._mol)
440 | if self._mol_with_hs is not None:
441 | new_record._mol_with_hs = copy.deepcopy(self._mol_with_hs)
442 |
443 | return new_record
444 |
445 | def get_atom_count(self) -> int:
446 | """
447 | Get the number of atoms in the molecule.
448 |
449 | Returns:
450 | Number of atoms, or 0 if mol is invalid
451 | """
452 | mol = self.mol
453 | return mol.GetNumAtoms() if mol is not None else 0
454 |
455 | def get_heavy_atom_count(self) -> int:
456 | """
457 | Get the number of heavy (non-hydrogen) atoms in the molecule.
458 |
459 | Returns:
460 | Number of heavy atoms, or 0 if mol is invalid
461 | """
462 | mol = self.mol
463 | return mol.GetNumHeavyAtoms() if mol is not None else 0
464 |
465 | def has_substructure(self, pattern: str | Chem.Mol) -> bool:
466 | """
467 | Check if molecule contains a specific substructure.
468 |
469 | Args:
470 | pattern: SMARTS string or RDKit mol object to search for
471 |
472 | Returns:
473 | True if substructure is found, False otherwise
474 | """
475 | mol = self.mol
476 | if mol is None:
477 | return False
478 |
479 | try:
480 | if isinstance(pattern, str):
481 | pattern_mol = Chem.MolFromSmarts(pattern)
482 | if pattern_mol is None:
483 | logger.warning("Invalid SMARTS pattern: {}", pattern)
484 | return False
485 | else:
486 | pattern_mol = pattern
487 |
488 | return mol.HasSubstructMatch(pattern_mol)
489 | except Exception as e:
490 | logger.warning("Error checking substructure: {}", str(e))
491 | return False
492 |
493 | def get_substructure_matches(
494 | self, pattern: str | Chem.Mol
495 | ) -> list[tuple[int, ...]]:
496 | """
497 | Get all matches of a substructure pattern.
498 |
499 | Args:
500 | pattern: SMARTS string or RDKit mol object to search for
501 |
502 | Returns:
503 | List of tuples containing atom indices for each match
504 | """
505 | mol = self.mol
506 | if mol is None:
507 | return []
508 |
509 | try:
510 | if isinstance(pattern, str):
511 | pattern_mol = Chem.MolFromSmarts(pattern)
512 | if pattern_mol is None:
513 | logger.warning("Invalid SMARTS pattern: {}", pattern)
514 | return []
515 | else:
516 | pattern_mol = pattern
517 |
518 | return list(mol.GetSubstructMatches(pattern_mol))
519 | except Exception as e:
520 | logger.warning("Error finding substructure matches: {}", str(e))
521 | return []
522 |
523 | def __str__(self) -> str:
524 | """String representation of the molecule."""
525 | if self.identifier:
526 | return f"MoleculeRecord('{self.smiles}', '{self.identifier}')"
527 | return f"MoleculeRecord('{self.smiles}')"
528 |
529 | def __repr__(self) -> str:
530 | """Detailed string representation of the molecule."""
531 | return self.__str__()
532 |
533 | def __eq__(self, other: object) -> bool:
534 | """Check equality based on canonical SMILES."""
535 | if not isinstance(other, MoleculeRecord):
536 | return False
537 |
538 | self_canonical = self.get_canonical_smiles()
539 | other_canonical = other.get_canonical_smiles()
540 |
541 | return (
542 | self_canonical is not None
543 | and other_canonical is not None
544 | and self_canonical == other_canonical
545 | )
546 |
547 | def __hash__(self) -> int:
548 | """Hash based on canonical SMILES."""
549 | canonical = self.get_canonical_smiles()
550 | return hash(canonical) if canonical is not None else hash(self.smiles)
551 |
--------------------------------------------------------------------------------
/dimorphite_dl/neutralize.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 | from rdkit import Chem
3 | from rdkit.Chem import AllChem
4 |
5 | RXN_DATA = (
6 | # To handle O- bonded to only one atom (add hydrogen).
7 | ("[Ov1-1:1]", "[Ov2+0:1]-[H]"),
8 | # To handle N+ bonded to a hydrogen (remove hydrogen).
9 | ("[#7v4+1:1]-[H]", "[#7v3+0:1]"),
10 | # To handle O- bonded to two atoms. Should not be Negative.
11 | ("[Ov2-:1]", "[Ov2+0:1]"),
12 | # To handle N+ bonded to three atoms. Should not be positive.
13 | ("[#7v3+1:1]", "[#7v3+0:1]"),
14 | # To handle N- Bonded to two atoms. Add hydrogen.
15 | ("[#7v2-1:1]", "[#7+0:1]-[H]"),
16 | # To handle bad azide. R-N-N#N should be R-N=[N+]=N.
17 | ("[H]-[N:1]-[N:2]#[N:3]", "[N:1]=[N+1:2]=[N:3]-[H]"),
18 | )
19 |
20 |
21 | class NeutralizationReaction:
22 | """
23 | Represents a single neutralization reaction defined by a pair of SMARTS strings
24 | """
25 |
26 | def __init__(self, smarts_reactant: str, smarts_product: str):
27 | """
28 | Args:
29 | smarts_reactant: SMARTS for detecting the reactants of a defined
30 | neutralization reaction.
31 | smarts_product: SMARTS for what the detected `smarts_reactant` should
32 | be transformed to.
33 | """
34 | self.smarts_reactant = smarts_reactant
35 | self.smarts_product = smarts_product
36 | self._pattern = Chem.MolFromSmarts(smarts_reactant)
37 | self._rxn = AllChem.ReactionFromSmarts(f"{smarts_reactant}>>{smarts_product}")
38 |
39 | def __str__(self) -> str:
40 | return f"{self.smarts_reactant} >> {self.smarts_product}"
41 |
42 | def __repr__(self) -> str:
43 | return self.__str__()
44 |
45 | def matches(self, mol: Chem.Mol) -> bool:
46 | """Check if this reaction can be applied to the given molecule."""
47 | return mol.HasSubstructMatch(self._pattern)
48 |
49 | def apply(self, mol: Chem.Mol) -> Chem.Mol:
50 | """
51 | Apply the neutralization reaction to the molecule. Returns the first product.
52 | If multiple products are generated, only the first is returned.
53 | """
54 | products = self._rxn.RunReactants((mol,))
55 | if products:
56 | # products is a tuple of tuples; take the first product set, first product
57 | return products[0][0]
58 | return mol
59 |
60 |
61 | class ReactionRegistry:
62 | """
63 | Holds a collection of NeutralizationReaction objects and applies them repeatedly
64 | until no further matches are found.
65 | """
66 |
67 | def __init__(self, rxn_data: tuple[tuple[str, str]]):
68 | self.reactions = []
69 | for reactant, product in rxn_data:
70 | self.reactions.append(NeutralizationReaction(reactant, product))
71 |
72 | def neutralize(self, mol: Chem.Mol) -> Chem.Mol:
73 | """
74 | Apply all registered neutralization reactions to the molecule in a loop
75 | until no further transformations are possible. Assumes explicit H atoms
76 | have already been added.
77 | """
78 | mol.UpdatePropertyCache(strict=False)
79 | changed = True
80 | while changed:
81 | changed = False
82 | for reaction in self.reactions:
83 | if reaction.matches(mol):
84 | logger.debug("Found reaction match: {}", str(reaction))
85 | mol = reaction.apply(mol)
86 | mol.UpdatePropertyCache(strict=False)
87 | changed = True
88 | break # restart scanning from first reaction
89 | else:
90 | logger.trace("No match to reaction: {}", str(reaction))
91 | # Final sanitization
92 | sanitized = Chem.SanitizeMol(
93 | mol, sanitizeOps=Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, catchErrors=True
94 | )
95 | if sanitized.name == "SANITIZE_NONE":
96 | logger.debug("After neutralizing: {}", Chem.MolToSmiles(mol))
97 | return mol
98 | raise RuntimeError("Ran into issue sanitizing mol")
99 |
100 |
101 | class MoleculeNeutralizer:
102 | """
103 | High-level class to take SMILES, handle preprocessing, add Hs,
104 | run neutralization, and return a clean SMILES.
105 | """
106 |
107 | def __init__(self, rxn_data: tuple[tuple[str, str]] | None = None):
108 | if rxn_data is None:
109 | rxn_data = RXN_DATA
110 | self.registry = ReactionRegistry(rxn_data)
111 |
112 | def neutralize_smiles(self, smiles: str) -> str | None:
113 | logger.debug("Neutralizing {}", smiles)
114 | mol = Chem.MolFromSmiles(smiles)
115 | if mol is None:
116 | raise ValueError(f"Invalid SMILES: {smiles}")
117 |
118 | # Add explicit Hs
119 | mol = Chem.AddHs(mol)
120 | logger.debug("After adding hydrogens: {}", Chem.MolToSmiles(mol))
121 | # Run neutralization
122 | mol = self.registry.neutralize(mol)
123 | # Remove explicit Hs
124 | mol = Chem.RemoveHs(mol)
125 | logger.debug("After removing hydrogens: {}", Chem.MolToSmiles(mol))
126 | # Generate final SMILES
127 | return Chem.MolToSmiles(mol)
128 |
--------------------------------------------------------------------------------
/dimorphite_dl/protonate/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/dimorphite_dl/protonate/change.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | from loguru import logger
4 | from rdkit import Chem
5 | from rdkit.Chem import Mol
6 |
7 | from dimorphite_dl.protonate.site import ProtonationSite, ProtonationState
8 |
9 |
10 | def protonate_site(
11 | mols: list[Mol],
12 | site: ProtonationSite,
13 | ph_min: float,
14 | ph_max: float,
15 | precision: float,
16 | ) -> list[Mol]:
17 | """Protonate a specific site in a list of molecules.
18 |
19 | Args:
20 | mols: List of molecule objects.
21 | site: ProtonationSite object with protonation information.
22 | ph_min: Minimum pH to expose the site to.
23 | ph_max: Maximum pH to expose the site to.
24 | precision: pKa standard deviation prefactor to consider.
25 |
26 | Returns:
27 | List of appropriately protonated molecule objects. If there is any issue,
28 | this will return an empty list.
29 | """
30 | if not mols:
31 | logger.warning("No molecules provided for protonation")
32 | return []
33 |
34 | logger.debug("Protonating site: {}", site.name)
35 |
36 | unique_states = list(site.get_unique_states(ph_min, ph_max, precision))
37 |
38 | current_mols = mols
39 |
40 | for idx_atom, state in unique_states:
41 | charges = state.get_charges()
42 |
43 | # If the state is not BOTH, we apply its single charge to each
44 | # molecule in current_mols without creating branches.
45 | if state != ProtonationState.BOTH:
46 | logger.debug(
47 | "Site {} atom {} has exclusive state {}; applying to all molecules",
48 | site.name,
49 | idx_atom,
50 | state.to_str(),
51 | )
52 | processed = set_protonation_charge(
53 | current_mols, idx_atom, charges, site.name
54 | )
55 | if len(processed) == 0:
56 | return []
57 | current_mols = processed
58 |
59 | else:
60 | logger.debug(
61 | "Site {} atom {} is BOTH; branching into {} variants per molecule",
62 | site.name,
63 | idx_atom,
64 | charges,
65 | )
66 |
67 | branched = []
68 | for mol in current_mols:
69 | try:
70 | variants = set_protonation_charge(
71 | [mol], idx_atom, charges, site.name
72 | )
73 | branched.extend(variants)
74 | except Exception as e:
75 | logger.error("Error protonating site {}: {}", idx_atom, str(e))
76 | return []
77 | current_mols = branched
78 | return current_mols
79 |
80 |
81 | def set_protonation_charge(
82 | mols: list[Mol], idx: int, charges: list[int], prot_site_name: str
83 | ) -> list[Mol]:
84 | """Set atomic charge on a specific site for a set of molecules.
85 |
86 | Args:
87 | mols: List of input molecule objects.
88 | idx: Index of the atom to modify.
89 | charges: List of charges to assign at this site.
90 | prot_site_name: Name of the protonation site.
91 |
92 | Returns:
93 | List of processed molecule objects. If anything goes wrong, then we return
94 | an empty list.
95 | """
96 | is_special_nitrogen = "*" in prot_site_name
97 |
98 | mols_charged = []
99 | for charge in charges:
100 | nitrogen_charge = charge + 1
101 |
102 | # Special case for nitrogen moieties where acidic group is neutral
103 | if is_special_nitrogen:
104 | nitrogen_charge = nitrogen_charge - 1
105 |
106 | for mol in mols:
107 | try:
108 | processed_mol = _apply_charge_to_molecule(
109 | mol, idx, charge, nitrogen_charge
110 | )
111 | if processed_mol is not None:
112 | mols_charged.append(processed_mol)
113 | else:
114 | return []
115 | except Exception as e:
116 | logger.warning(
117 | "Error processing molecule with charge {}: {}", charge, str(e)
118 | )
119 | return []
120 | return mols_charged
121 |
122 |
123 | def _apply_charge_to_molecule(
124 | mol: Mol, idx: int, charge: int, nitrogen_charge: int
125 | ) -> Mol | None:
126 | """Apply charge to a specific atom in a molecule.
127 |
128 | Args:
129 | mol: Input molecule
130 | idx: Atom index
131 | charge: Charge for non-nitrogen atoms
132 | nitrogen_charge: Charge for nitrogen atoms
133 | prot_site_name: Name of protonation site
134 |
135 | Returns:
136 | Modified molecule or None if processing fails
137 | """
138 | logger.trace(
139 | "Applying charge of {} at index {} to SMILES: {}",
140 | charge,
141 | idx,
142 | Chem.MolToSmiles(mol),
143 | )
144 | # Create deep copy to avoid modifying original
145 | mol_copy = copy.deepcopy(mol)
146 |
147 | # Remove hydrogens first
148 | try:
149 | mol_copy = Chem.RemoveHs(mol_copy)
150 | if mol_copy is None:
151 | logger.warning("RemoveHs returned None for molecule")
152 | return None
153 | except Exception as e:
154 | logger.warning("Failed to remove hydrogens: {}", str(e))
155 | return None
156 |
157 | # Validate atom index
158 | if idx >= mol_copy.GetNumAtoms():
159 | logger.warning(
160 | "Atom index {} out of range (molecule has {} atoms)",
161 | idx,
162 | mol_copy.GetNumAtoms(),
163 | )
164 | return None
165 |
166 | atom = mol_copy.GetAtomWithIdx(idx)
167 | element = atom.GetAtomicNum()
168 |
169 | # Calculate explicit bond order
170 | try:
171 | explicit_bond_order_total = sum(
172 | b.GetBondTypeAsDouble() for b in atom.GetBonds()
173 | )
174 | except Exception as e:
175 | logger.warning("Error calculating bond order for atom {}: {}", idx, str(e))
176 | return None
177 |
178 | # Set formal charge and explicit hydrogens based on element type
179 | try:
180 | if element == 7: # Nitrogen
181 | _set_nitrogen_properties(atom, nitrogen_charge, explicit_bond_order_total)
182 | else:
183 | _set_other_element_properties(
184 | atom, charge, element, explicit_bond_order_total
185 | )
186 |
187 | # Special case for aromatic nitrogen deprotonation
188 | mol_smiles = Chem.MolToSmiles(mol_copy)
189 | if "[nH-]" in mol_smiles:
190 | logger.debug("Detected [nH-]; setting number of Hs to zero for this atom")
191 | atom.SetNumExplicitHs(0)
192 |
193 | # Update property cache
194 | mol_copy.UpdatePropertyCache(strict=False)
195 |
196 | except Exception as e:
197 | logger.warning("Error setting atom properties: {}", str(e))
198 | return None
199 |
200 | return mol_copy
201 |
202 |
203 | def _set_nitrogen_properties(
204 | atom: Chem.Atom, charge: int, bond_order_total: int
205 | ) -> None:
206 | """Set properties for nitrogen atoms based on charge and bonding."""
207 | atom_idx = atom.GetIdx()
208 | is_aromatic = atom.GetIsAromatic()
209 | degree = atom.GetDegree()
210 | logger.trace(
211 | "Setting N properties: index={}, charge={}, bond_order={}, aromatic={}, degree={}",
212 | atom_idx,
213 | charge,
214 | bond_order_total,
215 | is_aromatic,
216 | degree,
217 | )
218 |
219 | # Handling niche cases of aromatics often detected on NADP
220 | if charge == 1 and bond_order_total == 4.0 and is_aromatic and degree == 3:
221 | return
222 |
223 | atom.SetFormalCharge(charge)
224 | logger.debug("Set formal charge to {}", charge)
225 |
226 | # Set explicit hydrogens based on charge and bond order
227 | h_count_map = {
228 | (1, 1): 3,
229 | (1, 2): 2,
230 | (1, 3): 1, # Positive charge
231 | (0, 1): 2,
232 | (0, 2): 1, # Neutral
233 | (-1, 1): 1,
234 | (-1, 2): 0, # Negative charge
235 | }
236 |
237 | h_count = h_count_map.get((int(charge), int(bond_order_total)), -1)
238 | if h_count != -1:
239 | logger.debug("Setting hydrogen count to {}", h_count)
240 | atom.SetNumExplicitHs(h_count)
241 |
242 |
243 | def _set_other_element_properties(
244 | atom: Chem.Atom, charge: int, element: int, bond_order_total: float
245 | ) -> None:
246 | """Set properties for non-nitrogen atoms."""
247 | atom_idx = atom.GetIdx()
248 | is_aromatic = atom.GetIsAromatic()
249 | degree = atom.GetDegree()
250 | logger.trace(
251 | "Setting {} properties: index={}, charge={}, bond_order={}, aromatic={}, degree={}",
252 | element,
253 | atom_idx,
254 | charge,
255 | bond_order_total,
256 | is_aromatic,
257 | degree,
258 | )
259 |
260 | atom.SetFormalCharge(charge)
261 | logger.debug("Set formal charge to {}", charge)
262 |
263 | # Special handling for oxygen and sulfur
264 | if element in (8, 16): # O and S
265 | if charge == 0 and bond_order_total == 1:
266 | atom.SetNumExplicitHs(1)
267 | logger.debug("Set explicit hydrogens for this atom to 1")
268 | elif charge == -1 and bond_order_total == 1:
269 | atom.SetNumExplicitHs(0)
270 | logger.debug("Set explicit hydrogens for this atom to 0")
271 |
--------------------------------------------------------------------------------
/dimorphite_dl/protonate/data.py:
--------------------------------------------------------------------------------
1 | import importlib.resources as pkg_resources
2 | from collections.abc import Iterator
3 |
4 | from loguru import logger
5 | from rdkit import Chem
6 |
7 | from dimorphite_dl.protonate.site import PKaDatum, SubstructureDatum
8 |
9 |
10 | class PKaData:
11 | _data: list[SubstructureDatum] = []
12 | """All loaded data for our protonation substructures."""
13 |
14 | _instance = None
15 |
16 | def __new__(cls):
17 | if cls._instance is None:
18 | cls._instance = super().__new__(cls)
19 | cls._load_data()
20 | return cls._instance
21 |
22 | @classmethod
23 | def _load_data(cls) -> None:
24 | lines = cls._load_lines()
25 | data = []
26 | for line in lines:
27 | data.append(cls._parse_substructure_line(line))
28 | cls._data = data
29 |
30 | @classmethod
31 | def _load_lines(cls) -> list[str]:
32 | """Load the substructure SMARTS file, filtering out comments and blank lines.
33 |
34 | Returns:
35 | List of valid SMARTS lines from the file.
36 |
37 | Raises:
38 | FileNotFoundError: If the substructure file cannot be found.
39 | IOError: If there are issues reading the file.
40 | """
41 | logger.trace("Loading substructure data from site_substructures.smarts")
42 |
43 | try:
44 | with pkg_resources.open_text(
45 | "dimorphite_dl.smarts", "site_substructures.smarts"
46 | ) as f:
47 | lines = []
48 | line_count = 0
49 | valid_count = 0
50 |
51 | for line in f:
52 | line_count += 1
53 | stripped = line.strip()
54 |
55 | # Skip empty lines and comments
56 | if stripped and not stripped.startswith("#"):
57 | lines.append(stripped)
58 | valid_count += 1
59 |
60 | logger.info("Loaded {} valid SMARTS patterns", valid_count)
61 | return lines
62 |
63 | except FileNotFoundError:
64 | logger.error("Could not find site_substructures.smarts file")
65 | raise
66 | except Exception as e:
67 | logger.error("Error reading substructure file: {}", str(e))
68 | raise IOError(f"Failed to read substructure file: {e}")
69 |
70 | @classmethod
71 | def _parse_substructure_line(cls, line: str) -> SubstructureDatum:
72 | """Parse a single line from the substructure data file.
73 |
74 | Args:
75 | line: Line from the substructure file
76 |
77 | Returns:
78 | SubstructureData object.
79 |
80 | Notes:
81 | Below is an example line of the tab separated file.
82 |
83 | ```text
84 | *Azide [N+0:1]=[N+:2]=[N+0:3]-[H] 2 4.65 0.07071067811865513
85 | ```
86 |
87 | This contains the following information separated by tabs.
88 |
89 | - Name of the substructure. A `*` prefix indicates that it is an aromatic
90 | nitrogen that needs special treatment.
91 | - [SMARTS](https://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)
92 | of this particular substructure.
93 | - Data about the protonation site always in a set of threes. You can have
94 | more than one site.
95 | - The site index.
96 | - pKa mean
97 | - pKa standard deviation.
98 | """
99 | parts = line.split()
100 | if len(parts) < 3:
101 | logger.warning("Invalid line format (too few parts): '{}'", line)
102 | raise ValueError
103 |
104 | name = parts[0]
105 | logger.trace("Substructure name is {}", name)
106 | smarts = parts[1]
107 | logger.trace("Substructure SMARTS is {}", smarts)
108 | mol = cls._create_rdkit_mol(smarts)
109 |
110 | # Parse pKa ranges (groups of 3: site, mean, std)
111 | pka_data = cls._parse_pka_line(parts[2:])
112 | return SubstructureDatum(name=name, smarts=smarts, pkas=pka_data, mol=mol)
113 |
114 | @classmethod
115 | def _create_rdkit_mol(cls, smarts: str) -> Chem.Mol:
116 | # Create mol object from SMARTS
117 | try:
118 | logger.trace("Attempting to make RDKit mol from SMARTS")
119 | mol = Chem.MolFromSmarts(smarts)
120 | if mol is None:
121 | logger.warning("Invalid SMARTS pattern: {}", smarts)
122 | raise ValueError
123 | except Exception as e:
124 | logger.warning("Error creating mol from SMARTS '{}' : {}", smarts, str(e))
125 | raise ValueError
126 | return mol
127 |
128 | @classmethod
129 | def _parse_pka_line(cls, line_parts: list[str]) -> list[PKaDatum]:
130 | if len(line_parts) % 3 != 0:
131 | logger.warning(
132 | "Invalid pKa data format, expected groups of 3, got {}", len(line_parts)
133 | )
134 | raise ValueError
135 |
136 | pka_data = []
137 | for i in range(0, len(line_parts), 3):
138 | try:
139 | idx_site = int(line_parts[i])
140 | mean = float(line_parts[i + 1])
141 | stdev = float(line_parts[i + 2])
142 | pka_data.append(PKaDatum(idx_site=idx_site, mean=mean, stdev=stdev))
143 | except (ValueError, IndexError) as e:
144 | logger.warning("Error parsing pKa data: {}", line_parts)
145 | raise ValueError from e
146 | return pka_data
147 |
148 | @classmethod
149 | def get_substructures(cls) -> Iterator[SubstructureDatum]:
150 | for substruct in cls._data:
151 | yield substruct
152 |
--------------------------------------------------------------------------------
/dimorphite_dl/protonate/detect.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides functionality to detect protonation sites in molecules
3 | using substructure matching with comprehensive error handling and validation.
4 | """
5 |
6 | from collections.abc import Iterator
7 |
8 | from loguru import logger
9 | from rdkit import Chem
10 |
11 | from dimorphite_dl.mol import MoleculeRecord
12 | from dimorphite_dl.protonate.data import PKaData
13 | from dimorphite_dl.protonate.site import ProtonationSite, SubstructureDatum
14 |
15 |
16 | class ProtonationSiteDetectionError(Exception):
17 | """Raised when protonation site detection encounters an error."""
18 |
19 | pass
20 |
21 |
22 | class ProtonationSiteDetector:
23 | """
24 | Robust detector for finding protonation sites in molecules.
25 |
26 | Uses substructure matching to identify potential protonation sites
27 | based on known pKa patterns. Handles atom protection to prevent
28 | overlapping matches.
29 | """
30 |
31 | def __init__(self, validate_sites: bool = True, max_sites_per_molecule: int = 50):
32 | """
33 | Initialize the detector with explicit configuration.
34 |
35 | Args:
36 | validate_sites: Whether to validate detected sites (explicit, not default)
37 | max_sites_per_molecule: Maximum sites to detect per molecule (bounded)
38 | """
39 | assert isinstance(validate_sites, bool)
40 | assert isinstance(max_sites_per_molecule, int)
41 | assert max_sites_per_molecule > 0
42 | assert max_sites_per_molecule <= 1000 # Reasonable upper bound
43 |
44 | self.validate_sites = validate_sites
45 | self.max_sites_per_molecule = max_sites_per_molecule
46 | self.pka_data = PKaData()
47 |
48 | # Initialize statistics - all counters start at zero
49 | self._stats_molecules_processed = 0
50 | self._stats_sites_found = 0
51 | self._stats_sites_validated = 0
52 | self._stats_sites_rejected = 0
53 | self._stats_substructures_matched = 0
54 |
55 | def find_sites(
56 | self, mol_record: MoleculeRecord
57 | ) -> tuple[MoleculeRecord, list[ProtonationSite]]:
58 | """
59 | Find protonation sites in a molecule. This is the main entry point.
60 |
61 | Args:
62 | mol_record: MoleculeRecord to analyze
63 |
64 | Returns:
65 | Tuple of (updated_mol_record, list_of_protonation_sites)
66 |
67 | Raises:
68 | ProtonationSiteDetectionError: If detection fails critically
69 | """
70 | assert isinstance(mol_record, MoleculeRecord)
71 | assert mol_record.smiles # SMILES cannot be empty
72 |
73 | logger.debug("Finding protonation sites for '{}'", mol_record.smiles)
74 | self._stats_molecules_processed += 1
75 |
76 | try:
77 | prepared_mol = mol_record.prepare_for_protonation()
78 | if prepared_mol is None:
79 | logger.warning("Failed to prepare molecule: '{}'", mol_record.smiles)
80 | return mol_record, []
81 |
82 | mol_record.mol = prepared_mol
83 | gen_sites = self._detect_all_sites_in_molecule(prepared_mol)
84 |
85 | sites_found = []
86 | for site in gen_sites:
87 | if self.validate_sites:
88 | if not site.is_valid():
89 | continue
90 | sites_found.append(site)
91 |
92 | sites_count = 0
93 | for site in sites_found:
94 | sites_count += len(site.pkas)
95 |
96 | logger.info(
97 | "Found {} protonation site(s) for '{}'", sites_count, mol_record.smiles
98 | )
99 | return mol_record, sites_found
100 |
101 | except Exception as error:
102 | logger.error(
103 | "Critical error detecting sites for '{}': {}",
104 | mol_record.smiles,
105 | str(error),
106 | )
107 | raise ProtonationSiteDetectionError(f"Detection failed: {error}") from error
108 |
109 | def _detect_all_sites_in_molecule(self, mol: Chem.Mol) -> Iterator[ProtonationSite]:
110 | """
111 | Detect all protonation sites in the prepared molecule.
112 |
113 | Args:
114 | mol: Prepared RDKit mol object
115 |
116 | Yields:
117 | Detected protonation sites.
118 | """
119 | assert mol is not None
120 | assert mol.GetNumAtoms() > 0
121 |
122 | total_matches_found = 0
123 |
124 | for substructure_data in self._iterate_available_substructures():
125 | if substructure_data.mol is None:
126 | logger.debug(
127 | "Skipping substructure '{}' - no mol object", substructure_data.name
128 | )
129 | continue
130 |
131 | matches = self._find_unprotected_matches_for_substructure(
132 | mol, substructure_data
133 | )
134 |
135 | n_matches = len(matches)
136 | if n_matches == 0:
137 | continue
138 |
139 | total_matches_found += n_matches
140 | self._stats_substructures_matched += 1
141 | n_sites = 0
142 |
143 | for site in self._create_sites_from_matches(
144 | mol, matches, substructure_data
145 | ):
146 | n_sites += len(site.pkas)
147 | if n_sites >= self.max_sites_per_molecule:
148 | break
149 |
150 | yield site
151 |
152 | mol = self._protect_matched_atoms_in_molecule(mol, matches)
153 |
154 | def _iterate_available_substructures(self) -> Iterator[SubstructureDatum]:
155 | """
156 | Get available substructure patterns for matching.
157 |
158 | Yields:
159 | SubstructureDatum objects for pattern matching
160 | """
161 | try:
162 | substructure_count = 0
163 | for substructure_data in self.pka_data.get_substructures():
164 | assert isinstance(substructure_data, SubstructureDatum)
165 | substructure_count += 1
166 | yield substructure_data
167 |
168 | logger.trace("Iterated over {} substructures", substructure_count)
169 |
170 | except Exception as error:
171 | logger.error("Error loading substructure data: {}", str(error))
172 | raise ProtonationSiteDetectionError(
173 | f"Failed to load substructure data: {error}"
174 | ) from error
175 |
176 | def _find_unprotected_matches_for_substructure(
177 | self, mol: Chem.Mol, substructure_data: SubstructureDatum
178 | ) -> list[tuple[int, ...]]:
179 | """
180 | Find unprotected matches for a specific substructure pattern.
181 |
182 | Args:
183 | mol: RDKit mol object to search in
184 | substructure_data: Substructure pattern to match
185 |
186 | Returns:
187 | List of tuples containing atom indices for unprotected matches
188 | """
189 | assert mol is not None
190 | assert substructure_data is not None
191 | assert substructure_data.mol is not None
192 |
193 | try:
194 | has_substructure = mol.HasSubstructMatch(substructure_data.mol)
195 | if not has_substructure:
196 | return []
197 |
198 | all_matches = list(mol.GetSubstructMatches(substructure_data.mol))
199 | total_matches = len(all_matches)
200 | logger.debug(
201 | "Found {} '{}' group(s)", total_matches, substructure_data.name
202 | )
203 |
204 | unprotected_matches = self._filter_matches_by_protection_status(
205 | mol, all_matches
206 | )
207 | unprotected_count = len(unprotected_matches)
208 |
209 | logger.debug(
210 | "{}/{} matches were unprotected",
211 | unprotected_count,
212 | total_matches,
213 | substructure_data.name,
214 | )
215 |
216 | return unprotected_matches
217 |
218 | except Exception as error:
219 | logger.warning(
220 | "Error finding matches for substructure '{}': {}",
221 | substructure_data.name,
222 | str(error),
223 | )
224 | return []
225 |
226 | def _filter_matches_by_protection_status(
227 | self, mol: Chem.Mol, all_matches: list[tuple[int, ...]]
228 | ) -> list[tuple[int, ...]]:
229 | """
230 | Filter matches to only include those with unprotected atoms.
231 |
232 | Args:
233 | mol: RDKit mol object
234 | all_matches: List of all matches to filter
235 |
236 | Returns:
237 | List of matches where all atoms are unprotected
238 | """
239 | assert mol is not None
240 | assert isinstance(all_matches, list)
241 |
242 | unprotected_matches = []
243 | atom_count = mol.GetNumAtoms()
244 |
245 | for match in all_matches:
246 | assert isinstance(match, tuple)
247 |
248 | # Validate atom indices are within bounds
249 | for atom_index in match:
250 | assert isinstance(atom_index, int)
251 | assert 0 <= atom_index < atom_count
252 |
253 | if self._are_all_atoms_in_match_unprotected(mol, match):
254 | unprotected_matches.append(match)
255 |
256 | return unprotected_matches
257 |
258 | def _are_all_atoms_in_match_unprotected(
259 | self, mol: Chem.Mol, match: tuple[int, ...]
260 | ) -> bool:
261 | """
262 | Check if all atoms in a match are unprotected.
263 |
264 | Args:
265 | mol: RDKit mol object
266 | match: Tuple of atom indices to check
267 |
268 | Returns:
269 | True if all atoms in match are unprotected, False otherwise
270 | """
271 | assert mol is not None
272 | assert isinstance(match, tuple)
273 | assert len(match) > 0 # Match cannot be empty
274 |
275 | try:
276 | for atom_index in match:
277 | assert isinstance(atom_index, int)
278 | if MoleculeRecord.is_atom_protected(mol, atom_index):
279 | return False
280 | return True
281 |
282 | except Exception as error:
283 | logger.debug(
284 | "Error checking protection for match {}: {}", match, str(error)
285 | )
286 | return False
287 |
288 | def _create_sites_from_matches(
289 | self,
290 | mol: Chem.Mol,
291 | matches: list[tuple[int, ...]],
292 | substructure_data: SubstructureDatum,
293 | ) -> Iterator[ProtonationSite]:
294 | """
295 | Create ProtonationSite objects from matches.
296 |
297 | Args:
298 | mol: RDKit mol object used to detect this protonation site.
299 | matches: List of atom index tuples
300 | substructure_data: Substructure information
301 |
302 | Yields:
303 | Detected protonation sites.
304 | """
305 | assert isinstance(matches, list)
306 | assert isinstance(substructure_data, SubstructureDatum)
307 |
308 | for match_indices in matches:
309 | site = ProtonationSite(
310 | mol=mol,
311 | idxs_match=tuple(match_indices),
312 | pkas=substructure_data.pkas,
313 | smarts=substructure_data.smarts,
314 | name=substructure_data.name,
315 | )
316 | yield site
317 |
318 | def _protect_matched_atoms_in_molecule(
319 | self, mol: Chem.Mol, matches: list[tuple[int, ...]]
320 | ) -> Chem.Mol:
321 | """
322 | Protect all atoms involved in matches to prevent overlap.
323 |
324 | Args:
325 | mol: RDKit mol object
326 | matches: List of matches whose atoms should be protected
327 |
328 | Returns:
329 | Same mol object with matched atoms protected
330 | """
331 | assert mol is not None
332 | assert isinstance(matches, list)
333 |
334 | for match in matches:
335 | assert isinstance(match, tuple)
336 | atom_indices = list(match)
337 | logger.debug("Protecting atoms: {}", match)
338 | mol = MoleculeRecord.protect_atoms(mol, atom_indices)
339 |
340 | return mol
341 |
342 | def get_stats(self) -> dict[str, int]:
343 | """
344 | Get detection statistics.
345 |
346 | Returns:
347 | Dictionary of detection statistics
348 | """
349 | return {
350 | "molecules_processed": self._stats_molecules_processed,
351 | "sites_found": self._stats_sites_found,
352 | "sites_validated": self._stats_sites_validated,
353 | "sites_rejected": self._stats_sites_rejected,
354 | "substructures_matched": self._stats_substructures_matched,
355 | }
356 |
357 | def reset_stats(self) -> None:
358 | """
359 | Reset all detection statistics to zero.
360 |
361 | """
362 | self._stats_molecules_processed = 0
363 | self._stats_sites_found = 0
364 | self._stats_sites_validated = 0
365 | self._stats_sites_rejected = 0
366 | self._stats_substructures_matched = 0
367 |
368 |
369 | def canonicalize_smiles_list(
370 | mols: list[Chem.Mol], original_smiles: str = ""
371 | ) -> list[str]:
372 | """
373 | Generate canonical SMILES from molecule objects.
374 |
375 | Args:
376 | mols: List of RDKit mol objects to convert
377 | original_smiles: Original SMILES for logging context
378 |
379 | Returns:
380 | List of unique canonical SMILES strings
381 | """
382 | assert isinstance(mols, list)
383 | assert isinstance(original_smiles, str)
384 |
385 | if len(mols) == 0:
386 | return []
387 |
388 | logger.debug("Generating canonical SMILES for {} molecules", len(mols))
389 |
390 | try:
391 | unique_smiles = set()
392 | valid_mol_count = 0
393 |
394 | for mol in mols:
395 | if mol is None:
396 | continue
397 |
398 | valid_mol_count += 1
399 | canonical_smiles = _generate_single_canonical_smiles(mol)
400 | if canonical_smiles is not None:
401 | unique_smiles.add(canonical_smiles)
402 |
403 | smiles_list = list(unique_smiles)
404 | unique_count = len(smiles_list)
405 |
406 | context_msg = f" for '{original_smiles}'" if original_smiles else ""
407 | logger.debug(
408 | "Generated {} unique canonical SMILES from {} valid molecules{}",
409 | unique_count,
410 | valid_mol_count,
411 | context_msg,
412 | )
413 |
414 | return smiles_list
415 |
416 | except Exception as error:
417 | context_msg = f" for '{original_smiles}'" if original_smiles else ""
418 | logger.warning(
419 | "Error in canonical SMILES generation{}: {}", context_msg, str(error)
420 | )
421 | return []
422 |
423 |
424 | def _generate_single_canonical_smiles(mol: Chem.Mol) -> str | None:
425 | """
426 | Generate canonical SMILES for a single molecule.
427 |
428 | Args:
429 | mol: RDKit mol object
430 |
431 | Returns:
432 | Canonical SMILES string or None if generation failed
433 | """
434 | assert mol is not None
435 |
436 | try:
437 | canonical = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
438 | if canonical and len(canonical) > 0:
439 | return canonical
440 | else:
441 | logger.debug("Generated empty SMILES string")
442 | return None
443 |
444 | except Exception as error:
445 | logger.debug("Error generating canonical SMILES: {}", str(error))
446 | return None
447 |
448 |
449 | # Convenience functions for backward compatibility
450 | def find(
451 | mol_record: MoleculeRecord,
452 | validate_sites: bool = True,
453 | max_sites_per_molecule: int = 50,
454 | ) -> tuple[MoleculeRecord, list[ProtonationSite]]:
455 | """
456 | Convenience function for finding protonation sites with default settings.
457 |
458 | Args:
459 | mol_record: MoleculeRecord to analyze
460 |
461 | Returns:
462 | Tuple of (updated_mol_record, list_of_protonation_sites)
463 | """
464 | assert isinstance(mol_record, MoleculeRecord)
465 | assert isinstance(validate_sites, bool)
466 | assert isinstance(max_sites_per_molecule, int)
467 | assert max_sites_per_molecule > 0
468 |
469 | detector = ProtonationSiteDetector(
470 | validate_sites=validate_sites, max_sites_per_molecule=max_sites_per_molecule
471 | )
472 | return detector.find_sites(mol_record)
473 |
--------------------------------------------------------------------------------
/dimorphite_dl/protonate/results.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from dimorphite_dl.protonate.site import ProtonationState
4 |
5 |
6 | @dataclass
7 | class ProtonationResult:
8 | """Data class for protonation results."""
9 |
10 | smiles: str
11 | identifier: str
12 | states: ProtonationState | None = None
13 |
14 | def to_string(self, include_states: bool = False) -> str:
15 | """Convert to output string format."""
16 | if include_states and self.states:
17 | return f"{self.smiles}\t{self.identifier}\t{self.states.to_str()}"
18 | return f"{self.smiles}\t{self.identifier}"
19 |
20 |
21 | @dataclass
22 | class ProtonationStats:
23 | """Statistics for protonation processing."""
24 |
25 | molecules_processed: int = 0
26 | total_variants_generated: int = 0
27 | variants_validated: int = 0
28 | variants_rejected: int = 0
29 | molecules_with_sites: int = 0
30 | molecules_without_sites: int = 0
31 | fallback_used: int = 0
32 |
--------------------------------------------------------------------------------
/dimorphite_dl/protonate/site.py:
--------------------------------------------------------------------------------
1 | """
2 | Protonation site data structures and state calculations.
3 |
4 | This module defines the core data structures for protonation sites,
5 | including state enumerations, pKa data, and site information.
6 | Each class has clear responsibilities and comprehensive validation.
7 | """
8 |
9 | from collections.abc import Iterator
10 | from dataclasses import dataclass, field
11 | from enum import Enum
12 |
13 | from loguru import logger
14 | from rdkit import Chem
15 |
16 |
17 | class ProtonationState(Enum):
18 | """
19 | Enumeration of possible protonation states for a site.
20 |
21 | Values are explicitly assigned for clarity and debugging.
22 | """
23 |
24 | UNKNOWN = 0
25 | DEPROTONATED = 1
26 | PROTONATED = 2
27 | BOTH = 3
28 |
29 | def to_str(self) -> str:
30 | """
31 | Convert protonation state to string representation.
32 |
33 | Returns:
34 | String representation of the protonation state
35 | """
36 | # Use explicit if-elif chain for clarity (TigerStyle)
37 | if self == ProtonationState.DEPROTONATED:
38 | return "DEPROTONATED"
39 | elif self == ProtonationState.PROTONATED:
40 | return "PROTONATED"
41 | elif self == ProtonationState.BOTH:
42 | return "BOTH"
43 | else:
44 | return "UNKNOWN"
45 |
46 | def get_charges(self) -> list[int]:
47 | """
48 | Get the formal charges associated with this protonation state.
49 |
50 | Returns:
51 | List of integer formal charges for this state
52 | """
53 | # Use explicit if-elif chain for clarity (TigerStyle)
54 | if self == ProtonationState.DEPROTONATED:
55 | return [-1]
56 | elif self == ProtonationState.PROTONATED:
57 | return [0]
58 | elif self == ProtonationState.BOTH:
59 | return [-1, 0]
60 | else:
61 | return []
62 |
63 |
64 | class PKaDatum:
65 | """
66 | Data structure for pKa information at a specific site.
67 |
68 | Contains the site index, mean pKa value, and standard deviation
69 | for calculating protonation states at different pH values.
70 | """
71 |
72 | def __init__(self, idx_site: int, mean: float, stdev: float):
73 | """
74 | Initialize pKa data with validation.
75 |
76 | Args:
77 | idx_atom: Site index (non-negative integer)
78 | mean: Mean pKa value (bounded 0-20 for realistic range)
79 | stdev: Standard deviation (non-negative, bounded 0-5)
80 | """
81 | assert isinstance(idx_site, int)
82 | assert isinstance(mean, (int, float))
83 | assert isinstance(stdev, (int, float))
84 | assert idx_site >= 0, f"Site index must be non-negative, got: {idx_site}"
85 |
86 | self.idx_site = idx_site
87 | """Index of the atom we would protonate in the SMARTS substructure pattern"""
88 | self.mean = float(mean)
89 | self.stdev = float(stdev)
90 |
91 | def get_state(
92 | self, ph_min: float, ph_max: float, precision: float
93 | ) -> ProtonationState:
94 | """
95 | Calculate protonation state for given pH range and precision.
96 |
97 | Args:
98 | ph_min: Minimum pH value (bounded 0-14)
99 | ph_max: Maximum pH value (bounded 0-14, greater than ph_min)
100 | precision: Precision factor for pKa calculation (positive)
101 |
102 | Returns:
103 | ProtonationState based on pH range and pKa statistics
104 | """
105 | assert isinstance(ph_min, (int, float))
106 | assert isinstance(ph_max, (int, float))
107 | assert isinstance(precision, (int, float))
108 | assert ph_min <= ph_max, (
109 | f"ph_min ({ph_min}) must be less than ph_max ({ph_max})"
110 | )
111 | assert precision >= 0.0, f"precision must be positive, got: {precision}"
112 |
113 | # Calculate effective pKa range based on precision
114 | effective_stdev = precision * self.stdev
115 | pka_min = self.mean - effective_stdev
116 | pka_max = self.mean + effective_stdev
117 |
118 | # Determine protonation state based on pH and pKa overlap
119 | # Use explicit conditions for clarity (TigerStyle)
120 | pka_overlaps_ph_range = (pka_min <= ph_max) and (ph_min <= pka_max)
121 | if pka_overlaps_ph_range:
122 | protonation_state = ProtonationState.BOTH
123 | elif self.mean > ph_max:
124 | protonation_state = ProtonationState.PROTONATED
125 | elif self.mean < ph_min:
126 | protonation_state = ProtonationState.DEPROTONATED
127 | else:
128 | protonation_state = ProtonationState.UNKNOWN
129 |
130 | return protonation_state
131 |
132 |
133 | @dataclass
134 | class SubstructureDatum:
135 | """
136 | Data structure for substructure pattern matching information.
137 |
138 | Contains the pattern name, SMARTS string, RDKit mol object,
139 | and associated pKa data for protonation site detection.
140 | """
141 |
142 | name: str = ""
143 | smarts: str = ""
144 | mol: Chem.Mol | None = None
145 | pkas: list[PKaDatum] = field(default_factory=list)
146 |
147 | def __post_init__(self):
148 | """Validate substructure data after initialization."""
149 | assert isinstance(self.name, str)
150 | assert isinstance(self.smarts, str)
151 | assert isinstance(self.pkas, list)
152 |
153 | # Name and SMARTS should not be empty for valid substructures
154 | if len(self.name) > 0 or len(self.smarts) > 0:
155 | assert len(self.name) > 0, "Substructure name cannot be empty"
156 | assert len(self.smarts) > 0, "SMARTS pattern cannot be empty"
157 |
158 | # Validate all pKa data entries
159 | for pka in self.pkas:
160 | assert isinstance(pka, PKaDatum)
161 |
162 | def has_valid_pattern(self) -> bool:
163 | """
164 | Check if substructure has a valid molecular pattern.
165 |
166 | Returns:
167 | True if mol object exists and is valid
168 | """
169 | return self.mol is not None
170 |
171 | def has_pka_data(self) -> bool:
172 | """
173 | Check if substructure has pKa data available.
174 |
175 | Returns:
176 | True if pKa data list is non-empty
177 | """
178 | return len(self.pkas) > 0
179 |
180 | def get_pka_count(self) -> int:
181 | """
182 | Get the number of pKa data points for this substructure.
183 |
184 | Returns:
185 | Number of pKa data entries
186 | """
187 | return len(self.pkas)
188 |
189 | def is_valid_for_matching(self) -> bool:
190 | """
191 | Check if substructure is valid for pattern matching.
192 |
193 | Returns:
194 | True if both pattern and pKa data are available
195 | """
196 | return self.has_valid_pattern() and self.has_pka_data()
197 |
198 |
199 | @dataclass
200 | class ProtonationSite:
201 | """
202 | Data structure for detected protonation site information.
203 |
204 | Contains atom indices and associated substructure data
205 | for a specific protonation site in a molecule.
206 | """
207 |
208 | mol: Chem.Mol
209 | """RDKit Mol object that this protonation site was detected"""
210 |
211 | idxs_match: tuple[int, ...]
212 | """Atom indices of substructure match"""
213 |
214 | pkas: list[PKaDatum]
215 | """Observed pKas of this site."""
216 |
217 | smarts: str
218 | """SMARTS used to detect the protonation site."""
219 |
220 | name: str
221 | """Name of identified protonation site."""
222 |
223 | def get_states(
224 | self, ph_min: float, ph_max: float, precision: float
225 | ) -> Iterator[tuple[int, ProtonationState]]:
226 | """
227 | Generate protonation states for all pKa data at this site.
228 |
229 | Args:
230 | ph_min: Minimum pH value
231 | ph_max: Maximum pH value
232 | precision: Precision factor for pKa calculation
233 |
234 | Yields:
235 | Atom index of Mol and ProtonationState for each pKa datum at this site
236 | """
237 | assert isinstance(ph_min, (int, float))
238 | assert isinstance(ph_max, (int, float))
239 | assert isinstance(precision, (int, float))
240 | assert ph_min <= ph_max, (
241 | f"ph_min ({ph_min}) must be less than or equal to ph_max ({ph_max})"
242 | )
243 | assert precision >= 0.0, f"precision must be positive, got: {precision}"
244 |
245 | pka_count = len(self.pkas)
246 | assert pka_count > 0, "Cannot generate states without pKa data"
247 |
248 | states_generated = 0
249 | for pka in self.pkas:
250 | assert isinstance(pka, PKaDatum)
251 | idx_atom = self.idxs_match[pka.idx_site]
252 | state = pka.get_state(ph_min, ph_max, precision)
253 | states_generated += 1
254 | yield idx_atom, state
255 |
256 | assert states_generated == pka_count, (
257 | f"Expected {pka_count} states, generated {states_generated}"
258 | )
259 |
260 | def get_unique_states(
261 | self, ph_min: float, ph_max: float, precision: float
262 | ) -> tuple[tuple[int, ProtonationState], ...]:
263 | """
264 | Get protonation states as a list for easier handling.
265 |
266 | Args:
267 | ph_min: Minimum pH value (bounded 0-14)
268 | ph_max: Maximum pH value (bounded 0-14, greater than ph_min)
269 | precision: Precision factor for pKa calculation (positive)
270 |
271 | Returns:
272 | List of ProtonationState objects for this site
273 | """
274 | gen = tuple(state for state in self.get_states(ph_min, ph_max, precision))
275 | states_unique = tuple(set(gen))
276 | return states_unique
277 |
278 | def is_valid(self) -> bool:
279 | if self.mol is None:
280 | logger.debug("Site validation failed: no mol object")
281 | return False
282 |
283 | atom_count = self.mol.GetNumAtoms()
284 | if atom_count <= 0:
285 | return False
286 |
287 | for atom_index in self.idxs_match:
288 | if not isinstance(atom_index, int):
289 | logger.debug("Invalid atom index type: {}", type(atom_index))
290 | return False
291 | if atom_index < 0:
292 | logger.debug("Negative atom index: {}", atom_index)
293 | return False
294 | if atom_index >= atom_count:
295 | logger.debug(
296 | "Atom index {} out of range (molecule has {} atoms)",
297 | atom_index,
298 | atom_count,
299 | )
300 | return False
301 |
302 | return True
303 |
304 |
305 | def validate_ph_range(ph_min: float, ph_max: float) -> bool:
306 | """
307 | Validate pH range parameters.
308 |
309 | Args:
310 | ph_min: Minimum pH value
311 | ph_max: Maximum pH value
312 |
313 | Returns:
314 | True if pH range is valid
315 | """
316 | try:
317 | assert isinstance(ph_min, (int, float))
318 | assert isinstance(ph_max, (int, float))
319 | assert 0.0 <= ph_min <= 14.0
320 | assert 0.0 <= ph_max <= 14.0
321 | assert ph_min < ph_max
322 | return True
323 | except AssertionError:
324 | return False
325 |
326 |
327 | def create_pka_datum_safe(idx_site: int, mean: float, stdev: float) -> PKaDatum | None:
328 | """
329 | Create PKaDatum with error handling.
330 |
331 | Args:
332 | idx_site: Site index
333 | mean: Mean pKa value
334 | stdev: Standard deviation
335 |
336 | Returns:
337 | PKaDatum object or None if parameters are invalid
338 | """
339 | try:
340 | return PKaDatum(idx_site, mean, stdev)
341 | except (AssertionError, ValueError):
342 | return None
343 |
344 |
345 | def create_protonation_site_safe(
346 | mol: Chem.Mol,
347 | idxs_match: tuple[int, ...],
348 | substructure: SubstructureDatum | None = None,
349 | ) -> ProtonationSite | None:
350 | """
351 | Create ProtonationSite with error handling.
352 |
353 | Args:
354 | mol: RDKit Mol object we are creating a protonation site for.
355 | idxs_match: Atom indices of substructure match.
356 | substructure: Substructure data
357 |
358 | Returns:
359 | ProtonationSite object or None if parameters are invalid
360 | """
361 | try:
362 | if substructure is None:
363 | substructure = SubstructureDatum()
364 | return ProtonationSite(
365 | mol=mol,
366 | idxs_match=idxs_match,
367 | pkas=substructure.pkas,
368 | smarts=substructure.smarts,
369 | name=substructure.name,
370 | )
371 | except (AssertionError, ValueError):
372 | return None
373 |
--------------------------------------------------------------------------------
/dimorphite_dl/smarts/site_substructures.smarts:
--------------------------------------------------------------------------------
1 | *Azide [N+0:1]=[N+:2]=[N+0:3]-[H] 2 4.65 0.07071067811865513
2 | Nitro [C,c,N,n,O,o:1]-[NX3:2](=[O:3])-[O:4]-[H] 3 -1000.0 0
3 | AmidineGuanidine1 [N:1]-[C:2](-[N:3])=[NX2:4]-[H:5] 3 12.025333333333334 1.5941046150769165
4 | AmidineGuanidine2 [C:1](-[N:2])=[NX2+0:3] 2 10.035538461538462 2.1312826469414716
5 | Sulfate [SX4:1](=[O:2])(=[O:3])([O:4]-[C,c,N,n:5])-[OX2:6]-[H] 5 -2.36 1.3048043093561141
6 | Sulfonate [SX4:1](=[O:2])(=[O:3])(-[C,c,N,n:4])-[OX2:5]-[H] 4 -1.8184615384615386 1.4086213481855594
7 | Sulfinic_acid [SX3:1](=[O:2])-[O:3]-[H] 2 1.7933333333333332 0.4372070447739835
8 | Phenyl_carboxyl [c,n,o:1]-[C:2](=[O:3])-[O:4]-[H] 3 3.463441968255319 1.2518054407928614
9 | Carboxyl [C:1](=[O:2])-[O:3]-[H] 2 3.456652971502591 1.2871420886834017
10 | Thioic_acid [C,c,N,n:1](=[O,S:2])-[SX2,OX2:3]-[H] 2 0.678267 1.497048763660801
11 | Phenyl_Thiol [c,n:1]-[SX2:2]-[H] 1 4.978235294117647 2.6137000480499806
12 | Thiol [C,N:1]-[SX2:2]-[H] 1 9.12448275862069 1.3317968158171463
13 |
14 | # [*]OP(=O)(O[H])O[H]. Note that this matches terminal phosphate of ATP, ADP, AMP.
15 | Phosphate [PX4:1](=[O:2])(-[OX2:3]-[H])(-[O+0:4])-[OX2:5]-[H] 2 2.4182608695652172 1.1091177991945305 5 6.5055 0.9512787792174668
16 |
17 | # Note that Internal_phosphate_polyphos_chain and
18 | # Initial_phosphate_like_in_ATP_ADP were added on 6/2/2020 to better detail with
19 | # molecules that have polyphosphate chains (e.g., ATP, ADP, NADH, etc.). Unlike
20 | # the other protonation states, these two were not determined by analyzing a set
21 | # of many compounds with experimentally determined pKa values.
22 |
23 | # For Internal_phosphate_polyphos_chain, we use a mean pKa value of 0.9, per
24 | # DOI: 10.7554/eLife.38821. For the precision value we use 1.0, which is roughly
25 | # the precision of the two ionizable hydroxyls from Phosphate (see above). Note
26 | # that when using recursive SMARTS strings, RDKit considers only the first atom
27 | # to be a match. Subsequent atoms define the environment.
28 | Internal_phosphate_polyphos_chain [$([PX4:1](=O)([OX2][PX4](=O)([OX2])(O[H]))([OX2][PX4](=O)(O[H])([OX2])))][O:2]-[H] 1 0.9 1.0
29 |
30 | # For Initial_phosphate_like_in_ATP_ADP, we use the same values found for the
31 | # lower-pKa hydroxyl of Phosphate (above).
32 | Initial_phosphate_like_in_ATP_ADP [$([PX4:1]([OX2][C,c,N,n])(=O)([OX2][PX4](=O)([OX2])(O[H])))]O-[H] 1 2.4182608695652172 1.1091177991945305
33 |
34 | # [*]P(=O)(O[H])O[H]. Cannot match terminal phosphate of ATP because O not among [C,c,N,n]
35 | Phosphonate [PX4:1](=[O:2])(-[OX2:3]-[H])(-[C,c,N,n:4])-[OX2:5]-[H] 2 1.8835714285714287 0.5925999820080644 5 7.247254901960784 0.8511476450801531
36 |
37 | Phenol [c,n,o:1]-[O:2]-[H] 1 7.065359866910526 3.277356122295936
38 | Peroxide1 [O:1]([$(C=O),$(C[Cl]),$(CF),$(C[Br]),$(CC#N):2])-[O:3]-[H] 2 8.738888888888889 0.7562592839596507
39 | Peroxide2 [C:1]-[O:2]-[O:3]-[H] 2 11.978235294117647 0.8697645895163075
40 | O=C-C=C-OH [O:1]=[C;R:2]-[C;R:3]=[C;R:4]-[O:5]-[H] 4 3.554 0.803339458581667
41 | Vinyl_alcohol [C:1]=[C:2]-[O:3]-[H] 2 8.871850714285713 1.660200255394124
42 | Alcohol [C:1]-[O:2]-[H] 1 14.780384615384616 2.546464970533435
43 | N-hydroxyamide [C:1](=[O:2])-[N:3]-[O:4]-[H] 3 9.301904761904762 1.2181897185891002
44 | *Ringed_imide1 [O,S:1]=[C;R:2]([$([#8]),$([#7]),$([#16]),$([#6][Cl]),$([#6]F),$([#6][Br]):3])-[N;R:4]([C;R:5]=[O,S:6])-[H] 3 6.4525 0.5555627777308341
45 | *Ringed_imide2 [O,S:1]=[C;R:2]-[N;R:3]([C;R:4]=[O,S:5])-[H] 2 8.681666666666667 1.8657779975741713
46 | *Imide [F,Cl,Br,S,s,P,p:1][#6:2][CX3:3](=[O,S:4])-[NX3+0:5]([CX3:6]=[O,S:7])-[H] 4 2.466666666666667 1.4843629385474877
47 | *Imide2 [O,S:1]=[CX3:2]-[NX3+0:3]([CX3:4]=[O,S:5])-[H] 2 10.23 1.1198214143335534
48 | *Amide_electronegative [C:1](=[O:2])-[N:3](-[Br,Cl,I,F,S,O,N,P:4])-[H] 2 3.4896 2.688124315081677
49 | *Amide [C:1](=[O:2])-[N:3]-[H] 2 12.00611111111111 4.512491341218857
50 | *Sulfonamide [SX4:1](=[O:2])(=[O:3])-[NX3+0:4]-[H] 3 7.9160326086956525 1.9842121316708763
51 | Anilines_primary [c:1]-[NX3+0:2]([H:3])[H:4] 1 3.899298673194805 2.068768503987161
52 | Anilines_secondary [c:1]-[NX3+0:2]([H:3])[!H:4] 1 4.335408163265306 2.1768842022330843
53 | Anilines_tertiary [c:1]-[NX3+0:2]([!H:3])[!H:4] 1 4.16690685045614 2.005865735782679
54 | Aromatic_nitrogen_unprotonated [n+0&H0:1] 0 4.3535441240733945 2.0714072661859584
55 | Amines_primary_secondary_tertiary [C:1]-[NX3+0:2] 1 8.159107682388349 2.5183597445318147
56 |
57 | # e.g., [*]P(=O)(O[H])[*]. Note that cannot match the internal phosphates of ATP, because
58 | # oxygen is not among [C,c,N,n,F,Cl,Br,I]
59 | Phosphinic_acid [PX4:1](=[O:2])(-[C,c,N,n,F,Cl,Br,I:3])(-[C,c,N,n,F,Cl,Br,I:4])-[OX2:5]-[H] 4 2.9745 0.6867886750744557
60 |
61 | # e.g., [*]OP(=O)(O[H])O[*]. Cannot match ATP because P not among [C,c,N,n,F,Cl,Br,I]
62 | Phosphate_diester [PX4:1](=[O:2])(-[OX2:3]-[C,c,N,n,F,Cl,Br,I:4])(-[O+0:5]-[C,c,N,n,F,Cl,Br,I:4])-[OX2:6]-[H] 6 2.7280434782608696 2.5437448856908316
63 |
64 | # e.g., [*]P(=O)(O[H])O[*]. Cannot match ATP because O not among [C,c,N,n,F,Cl,Br,I].
65 | Phosphonate_ester [PX4:1](=[O:2])(-[OX2:3]-[C,c,N,n,F,Cl,Br,I:4])(-[C,c,N,n,F,Cl,Br,I:5])-[OX2:6]-[H] 5 2.0868 0.4503028610465036
66 |
67 | Primary_hydroxyl_amine [C,c:1]-[O:2]-[NH2:3] 2 4.035714285714286 0.8463816543155368
68 | *Indole_pyrrole [c;R:1]1[c;R:2][c;R:3][c;R:4][n;R:5]1[H] 4 14.52875 4.06702491591416
69 | *Aromatic_nitrogen_protonated [n:1]-[H] 0 7.17 2.94602395490212
70 |
--------------------------------------------------------------------------------
/docs/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 | - Home: index.md
3 | - API: api
4 | - Development: development.md
5 |
6 | sort:
7 | type: natural
8 | ignore_case: true
9 | by: title
10 | direction: asc
11 | flatten_single_child_sections: true
12 |
--------------------------------------------------------------------------------
/docs/.overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block header %}
4 | {{ super() }}
5 |
6 |
7 |
8 | {% endblock %}
9 |
10 |
11 |
--------------------------------------------------------------------------------
/docs/css/base.css:
--------------------------------------------------------------------------------
1 |
2 | /*Make the content wider and relative to window size.*/
3 | .md-grid {
4 | max-width: 85%
5 | }
6 |
7 | :root {
8 | --md-tooltip-width: 600px;
9 | }
10 |
11 | @page {
12 | size: letter;
13 | max-width: 100%;
14 | margin-top: 1in;
15 | margin-right: 0in;
16 | margin-bottom: 1in;
17 | margin-left: 0in;
18 | }
19 |
20 | .md-typeset h2 {
21 | line-height: 1.13;
22 | }
23 |
--------------------------------------------------------------------------------
/docs/css/colors.css:
--------------------------------------------------------------------------------
1 | [data-md-color-scheme="default"] {
2 |
3 | /* Primary color shades */
4 | --md-primary-fg-color: #025099;
5 | --md-primary-fg-color--light: #0437AD;
6 | --md-primary-bg-color: #ffffff; /* Header text */
7 | --md-primary-bg-color--light: #DBDBDB; /* Secondary header text */
8 |
9 | /* Default color shades */
10 | --md-default-fg-color: #646464; /* ??? */
11 | --md-default-fg-color--light: #7A7A7A; /* h1 */
12 | --md-default-fg-color--lighter: #9B9B9B; /* ??? */
13 | --md-default-fg-color--lightest: #BCBCBC; /* ??? */
14 |
15 | --md-default-bg-color: #FAFAFA; /* Body background */
16 | --md-default-bg-color--light: #FAFAFA;
17 | --md-default-bg-color--lighter: #FAFAFA;
18 | --md-default-bg-color--lightest: #FAFAFA;
19 |
20 | /* Code color shades */
21 | --md-code-fg-color: #36464e; /* Code block text color */
22 | --md-code-bg-color: #f1f1f1; /* Code block background */
23 |
24 | /* Code highlighting color shades */
25 | --md-code-hl-color: #0000ff;
26 | --md-code-hl-color--light: #0000ff;
27 | --md-code-hl-number-color: #d52a2a;
28 | --md-code-hl-special-color: #db1457;
29 | --md-code-hl-function-color: #a846b9;
30 | --md-code-hl-constant-color: #6e59d9;
31 | --md-code-hl-keyword-color: #3f6ec6;
32 | --md-code-hl-string-color: #1c7d4d;
33 | --md-code-hl-name-color: #36464e;
34 | --md-code-hl-operator-color: var(--md-primary-fg-color);
35 | --md-code-hl-punctuation-color: var(--md-primary-fg-color);
36 | --md-code-hl-comment-color: var(--md-primary-fg-color);
37 | --md-code-hl-generic-color: var(--md-primary-fg-color);
38 | --md-code-hl-variable-color: var(--md-primary-fg-color);
39 |
40 | /* Typeset color shades */
41 | --md-typeset-color: #212529; /* Main text color */
42 |
43 | /* Typeset `a` color shades */
44 | --md-typeset-a-color: #01a0d7; /* Link color */
45 |
46 | /* Typeset `table` color shades */
47 | --md-typeset-table-color: #a5a5a5; /* Outline color */
48 | --md-typeset-table-color--light: #e3e2e2; /* Hover color */
49 |
50 | /* Footer color shades */
51 | --md-footer-fg-color: #ffffff; /* ??? */
52 | --md-footer-fg-color--light: #e9ecef; /* Footer text */
53 | --md-footer-fg-color--lighter: #adb5bd; /* ??? */
54 | --md-footer-bg-color: #000000;
55 | --md-footer-bg-color--dark: #212529; /* Footer background */
56 |
57 | /* Accent color shades */
58 | --md-accent-fg-color: #032779; /* Hover over link */
59 | --md-accent-fg-color--transparent: #caf0f8; /* Hover over transparent (e.g., code with link) */
60 | --md-accent-bg-color: #ffffff;
61 | --md-accent-bg-color--light: #e5e5e5;
62 |
63 | /* Admonition colors */
64 | --md-admonition-fg-color: #212529;
65 | --md-admonition-bg-color: #FAFAFA;
66 | }
67 |
68 | [data-md-color-scheme="dark"] {
69 |
70 | /* Primary color shades */
71 | --md-primary-fg-color: #23243D;
72 | --md-primary-fg-color--light: #0437AD;
73 | --md-primary-bg-color: #ffffff; /* Header text */
74 | --md-primary-bg-color--light: #DBDBDB; /* Secondary header text */
75 |
76 | /* Default color shades */
77 | --md-default-fg-color: #e2e4e9; /* ??? */
78 | --md-default-fg-color--light: #ffffff; /* h1 */
79 | --md-default-fg-color--lighter: #e2e4e9; /* ??? */
80 | --md-default-fg-color--lightest: #e2e4e9; /* ??? */
81 |
82 | --md-default-bg-color: #212529; /* Body background */
83 | --md-default-bg-color--light: #FAFAFA;
84 | --md-default-bg-color--lighter: #FAFAFA;
85 | --md-default-bg-color--lightest: #FAFAFA;
86 |
87 | /* Code color shades */
88 | --md-code-fg-color: #dddddd; /* Code block text color */
89 | --md-code-bg-color: #333333; /* Code block background */
90 |
91 | /* Code highlighting color shades */
92 | --md-code-hl-color: #aeaeff;
93 | --md-code-hl-color--light: #aeaeff;
94 | --md-code-hl-number-color: #ff9494;
95 | --md-code-hl-special-color: #ffa0c0;
96 | --md-code-hl-function-color: #f3adff;
97 | --md-code-hl-constant-color: #bdaeff;
98 | --md-code-hl-keyword-color: #a0c1ff;
99 | --md-code-hl-string-color: #9fffcf;
100 | --md-code-hl-name-color: #f5f5f5;
101 | --md-code-hl-operator-color: #a6f0ff;
102 | --md-code-hl-punctuation-color: #a6f0ff;
103 | --md-code-hl-comment-color: #a6f0ff;
104 | --md-code-hl-generic-color: #a6f0ff;
105 | --md-code-hl-variable-color: #a6f0ff;
106 |
107 | /* Typeset color shades */
108 | --md-typeset-color: #ffffff; /* Main text color */
109 |
110 | /* Typeset `a` color shades */
111 | --md-typeset-a-color: #96E4FE; /* Link color */
112 |
113 | /* Typeset `table` color shades */
114 | --md-typeset-table-color: #a5a5a5; /* Outline color */
115 | --md-typeset-table-color--light: #343a40; /* Hover color */
116 |
117 | /* Footer color shades */
118 | --md-footer-fg-color: #ffffff; /* ??? */
119 | --md-footer-fg-color--light: #e9ecef; /* Footer text */
120 | --md-footer-fg-color--lighter: #adb5bd; /* ??? */
121 | --md-footer-bg-color: #000000;
122 | --md-footer-bg-color--dark: #171717; /* Footer background */
123 |
124 | /* Accent color shades */
125 | --md-accent-fg-color: #90e0ef; /* Hover over link */
126 | --md-accent-fg-color--transparent: #6D6D6D; /* Hover over transparent (e.g., code with link) */
127 | --md-accent-bg-color: #ffffff;
128 | --md-accent-bg-color--light: #e5e5e5;
129 |
130 | /* Admonition colors */
131 | --md-admonition-fg-color: #ffffff;
132 | --md-admonition-bg-color: #212529;
133 |
134 | .highlight-ipynb {
135 | --jp-mirror-editor-string-color: #98c379;
136 | --jp-mirror-editor-number-color: #d19a66;
137 | --jp-mirror-editor-keyword-color: #c678dd;
138 | --jp-mirror-editor-operator-color: #c678dd;
139 | }
140 |
141 |
142 |
143 | .highlight-ipynb {
144 | margin: 0;
145 | padding: 5px 10px;
146 | background-color: #23262A;
147 | }
148 |
149 | .highlight-ipynb .nf {
150 | color: #61afef;
151 | }
152 |
153 | .highlight-ipynb .p {
154 | color: #ffffff;
155 | }
156 |
157 | .highlight-ipynb .nb {
158 | color: #56b6c2;
159 | }
160 |
161 | .highlight-ipynb .kc {
162 | color: #d19a66;
163 | }
164 |
165 | .highlight-ipynb .c1 {
166 | color: #8f8f8f;
167 | }
168 |
169 | .jupyter-wrapper .jp-InputArea-editor {
170 | position: relative;
171 | border-color: #30363C;
172 | }
173 |
174 | .jupyter-wrapper .highlight pre {
175 | background-color: transparent;
176 | padding: 10px;
177 | overflow: auto;
178 | }
179 | }
180 |
--------------------------------------------------------------------------------
/docs/css/jupyter.css:
--------------------------------------------------------------------------------
1 | html
2 | /*
3 | This adjusts the font size of Jupyter notebook code blocks to be closer to normal.
4 | */
5 | .highlight {
6 | font-size: 85%;
7 | }
8 |
--------------------------------------------------------------------------------
/docs/css/launchy.css:
--------------------------------------------------------------------------------
1 | .launchy-container {
2 | display: flex;
3 | justify-content: flex-end;
4 | position: relative;
5 | margin-top: -4.4em;
6 | margin-bottom: 2em;
7 | margin-right: 0.4em;
8 | }
9 |
--------------------------------------------------------------------------------
/docs/css/mkdocstrings.css:
--------------------------------------------------------------------------------
1 | /*Indentation.*/
2 | div.doc-contents:not(.first) {
3 | padding-left: 35px; /*25px is the default*/
4 | border-left: .15rem solid #ededed;
5 | }
6 |
7 | .doc-heading .highlight {
8 | font-size: 18px;
9 | background-color: transparent;
10 | }
11 |
12 | /*Mark external links as such. */
13 | a.external::after,
14 | a.autorefs-external::after {
15 | /* */
16 | mask-image: url('data:image/svg+xml, ');
17 | -webkit-mask-image: url('data:image/svg+xml, ');
18 | content: ' ';
19 |
20 | display: inline-block;
21 | vertical-align: middle;
22 | position: relative;
23 |
24 | height: 1em;
25 | width: 1em;
26 | background-color: var(--md-typeset-a-color);
27 | }
28 |
29 | a.external:hover::after,
30 | a.autorefs-external:hover::after {
31 | background-color: var(--md-accent-fg-color);
32 | }
33 |
34 | /* Fancier color for operators such as * and |. */
35 | .doc-signature .o {
36 | color: var(--md-code-hl-special-color);
37 | }
38 |
39 | /* Fancier color for constants such as None, True, and False. */
40 | .doc-signature .kc {
41 | color: var(--md-code-hl-constant-color);
42 | }
43 |
44 | /* Fancier color for built-in types (only useful when cross-references are used). */
45 | .doc-signature .n > a[href^="https://docs.python.org/"][href*="/functions.html#"],
46 | .doc-signature .n > a[href^="https://docs.python.org/"][href*="/stdtypes.html#"] {
47 | color: var(--md-code-hl-constant-color);
48 | }
49 |
--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
1 | # Development
2 |
3 | This comprehensive guide provides detailed instructions to help maintainers effectively develop, test, document, build, and release new versions of `dimorphite_dl`.
4 |
5 | ## Setting up the Development Environment
6 |
7 | `dimorphite_dl` utilizes [`pixi`](https://pixi.sh/latest/) for managing environments and dependencies, streamlining the setup process. Follow these precise steps to configure your development environment:
8 |
9 | 1. **Clone the repository:**
10 | Begin by obtaining a local copy of the `dimorphite_dl` codebase:
11 |
12 | ```bash
13 | git clone git@GitHub.com:durrantlab/dimorphite_dl.git
14 | cd dimorphite_dl
15 | ```
16 | 2. **Install dependencies:**
17 | Install all necessary dependencies by running:
18 |
19 | ```bash
20 | pixi install
21 | ```
22 | 3. **Activate the development environment:**
23 | To enter the isolated virtual environment configured specifically for `dimorphite_dl` development, execute:
24 |
25 | ```bash
26 | pixi shell
27 | ```
28 |
29 | You are now fully prepared and equipped to develop `dimorphite_dl`.
30 |
31 | ## Code Formatting and Style Guide
32 |
33 | Maintaining consistent style and formatting across the codebase is crucial for readability and maintainability.
34 | `dimorphite_dl` employs automated formatting tools configured to enforce standardized style guidelines.
35 | Execute the following command to apply formatting automatically:
36 |
37 | ```bash
38 | pixi run format
39 | ```
40 |
41 | This command sequentially runs `black` for Python formatting, `isort` for managing imports, and `markdownlint-cli2` to enforce markdown formatting standards, ensuring your contributions align with project conventions.
42 |
43 | ## Documentation
44 |
45 | `dimorphite_dl`'s documentation is built using MkDocs, allowing easy creation and maintenance of high-quality documentation.
46 | To locally preview documentation changes, serve the documentation by running:
47 |
48 | ```bash
49 | pixi run -e docs serve-docs
50 | ```
51 |
52 | After execution, open your web browser and visit [`http://127.0.0.1:8000/`](http://127.0.0.1:8000/) to review changes in real-time.
53 |
54 | ## Testing
55 |
56 | Writing and maintaining tests is essential for ensuring code correctness, reliability, and stability.
57 | Execute `dimorphite_dl`'s tests with:
58 |
59 | ```bash
60 | pixi run -e dev tests
61 | ```
62 |
63 | Additionally, you can evaluate test coverage to identify untested areas and improve overall reliability by running:
64 |
65 | ```bash
66 | pixi run -e dev coverage
67 | ```
68 |
69 | Review the generated coverage reports to address any gaps in testing.
70 |
71 | ## Bumping Version
72 |
73 | Releasing a new version of `dimorphite_dl` requires updating version information, documenting changes, and creating a corresponding release tag. Follow these steps precisely to ensure consistency and traceability:
74 |
75 | 1. **Update the changelog:**
76 | Document all notable changes since the previous release in the `CHANGELOG.md` file. Follow a consistent and clear format to help users understand what has changed.
77 |
78 | 3. **Commit the changes:**
79 | Stage and commit the version bump and changelog update using a clear and standardized message, for example:
80 |
81 | ```bash
82 | git add .
83 | git commit -m "bump: v1.2.5"
84 | ```
85 |
86 | 4. **Tag the commit:**
87 | Create a version tag that follows the `v` format:
88 |
89 | ```bash
90 | git tag v1.2.5
91 | git push origin main --tags
92 | ```
93 |
94 | 5. **Create a GitHub release:**
95 | Navigate to the [GitHub Releases](https://github.com/durrantlab/dimorphite_dl/releases) page and draft a new release:
96 |
97 | - Tag version: `v1.2.5`
98 | - Release title: `v1.2.5`
99 | - Description: Copy the relevant changelog section or summarize the key changes.
100 |
101 | Attach the built package files from the `dist/` directory, if desired.
102 |
103 | ## Building the Package
104 |
105 | Prepare `dimorphite_dl` for publishing or distribution by building the package.
106 | Execute:
107 |
108 | ```bash
109 | pixi run build
110 | ```
111 |
112 | Upon completion, inspect the `dist` directory for the generated distribution files, which are ready for publication.
113 |
114 | ## Publishing to PyPI
115 |
116 | Once the version number is updated and the package is built, it can be published to PyPI.
117 | Execute:
118 |
119 | ```bash
120 | pixi run publish
121 | ```
122 |
123 | For preliminary testing or release candidates, it is highly recommended to publish to TestPyPI first.
124 | Execute:
125 |
126 | ```bash
127 | pixi run publish-test
128 | ```
129 |
130 | Publishing to TestPyPI allows you to validate packaging correctness and installation processes without affecting production users.
131 |
132 | ## Maintenance Best Practices
133 |
134 | To maintain high quality and reliability of `dimorphite_dl`, adhere to the following best practices:
135 |
136 | - Regularly synchronize your local repository with the main branch to incorporate the latest updates:
137 |
138 | ```bash
139 | git pull origin main
140 | ```
141 | - Frequently review and address open issues and pull requests on GitHub.
142 | - Clearly document changes in commit messages, issue descriptions, and pull requests.
143 | - Routinely verify dependencies and update them as necessary to maintain compatibility and security.
144 |
145 | Adhering to these guidelines ensures a robust, stable, and continuously improving `dimorphite_dl` project.
146 |
147 | This expanded documentation guide covers the entire workflow comprehensively, providing clarity and precision for effective `dimorphite_dl` project maintenance.
148 |
--------------------------------------------------------------------------------
/docs/gen_ref_pages.py:
--------------------------------------------------------------------------------
1 | """Generate the code reference pages."""
2 |
3 | import os
4 | from pathlib import Path
5 |
6 | import mkdocs_gen_files
7 |
8 | SRC_DIR = "dimorphite_dl"
9 | WRITE_DIR = "api"
10 |
11 | for path in sorted(Path(SRC_DIR).rglob("*.py")): #
12 | module_path = path.relative_to(SRC_DIR).with_suffix("") #
13 |
14 | doc_path = path.relative_to(SRC_DIR).with_suffix(".md") #
15 |
16 | if not os.path.exists(Path(WRITE_DIR)):
17 | os.mkdir(Path(WRITE_DIR))
18 |
19 | full_doc_path = Path(WRITE_DIR, doc_path) #
20 |
21 | parts = list(module_path.parts)
22 |
23 | if parts[-1] == "__init__": #
24 | parts = parts[:-1]
25 | elif parts[-1] == "__main__":
26 | continue
27 |
28 | if len(parts) == 0:
29 | continue
30 |
31 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: #
32 | identifier = ".".join(parts) #
33 |
34 | print("::: " + identifier, file=fd) #
35 |
36 | mkdocs_gen_files.set_edit_path(full_doc_path, path) #
37 |
--------------------------------------------------------------------------------
/docs/img/launchy/colab.svg:
--------------------------------------------------------------------------------
1 |
2 |
17 |
19 |
40 |
43 |
47 |
51 |
55 |
59 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | --8<-- "README.md"
2 |
--------------------------------------------------------------------------------
/docs/js/mathjax-config.js:
--------------------------------------------------------------------------------
1 | window.MathJax = {
2 | tex: {
3 | inlineMath: [["\\(", "\\)"], ["$", "$"]],
4 | displayMath: [["\\[", "\\]"], ["$$", "$$"]],
5 | processEscapes: true,
6 | processEnvironments: true
7 | },
8 | options: {
9 | ignoreHtmlClass: ".*|",
10 | processHtmlClass: "arithmatex"
11 | }
12 | };
13 |
14 | document$.subscribe(() => {
15 | MathJax.startup.output.clearCache()
16 | MathJax.typesetClear()
17 | MathJax.texReset()
18 | MathJax.typesetPromise()
19 | })
20 |
--------------------------------------------------------------------------------
/hooks/launchy.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from bs4 import BeautifulSoup
4 |
5 |
6 | def is_jupyter(markdown=None, html=None):
7 | if html:
8 | return '' in html
9 | if markdown:
10 | return False
11 | raise ValueError("Must provide either markdown or html")
12 |
13 |
14 | def on_page_content(html, page, config, files):
15 | """Adds a Google colab button to launch Jupyter files"""
16 | # Only Jupyter notebooks will have this div.
17 | if is_jupyter(html=html):
18 | page_url = page.url
19 | if "/index.ipynb" == page.file.abs_src_path[-12:]:
20 | page_url += "index.ipynb"
21 | else:
22 | page_url = page_url[:-1] + ".ipynb"
23 | colab_url = os.path.join(config["colab_base_url"], page_url)
24 | colab_launch_html = f"""
25 |
30 | """
31 | soup = BeautifulSoup(html, "html.parser")
32 | h1_tag = soup.find("h1")
33 | h1_tag.insert_after(BeautifulSoup(colab_launch_html, "html.parser"))
34 | return soup.prettify()
35 | return html
36 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | docs_dir: docs
2 |
3 | site_name: dimorphite_dl
4 | site_author: durrantlab
5 |
6 | repo_name: durrantlab/dimorphite_dl
7 | repo_url: https://github.com/durrantlab/dimorphite_dl
8 | copyright:
CC BY-NC-SA 4.0 by OASCI
9 |
10 | # https://squidfunk.github.io/mkdocs-material/
11 | theme:
12 | name: material
13 | custom_dir: docs/.overrides
14 | language: en
15 | # logo: img/logo.svg
16 | palette:
17 | # Palette toggle for light mode
18 | - scheme: default
19 | toggle:
20 | icon: material/lightbulb-outline
21 | name: Switch to dark mode
22 |
23 | # Palette toggle for dark mode
24 | - scheme: dark
25 | toggle:
26 | icon: material/lightbulb
27 | name: Switch to light mode
28 | font:
29 | text: Roboto
30 | code: Roboto Mono
31 | icon:
32 | repo: fontawesome/brands/github
33 | annotation: material/star-four-points-circle
34 | features:
35 | - content.code.annotate
36 | - content.code.copy
37 | - content.code.select
38 | - content.tooltips
39 | - content.tabs.link
40 | - navigation.tabs
41 | - navigation.tabs.sticky
42 | - navigation.tracking
43 | - navigation.top
44 | - navigation.indexes
45 | - navigation.path
46 | - navigation.prune
47 | - toc.follow
48 | - search.suggest
49 |
50 | validation:
51 | omitted_files: warn
52 | absolute_links: warn
53 | unrecognized_links: warn
54 |
55 | # Options need to be indented twice for some reason?
56 | plugins:
57 | - search
58 | - autorefs
59 | - material-plausible
60 | - gen-files:
61 | scripts:
62 | - docs/gen_ref_pages.py
63 | - mkdocstrings:
64 | handlers:
65 | python:
66 | inventories:
67 | - "https://docs.python.org/3/objects.inv"
68 | paths: ["dimorphite_dl"]
69 | options:
70 | show_source: false
71 | show_root_heading: false
72 | annotations_path: brief
73 | docstring_style: google
74 | merge_init_into_class: true
75 | docstring_section_style: spacy
76 | show_if_no_docstring: true
77 | show_labels: false
78 | parameter_headings: false
79 | show_symbol_type_heading: true
80 | show_symbol_type_toc: true
81 | - mkdocs-jupyter:
82 | no_input: False
83 | include_requirejs: true
84 | include_source: True
85 | ignore: ["*.py"]
86 | remove_tag_config:
87 | remove_input_tags:
88 | - hide_code
89 | - awesome-nav
90 | - glightbox
91 | - macros
92 | - print-site
93 | - git-revision-date-localized:
94 | type: iso_datetime
95 | timezone: America/Detroit
96 | fallback_to_build_date: true
97 |
98 | hooks:
99 | - hooks/launchy.py
100 | colab_base_url: https://colab.research.google.com/github/durrantlab/dimorphite_dl/blob/main/study
101 |
102 |
103 | extra:
104 | generator: false
105 |
106 | extra_css:
107 | - css/base.css
108 | - css/colors.css
109 | - css/jupyter.css
110 | - css/mkdocstrings.css
111 | - css/launchy.css
112 |
113 | extra_javascript:
114 | - js/mathjax-config.js
115 |
116 | markdown_extensions:
117 | - abbr
118 | - toc:
119 | permalink: true
120 | - admonition
121 | - attr_list
122 | - def_list
123 | - footnotes
124 | - md_in_html
125 | - tables
126 | - pymdownx.arithmatex:
127 | generic: true
128 | - pymdownx.betterem
129 | - pymdownx.caret
130 | - pymdownx.details
131 | - pymdownx.highlight:
132 | anchor_linenums: true
133 | line_spans: __span
134 | pygments_lang_class: true
135 | - pymdownx.inlinehilite
136 | - pymdownx.keys
137 | - pymdownx.mark
138 | - pymdownx.smartsymbols
139 | - pymdownx.snippets
140 | - pymdownx.superfences:
141 | custom_fences:
142 | - name: mermaid
143 | class: mermaid
144 | format: !!python/name:pymdownx.superfences.fence_code_format
145 | - pymdownx.tabbed:
146 | alternate_style: true
147 | - pymdownx.tasklist:
148 | custom_checkbox: true
149 | - pymdownx.tilde
150 |
--------------------------------------------------------------------------------
/pixi.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | authors = [
3 | "durrantlab
",
4 | ]
5 | channels = ["conda-forge"]
6 | name = "dimorphite_dl"
7 | description = "Adds hydrogen atoms to molecular representations as specified by pH"
8 | platforms = ["win-64", "linux-64", "osx-64"]
9 | license = "Apache-2.0"
10 | readme = "README.md"
11 |
12 | [pypi-dependencies]
13 | dimorphite_dl = { path = ".", editable = true }
14 |
15 | [environments]
16 | dev = ["dev"]
17 | docs = ["docs"]
18 |
19 | [tasks]
20 |
21 | [dependencies]
22 | python = "==3.13"
23 | rdkit = ">=2020.3.3,<2026"
24 |
25 | [feature.dev.dependencies]
26 | isort = ">=5.12.0"
27 | pylint = ">=3.0.1"
28 | mypy = ">=1.6.0"
29 | pytest = ">=7.4.2"
30 | pytest-cov = ">=4.1.0"
31 | coverage = ">=7.3.1"
32 | pytest-html = ">=4.0.1"
33 | colorama = ">=0.4.6"
34 | basedpyright = ">=1.29.1,<2"
35 | ruff = ">=0.11.10,<0.12"
36 | twine = ">=6.1.0,<7"
37 | ipykernel = ">=6.29.5,<7"
38 |
39 | [feature.dev.tasks]
40 | mdlint = { cmd = ["markdownlint-cli2", '"**/*.{md,markdown}"', "--fix", "--config", ".markdownlint.yaml", "||", "true"] }
41 | isort = { cmd = ["isort", "--settings-path", ".isort.cfg", "./dimorphite_dl", "./tests", "||", "true"] }
42 | ruff = { cmd = ["ruff", "format", "--config", ".ruff.toml", "./dimorphite_dl", "./tests", "||", "true"] }
43 | format = { depends-on = ["mdlint", "isort", "ruff"] }
44 | tests = { cmd = [
45 | "PYTHONPATH=.",
46 | "pytest",
47 | "-c",
48 | ".pytest.ini",
49 | "--cov='dimorphite_dl'",
50 | "--cov-report=xml",
51 | "--junit-xml=report.xml",
52 | "--failed-first",
53 | ]}
54 | coverage = { cmd = ["coverage", "report"] }
55 | cleanup-build = { cmd = ["rm", "-rf", "./build", "./dist"] }
56 | build = { cmd = ["python3", "-m", "build"], depends-on = ["cleanup-build"]}
57 | publish-test = { cmd = ["twine", "upload", "--repository", "testpypi", "dist/*"] }
58 | publish = { cmd = ["twine", "upload", "dist/*"] }
59 |
60 | [feature.dev.pypi-dependencies]
61 | build = ">=1.2.2.post1,<2"
62 | mypy-extensions = ">=1.0.0"
63 | black = { version = ">=23.10.0", extras = ["jupyter"] }
64 | pyrefly = ">=0.16.0, <0.17"
65 | setuptools-scm = ">=8.0.0"
66 |
67 | [feature.docs.dependencies]
68 | mkdocs = ">=1.6.1,<2"
69 | mkdocs-material = ">=9.6.5,<10"
70 | mkdocstrings = ">=0.28.2,<0.29"
71 | mkdocstrings-python = ">=1.16.2,<2"
72 | pymdown-extensions = ">=10.14.3,<11"
73 | mkdocs-table-reader-plugin = ">=3.1.0,<4"
74 | mkdocs-gen-files = ">=0.4.0,<0.5"
75 | mkdocs-macros-plugin = ">=1.3.7,<2"
76 | mkdocs-jupyter = ">=0.25.1,<0.26"
77 | mkdocs-glightbox = ">=0.4.0,<0.5"
78 | mkdocs-git-revision-date-localized-plugin = ">=1.2.9,<2"
79 |
80 | [feature.docs.pypi-dependencies]
81 | material-plausible-plugin = ">=0.2.0,<0.3"
82 | mkdocs-print-site-plugin = ">=2.6.0,<3"
83 | mkdocs-awesome-nav = ">=3.0.0,<4"
84 |
85 | [feature.docs.tasks]
86 | docs = { cmd = ["rm", "-rf", "./public/", "&&", "mkdocs", "build", "-d", "public/"] }
87 | serve = { cmd = ["mkdocs", "serve"] }
88 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | authors = [
3 | {name = "Durrant Lab @ Pitt", email = "durrantj@pitt.edu"}
4 | ]
5 | maintainers = [
6 | {name = "Alex M. Maldonado", email = "alex.maldonado@pitt.edu"}
7 | ]
8 | description = "Adds hydrogen atoms to molecular representations as specified by pH"
9 | name = "dimorphite_dl"
10 | dynamic = ["version"]
11 | readme = "README.md"
12 | requires-python = ">=3.10"
13 | license = "Apache-2.0"
14 | classifiers = [
15 | "Development Status :: 5 - Production/Stable",
16 | "Intended Audience :: Developers",
17 | "Intended Audience :: Science/Research",
18 | "Natural Language :: English",
19 | "Programming Language :: Python",
20 | "Programming Language :: Python :: 3",
21 | "Programming Language :: Python :: 3.10",
22 | "Programming Language :: Python :: 3.11",
23 | "Programming Language :: Python :: 3.12",
24 | "Programming Language :: Python :: 3.13",
25 | "Topic :: Scientific/Engineering :: Chemistry",
26 | ]
27 |
28 | # TODO: Keep this here until pixi releases building capabilities
29 | dependencies = [
30 | "loguru>=0.7.2,<0.8",
31 | "rdkit>=2020.3.3,<2026",
32 | ]
33 |
34 | [project.urls]
35 | Documentation = "https://durrantlab.github.io/dimorphite_dl"
36 | Repository = "https://github.com/durrantlab/dimorphite_dl"
37 | Issues = "https://github.com/durrantlab/dimorphite_dl/issues"
38 | Changelog = "https://github.com/durrantlab/dimorphite_dl/blob/main/CHANGELOG.md"
39 |
40 | [project.scripts]
41 | dimorphite_dl = "dimorphite_dl.cli:run_cli"
42 |
43 | [build-system]
44 | requires = ["setuptools>=61.0", "setuptools-scm>=8", "wheel"]
45 |
46 | [tool.setuptools.packages.find]
47 | where = ["."]
48 | include = ["dimorphite_dl*"]
49 |
50 | [tool.setuptools.package-data]
51 | dimorphite_dl = ["smarts/*.smarts"]
52 |
53 | [tool.setuptools_scm]
54 | write_to = "dimorphite_dl/_version.py"
55 | version_scheme = "guess-next-dev"
56 | local_scheme = "node-and-timestamp"
57 |
58 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 | from rdkit import Chem
5 |
6 | from dimorphite_dl import enable_logging
7 | from dimorphite_dl.io import SMILESProcessor
8 |
9 | TEST_DIR = os.path.dirname(__file__)
10 |
11 |
12 | def compare_smiles(smiles1, smiles2):
13 | detected_can = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1), isomericSmiles=True)
14 | assert isinstance(detected_can, str)
15 | expected_can = Chem.MolToSmiles(Chem.MolFromSmiles(smiles2), isomericSmiles=True)
16 | assert isinstance(expected_can, str)
17 | assert detected_can == expected_can, f"got {smiles1}, expected {smiles2}"
18 |
19 |
20 | def compare_smarts(smarts1, smarts2):
21 | detected_can = Chem.MolToSmarts(Chem.MolFromSmarts(smarts1))
22 | expected_can = Chem.MolToSmarts(Chem.MolFromSmarts(smarts2))
23 | assert detected_can == expected_can, f"got {smarts1}, expected {smarts2}"
24 |
25 |
26 | @pytest.fixture(scope="session", autouse=True)
27 | def turn_on_logging():
28 | enable_logging(0)
29 |
30 |
31 | # Pytest fixtures for reusable test data
32 | @pytest.fixture
33 | def sample_smiles_list():
34 | """Fixture providing a sample list of SMILES strings."""
35 | return ["CCO", "CCC", "c1ccccc1", "CC(C)C", "CCN"]
36 |
37 |
38 | @pytest.fixture
39 | def sample_smiles_file(tmp_path):
40 | """Fixture providing a temporary SMILES file."""
41 | content = "CCO ethanol\nCCC propane\nc1ccccc1 benzene\n"
42 | file_path = tmp_path / "test_molecules.smi"
43 | file_path.write_text(content)
44 | return str(file_path)
45 |
46 |
47 | @pytest.fixture
48 | def processor_no_validation():
49 | """Fixture providing a processor with validation disabled."""
50 | return SMILESProcessor(validate_smiles=False)
51 |
--------------------------------------------------------------------------------
/tests/files/sample_molecules.smi:
--------------------------------------------------------------------------------
1 | C#CCO Alcohol
2 | C(=O)N Amide
3 | CC(=O)NOC(C)=O Amide_electronegative
4 | COC(=N)N AmidineGuanidine2
5 | Brc1ccc(C2NCCS2)cc1 Amines_primary_secondary_tertiary
6 | CC(=O)[n+]1ccc(N)cc1 Anilines_primary
7 | CCNc1ccccc1 Anilines_secondary
8 | Cc1ccccc1N(C)C Anilines_tertiary
9 | BrC1=CC2=C(C=C1)NC=C2 Aromatic_nitrogen_protonated
10 | C-N=[N+]=[N@H] Azide
11 | BrC(C(O)=O)CBr Carboxyl
12 | NC(NN=O)=N AmidineGuanidine1
13 | C(F)(F)(F)C(=O)NC(=O)C Imide
14 | O=C(C)NC(C)=O Imide2
15 | CC(C)(C)C(N(C)O)=O N-hydroxyamide
16 | C[N+](O)=O Nitro
17 | O=C1C=C(O)CC1 O=C-C=C-OH
18 | C1CC1OO Peroxide2
19 | C(=O)OO Peroxide1
20 | Brc1cc(O)cc(Br)c1 Phenol
21 | CC(=O)c1ccc(S)cc1 Phenyl_Thiol
22 | C=CCOc1ccc(C(=O)O)cc1 Phenyl_carboxyl
23 | COP(=O)(O)OC Phosphate_diester
24 | CP(C)(=O)O Phosphinic_acid
25 | CC(C)OP(C)(=O)O Phosphonate_ester
26 | CC1(C)OC(=O)NC1=O Ringed_imide1
27 | O=C(N1)C=CC1=O Ringed_imide2
28 | O=S(OC)(O)=O Sulfate
29 | COc1ccc(S(=O)O)cc1 Sulfinic_acid
30 | CS(N)(=O)=O Sulfonamide
31 | CC(=O)CSCCS(O)(=O)=O Sulfonate
32 | CC(=O)S Thioic_acid
33 | C(C)(C)(C)(S) Thiol
34 | Brc1cc[nH+]cc1 Aromatic_nitrogen_unprotonated
35 | C=C(O)c1c(C)cc(C)cc1C Vinyl_alcohol
36 | CC(=O)ON Primary_hydroxyl_amine
37 | O=P(O)(O)OCCCC Phosphate
38 | CC(P(O)(O)=O)C Phosphonate
39 |
--------------------------------------------------------------------------------
/tests/mol/test_detect_substruct.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from conftest import compare_smarts, compare_smiles # type: ignore
3 | from rdkit import Chem
4 |
5 | from dimorphite_dl.mol import MoleculeRecord
6 | from dimorphite_dl.protonate.detect import ProtonationSiteDetector
7 |
8 |
9 | @pytest.mark.parametrize(
10 | ("smiles", "smiles_prepped_correct", "expected_smarts", "expected_idxs_match"),
11 | [
12 | ("C#CCO", "[H]C#CC([H])([H])O[H]", "[C:1]-[O:2]-[#1]", (2, 3, 7)),
13 | ("Brc1cc[nH+]cc1", "[H]c1nc([H])c([H])c(Br)c1[H]", "[n&+0&H0:1]", (4,)),
14 | (
15 | "C-N=[N+]=[N@H]",
16 | "[H]N=[N+]=NC([H])([H])[H]",
17 | "[N&+0:1]=[N&+:2]=[N&+0:3]-[#1]",
18 | (1, 2, 3, 7),
19 | ),
20 | (
21 | "O=P(O)(O)OCCCC",
22 | "[H]OP(=O)(O[H])OC([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]",
23 | "[P&X4:1](=[O:2])(-[O&X2:3]-[#1])(-[O&+0:4])-[O&X2:5]-[#1]",
24 | (5, 6, 7, 18, 4, 8, 19),
25 | ),
26 | ],
27 | )
28 | def test_substructure_detect(
29 | smiles, smiles_prepped_correct, expected_smarts, expected_idxs_match
30 | ):
31 | mol_record = MoleculeRecord(smiles)
32 |
33 | detector = ProtonationSiteDetector()
34 |
35 | # prepare molecule
36 | mol = mol_record.prepare_for_protonation()
37 | smiles_prepped = Chem.MolToSmiles(mol)
38 | compare_smiles(smiles_prepped, smiles_prepped_correct)
39 |
40 | # detect substructures
41 | substructures = list(detector._detect_all_sites_in_molecule(mol))
42 | sub_match = substructures[0]
43 |
44 | # instead of raw string equality, canonicalize both SMARTS and compare
45 | compare_smarts(sub_match.smarts, expected_smarts)
46 | # atom indices should still be the same
47 | assert sub_match.idxs_match == expected_idxs_match
48 |
--------------------------------------------------------------------------------
/tests/mol/test_neutralize.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dimorphite_dl.mol import MoleculeRecord
4 |
5 |
6 | @pytest.mark.parametrize(
7 | ("input_smiles", "exp_azides", "exp_neutral", "exp_canonical"),
8 | [
9 | ("C#CCO", "C#CCO", "C#CCO", "C#CCO"),
10 | ("Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1", "Brc1ccncc1", "Brc1ccncc1"),
11 | ("C-N=[N+]=[N@H]", "C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=N"),
12 | ("O=P(O)(O)OCCCC", "O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)(O)O"),
13 | ],
14 | )
15 | def test_molecule_preparation_steps(
16 | input_smiles, exp_azides, exp_neutral, exp_canonical
17 | ):
18 | mol = MoleculeRecord(input_smiles)
19 | mol.process_azides()
20 | assert mol.smiles == exp_azides, (
21 | f"after process_azides: got {mol.smiles!r}, expected {exp_azides!r}"
22 | )
23 | mol.neutralize()
24 | assert mol.smiles == exp_neutral, (
25 | f"after neutralize: got {mol.smiles!r}, expected {exp_neutral!r}"
26 | )
27 | mol.make_canonical()
28 | assert mol.smiles == exp_canonical, (
29 | f"after make_canonical: got {mol.smiles!r}, expected {exp_canonical!r}"
30 | )
31 |
--------------------------------------------------------------------------------
/tests/protonate/test_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for loading protonation site data
3 | """
4 |
5 | from dimorphite_dl.protonate.data import PKaData
6 |
7 |
8 | def test_data_init():
9 | pka_data = PKaData()
10 | pka_data2 = PKaData()
11 | assert pka_data == pka_data2
12 |
13 | n_substructures = len(pka_data._data)
14 | assert n_substructures == 41
15 |
--------------------------------------------------------------------------------
/tests/protonate/test_run.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from conftest import compare_smiles # type: ignore
3 |
4 | from dimorphite_dl import protonate_smiles
5 |
6 |
7 | # Every molecule should be protonated
8 | @pytest.mark.parametrize(
9 | ("smiles_input", "smiles_correct"),
10 | [
11 | ("C#CCO", "C#CCO"), # alcohol
12 | ("C(=O)N", "NC=O"), # Amide,
13 | ("CC(=O)NOC(C)=O", "CC(=O)NOC(C)=O"), # Amide_electronegative,
14 | ("COC(=N)N", "COC(N)=[NH2+]"), # AmidineGuanidine2,
15 | (
16 | "Brc1ccc(C2NCCS2)cc1",
17 | "Brc1ccc(C2[NH2+]CCS2)cc1",
18 | ), # Amines_primary_secondary_tertiary,
19 | ("CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc([NH3+])cc1"), # Anilines_primary,
20 | ("CCNc1ccccc1", "CC[NH2+]c1ccccc1"), # Anilines_secondary,
21 | ("Cc1ccccc1N(C)C", "Cc1ccccc1[NH+](C)C"), # Anilines_tertiary,
22 | ("BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[nH]ccc2c1"), # Indole_pyrrole,
23 | ("O=c1cc[nH]cc1", "O=c1cc[nH]cc1"), # Aromatic_nitrogen_protonated,
24 | ("C-N=[N+]=[N@H]", "CN=[N+]=N"), # Azide,
25 | ("BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr"), # Carboxyl,
26 | ("NC(NN=O)=N", "NC(=[NH2+])NN=O"), # AmidineGuanidine1,
27 | ("C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F"), # Imide,
28 | ("O=C(C)NC(C)=O", "CC(=O)NC(C)=O"), # Imide2,
29 | ("CC(C)(C)C(N(C)O)=O", "CN(O)C(=O)C(C)(C)C"), # N-hydroxyamide,
30 | ("C[N+](O)=O", "C[N+](=O)O"), # Nitro,
31 | ("O=C1C=C(O)CC1", "O=C1C=C(O)CC1"), # O=C-C=C-OH,
32 | ("C1CC1OO", "OOC1CC1"), # Peroxide2,
33 | ("C(=O)OO", "O=COO"), # Peroxide1,
34 | ("Brc1cc(O)cc(Br)c1", "Oc1cc(Br)cc(Br)c1"), # Phenol,
35 | ("CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc(S)cc1"), # Phenyl_Thiol,
36 | ("C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)O)cc1"), # Phenyl_carboxyl,
37 | ("COP(=O)(O)OC", "COP(=O)(O)OC"), # Phosphate_diester,
38 | ("CP(C)(=O)O", "CP(C)(=O)O"), # Phosphinic_acid,
39 | ("CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)O"), # Phosphonate_ester,
40 | ("CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)NC1=O"), # Ringed_imide1,
41 | ("O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1"), # Ringed_imide2,
42 | ("O=S(OC)(O)=O", "COS(=O)(=O)O"), # Sulfate,
43 | ("COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)O)cc1"), # Sulfinic_acid,
44 | ("CS(N)(=O)=O", "CS(N)(=O)=O"), # Sulfonamide,
45 | ("CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)O"), # Sulfonate,
46 | ("CC(=O)S", "CC(=O)S"), # Thioic_acid,
47 | ("C(C)(C)(C)(S)", "CC(C)(C)S"), # Thiol,
48 | ("Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1"), # Aromatic_nitrogen_unprotonated,
49 | ("C=C(O)c1c(C)cc(C)cc1C", "C=C(O)c1c(C)cc(C)cc1C"), # Vinyl_alcohol,
50 | ("CC(=O)ON", "CC(=O)O[NH3+]"), # Primary_hydroxyl_amine,
51 | ("O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O"), # Phosphate
52 | ("CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O"), # Phosphonate
53 | ],
54 | )
55 | def test_very_acidic_single(smiles_input, smiles_correct):
56 | ph_min = -10000000
57 | ph_max = -10000000
58 |
59 | output = protonate_smiles(smiles_input, ph_min=ph_min, ph_max=ph_max, precision=0.5)
60 | assert len(output) == 1
61 | smiles_output = output[0]
62 |
63 | compare_smiles(smiles_output, smiles_correct)
64 |
65 |
66 | # Every molecule should be deprotonated
67 | @pytest.mark.parametrize(
68 | ("smiles_input", "smiles_correct"),
69 | [
70 | ("C#CCO", "C#CC[O-]"), # Alcohol
71 | ("C(=O)N", "[NH-]C=O"), # Amide
72 | ("CC(=O)NOC(C)=O", "CC(=O)[N-]OC(C)=O"), # Amide_electronegative
73 | ("COC(=N)N", "COC(=N)N"), # AmidineGuanidine2
74 | (
75 | "Brc1ccc(C2NCCS2)cc1",
76 | "Brc1ccc(C2NCCS2)cc1",
77 | ), # Amines_primary_secondary_tertiary
78 | ("CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc(N)cc1"), # Anilines_primary
79 | ("CCNc1ccccc1", "CCNc1ccccc1"), # Anilines_secondary
80 | ("Cc1ccccc1N(C)C", "Cc1ccccc1N(C)C"), # Anilines_tertiary
81 | ("BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[n-]ccc2c1"), # Indole_pyrrole
82 | ("O=c1cc[nH]cc1", "O=c1cc[n-]cc1"), # Aromatic_nitrogen_protonated
83 | ("C-N=[N+]=[N@H]", "CN=[N+]=[N-]"), # Azide
84 | ("BrC(C(O)=O)CBr", "O=C([O-])C(Br)CBr"), # Carboxyl
85 | ("NC(NN=O)=N", "N=C(N)NN=O"), # AmidineGuanidine1
86 | ("C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)[N-]C(=O)C(F)(F)F"), # Imide
87 | ("O=C(C)NC(C)=O", "CC(=O)[N-]C(C)=O"), # Imide2
88 | ("CC(C)(C)C(N(C)O)=O", "CN([O-])C(=O)C(C)(C)C"), # N-hydroxyamide
89 | ("C[N+](O)=O", "C[N+](=O)[O-]"), # Nitro
90 | ("O=C1C=C(O)CC1", "O=C1C=C([O-])CC1"), # O=C-C=C-OH
91 | ("C1CC1OO", "[O-]OC1CC1"), # Peroxide2
92 | ("C(=O)OO", "O=CO[O-]"), # Peroxide1
93 | ("Brc1cc(O)cc(Br)c1", "[O-]c1cc(Br)cc(Br)c1"), # Phenol
94 | ("CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc([S-])cc1"), # Phenyl_Thiol
95 | ("C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)[O-])cc1"), # Phenyl_carboxyl
96 | ("COP(=O)(O)OC", "COP(=O)([O-])OC"), # Phosphate_diester
97 | ("CP(C)(=O)O", "CP(C)(=O)[O-]"), # Phosphinic_acid
98 | ("CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)[O-]"), # Phosphonate_ester
99 | ("CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)[N-]C1=O"), # Ringed_imide1
100 | ("O=C(N1)C=CC1=O", "O=C1C=CC(=O)[N-]1"), # Ringed_imide2
101 | ("O=S(OC)(O)=O", "COS(=O)(=O)[O-]"), # Sulfate
102 | ("COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)[O-])cc1"), # Sulfinic_acid
103 | ("CS(N)(=O)=O", "CS([NH-])(=O)=O"), # Sulfonamide
104 | ("CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)[O-]"), # Sulfonate
105 | ("CC(=O)S", "CC(=O)[S-]"), # Thioic_acid
106 | ("C(C)(C)(C)(S)", "CC(C)(C)[S-]"), # Thiol
107 | ("Brc1cc[nH+]cc1", "Brc1ccncc1"), # Aromatic_nitrogen_unprotonated
108 | ("C=C(O)c1c(C)cc(C)cc1C", "C=C([O-])c1c(C)cc(C)cc1C"), # Vinyl_alcohol
109 | ("CC(=O)ON", "CC(=O)ON"), # Primary_hydroxyl_amine
110 | ("O=P(O)(O)OCCCC", "CCCCOP(=O)([O-])[O-]"), # Phosphate
111 | ("CC(P(O)(O)=O)C", "CC(C)P(=O)([O-])[O-]"), # Phosphonate
112 | ],
113 | )
114 | def test_very_basic(smiles_input, smiles_correct):
115 | ph_min = 10000000
116 | ph_max = 10000000
117 |
118 | output = list(
119 | protonate_smiles(smiles_input, ph_min=ph_min, ph_max=ph_max, precision=0.5)
120 | )
121 | assert len(output) == 1
122 | smiles_output = output[0]
123 |
124 | compare_smiles(smiles_output, smiles_correct)
125 |
126 |
127 | @pytest.mark.parametrize(
128 | ("smiles_input", "smiles_protonated", "smiles_deprotonated", "pka_avg"),
129 | [
130 | ["C#CCO", "C#CCO", "C#CC[O-]", 14.780384615384616], # alcohol
131 | ["C(=O)N", "NC=O", "[NH-]C=O", 12.00611111111111], # amide
132 | [
133 | "CC(=O)NOC(C)=O",
134 | "CC(=O)NOC(C)=O",
135 | "CC(=O)[N-]OC(C)=O",
136 | 3.4896,
137 | ], # Amide_electronegative
138 | [
139 | "COC(=N)N",
140 | "COC(N)=[NH2+]",
141 | "COC(=N)N",
142 | 10.035538461538462,
143 | ], # AmidineGuanidine2"
144 | [
145 | "Brc1ccc(C2NCCS2)cc1",
146 | "Brc1ccc(C2[NH2+]CCS2)cc1",
147 | "Brc1ccc(C2NCCS2)cc1",
148 | 8.159107682388349,
149 | ], # Amines_primary_secondary_tertiary
150 | [
151 | "CC(=O)[n+]1ccc(N)cc1",
152 | "CC(=O)[n+]1ccc([NH3+])cc1",
153 | "CC(=O)[n+]1ccc(N)cc1",
154 | 3.899298673194805,
155 | ], # Anilines_primary
156 | [
157 | "CCNc1ccccc1",
158 | "CC[NH2+]c1ccccc1",
159 | "CCNc1ccccc1",
160 | 4.335408163265306,
161 | ], # Anilines_secondary
162 | [
163 | "Cc1ccccc1N(C)C",
164 | "Cc1ccccc1[NH+](C)C",
165 | "Cc1ccccc1N(C)C",
166 | 4.16690685045614,
167 | ], # Anilines_tertiary
168 | [
169 | "BrC1=CC2=C(C=C1)NC=C2",
170 | "Brc1ccc2[nH]ccc2c1",
171 | "Brc1ccc2[n-]ccc2c1",
172 | 14.52875,
173 | ], # Indole_pyrrole
174 | [
175 | "O=c1cc[nH]cc1",
176 | "O=c1cc[nH]cc1",
177 | "O=c1cc[n-]cc1",
178 | 7.17,
179 | ], # Aromatic_nitrogen_protonated
180 | ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", 4.65], # Azide
181 | [
182 | "BrC(C(O)=O)CBr",
183 | "O=C(O)C(Br)CBr",
184 | "O=C([O-])C(Br)CBr",
185 | 3.456652971502591,
186 | ], # Carboxyl
187 | [
188 | "NC(NN=O)=N",
189 | "NC(=[NH2+])NN=O",
190 | "N=C(N)NN=O",
191 | 12.025333333333334,
192 | ], # AmidineGuanidine1
193 | [
194 | "C(F)(F)(F)C(=O)NC(=O)C",
195 | "CC(=O)NC(=O)C(F)(F)F",
196 | "CC(=O)[N-]C(=O)C(F)(F)F",
197 | 2.466666666666667,
198 | ], # Imide
199 | ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", 10.23], # Imide2
200 | [
201 | "CC(C)(C)C(N(C)O)=O",
202 | "CN(O)C(=O)C(C)(C)C",
203 | "CN([O-])C(=O)C(C)(C)C",
204 | 9.301904761904762,
205 | ], # N-hydroxyamide
206 | ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", -1000.0], # Nitro
207 | ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", 11.978235294117647], # Peroxide2
208 | ["C(=O)OO", "O=COO", "O=CO[O-]", 8.738888888888889], # Peroxide1
209 | [
210 | "Brc1cc(O)cc(Br)c1",
211 | "Oc1cc(Br)cc(Br)c1",
212 | "[O-]c1cc(Br)cc(Br)c1",
213 | 7.065359866910526,
214 | ], # Phenol
215 | [
216 | "CC(=O)c1ccc(S)cc1",
217 | "CC(=O)c1ccc(S)cc1",
218 | "CC(=O)c1ccc([S-])cc1",
219 | 4.978235294117647,
220 | ], # Phenyl_Thiol
221 | [
222 | "C=CCOc1ccc(C(=O)O)cc1",
223 | "C=CCOc1ccc(C(=O)O)cc1",
224 | "C=CCOc1ccc(C(=O)[O-])cc1",
225 | 3.463441968255319,
226 | ], # Phenyl_carboxyl
227 | [
228 | "COP(=O)(O)OC",
229 | "COP(=O)(O)OC",
230 | "COP(=O)([O-])OC",
231 | 2.7280434782608696,
232 | ], # Phosphate_diester
233 | ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", 2.9745], # Phosphinic_acid
234 | [
235 | "CC(C)OP(C)(=O)O",
236 | "CC(C)OP(C)(=O)O",
237 | "CC(C)OP(C)(=O)[O-]",
238 | 2.0868,
239 | ], # Phosphonate_ester
240 | [
241 | "CC1(C)OC(=O)NC1=O",
242 | "CC1(C)OC(=O)NC1=O",
243 | "CC1(C)OC(=O)[N-]C1=O",
244 | 6.4525,
245 | ], # Ringed_imide1
246 | [
247 | "O=C(N1)C=CC1=O",
248 | "O=C1C=CC(=O)N1",
249 | "O=C1C=CC(=O)[N-]1",
250 | 8.681666666666667,
251 | ], # Ringed_imide2
252 | ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", -2.36], # Sulfate
253 | [
254 | "COc1ccc(S(=O)O)cc1",
255 | "COc1ccc(S(=O)O)cc1",
256 | "COc1ccc(S(=O)[O-])cc1",
257 | 1.7933333333333332,
258 | ], # Sulfinic_acid
259 | [
260 | "CS(N)(=O)=O",
261 | "CS(N)(=O)=O",
262 | "CS([NH-])(=O)=O",
263 | 7.9160326086956525,
264 | ], # Sulfonamide
265 | [
266 | "CC(=O)CSCCS(O)(=O)=O",
267 | "CC(=O)CSCCS(=O)(=O)O",
268 | "CC(=O)CSCCS(=O)(=O)[O-]",
269 | -1.8184615384615386,
270 | ], # Sulfonate
271 | ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", 0.678267], # Thioic_acid
272 | ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", 9.12448275862069], # Thiol
273 | [
274 | "Brc1cc[nH+]cc1",
275 | "Brc1cc[nH+]cc1",
276 | "Brc1ccncc1",
277 | 4.3535441240733945,
278 | ], # Aromatic_nitrogen_unprotonated
279 | [
280 | "C=C(O)c1c(C)cc(C)cc1C",
281 | "C=C(O)c1c(C)cc(C)cc1C",
282 | "C=C([O-])c1c(C)cc(C)cc1C",
283 | 8.871850714285713,
284 | ], # Vinyl_alcohol
285 | [
286 | "CC(=O)ON",
287 | "CC(=O)O[NH3+]",
288 | "CC(=O)ON",
289 | 4.035714285714286,
290 | ], # Primary_hydroxyl_amine
291 | ],
292 | )
293 | def test_pka_average(smiles_input, smiles_protonated, smiles_deprotonated, pka_avg):
294 | """Test that when the pH is equal to the average pKa, the protonation
295 | state is always both"""
296 |
297 | output = list(
298 | protonate_smiles(smiles_input, ph_min=pka_avg, ph_max=pka_avg, precision=0.5)
299 | )
300 | assert len(output) == 2
301 | smiles_output_sorted = tuple(sorted(output))
302 | smiles_correct_sorted = tuple(sorted((smiles_protonated, smiles_deprotonated)))
303 | for smiles_output, smiles_correct in zip(
304 | smiles_output_sorted, smiles_correct_sorted
305 | ):
306 | compare_smiles(smiles_output, smiles_correct)
307 |
308 |
309 | def test_no_carbanion():
310 | smi = (
311 | "Cc1nc2cc(-c3[nH]c4cc5ccccc5c5c4c3CCN(C(=O)O)[C@@H]5O)cc3c(=O)[nH][nH]c(n1)c23"
312 | )
313 | output = list(protonate_smiles(smi))
314 |
315 | if "[C-]" in "".join(output).upper():
316 | msg = "Processing " + smi + " produced a molecule with a carbanion!"
317 | raise RuntimeError(msg)
318 | else:
319 | print("(CORRECT) No carbanion: " + smi)
320 |
321 |
322 | def test_max_variants():
323 | # Make sure max number of variants is limited (old bug).
324 | smi = "CCCC[C@@H](C(=O)N)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](C(C)C)NC(=O)[C@@H](NC(=O)[C@H](Cc1c[nH]c2c1cccc2)NC(=O)[C@@H](NC(=O)[C@@H](Cc1ccc(cc1)O)N)CCC(=O)N)C)C)Cc1nc[nH]c1)Cc1ccccc1"
325 | output = list(protonate_smiles(smi))
326 |
327 | assert len(output) == 128, f"Should produce 128 mol, but produced {len(output)}"
328 |
329 |
330 | @pytest.mark.parametrize(("smiles"), [r"CCC(C)=C(Cl)C/C(I)=C(\C)F"])
331 | def test_no_protonation_sites(smiles):
332 | output = protonate_smiles(smiles)
333 | assert len(output) == 1
334 | compare_smiles(output[0], smiles)
335 |
336 |
337 | @pytest.mark.parametrize(
338 | ("smiles_input", "ph", "smiles_correct"),
339 | [
340 | (
341 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O",
342 | 0.5,
343 | "[NH3+]c1[nH+]c[nH+]c2c1[nH+]cn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O",
344 | ),
345 | (
346 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O",
347 | 1.0,
348 | "[NH3+]c1[nH+]c[nH+]c2c1[nH+]cn2C1OC(COP(=O)(O)OP(=O)([O-])OP(=O)(O)O)C(O)C1O",
349 | ),
350 | (
351 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O",
352 | 2.6,
353 | "[NH3+]c1[nH+]c[nH+]c2c1[nH+]cn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])O)C(O)C1O",
354 | ),
355 | (
356 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O",
357 | 7.0,
358 | "Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O",
359 | ),
360 | # Changed output NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c([NH3+])ncnc54)C(O)C3O)C(O)C2O)c1
361 | # to NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1
362 | (
363 | "O=P(O)(OP(O)(OCC1C(O)C(O)C(N2C=NC3=C(N)N=CN=C32)O1)=O)OCC(O4)C(O)C(O)C4[N+]5=CC=CC(C(N)=O)=C5",
364 | 0.5,
365 | "NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1",
366 | ),
367 | # Changed output NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4cnc5c([NH3+])ncnc54)C(O)C3O)C(O)C2O)c1
368 | # to NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1
369 | # Old version of dimorphite would inconsistently handle failed protonation by sometimes referring to
370 | # the last successful site OR the last successful site TYPE.
371 | (
372 | "O=P(O)(OP(O)(OCC1C(O)C(O)C(N2C=NC3=C(N)N=CN=C32)O1)=O)OCC(O4)C(O)C(O)C4[N+]5=CC=CC(C(N)=O)=C5",
373 | 2.5,
374 | "NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1",
375 | ),
376 | (
377 | "O=P(O)(OP(O)(OCC1C(O)C(O)C(N2C=NC3=C(N)N=CN=C32)O1)=O)OCC(O4)C(O)C(O)C4[N+]5=CC=CC(C(N)=O)=C5",
378 | 7.4,
379 | "NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4cnc5c(N)ncnc54)C(O)C3O)C(O)C2O)c1",
380 | ),
381 | ],
382 | )
383 | def test_multiple_ph(smiles_input, ph, smiles_correct):
384 | output = protonate_smiles(
385 | smiles_input, ph_min=ph, ph_max=ph, precision=0.0, validate_output=True
386 | )
387 | assert len(output) == 1
388 |
389 | compare_smiles(output[0], smiles_correct)
390 |
--------------------------------------------------------------------------------
/tests/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/training_data/README.md:
--------------------------------------------------------------------------------
1 | Training Data
2 | =============
3 |
4 | Format
5 | ------
6 |
7 | To allow others to reproduce our work, we here include the data used to
8 | calculate typical pKa ranges for 38 ionizable substructures. Please see the
9 | `training_data.json` file.
10 |
11 | The keys of the JSON are the labels of each substructure (e.g.,
12 | "Thioic_acid"). The JSON values are lists of pKa values. For example:
13 |
14 | ``` json
15 | {
16 | "Aromatic_protonated_nitrogen": [
17 | 7.7, 14.9, 15.3, ...
18 | ],
19 | "Vynl_alcohol": [
20 | 9.2, 9.5, 9.5, ...
21 | ]
22 | }
23 | ```
24 |
25 | In the case of "Phosphate" and "Phosphonate" groups, the values are lists of
26 | two pKa values (pKa1 and pKa2). Where one of these pKa values is unavailable,
27 | it is listed as `null`. For example:
28 |
29 | ``` json
30 | {
31 | "Phosphonate": [
32 | [1.1, 6.5], [2.7, 8.4], [null, 8.7], ...
33 | ]
34 | }
35 | ```
36 |
37 | Reaxys Terms and Conditions
38 | ---------------------------
39 |
40 | Most of the pKa data used to train Dimorphite-DL was taken from the [Reaxys
41 | database](https://www.reaxys.com/#/about-content), owned and operated by
42 | Elsevier Information Systems GmbH. [Facts are not
43 | copyrightable](https://www.copyright.gov/help/faq/faq-protect.html), but in
44 | using the database we did agree to Elsevier's [Terms and
45 | Conditions](https://www.elsevier.com/legal/elsevier-website-terms-and-conditions).
46 |
47 | Ideally, we would like to include both the SMILES strings and precise
48 | catalogued pKa values for all training examples. But, given the Terms and
49 | Conditions, it is unclear whether this use is permissible:
50 |
51 | > Unless otherwise set out herein, content comprised within the Services,
52 | > including text... and other information (collectively, the "Content")... is
53 | > owned by Elsevier, its licensors or its content providers and is protected
54 | > by copyright, trademark and other intellectual property and unfair
55 | > competition laws.
56 |
57 | Do the catalogued SMILES strings and pKa values fall under this definition of
58 | "content"? But they are not copyrightable, perhaps suggesting they do not. On
59 | the other hand, publication is certainly a kind of "scholarly use":
60 |
61 | > ...you may print or download Content from the Services for your own
62 | > personal, non-commercial, informational or scholarly use, provided that you
63 | > keep intact all copyright and other proprietary notices.
64 |
65 | But, later in the terms, publication seems to be expressly prohibited:
66 |
67 | > You may not copy, display, distribute, modify, publish, reproduce, store,
68 | > transmit, post, translate or create other derivative works from, or sell,
69 | > rent or license all or any part of the Content... in any medium to anyone,
70 | > except as otherwise expressly permitted under these Terms and Conditions, or
71 | > any relevant license or subscription agreement or authorization by us.
72 |
73 | We emailed Reaxys seeking clarification but did not hear back from them.
74 |
75 | Solution
76 | --------
77 |
78 | Given this uncertainty, we opted not to publish the exact SMILES structures
79 | taken from the Reaxys database. We further opted to round the pKa values to
80 | the nearest tenth, to avoid directly redistributing Reaxys data. The data we
81 | do provide should allow others to recalculate our pKa ranges with reasonable
82 | accuracy.
83 |
--------------------------------------------------------------------------------
/training_data/training_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "Azide": [
3 | 4.6, 4.7
4 | ],
5 | "Nitro": [
6 | -1000.0
7 | ],
8 | "AmidineGuanidine1": [
9 | 13.6, 10.1, 13.4, 10.4, 11.7, 13.6, 11.5, 13.7, 11.8, 12.1, 13.6, 13.4,
10 | 8.3, 10.9, 12.3
11 | ],
12 | "AmidineGuanidine2": [
13 | 11.1, 10.3, 8.3, 11.1, 10.9, 9.2, 8.9, 7.7, 6.7, 8.3, 6.8, 10.7, 4.8,
14 | 8.1, 9.2, 8.5, 8.1, 9.1, 8.0, 9.0, 9.7, 9.3, 9.3, 8.6, 8.1, 9.5, 10.3,
15 | 9.8, 8.6, 8.3, 7.9, 9.1, 8.4, 9.2, 8.5, 9.4, 4.7, 8.3, 10.7, 12.4,
16 | 12.3, 12.5, 11.6, 14.4, 13.9, 13.9, 7.5, 12.2, 8.9, 9.1, 14.3, 10.6,
17 | 12.2, 12.3, 12.4, 12.3, 12.3, 12.2, 10.9, 12.5, 12.6, 12.0, 11.7, 11.2,
18 | 11.5
19 | ],
20 | "Sulfate": [
21 | -1.5, -1.4, -1.7, -1.4, -1.2, -3.4, -3.9, -4.4
22 | ],
23 | "Sulfonate": [
24 | -1.9, -1.7, -5.9, -2.8, -0.8, -0.6, -2.5, -1.0, -1.3, -1.0, -0.6, -1.8,
25 | -1.8
26 | ],
27 | "Sulfinic_acid": [
28 | 1.9, 1.7, 2.0, 1.5, 2.2, 1.0, 2.3, 1.4, 2.2
29 | ],
30 | "Phenyl_carboxyl": [
31 | 3.7, 4.5, 3.7, 3.0, 3.8, 3.1, 4.5, 4.5, 4.3, 4.3, 3.6, 4.5, 4.5, 4.5,
32 | 1.9, 0.1, 3.6, 3.6, 4.5, 4.2, 2.8, 3.8, 4.2, 3.9, 4.2, 3.9, 4.3, 3.6,
33 | 3.9, 4.6, 3.8, 3.5, 3.4, 4.1, -0.6, 3.3, 4.0, 2.9, 3.4, 3.7, 4.0, 4.2,
34 | 3.6, -0.6, 4.3, 4.2, 3.4, 2.4, 4.1, 2.5, 2.7, 3.8, 3.8, 3.9, 3.8, 3.8,
35 | 3.8, 4.1, 3.5, 3.6, 3.1, 3.9, 3.5, 2.9, 3.8, 4.5, 3.0, 3.6, 1.5, 3.3,
36 | 3.2, 1.5, 3.3, 3.4, 3.1, 3.2, 2.4, 1.7, 1.9, 2.3, 3.9, 3.9, 3.8, 4.1,
37 | 4.4, 5.7, 5.3, 1.8, 4.3, 3.4, 3.1, 3.1, 3.1, 3.1, 3.2, 4.4, 3.6, 4.0,
38 | 4.4, 4.4, 4.1, 5.2, 5.5, 4.2, 3.9, 4.4, 4.5, 3.5, 5.0, 4.3, 3.3, 3.4,
39 | 3.4, 3.8, 4.3, 4.1, 5.3, 5.5, 4.0, 3.6, 4.2, 3.7, -0.6, 3.5, 3.6, -0.6,
40 | 2.9, 3.5, -0.5, -0.5, -0.5, 3.7, 3.7, 3.5, 2.9, 1.7, 4.2, 5.0, 4.2,
41 | 3.6, 1.2
42 | ],
43 | "Carboxyl": [
44 | 4.2, 4.6, 4.6, 4.8, 5.8, 3.2, 2.7, 4.5, 4.7, 4.7, 4.2, 1.9, 3.2, 2.6,
45 | 3.6, 4.7, 5.1, 4.6, 4.6, 4.7, 3.2, 4.2, 4.0, 4.8, 3.2, 4.2, 3.8, 2.4,
46 | 3.8, 3.8, 3.8, 2.6, 3.7, 4.7, 4.8, -0.7, 2.6, 2.5, 3.7, 3.4, 4.7, 4.9,
47 | 4.6, -0.7, 4.7, 1.7, 1.9, 2.6, 3.5, 3.6, 5.0, 4.8, 3.4, 4.1, 4.5, 4.5,
48 | 3.6, 4.7, 1.9, 2.6, 4.8, 4.8, 3.6, 4.9, 4.9, 3.6, 4.1, -0.7, 3.6, -0.7,
49 | -0.7, -0.7, 4.6, 4.9, 3.6, 3.8, 2.6, 3.8, 5.7, 1.1, 2.7, 4.3, 3.1, 2.5,
50 | 3.6, 3.4, 4.3, 4.6, 4.7, 1.7, 4.2, 4.5, 4.6, 2.6, 2.9, 5.4, 5.0, 3.4,
51 | -0.5, 3.5, 4.2, 4.5, 4.2, 4.5, 4.6, 4.3, 4.5, 3.3, 1.8, 4.3, 2.0, 3.5,
52 | 4.4, 4.7, 2.6, 2.8, 3.7, 4.6, 4.0, 2.0, 4.4, 4.7, 3.1, 2.6, 3.5, 4.8,
53 | 3.2, 3.9, 3.7, 3.9, 3.5, 4.6, 4.9, 5.0, 1.3, 1.4, 4.1, 4.5, 5.0, 5.0,
54 | 5.3, 5.0, 1.8, 2.6, 2.4, 3.2, 1.3, 2.7, 2.6, 3.4, 3.1, 2.3, 2.4, 2.2,
55 | 4.0, 4.0, 4.0, 3.9, 4.0, 1.8, 3.8, 4.4, 4.6, 4.8, 3.2, 3.4, 4.0, 3.9,
56 | 3.9, 2.2, 2.9, 3.0, 3.0, 2.0, 2.0, 2.0, 1.5, 3.2, 3.4, 3.4, 3.2, 3.3,
57 | -0.7, 3.3, 1.4, 3.1, 3.2, 3.3, 4.6, 4.3, 4.3, 4.3, -0.6, 1.9, 1.9, 2.7,
58 | 2.5, 1.9, 2.0, 3.1, 2.1, 2.2, 2.2, 2.5, 1.4, 1.1, 1.2, 0.2, 4.4, 2.5,
59 | 0.7, 0.3, 3.2, 2.9, 3.4, 2.5, 4.2, 5.1, 4.1, 3.3, 3.3, 4.9, 4.8, 4.8,
60 | 3.6, 3.5, 4.8, 5.0, 4.9, 3.9, 3.6, 3.4, 3.1, 3.3, -0.6, 4.3, 4.0, 3.7,
61 | 4.4, 4.5, 3.3, 3.4, 3.4, 4.4, 5.0, 3.6, 3.0, 4.7, 4.8, 4.4, 4.3, 4.6,
62 | 5.0, 4.5, 4.2, 4.8, 4.1, 4.9, 4.9, 5.0, 5.1, 5.0, 4.9, 5.0, 5.0, 4.5,
63 | 4.0, 4.4, 4.2, -0.7, 4.9, 4.3, 4.7, 4.7, 4.4, 4.3, 4.4, 4.5, 3.1, 3.5,
64 | 3.4, 2.8, 3.1, 3.1, 3.1, 3.1, 3.2, 3.1, 3.1, 3.2, 3.1, 3.1, 3.1, 3.2,
65 | 3.1, 3.2, 3.0, 3.1, 3.2, 4.1, 3.6, 2.4, 1.9, 2.5, 2.5, 2.6, 2.4, 2.5,
66 | 2.5, 2.6, 2.5, 2.4, 2.1, 2.7, 2.7, 2.8, 2.7, 2.6, 2.7, 2.8, 2.7, 2.7,
67 | 2.7, 2.9, 3.0, 4.3, 3.7, 3.5, 3.7, 3.8, 3.6, 3.6, 3.5, 3.6, 3.6, 3.5,
68 | 3.5, 3.5, 3.5, 3.9, 3.9, 3.8, 3.8, 3.6, 5.2, 5.3, 5.4, 4.9, 5.4, -0.7,
69 | -0.6, 3.9, 4.3, 3.7, 3.9, 4.1, 4.1, 4.1, 3.9, 4.6, 4.5, 4.5, 4.3, 4.5,
70 | 4.4, 4.3, 4.0, 4.1, 3.4, -0.6, 2.0, 2.3, 4.0, 2.8, 3.3, 4.8, 3.8, 2.6,
71 | 4.4, 4.8, 4.8, 4.5, 4.7, 2.0, 1.8, 1.8
72 | ],
73 | "Thioic_acid": [
74 | -0.6, 1.7, -0.6, 2.5, -0.4, -0.4, 2.6
75 | ],
76 | "Phenyl_Thiol": [
77 | 5.3, 6.3, -0.8, 6.4, 2.7, 2.8, 4.9, -0.8, 5.5, 5.9, 7.2, 6.2, 7.0, 5.4,
78 | 5.5, 8.6, 6.6
79 | ],
80 | "Thiol": [
81 | 7.7, 7.3, 7.7, 7.2, 7.6, 7.5, 9.4, 10.4, 10.2, 10.6, 10.3, 8.5, 10.0,
82 | 10.9, 8.6, 7.9, 11.1, 11.2, 7.9, 9.4, 7.3, 10.7, 9.3, 7.9, 9.5, 8.4,
83 | 9.9, 10.2, 10.2
84 | ],
85 | "Phosphate": [
86 | [2.0, 6.5], [2.0, 6.5], [2.3, 6.8], [4.2, 9.7], [4.0, 7.3], [2.7, 6.7],
87 | [1.3, 5.8], [2.0, 6.8], [2.9, 5.9], [5.9, 6.2], [1.6, 6.7], [1.0, 5.8],
88 | [2.0, 5.8], [2.0, 5.6], [3.4, 6.2], [3.1, 7.0], [1.8, null],
89 | [1.8, null], [1.6, null], [1.6, null], [null, 6.6], [null, 6.2],
90 | [null, 4.8], [1.7, null], [2.4, null], [2.5, null], [null, 6.7]
91 | ],
92 | "Phosphonate": [
93 | [1.1, 6.5], [2.7, 8.4], [2.8, 8.7], [1.3, 6.5], [2.7, 8.5], [2.9, 9.0],
94 | [2.6, 8.2], [2.5, 8.2], [2.1, 7.5], [1.4, 6.7], [1.3, 6.5], [1.3, 6.7],
95 | [1.6, 7.0], [1.1, 6.3], [2.3, 7.5], [2.4, 7.3], [1.8, 7.2], [3.8, 8.1],
96 | [1.7, 6.9], [1.8, 8.4], [2.4, 8.1], [1.4, 6.3], [1.8, 7.1], [1.9, 7.3],
97 | [1.7, 6.8], [1.6, 4.8], [2.7, 8.4], [1.3, 6.5], [1.9, 7.3], [1.6, 6.7],
98 | [2.2, 7.8], [1.9, 7.4], [1.6, 7.0], [1.4, 6.6], [1.5, 6.7], [1.4, 6.6],
99 | [1.6, 7.0], [2.1, 7.3], [1.7, 7.1], [1.8, 7.2], [1.1, 5.6], [1.3, 5.9],
100 | [null, 8.0], [null, 7.1], [null, 8.1], [null, 7.8], [null, 8.6],
101 | [null, 6.6], [null, 7.2], [null, 6.8], [null, 8.2]
102 | ],
103 | "Phenol": [
104 | 3.9, 4.3, 5.2, 6.0, 7.6, 5.2, 6.3, 7.8, 4.7, 6.6, 10.0, 5.0, 10.0, 8.9,
105 | 0.1, -0.7, -0.8, 10.2, -1.1, -1.0, 6.3, 6.5, 6.4, 6.3, 10.3, 10.3, 7.2,
106 | 6.7, -1.0, 8.3, -1.0, 8.1, 8.4, -1.0, 7.1, 6.1, 8.3, 8.7, -1.0, 10.1,
107 | 9.6, 7.7, 8.0, 6.9, 8.3, 10.2, -0.7, -0.8, 8.4, 7.4, 7.8, 7.8, 2.3,
108 | 7.4, 9.9, 7.2, 7.9, 10.0, 9.2, 8.0, 9.7, 9.5, 9.5, 10.5, 5.9, 5.3,
109 | 10.6, 10.1, 6.6, 10.6, 8.1, 12.6, 6.4, 7.3, 5.6, 6.3, 10.5, 10.4, 10.0,
110 | 10.6, 10.2, 8.9, 11.8, 7.7, 7.8, 6.3, 5.4, 7.1, 7.2, 9.6, 7.8, 6.1,
111 | 8.6, 6.8, 2.1, 8.6, 8.2, 8.3, 8.0, 8.6, -0.6, -0.8, 8.3, 7.4, 6.2, 7.4,
112 | 3.3, 8.9, 6.6, 6.6, 8.2, 1.6, 7.2, 6.1, 8.3, 3.9, 3.8, 4.1, 7.8, 1.6,
113 | 6.4, 1.4, 2.2, 2.0, 6.0, 8.7, 7.9, 5.2, 10.7, 10.8, 10.9, 6.3, 6.6,
114 | -0.9, 10.0, 10.3, 7.6, 7.6, -0.9, -0.9, 9.9, -1.0, 7.8, 8.3, 8.3, 9.6,
115 | -0.9, 10.2, 9.5, 9.1, 9.1, 9.9, 9.2, 9.6, 7.8, 7.4, 5.6, 9.0, 9.1, 9.0,
116 | 8.4, 8.8, 9.8, 9.5, 10.2, 8.6, 9.9, 7.2, 10.7, 8.4, 8.4, 8.8, 10.0, 6.8,
117 | 8.2, 7.3, 9.3, 9.9, 11.9, 10.4, 9.6, 8.1, 5.4, 9.3, 9.3, 9.4, 8.2,
118 | 10.1, 9.8, 7.3
119 | ],
120 | "Peroxide1": [
121 | 7.1, 9.3, 9.8, 8.6, 9.6, 8.1, 9.6, 8.9, 8.2, 9.0, 9.4, 9.0, 8.7, 8.9,
122 | 7.8, 7.4, 9.0, 8.9
123 | ],
124 | "Peroxide2": [
125 | 12.6, 11.5, 11.2, 11.2, 11.9, 10.6, 13.3, 11.4, 12.8, 11.0, 12.8, 12.8,
126 | 10.5, 12.5, 12.6, 12.4, 12.6
127 | ],
128 | "O=C-C=C-OH": [
129 | 4.2, 3.6, 3.6, 3.6, 3.1, 3.6, 1.7, 4.0, 3.5, 3.6, 3.6, 2.4, 3.2, 5.2,
130 | 4.3
131 | ],
132 | "Vinyl_alcohol": [
133 | 9.2, 9.5, 9.5, 9.6, 9.5, 3.9, 8.5, 10.7, 10.4, 8.6, 9.5, 10.5, 9.3,
134 | 9.2, 9.1, 9.4, 10.5, 9.4, 7.4, 5.7, 6.8
135 | ],
136 | "Alcohol": [
137 | 14.6, 15.4, 15.5, 15.6, 15.7, 15.1, 15.1, 13.3, 14.3, 12.4, 14.9, 15.5,
138 | 9.2, 12.2, 15.4, 12.1, 12.2, 16.0, 17.0, 14.4, 13.3, 24.0, 14.0, 15.1,
139 | 16.8, 15.2
140 | ],
141 | "N-hydroxyamide": [
142 | 8.0, 9.0, 8.3, 11.1, 10.8, 8.4, 8.8, 10.5, 8.5, 10.1, 8.7, 8.2, 9.8,
143 | 10.8, 8.1, 11.2, 11.1, 8.2, 9.9, 7.7, 8.3
144 | ],
145 | "Ringed_imide1": [
146 | 6.1, 6.9, 7.5, 5.8, 6.2, 6.4, 6.0, 6.6
147 | ],
148 | "Ringed_imide2": [
149 | 9.3, 9.5, 9.6, 11.4, 10.5, 10.2, 10.0, 4.4, 9.8, 8.7, 9.4, 8.8, 7.7,
150 | 6.2, 10.5, 11.2, 9.4, 9.4, 7.7, 6.2, 8.7, 8.3, 6.3, 5.2
151 | ],
152 | "Imide": [
153 | 1.2, 2.1, 4.1
154 | ],
155 | "Imide2": [
156 | 9.8, 10.3, 9.1, 9.4, 11.0, 10.3, 12.9, 10.2, 10.3, 9.1
157 | ],
158 | "Amide_electronegative": [
159 | -0.6, -0.7, 6.8, 6.8, 9.3, 1.7, 1.9, 2.1, 2.4, 8.9, 1.5, 2.0, 2.1, 2.2,
160 | -0.8, 5.0, 4.2, 3.8, 1.6, 5.9, 2.9, 4.0, 4.7, 4.9, 4.5
161 | ],
162 | "Amide": [
163 | 10.4, 3.4, 10.0, 13.5, 8.2, 11.7, 3.5, 9.3, 9.6, 13.4, 13.4, 13.3,
164 | 18.5, 18.5, 15.1, 19.4, 10.4, 14.5
165 | ],
166 | "Sulfonamide": [
167 | 5.0, 7.8, 10.7, 8.5, 3.8, 7.6, 8.9, 9.7, 11.5, 8.0, 9.4, 10.8, 8.6,
168 | 8.3, 9.6, 9.6, 8.5, 9.5, 7.5, 7.9, 9.2, 8.9, 9.0, 6.5, 6.5, 6.3, 6.8,
169 | 3.9, 2.8, 4.9, 3.5, 9.0, 8.2, 9.3, 7.5, 6.8, 7.5, 10.3, 7.2, 7.7, 8.8,
170 | 8.2, 9.4, 10.3, 8.2, 6.2
171 | ],
172 | "Anilines_primary": [
173 | 3.5, 3.6, 0.3, 9.2, 2.3, 2.3, 5.0, 4.4, 4.3, 2.5, 4.4, 2.2, 3.1, 5.3,
174 | 4.2, 4.4, 5.2, 4.4, 9.2, 2.4, 3.5, 3.7, 4.1, 4.5, 4.4, 4.7, 4.8, 4.4,
175 | 4.3, 4.9, 4.9, 4.6, 5.1, 3.0, 4.7, 3.9, 4.4, 2.0, 3.4, -0.4, 2.6, 10.6,
176 | 1.2, 0.9, 2.4, 5.0, 4.0, 5.2, 2.8, 3.8, 4.0, 3.8, 2.9, 1.5, 4.4, 3.5,
177 | 4.7, 4.7, 4.5, 13.8, 3.4, 3.6, -0.5, 3.5, 3.6, 3.6, 2.7, 4.6, 2.5, 3.8,
178 | 2.6, 3.8, 1.8, 3.9, 2.5, 3.9, 3.9
179 | ],
180 | "Anilines_secondary": [
181 | 7.2, 4.2, 4.4, 7.0, 6.8, 6.3, 1.2, 4.1, 4.6, 5.1, 5.2, 5.1, 5.1, 5.4,
182 | 4.9, 4.6, 3.3, 5.0, 4.2, 2.5, 3.7, 4.0, 6.1, 5.4, 5.9, 4.9, 3.7, 4.6,
183 | 8.6, 4.6, 4.6, 3.3, 5.0, 4.3, 2.5, 3.7, 4.0, 7.3, 6.5, -0.8, 0.3, 0.8,
184 | 4.5, -0.9, 7.2, 7.2, 0.8, -0.7, 5.2
185 | ],
186 | "Anilines_tertiary": [
187 | 4.1, 3.9, 3.6, 4.0, 4.1, 2.1, 3.3, 4.9, 6.4, 5.7, 6.1, 8.0, 7.2, 6.6,
188 | 0.7, 1.4, 7.2, 4.8, 4.1, 2.6, 2.6, 2.6, 2.2, 7.5, 5.4, 4.4, 5.8, 4.2,
189 | 2.7, 2.4, 2.2, 2.1, 1.9, 4.6, 3.3, 5.1, 6.0, 2.6, 8.8, 5.5, 4.8, 4.1,
190 | 5.8, 4.0, 5.6, 4.1, 2.3, 0.7, 1.8, 3.0, 2.6, 0.6, 4.5, 1.6, 4.5, 7.1,
191 | 7.8
192 | ],
193 | "Aromatic_nitrogen_unprotonated": [
194 | 0.9, 2.9, -0.4, 2.8, 4.0, 5.0, 5.0, 3.7, 4.9, 5.7, 4.9, 4.9, 5.6, 2.3,
195 | 3.3, 3.7, 2.6, 5.0, 5.7, 5.8, 5.8, 5.7, 6.7, 6.0, 6.0, 2.7, 5.5, 6.0,
196 | 5.3, 4.3, 3.2, 4.8, 6.0, 5.6, 6.0, 8.4, 9.0, 8.2, 3.8, 1.8, 3.3, 3.4,
197 | 3.5, 3.4, 3.0, 4.1, 1.8, 5.0, 0.9, 4.8, 3.4, 4.9, 6.7, 1.5, 3.1, 4.2,
198 | 5.4, 3.6, 4.4, 9.6, 4.9, 7.5, 6.4, 6.0, 5.2, 2.3, 6.8, 3.5, 6.0, 5.7,
199 | 6.8, 6.2, 6.7, 3.1, 6.6, 0.7, 2.7, 0.6, -0.4, 3.0, 1.8, 3.3, 3.5, 1.5,
200 | 2.0, 3.4, 3.5, 4.2, 4.0, 12.7, 2.3, 2.6, 2.3, 2.4, 4.2, 6.2, 0.7, 4.6,
201 | 4.9, 4.2, 4.7, 5.0, 4.2, 4.6, 5.4, 2.7, 5.6, 5.7, 3.7
202 | ],
203 | "Amines_primary_secondary_tertiary": [
204 | 5.0, 5.5, 8.5, 10.8, 7.0, 11.0, 10.8, 11.3, 11.1, 9.7, 9.0, 9.6, 11.3,
205 | 9.5, 6.3, 7.9, -1.1, 9.1, 10.2, 9.8, 9.1, 5.2, 9.4, 7.1, 7.9, 9.8, 8.1,
206 | 7.8, 8.7, 8.3, 7.4, 6.9, 9.7, 7.0, 9.3, 9.6, 8.7, 9.3, 9.9, 10.3, 7.9,
207 | 7.5, 8.6, 10.6, 8.9, 7.2, 11.9, 8.8, 7.0, 7.8, 10.4, 10.6, 9.2, 8.3,
208 | 8.5, 7.7, 8.8, 8.0, 7.5, 6.1, 6.9, 8.7, 9.1, 5.8, 3.4, 11.1, 10.2, 9.4,
209 | 10.0, 9.7, 9.5, 10.1, 9.6, 11.4, 11.6, 11.9, 11.3, 11.0, 10.3, 7.5,
210 | 8.3, 10.9, 8.2, 10.9, 11.3, 9.0, 8.6, 9.2, 8.9, 7.0, 6.7, 8.3, 8.3,
211 | 10.6, 6.3, 8.3, 8.5, 8.3, 11.9, 10.2, 10.1, 10.6, 8.3, 10.0, 10.7, 4.5,
212 | 10.7, 11.2, 6.7, 6.0, 10.0, 10.9, 8.3, 6.6, 4.2, 4.6, 10.7, 10.5, 10.7,
213 | 10.8, 10.0, 9.1, 6.9, 6.2, 6.5, 4.3, 3.7, 3.6, 8.9, 8.7, 6.2, 9.3,
214 | 10.7, 5.0, 10.4, 8.9, 6.2, 6.7, 6.6, 3.9, 9.0, 8.5, 8.9, 10.1, 4.5,
215 | 8.7, 10.5, 1.0, 7.6, 7.5, 8.7, 1.9, 8.6, 7.5, 6.3, 11.9, 10.6, 10.4,
216 | 10.7, 7.7, 11.0, 12.1, 9.2, 11.1, 9.8, 0.9, 7.7, 3.1, 5.2, 9.0, 8.8,
217 | 9.6, 8.6, 7.0, 7.7, 7.3, 8.4, 8.3, 5.7, 6.0, 6.5, 6.7, 6.5, 3.9, 6.4,
218 | 6.7, 6.7, 9.3, 9.3, 6.6, 5.5, 9.4, 8.0, 8.4, 8.1, 7.8, 7.7, 8.1, 8.1,
219 | 5.5, 10.7, 9.9, 10.4, 8.9, 10.4, 9.9, 7.2, 10.5, 4.5, 5.0, 9.8, 9.4,
220 | 4.8, 8.0, 8.3, 6.5, 7.3, 8.0, 4.2, 9.5, 8.1, 8.1, 9.4, 9.2, 9.8, 9.6,
221 | 9.7, 9.9, 9.9, 9.9, 9.9, 8.1, 8.3, 8.6, 8.9, 8.2, 8.1, 7.0, 8.3, 7.8,
222 | 7.7, 9.1, 8.7, 7.9, 9.9, 10.4, 10.5, 10.2, 10.1, 10.7, 6.2, 6.0, 10.8,
223 | 10.6, 9.0, 6.1, 9.7, 1.7, 7.2, 7.8, 8.7, 7.2, 8.4, 9.4, 7.7, 3.8, 9.1,
224 | 7.3, 9.0, 8.8, 8.9, 9.7, 9.9, 9.4, 9.0, -0.9, 9.2, 9.4, -0.9, 8.7, 5.5,
225 | -0.9, 8.8, 8.3, 10.5, 9.3, 9.6, 9.5, 10.5, 10.6, 11.4, 10.6, 11.4, 6.8,
226 | 5.3, 9.1, 9.8, 5.3, 5.1, 5.3, 7.0, 7.5, 9.7, 10.2, 6.3, 8.8, 6.2, 5.0,
227 | 0.6, 3.1, 6.6, 7.4, 6.0, 2.6, 8.1, 6.1, 5.8, 5.8, 7.8, 10.1, 9.3, 7.0,
228 | 2.0, 4.5, 1.1, 5.3, -0.6, 4.5, 6.1, 7.0, 8.0, 6.5, 0.8, 0.4, 8.9, 9.1,
229 | 10.7, 9.2, 9.5, -0.9, 5.3, 5.7, 7.1, 5.8, 10.0, 11.5, 5.7, 7.1, 9.5,
230 | 8.6, 9.1, 8.8, 8.6, 9.7, 10.2, 10.4, 10.5, 9.9, 10.4, 10.8, 10.3, 8.8,
231 | 8.2, 7.3, 11.0, 6.8, 10.1, 9.7, 9.8, 9.6, 9.7, 9.8, 10.1, 11.0, 7.7,
232 | 9.6, 8.7, 8.9, 9.2, 9.7, 9.4, 8.6, 8.1, 8.3, 6.2, 8.0, 7.3, 7.0, 12.2,
233 | 10.3, -0.8, 7.6, 6.1, 7.5, 8.1, 3.1, 7.1, 9.1, 8.9, 10.0, 7.6, 9.2,
234 | 8.7, 8.8, 10.7, 10.9, 8.7, 7.7, 7.5, 14.8, 14.8, 14.7
235 | ],
236 | "Phosphinic_acid": [
237 | 3.3, 3.3, 4.2, 3.6, 3.2, 3.2, 3.2, 3.3, 3.2, 2.4, 2.7, 0.8, 3.1, 3.5,
238 | 2.0, 3.1, 2.5, 2.7, 3.1, 3.0
239 | ],
240 | "Phosphate_diester": [
241 | 1.1, 1.0, 1.1, 1.2, 1.1, 1.0, 1.1, 0.9, 0.7, 0.5, 3.6, 3.9, 3.6, 1.6,
242 | 2.9, 4.2, 3.5, 12.7, 3.8, 4.1, 3.5, 3.4, 2.4
243 | ],
244 | "Phosphonate_ester": [
245 | 2.0, 2.2, 2.0, 1.9, 1.9, 1.9, 1.9, 2.0, 1.8, 2.0, 1.7, 1.7, 3.7, 2.1,
246 | 2.2, 2.3, 2.2, 2.1, 2.2, 2.2, 2.2, 2.4, 2.3, 2.3, 0.9
247 | ],
248 | "primary_hydroxyl_amine_2": [
249 | 4.1, 4.6, 4.4, 4.2, 4.4, 4.3, 2.1
250 | ],
251 | "Indole_pyrrole": [
252 | 14.9, 15.3, 14.5, 0.1, 17.5, 17.0, 16.5, 16.6, 16.3, 16.3, 15.2, 15.4,
253 | 13.0, 12.5, 15.2, 16.1
254 | ],
255 | "Aromatic_nitrogen_protonated": [
256 | 7.7, 3.0, 11.1, 9.4, 8.3, 2.9, 8.7, 6.3
257 | ]
258 | }
259 |
--------------------------------------------------------------------------------