├── .coveragerc ├── .editorconfig ├── .gitattributes ├── .github └── workflows │ ├── codecov.yml │ ├── docs.yml │ └── tests.yml ├── .gitignore ├── .isort.cfg ├── .markdownlint.yaml ├── .mypy.ini ├── .pytest.ini ├── .ruff.toml ├── .zenodo.json ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE.md ├── README.md ├── dimorphite_dl ├── __init__.py ├── cli.py ├── io.py ├── mol.py ├── neutralize.py ├── protonate │ ├── __init__.py │ ├── change.py │ ├── data.py │ ├── detect.py │ ├── results.py │ ├── run.py │ └── site.py └── smarts │ └── site_substructures.smarts ├── docs ├── .nav.yml ├── .overrides │ └── main.html ├── css │ ├── base.css │ ├── colors.css │ ├── jupyter.css │ ├── launchy.css │ └── mkdocstrings.css ├── development.md ├── gen_ref_pages.py ├── img │ └── launchy │ │ └── colab.svg ├── index.md └── js │ └── mathjax-config.js ├── hooks └── launchy.py ├── mkdocs.yml ├── pixi.lock ├── pixi.toml ├── pyproject.toml ├── tests ├── conftest.py ├── files │ └── sample_molecules.smi ├── mol │ ├── test_detect_substruct.py │ └── test_neutralize.py ├── protonate │ ├── test_data.py │ └── test_run.py ├── test_smiles_io.py └── tmp │ └── .gitignore └── training_data ├── README.md └── training_data.json /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | data_file = .coverage 4 | source = "tests" 5 | 6 | [paths] 7 | source = dimorphite_dl 8 | 9 | [report] 10 | show_missing = true 11 | skip_empty = true 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # Check http://editorconfig.org for more information 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.{py, pyi}] 14 | indent_style = space 15 | indent_size = 4 16 | 17 | [*.bat] 18 | indent_style = tab 19 | end_of_line = crlf 20 | 21 | [Makefile] 22 | indent_style = tab 23 | 24 | [*.{yml, yaml}] 25 | indent_size = 2 26 | trim_trailing_whitespace = true 27 | 28 | [*.md] 29 | indent_size = 4 30 | trim_trailing_whitespace = true 31 | 32 | [LICENSE] 33 | insert_final_newline = false 34 | 35 | [*.{diff,patch}] 36 | trim_trailing_whitespace = false 37 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.md text 7 | *.rst text 8 | 9 | # Denote all files that are truly binary and should not be modified. 10 | *.png binary 11 | *.jpg binary 12 | 13 | pixi.lock linguist-language=YAML linguist-generated=true 14 | -------------------------------------------------------------------------------- /.github/workflows/codecov.yml: -------------------------------------------------------------------------------- 1 | name: Codecov 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | workflow_dispatch: 9 | 10 | jobs: 11 | run: 12 | name: codecov 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | with: 19 | lfs: true 20 | 21 | - name: Install pixi 22 | uses: prefix-dev/setup-pixi@v0.8.8 23 | with: 24 | locked: false 25 | frozen: false 26 | cache: true 27 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }} 28 | 29 | - name: Setup environment 30 | run: pixi install -e dev 31 | 32 | - name: Get test coverage 33 | run: pixi run tests 34 | 35 | - name: Upload to Codecov 36 | uses: codecov/codecov-action@v5 37 | with: 38 | env_vars: OS,PYTHON 39 | fail_ci_if_error: true 40 | verbose: true 41 | token: ${{ secrets.CODECOV_TOKEN }} 42 | 43 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | concurrency: 15 | group: "pages" 16 | cancel-in-progress: true 17 | 18 | jobs: 19 | deploy: 20 | name: docs 21 | environment: 22 | name: github-pages 23 | url: ${{ steps.deployment.outputs.page_url }} 24 | runs-on: ubuntu-latest 25 | 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | with: 30 | fetch-depth: 0 31 | 32 | - name: Install pixi 33 | uses: prefix-dev/setup-pixi@v0.8.8 34 | with: 35 | locked: false 36 | frozen: false 37 | cache: true 38 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }} 39 | 40 | - name: Setup environment 41 | run: pixi install -e docs 42 | 43 | - name: Build documentation 44 | run: pixi run docs 45 | 46 | - name: Setup Pages 47 | uses: actions/configure-pages@v5 48 | 49 | - name: Upload artifact 50 | uses: actions/upload-pages-artifact@v3 51 | with: 52 | path: 'public/' 53 | 54 | - name: Deploy to GitHub Pages 55 | id: deployment 56 | uses: actions/deploy-pages@v4 57 | 58 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | # Allows you to run this workflow manually from the Actions tab 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout repo 17 | uses: actions/checkout@v4 18 | 19 | - name: Install pixi 20 | uses: prefix-dev/setup-pixi@v0.8.8 21 | with: 22 | locked: false 23 | frozen: false 24 | cache: true 25 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }} 26 | 27 | - name: Setup environment 28 | run: pixi install -e dev 29 | 30 | - name: Run tests 31 | run: pixi run tests 32 | 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | __pycache__ 3 | .venv 4 | .env 5 | dist/ 6 | **/_version.py 7 | 8 | # Jupyter Notebook 9 | **/.ipynb_checkpoints 10 | 11 | # IDE settings 12 | .vscode/ 13 | .idea/ 14 | 15 | public/ 16 | 17 | .cache 18 | 19 | node_modules 20 | package-lock.json 21 | package.json 22 | 23 | # pixi environments 24 | .pixi 25 | *.egg-info 26 | 27 | # coverage 28 | coverage.xml 29 | report.xml 30 | .coverage 31 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | py_version = 313 3 | skip=.bzr,.direnv,.eggs,.git,.hg,.mypy_cache,.nox,.pants.d,.svn,.tox,.venv,__pypackages__,_build,buck-out,build,dist,node_modules,venv,.pixi 4 | line_length = 88 5 | known_typing = typing,types,typing_extensions,mypy,mypy_extensions 6 | sections = FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 7 | profile = black 8 | include_trailing_comma = true 9 | multi_line_output = 3 10 | indent = 4 11 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | # https://github.com/DavidAnson/markdownlint/blob/main/schema/.markdownlint.yaml 2 | MD007: 3 | indent: 4 4 | MD013: false 5 | MD022: false 6 | MD024: false 7 | MD026: false 8 | MD028: false 9 | MD030: 10 | ol_multi: 2 11 | ol_single: 2 12 | ul_multi: 3 13 | ul_single: 3 14 | MD031: false 15 | MD032: false 16 | MD033: false 17 | MD034: false 18 | MD036: false 19 | MD038: false 20 | MD041: false 21 | MD046: false 22 | MD052: false 23 | MD053: false 24 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = "3.12" 3 | pretty = true 4 | show_traceback = true 5 | color_output = true 6 | allow_redefinition = false 7 | check_untyped_defs = true 8 | disallow_any_generics = true 9 | disallow_incomplete_defs = true 10 | ignore_missing_imports = true 11 | implicit_reexport = false 12 | no_implicit_optional = true 13 | show_column_numbers = true 14 | show_error_codes = true 15 | show_error_context = true 16 | strict_equality = true 17 | strict_optional = true 18 | warn_no_return = true 19 | warn_redundant_casts = true 20 | warn_return_any = true 21 | warn_unreachable = true 22 | warn_unused_configs = true 23 | warn_unused_ignores = true 24 | -------------------------------------------------------------------------------- /.pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = 3 | dimorphite_dl 4 | *.egg 5 | .eggs 6 | dist 7 | build 8 | docs 9 | .tox 10 | .git 11 | __pycache__ 12 | doctest_optionflags = 13 | NUMBER 14 | NORMALIZE_WHITESPACE 15 | IGNORE_EXCEPTION_DETAIL 16 | addopts = 17 | --strict-markers 18 | --tb=short 19 | --doctest-modules 20 | --doctest-continue-on-failure 21 | testpaths = tests 22 | -------------------------------------------------------------------------------- /.ruff.toml: -------------------------------------------------------------------------------- 1 | exclude = [ 2 | ".bzr", 3 | ".direnv", 4 | ".eggs", 5 | ".git", 6 | ".git-rewrite", 7 | ".hg", 8 | ".ipynb_checkpoints", 9 | ".mypy_cache", 10 | ".nox", 11 | ".pants.d", 12 | ".pyenv", 13 | ".pytest_cache", 14 | ".pytype", 15 | ".ruff_cache", 16 | ".svn", 17 | ".tox", 18 | ".venv", 19 | ".vscode", 20 | "__pypackages__", 21 | "_build", 22 | "buck-out", 23 | "build", 24 | "dist", 25 | "node_modules", 26 | "site-packages", 27 | "venv", 28 | ".pixi", 29 | ".pytest_cache", 30 | ] 31 | 32 | line-length = 88 33 | indent-width = 4 34 | 35 | [format] 36 | quote-style = "double" 37 | indent-style = "space" 38 | line-ending = "lf" 39 | docstring-code-format = true 40 | docstring-code-line-length = "dynamic" 41 | skip-magic-trailing-comma = true 42 | 43 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "upload_type": "software", 3 | "title": "Dimorphite-DL", 4 | "creators": [ 5 | { 6 | "name": "Ropp, Patrick J.", 7 | }, 8 | { 9 | "name": "Kaminsky, Jesse C.", 10 | "orcid": "0000-0001-5796-2874" 11 | }, 12 | { 13 | "name": "Yablonski, Sara", 14 | }, 15 | { 16 | "name": "Spiegel, Jacob O.", 17 | "orcid": "0000-0002-8496-6915", 18 | }, 19 | { 20 | "name": "Maldonado, Alex M.", 21 | "orcid": "0000-0003-3280-062X", 22 | "affiliation": "Department of Biological Sciences, University of Pittsburgh" 23 | }, 24 | { 25 | "name": "Durrant, Jacob D.", 26 | "orcid": "0000-0002-5808-4097", 27 | "affiliation": "Department of Biological Sciences, University of Pittsburgh" 28 | }, 29 | ], 30 | "access_right": "open", 31 | "license": { 32 | "id": "apache-2.0", 33 | }, 34 | "language": "eng", 35 | } 36 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 4 | 5 | ## [Unreleased] 6 | 7 | ### Added 8 | 9 | - Turn on and control logging through the CLI. 10 | - `colorize` keyword argument for `enable_logging` for logs to not use ANSI color codes. 11 | 12 | ### Fixed 13 | 14 | - Determining if a provided input string was a SMILES or path to file. 15 | `CCC(C)=C(Cl)C/C(I)=C(\C)F` was incorrectly classified as a file. 16 | 17 | ## [2.0.1] - 2025-06-03 18 | 19 | ### Changed 20 | 21 | - Rearranged `__init__.py` imports to mainly have `from dimorphite_dl import protonate_smiles`. 22 | 23 | ### Fixed 24 | 25 | - Circular import of SMARTS. 26 | 27 | ## [2.0.0] - 2025-06-01 28 | 29 | ### Changed 30 | 31 | - Fallback mechanism now uses the previous successful site protonation. 32 | In previous versions, sometimes only the last successful protonation site type was returned. 33 | If the third phosphate protonation failed, then it would fallback to the last successful protonation before the first phosphate. 34 | Now, we would return the second phosphate protonation. 35 | - Major refactor of practically everything. 36 | 37 | ## [1.2.5] - 2025-05-21 38 | 39 | ### Changed 40 | 41 | - Major reorganization of the original `dimorphite_dl.py` file into Python modules under the package name `dimorphite_dl`. No code logic has been change, just refactored. 42 | 43 | ## [1.2.4] 44 | 45 | ### Added 46 | 47 | - Added test cases for ATP and NAD. 48 | 49 | ### Changed 50 | 51 | - Dimorphite-DL now better protonates compounds with polyphosphate chains 52 | (e.g., ATP). See `site_substructures.smarts` for the rationale behind the 53 | added pKa values. 54 | - `site_substructures.smarts` now allows comments (lines that start with `#`). 55 | - Improved suport for the `--silent` option. 56 | - Reformatted code per the [*Black* Python code formatter](https://github.com/psf/black). 57 | 58 | ### Fixed 59 | 60 | - Fixed a bug that affected how Dimorphite-DL deals with new protonation 61 | states that yield invalid SMILES strings. 62 | - Previously, it simply returned the original input SMILES in these rare 63 | cases (better than nothing). Now, it instead returns the last valid SMILES 64 | produced, not necessarily the original SMILES. 65 | - Consider `O=C(O)N1C=CC=C1` at pH 3.5 as an example. 66 | - Dimorphite-DL first deprotonates the carboxyl group, producing 67 | `O=C([O-])n1cccc1` (a valid SMILES). 68 | - It then attempts to protonate the aromatic nitrogen, producing 69 | `O=C([O-])[n+]1cccc1`, an invalid SMILES. 70 | - Previously, it would output the original SMILES, `O=C(O)N1C=CC=C1`. Now 71 | it outputs the last valid SMILES, `O=C([O-])n1cccc1`. 72 | 73 | ## [1.2.3] 74 | 75 | ### Added 76 | 77 | - Added "silent" option to suppress all output. 78 | - Added code to suppress unnecessary RDKit warnings. 79 | 80 | ### Changed 81 | 82 | - Updated protonation of nitrogen, oxygen, and sulfur atoms to be compatible 83 | with the latest version of RDKit, which broke backwards compatibility. 84 | - Updated copyright to 2020. 85 | 86 | ## [1.2.2] 87 | 88 | ### Added 89 | 90 | - Added a new parameter to limit the number of variants per compound 91 | (`--max_variants`). The default is 128. 92 | 93 | ## [1.2.1] 94 | 95 | ### Fixed 96 | 97 | - Corrected a bug that rarely misprotonated/deprotonated compounds with 98 | multiple ionization sites (e.g., producing a carbanion). 99 | 100 | ## [1.2.0] 101 | 102 | ### Fixed 103 | 104 | - Corrected a bug that led Dimorphite-DL to sometimes produce output molecules 105 | that are non-physical. 106 | - Corrected a bug that gave incorrect protonation states for rare molecules 107 | (aromatic rings with nitrogens that are protonated when electrically 108 | neutral, e.g. pyridin-4(1H)-one). 109 | - `run_with_mol_list()` now preserves non-string properties. 110 | - `run_with_mol_list()` throws a warning if it cannot process a molecule, 111 | rather than terminating the program with an error. 112 | 113 | ## [1.1.0] 114 | 115 | ### Added 116 | 117 | - Dimorphite-DL now distinguishes between indoles/pyrroles and 118 | Aromatic_nitrogen_protonated. 119 | - It is now possible to call Dimorphite-DL from another Python script, in 120 | addition to the command line. See the `README.md` file for instructions. 121 | 122 | ## [1.0.0] 123 | 124 | The original version described in: 125 | 126 | Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An 127 | open-source program for enumerating the ionization states of drug-like small 128 | molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9. 129 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. 6 | 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 8 | 9 | ## Our Standards 10 | 11 | Examples of behavior that contributes to a positive environment for our community include: 12 | 13 | - Demonstrating empathy and kindness toward other people 14 | - Being respectful of differing opinions, viewpoints, and experiences 15 | - Giving and gracefully accepting constructive feedback 16 | - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 17 | - Focusing on what is best not just for us as individuals but for the overall community 18 | 19 | Examples of unacceptable behavior include: 20 | 21 | - The use of sexualized language or imagery and sexual attention or advances of any kind; 22 | - Trolling, insulting or derogatory comments, and personal or political attacks; 23 | - Public or private harassment; 24 | - Publishing others' private information, such as a physical or email address, without their explicit permission; 25 | - Other conduct that could reasonably be considered inappropriate in a professional setting. 26 | 27 | ## Enforcement Responsibilities 28 | 29 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior. 30 | They will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 31 | 32 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct and will communicate reasons for moderation decisions when appropriate. 33 | 34 | ## Scope 35 | 36 | This Code of Conduct applies within all community spaces and when an individual officially represents the community in public spaces. 37 | Examples of representing our community include: 38 | 39 | - using an official email address, 40 | - posting via an official social media account, 41 | - or acting as an appointed representative at an online or offline event. 42 | 43 | ## Enforcement 44 | 45 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at durrantj@pitt.edu. 46 | All complaints will be reviewed and investigated promptly and fairly. 47 | 48 | All community leaders must respect the privacy and security of the reporter of any incident. 49 | 50 | ## Enforcement Guidelines 51 | 52 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: 53 | 54 | ### 1. Correction 55 | 56 | **Community Impact**: 57 | Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. 58 | 59 | **Consequence**: 60 | A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. 61 | A public apology may be requested. 62 | 63 | ### 2. Warning 64 | 65 | **Community Impact**: 66 | A violation through a single incident or series of actions. 67 | 68 | **Consequence**: 69 | A warning with consequences for continued behavior. 70 | No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. 71 | This includes avoiding interactions in community spaces and external channels like social media. 72 | Violating these terms may lead to a temporary or permanent ban. 73 | 74 | ### 3. Temporary Ban 75 | 76 | **Community Impact**: 77 | A severe violation of community standards, including sustained inappropriate behavior. 78 | 79 | **Consequence**: 80 | A temporary ban from any sort of interaction or public communication with the community for a specified period of time. 81 | No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. 82 | Violating these terms may lead to a permanent ban. 83 | 84 | ### 4. Permanent Ban 85 | 86 | **Community Impact**: 87 | Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. 88 | 89 | **Consequence**: 90 | A permanent ban from any sort of public interaction within the community. 91 | 92 | ## Attribution 93 | 94 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 95 | 96 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 97 | 98 | For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. 99 | Translations are available at [https://www.contributor-covenant.org/translations][translations]. 100 | 101 | [homepage]: https://www.contributor-covenant.org 102 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 103 | [Mozilla CoC]: https://github.com/mozilla/diversity 104 | [FAQ]: https://www.contributor-covenant.org/faq 105 | [translations]: https://www.contributor-covenant.org/translations 106 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # Apache License 2 | 3 | *Version 2.0, January 2004* 4 | <[http://www.apache.org/licenses](http://www.apache.org/licenses)> 5 | 6 | ## Terms and Conditions for use, reproduction, and distribution 7 | 8 | ### 1. Definitions 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, and 11 | distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by the 14 | copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all other 17 | entities that control, are controlled by, or are under common control with 18 | that entity. For the purposes of this definition, "control" means (i) the 19 | power, direct or indirect, to cause the direction or management of such 20 | entity, whether by contract or otherwise, or (ii) ownership of 21 | fifty percent (50%) or more of the outstanding shares, or (iii) beneficial 22 | ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity exercising 25 | permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation source, 29 | and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical transformation 32 | or translation of a Source form, including but not limited to compiled 33 | object code, generated documentation, and conversions to 34 | other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or Object 37 | form, made available under the License, as indicated by a copyright notice 38 | that is included in or attached to the work (an example is provided in the 39 | Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object form, 42 | that is based on (or derived from) the Work and for which the editorial 43 | revisions, annotations, elaborations, or other modifications represent, 44 | as a whole, an original work of authorship. For the purposes of this 45 | License, Derivative Works shall not include works that remain separable 46 | from, or merely link (or bind by name) to the interfaces of, the Work and 47 | Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including the original 50 | version of the Work and any modifications or additions to that Work or 51 | Derivative Works thereof, that is intentionally submitted to Licensor for 52 | inclusion in the Work by the copyright owner or by an individual or 53 | Legal Entity authorized to submit on behalf of the copyright owner. 54 | For the purposes of this definition, "submitted" means any form of 55 | electronic, verbal, or written communication sent to the Licensor or its 56 | representatives, including but not limited to communication on electronic 57 | mailing lists, source code control systems, and issue tracking systems 58 | that are managed by, or on behalf of, the Licensor for the purpose of 59 | discussing and improving the Work, but excluding communication that is 60 | conspicuously marked or otherwise designated in writing by the copyright 61 | owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity on 64 | behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | ### 2. Grant of Copyright License 68 | 69 | Subject to the terms and conditions of this License, each Contributor 70 | hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, 71 | royalty-free, irrevocable copyright license to reproduce, prepare 72 | Derivative Works of, publicly display, publicly perform, sublicense, 73 | and distribute the Work and such Derivative Works in 74 | Source or Object form. 75 | 76 | ### 3. Grant of Patent License 77 | 78 | Subject to the terms and conditions of this License, each Contributor 79 | hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, 80 | royalty-free, irrevocable (except as stated in this section) patent 81 | license to make, have made, use, offer to sell, sell, import, and 82 | otherwise transfer the Work, where such license applies only to those 83 | patent claims licensable by such Contributor that are necessarily 84 | infringed by their Contribution(s) alone or by combination of their 85 | Contribution(s) with the Work to which such Contribution(s) was submitted. 86 | If You institute patent litigation against any entity (including a 87 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 88 | Contribution incorporated within the Work constitutes direct or 89 | contributory patent infringement, then any patent licenses granted to 90 | You under this License for that Work shall terminate as of the date such 91 | litigation is filed. 92 | 93 | ### 4. Redistribution 94 | 95 | You may reproduce and distribute copies of the Work or Derivative Works 96 | thereof in any medium, with or without modifications, and in Source or 97 | Object form, provided that You meet the following conditions: 98 | 99 | 1. You must give any other recipients of the Work or Derivative Works a 100 | copy of this License; and 101 | 102 | 2. You must cause any modified files to carry prominent notices stating 103 | that You changed the files; and 104 | 105 | 3. You must retain, in the Source form of any Derivative Works that You 106 | distribute, all copyright, patent, trademark, and attribution notices from 107 | the Source form of the Work, excluding those notices that do not pertain 108 | to any part of the Derivative Works; and 109 | 110 | 4. If the Work includes a "NOTICE" text file as part of its distribution, 111 | then any Derivative Works that You distribute must include a readable copy 112 | of the attribution notices contained within such NOTICE file, excluding 113 | those notices that do not pertain to any part of the Derivative Works, 114 | in at least one of the following places: within a NOTICE text file 115 | distributed as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, within a 117 | display generated by the Derivative Works, if and wherever such 118 | third-party notices normally appear. The contents of the NOTICE file are 119 | for informational purposes only and do not modify the License. 120 | You may add Your own attribution notices within Derivative Works that You 121 | distribute, alongside or as an addendum to the NOTICE text from the Work, 122 | provided that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and may 126 | provide additional or different license terms and conditions for use, 127 | reproduction, or distribution of Your modifications, or for any such 128 | Derivative Works as a whole, provided Your use, reproduction, and 129 | distribution of the Work otherwise complies with the conditions 130 | stated in this License. 131 | 132 | ### 5. Submission of Contributions 133 | 134 | Unless You explicitly state otherwise, any Contribution intentionally 135 | submitted for inclusion in the Work by You to the Licensor shall be under 136 | the terms and conditions of this License, without any additional 137 | terms or conditions. Notwithstanding the above, nothing herein shall 138 | supersede or modify the terms of any separate license agreement you may 139 | have executed with Licensor regarding such Contributions. 140 | 141 | ### 6. Trademarks 142 | 143 | This License does not grant permission to use the trade names, trademarks, 144 | service marks, or product names of the Licensor, except as required for 145 | reasonable and customary use in describing the origin of the Work and 146 | reproducing the content of the NOTICE file. 147 | 148 | ### 7. Disclaimer of Warranty 149 | 150 | Unless required by applicable law or agreed to in writing, Licensor 151 | provides the Work (and each Contributor provides its Contributions) 152 | on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 153 | either express or implied, including, without limitation, any warranties 154 | or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS 155 | FOR A PARTICULAR PURPOSE. You are solely responsible for determining the 156 | appropriateness of using or redistributing the Work and assume any risks 157 | associated with Your exercise of permissions under this License. 158 | 159 | ### 8. Limitation of Liability 160 | 161 | In no event and under no legal theory, whether in tort 162 | (including negligence), contract, or otherwise, unless required by 163 | applicable law (such as deliberate and grossly negligent acts) or agreed 164 | to in writing, shall any Contributor be liable to You for damages, 165 | including any direct, indirect, special, incidental, or consequential 166 | damages of any character arising as a result of this License or out of 167 | the use or inability to use the Work (including but not limited to damages 168 | for loss of goodwill, work stoppage, computer failure or malfunction, 169 | or any and all other commercial damages or losses), even if such 170 | Contributor has been advised of the possibility of such damages. 171 | 172 | ### 9. Accepting Warranty or Additional Liability 173 | 174 | While redistributing the Work or Derivative Works thereof, You may choose 175 | to offer, and charge a fee for, acceptance of support, warranty, 176 | indemnity, or other liability obligations and/or rights consistent with 177 | this License. However, in accepting such obligations, You may act only 178 | on Your own behalf and on Your sole responsibility, not on behalf of any 179 | other Contributor, and only if You agree to indemnify, defend, and hold 180 | each Contributor harmless for any liability incurred by, or claims 181 | asserted against, such Contributor by reason of your accepting any such 182 | warranty or additional liability. 183 | 184 | *END OF TERMS AND CONDITIONS* 185 | 186 | ## APPENDIX: How to apply the Apache License to your work 187 | 188 | To apply the Apache License to your work, attach the following boilerplate 189 | notice, with the fields enclosed by brackets "[]" replaced with your own 190 | identifying information. (Don't include the brackets!) The text should be 191 | enclosed in the appropriate comment syntax for the file format. We also 192 | recommend that a file or class name and description of purpose be included 193 | on the same "printed page" as the copyright notice for easier 194 | identification within third-party archives. 195 | 196 | Copyright 2025 durrantlab 197 | 198 | Licensed under the Apache License, Version 2.0 (the "License"); 199 | you may not use this file except in compliance with the License. 200 | You may obtain a copy of the License at 201 | 202 | https://www.apache.org/licenses/LICENSE-2.0 203 | 204 | Unless required by applicable law or agreed to in writing, software 205 | distributed under the License is distributed on an "AS IS" BASIS, 206 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 207 | or implied. See the License for the specific language governing 208 | permissions and limitations under the License. 209 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

dimorphite_dl

2 | 3 |

Adds hydrogen atoms to molecular representations as specified by pH

4 | 5 |

6 | 7 | Build Status 8 | 9 | PyPI - Python Version 10 | 11 | codecov 12 | 13 | 14 | GitHub release (latest by date) 15 | 16 | 17 | PyPI - Downloads 18 | 19 | 20 | License 21 | 22 | 23 | GitHub repo size 24 | 25 | 26 | DOI 27 | 28 | 29 | Archived | https://doi.org/10.5281/zenodo.15486131 30 | 31 |

32 | 33 | Dimorphite-DL is a fast, accurate, accessible, and modular open-source program designed for enumerating small-molecule ionization states. 34 | It specifically adds or removes hydrogen atoms from molecular representations to achieve the appropriate protonation state for a user-specified pH range. 35 | 36 | Accurate protonation states are crucial in cheminformatics and computational drug discovery, as a molecule's ionization state significantly impacts its physicochemical properties, biological activity, and interactions with targets. 37 | Dimorphite-DL addresses this by providing a robust solution for preparing molecules for various downstream applications like docking, molecular dynamics, and virtual screening. 38 | 39 | ## Installation 40 | 41 | You can install the latest released version on [PyPI](https://pypi.org/project/dimorphite-dl/) using the following command. 42 | 43 | ```bash 44 | pip install dimorphite_dl 45 | ``` 46 | 47 | Or you can install the latest development version from the `main` branch on [GitHub](https://github.com/durrantlab/dimorphite_dl) using 48 | 49 | ```bash 50 | pip install https://github.com/durrantlab/dimorphite_dl.git 51 | ``` 52 | 53 | ## Usage 54 | 55 | ### CLI 56 | 57 | The command-line interface (`dimorphite_dl`) provides straightforward access to Dimorphite-DL's functionalities. 58 | 59 | **Positional Arguments:** 60 | 61 | - `SMI`: SMILES string or path to a file containing SMILES strings to protonate. 62 | 63 | **Options:** 64 | 65 | - `--ph_min MIN`: Minimum pH to consider (default: 6.4). 66 | - `--ph_max MAX`: Maximum pH to consider (default: 8.4). 67 | - `--precision PRE`: pKa precision factor, representing the number of standard deviations from the mean pKa to consider when determining ionization states (default: 1.0). 68 | - `--output_file FILE`: Optional path to a file to write the protonated SMILES results. 69 | - `--max_variants MXV`: Limits the number of protonation variants generated per input compound (default: 128). 70 | - `--label_states`: If set, output SMILES will be labeled with their target ionization state ("DEPROTONATED", "PROTONATED", or "BOTH"). 71 | - `--log_level`: Enable logging and set the level. 72 | Can be `none`, `debug`, `info`, `warning`, `error`, or `critical`. 73 | Defaults to no logging. 74 | 75 | #### Examples 76 | 77 | Protonate molecules from a file: 78 | 79 | ```bash 80 | dimorphite_dl sample_molecules.smi 81 | ``` 82 | 83 | Protonate a single SMILES string within a specific pH range: 84 | 85 | ```bash 86 | dimorphite_dl --ph_min -3.0 --ph_max -2.0 "CCC(=O)O" 87 | ``` 88 | 89 | Protonate a SMILES string and save output to a file: 90 | 91 | ```bash 92 | dimorphite_dl --ph_min -3.0 --ph_max -2.0 --output_file output.smi "CCCN" 93 | ``` 94 | 95 | Protonate molecules from a file with increased pKa precision and state labels: 96 | 97 | ```bash 98 | dimorphite_dl --precision 2.0 --label_states sample_molecules.smi 99 | ``` 100 | 101 | ### Scripting 102 | 103 | Dimorphite-DL can be easily integrated into your Python scripts. 104 | The primary function for this is `protonate_smiles` from `dimorphite_dl.protonate`. 105 | 106 | ```python 107 | from dimorphite_dl import protonate_smiles 108 | 109 | # Protonate a single SMILES string with custom pH range and precision 110 | protonated_mol_1: list[str] = protonate_smiles( 111 | "CCC(=O)O", ph_min=6.8, ph_max=7.9, precision=0.5 112 | ) 113 | print(f"Protonated 'CCC(=O)O': {protonated_mol_1}") 114 | 115 | # Protonate a list of SMILES strings 116 | protonated_mol_list: list[str] = protonate_smiles(["CCC(=O)O", "CCCN"]) 117 | print(f"Protonated list: {protonated_mol_list}") 118 | 119 | # Protonate molecules from a SMILES file 120 | # Make sure '~/example.smi' exists and contains SMILES strings 121 | # protonated_from_file: list[str] = protonate_smiles("~/example.smi") 122 | # print(f"Protonated from file: {protonated_from_file}") 123 | 124 | # Example with labeling states and limiting variants 125 | protonated_labeled: list[str] = protonate_smiles( 126 | "C1CCCCC1C(=O)O", ph_min=7.0, ph_max=7.4, label_states=True, max_variants=5 127 | ) 128 | print(f"Protonated with labels: {protonated_labeled}") 129 | ``` 130 | 131 | ## Known issues 132 | 133 | Dimorphite_dl is designed to handle the vast majority of ionizable functional groups accurately, but there are some edge cases where the current SMARTS patterns and pKa assignments may not behave as expected. 134 | The following are known limitations that users should be aware of when working with specific molecular substructures: 135 | 136 | - **Tertiary Amides**: Tertiary amides (e.g., N-acetylpiperidine `CC(=O)N1CCCCC1`) are incorrectly treated as basic amines (pKa ~8) instead of neutral species because current amide SMARTS patterns require an N-H bond. 137 | - **Indoles and Pyrroles**: These heterocycles are correctly deprotonated around pH 14.5 but are not protonated at very low pH (~-3.5) where they would be expected to protonate under extremely acidic conditions. 138 | 139 | ## Development 140 | 141 | We use [pixi](https://pixi.sh/latest/) to manage Python environments and simplify the developer workflow. 142 | Once you have [pixi](https://pixi.sh/latest/) installed, move into `dimorphite_dl` directory (e.g., `cd dimorphite_dl`) and install the environment using the command 143 | 144 | ```bash 145 | pixi install 146 | ``` 147 | 148 | Now you can activate the new virtual environment using 149 | 150 | ```sh 151 | pixi shell 152 | ``` 153 | 154 | ## Citation 155 | 156 | If you use Dimorphite-DL in your research, please cite: 157 | 158 | Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An open-source program for enumerating the ionization states of drug-like small 159 | molecules. *J Cheminform 11*:14. doi: [10.1186/s13321-019-0336-9](https://doi.org/10.1186/s13321-019-0336-9). 160 | 161 | ## License 162 | 163 | This project is released under the Apache-2.0 License as specified in `LICENSE.md`. 164 | -------------------------------------------------------------------------------- /dimorphite_dl/__init__.py: -------------------------------------------------------------------------------- 1 | """Adds hydrogen atoms to molecular representations as specified by pH""" 2 | 3 | from typing import Any 4 | 5 | import os 6 | import sys 7 | from ast import literal_eval 8 | 9 | from loguru import logger 10 | 11 | from .protonate.run import protonate_smiles 12 | 13 | __all__ = ["protonate_smiles"] 14 | 15 | try: 16 | from ._version import version as __version__ 17 | except ImportError: 18 | __version__ = "unknown" 19 | 20 | logger.disable("dimorphite_dl") 21 | 22 | LOG_FORMAT = ( 23 | "{time:HH:mm:ss} | " 24 | "{level: <8} | " 25 | "{name}:{function}:{line} - {message}" 26 | ) 27 | 28 | 29 | def enable_logging( 30 | level_set: int, 31 | stdout_set: bool = True, 32 | file_path: str | None = None, 33 | log_format: str = LOG_FORMAT, 34 | colorize: bool = True, 35 | ) -> None: 36 | r"""Enable logging. 37 | 38 | Args: 39 | level: Requested log level: `10` is debug, `20` is info. 40 | file_path: Also write logs to files here. 41 | """ 42 | config: dict[str, Any] = {"handlers": []} 43 | if stdout_set: 44 | config["handlers"].append( 45 | { 46 | "sink": sys.stdout, 47 | "level": level_set, 48 | "format": log_format, 49 | "colorize": colorize, 50 | } 51 | ) 52 | if isinstance(file_path, str): 53 | config["handlers"].append( 54 | { 55 | "sink": file_path, 56 | "level": level_set, 57 | "format": log_format, 58 | "colorize": colorize, 59 | } 60 | ) 61 | # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.configure 62 | logger.configure(**config) 63 | 64 | logger.enable("dimorphite_dl") 65 | 66 | 67 | if literal_eval(os.environ.get("DIMORPHITE_DL_LOG", "False")): 68 | level = int(os.environ.get("DIMORPHITE_DL_LOG_LEVEL", 20)) 69 | stdout = literal_eval(os.environ.get("DIMORPHITE_DL_STDOUT", "True")) 70 | log_file_path = os.environ.get("DIMORPHITE_DL_LOG_FILE_PATH", None) 71 | enable_logging(level, stdout, log_file_path) 72 | -------------------------------------------------------------------------------- /dimorphite_dl/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from loguru import logger 4 | 5 | from dimorphite_dl import __version__, enable_logging, protonate_smiles 6 | 7 | LOG_LEVEL_TO_INT = {"debug": 10, "info": 20, "warning": 30, "error": 40, "critical": 50} 8 | 9 | 10 | def run_cli() -> None: 11 | """The main definition run when you call the script from the commandline.""" 12 | parser = argparse.ArgumentParser(description=f"dimorphite_dl v{__version__}") 13 | parser.add_argument( 14 | "--ph_min", 15 | metavar="MIN", 16 | type=float, 17 | default=6.4, 18 | help="Minimum pH to consider (default: 6.4)", 19 | ) 20 | parser.add_argument( 21 | "--ph_max", 22 | metavar="MAX", 23 | type=float, 24 | default=8.4, 25 | help="Maximum pH to consider (default: 8.4)", 26 | ) 27 | parser.add_argument( 28 | "--precision", 29 | metavar="PRE", 30 | type=float, 31 | default=1.0, 32 | help="pKa precision factor (i.e., number of standard devations)", 33 | ) 34 | parser.add_argument( 35 | "--output_file", 36 | metavar="FILE", 37 | type=str, 38 | help="Output file to write protonated SMILES (optional)", 39 | ) 40 | parser.add_argument( 41 | "--max_variants", 42 | metavar="MXV", 43 | type=int, 44 | default=128, 45 | help="Limit number of variants per input compound (default: 128)", 46 | ) 47 | parser.add_argument( 48 | "--label_states", 49 | action="store_true", 50 | help="label protonated SMILES with target state " 51 | + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").', 52 | ) 53 | parser.add_argument( 54 | "--log_level", 55 | choices=["none", "debug", "info", "warning", "error", "critical"], 56 | default="none", 57 | help="Enable and set logging level. Defaults to none (i.e., no logging)", 58 | ) 59 | parser.add_argument( 60 | "smiles", metavar="SMI", type=str, help="SMILES or path to SMILES to protonate" 61 | ) 62 | 63 | args = parser.parse_args() 64 | if args.log_level != "none": 65 | enable_logging(LOG_LEVEL_TO_INT[args.log_level]) 66 | 67 | if args.output_file is not None: 68 | logger.info("Writing smiles to {}", args.output_file) 69 | f = open(args.output_file, "w", encoding="utf-8") 70 | 71 | for smiles_protonated in protonate_smiles( 72 | smiles_input=args.smiles, 73 | ph_min=args.ph_min, 74 | ph_max=args.ph_max, 75 | precision=args.precision, 76 | label_states=args.label_states, 77 | max_variants=args.max_variants, 78 | ): 79 | if args.output_file is not None: 80 | f.write(smiles_protonated + "\n") 81 | else: 82 | print(smiles_protonated) 83 | -------------------------------------------------------------------------------- /dimorphite_dl/io.py: -------------------------------------------------------------------------------- 1 | """ 2 | Robust, memory-efficient SMILES string handling library. 3 | 4 | Provides unified streaming interface for processing SMILES from various sources 5 | with comprehensive error handling, validation, and memory optimization. 6 | """ 7 | 8 | from typing import Any, TextIO 9 | 10 | import gzip 11 | import os 12 | import pathlib 13 | from collections.abc import Iterable, Iterator 14 | from dataclasses import dataclass, field 15 | 16 | from loguru import logger 17 | from rdkit.Chem.MolStandardize import rdMolStandardize 18 | 19 | 20 | @dataclass 21 | class SMILESRecord: 22 | """Container for a SMILES string with metadata.""" 23 | 24 | smiles: str 25 | identifier: str = "" 26 | source_line: int | None = None 27 | metadata: dict[str, Any] = field(default_factory=dict) 28 | 29 | 30 | class SMILESValidationError(Exception): 31 | """Raised when SMILES validation fails.""" 32 | 33 | pass 34 | 35 | 36 | class SMILESStreamError(Exception): 37 | """Raised when streaming encounters an error.""" 38 | 39 | pass 40 | 41 | 42 | class SMILESProcessor: 43 | """ 44 | Memory-efficient SMILES string processor with robust error handling. 45 | 46 | Handles various input formats and provides streaming interface for 47 | processing large datasets without memory overflow. 48 | """ 49 | 50 | def __init__( 51 | self, 52 | validate_smiles: bool = True, 53 | skip_invalid: bool = True, 54 | max_length: int | None = 10000, 55 | chunk_size: int = 1000, 56 | ): 57 | """ 58 | Initialize SMILES processor. 59 | 60 | Args: 61 | validate_smiles: Whether to validate SMILES syntax 62 | skip_invalid: Skip invalid SMILES instead of raising errors 63 | max_length: Maximum allowed SMILES length (None for no limit) 64 | chunk_size: Batch size for processing operations 65 | """ 66 | self.validate_smiles = validate_smiles 67 | self.skip_invalid = skip_invalid 68 | self.max_length = max_length 69 | self.chunk_size = chunk_size 70 | self._stats: dict[str, int] = {"processed": 0, "skipped": 0, "errors": 0} 71 | 72 | def stream( 73 | self, input_data: str | Iterable[str] | Iterator[str] 74 | ) -> Iterator[SMILESRecord]: 75 | """ 76 | Stream SMILES records from various input types. 77 | 78 | Args: 79 | input_data: File path, single SMILES, or iterable of SMILES 80 | 81 | Yields: 82 | SMILESRecord: Validated SMILES records with metadata 83 | 84 | Raises: 85 | SMILESStreamError: If input cannot be processed 86 | """ 87 | self._reset_stats() 88 | 89 | try: 90 | if isinstance(input_data, str): 91 | yield from self._handle_string_input(input_data) 92 | elif hasattr(input_data, "__iter__"): 93 | yield from self._handle_iterable_input(input_data) 94 | else: 95 | raise SMILESStreamError(f"Unsupported input type: {type(input_data)}") 96 | 97 | except Exception as e: 98 | logger.error(f"Error streaming SMILES: {e}") 99 | if not self.skip_invalid: 100 | if isinstance(e, SMILESValidationError): 101 | raise e 102 | else: 103 | raise SMILESStreamError(f"Failed to process input: {e}") from e 104 | 105 | def stream_batches( 106 | self, input_data: str | Iterable[str], batch_size: int | None = None 107 | ) -> Iterator[list[SMILESRecord]]: 108 | """ 109 | Stream SMILES records in batches for efficient processing. 110 | 111 | Args: 112 | input_data: Input source 113 | batch_size: Size of each batch (uses instance default if None) 114 | 115 | Yields: 116 | Batches of SMILES records 117 | """ 118 | batch_size = batch_size or self.chunk_size 119 | batch = [] 120 | 121 | for record in self.stream(input_data): 122 | batch.append(record) 123 | if len(batch) >= batch_size: 124 | yield batch 125 | batch = [] 126 | 127 | if batch: # Yield remaining records 128 | yield batch 129 | 130 | def _handle_string_input(self, input_str: str) -> Iterator[SMILESRecord]: 131 | """Handle string input - either file path or single SMILES.""" 132 | if self._is_file_path(input_str): 133 | yield from self._stream_from_file(input_str) 134 | else: 135 | # Single SMILES string 136 | record = self._create_record(input_str, source_line=1) 137 | if record: 138 | yield record 139 | 140 | def _handle_iterable_input(self, iterable: Iterable[str]) -> Iterator[SMILESRecord]: 141 | """Handle iterable input (list, generator, etc.). 142 | 143 | This will skip empty lines. 144 | """ 145 | logger.debug("Handling iterable input of {}", iterable) 146 | for line_num, line in enumerate(iterable, 1): 147 | if isinstance(line, str): 148 | line = line.strip() 149 | line_split = line.split() 150 | if len(line_split) > 2: 151 | logger.warning( 152 | f"Lines can only contain a smiles string and identifier, but we were given {line}" 153 | ) 154 | raise ValueError( 155 | "Line contains more than two items (smiles and identifier)" 156 | ) 157 | if len(line_split) == 0: 158 | continue 159 | smiles = line_split[0] 160 | if len(line_split) == 2: 161 | identifier = line_split[1] 162 | else: 163 | identifier = "" 164 | record = self._create_record(smiles, identifier, source_line=line_num) 165 | if record: 166 | yield record 167 | else: 168 | self._handle_error( 169 | f"Non-string item at position {line_num}: {type(line)}" 170 | ) 171 | 172 | def _stream_from_file(self, filepath: str) -> Iterator[SMILESRecord]: 173 | """Stream SMILES from file with format auto-detection.""" 174 | logger.debug("Streaming from {}", filepath) 175 | path = pathlib.Path(filepath) 176 | 177 | if not path.exists(): 178 | raise SMILESStreamError(f"File not found: {filepath}") 179 | 180 | # Handle compressed files 181 | open_func = gzip.open if path.suffix == ".gz" else open 182 | mode = "rt" if path.suffix == ".gz" else "r" 183 | 184 | try: 185 | with open_func(filepath, mode, encoding="utf-8", errors="replace") as f: 186 | yield from self._stream_from_file_object(f, path) 187 | except Exception as e: 188 | raise SMILESStreamError(f"Error reading file {filepath}: {e}") from e 189 | 190 | def _stream_from_file_object( 191 | self, file_obj: TextIO, path: pathlib.Path 192 | ) -> Iterator[SMILESRecord]: 193 | """Stream from file object based on file extension.""" 194 | suffix = path.suffix.lower().replace(".gz", "") 195 | 196 | if suffix in {".smiles", ".smi", ".txt", ""}: 197 | yield from self._stream_from_text(file_obj) 198 | else: 199 | logger.warning(f"Unknown file format {suffix}, treating as text") 200 | yield from self._stream_from_text(file_obj) 201 | 202 | def _stream_from_text(self, file_obj: TextIO) -> Iterator[SMILESRecord]: 203 | """Stream SMILES from plain text file.""" 204 | for line_num, line in enumerate(file_obj, 1): 205 | line = line.strip() 206 | if line and not line.startswith("#"): 207 | # Handle multi-column format (SMILES ID) 208 | parts = line.split() 209 | smiles = parts[0] 210 | identifier = parts[1] if len(parts) > 1 else "" 211 | 212 | record = self._create_record( 213 | smiles, identifier=identifier, source_line=line_num 214 | ) 215 | if record: 216 | yield record 217 | 218 | def _create_record( 219 | self, 220 | smiles: str, 221 | identifier: str = "", 222 | source_line: int | None = None, 223 | metadata: dict[str, Any] | None = None, 224 | ) -> SMILESRecord | None: 225 | """Create and validate a SMILES record.""" 226 | smiles = smiles.strip() 227 | 228 | if not smiles: 229 | return None 230 | 231 | try: 232 | # Length validation 233 | if self.max_length and len(smiles) > self.max_length: 234 | self._handle_error( 235 | f"SMILES too long ({len(smiles)} > {self.max_length}): {smiles[:50]}..." 236 | ) 237 | return None 238 | 239 | # Basic syntax validation 240 | if self.validate_smiles: 241 | if not self._validate_smiles_syntax(smiles): 242 | self._handle_error(f"Invalid SMILES syntax: {smiles}") 243 | return None 244 | 245 | self._stats["processed"] += 1 246 | return SMILESRecord( 247 | smiles=smiles, 248 | identifier=identifier, 249 | source_line=source_line, 250 | metadata=metadata or {}, 251 | ) 252 | 253 | except Exception as e: 254 | self._handle_error(f"Error creating record for '{smiles}': {e}") 255 | return None 256 | 257 | def _validate_smiles_syntax(self, smiles: str) -> bool: 258 | """SMILES syntax validation using RDKit.""" 259 | logger.info("Processing {}", smiles) 260 | try: 261 | rdMolStandardize.ValidateSmiles(smiles) 262 | logger.debug("SMILES is valid") 263 | return True 264 | except Exception: 265 | logger.info("SMILES is NOT valid") 266 | return False 267 | 268 | def _is_file_path(self, s: str) -> bool: 269 | """Check if string is likely a file path.""" 270 | # Don't treat very long strings as file paths 271 | if len(s) > 1000: 272 | return False 273 | 274 | # Check if it has any file path indicators 275 | has_path_indicators = ( 276 | os.path.exists(s) 277 | or os.path.sep in s 278 | or (os.path.altsep and os.path.altsep in s) 279 | or s.endswith((".smiles", ".smi", ".txt", ".csv", ".sdf", ".gz")) 280 | ) 281 | 282 | if not has_path_indicators: 283 | return False 284 | 285 | # If it looks like a path, try to validate it as a real file path 286 | try: 287 | path = pathlib.Path(s) 288 | # If the path exists, it's definitely a file 289 | if path.exists(): 290 | return True 291 | # If the parent directory exists, it could be a valid file path 292 | if path.parent.exists(): 293 | return True 294 | # If it has a valid file extension and reasonable structure, assume it's a path 295 | if path.suffix in {".smiles", ".smi", ".txt", ".csv", ".sdf", ".gz"}: 296 | return True 297 | except (OSError, ValueError): 298 | # If we can't even create a Path object, it's probably not a file path 299 | return False 300 | 301 | return False 302 | 303 | def _handle_error(self, message: str): 304 | """Handle errors based on skip_invalid setting.""" 305 | self._stats["errors"] += 1 306 | if self.skip_invalid: 307 | self._stats["skipped"] += 1 308 | logger.warning(message) 309 | else: 310 | raise SMILESValidationError(message) 311 | 312 | def _reset_stats(self): 313 | """Reset processing statistics.""" 314 | self._stats = {"processed": 0, "skipped": 0, "errors": 0} 315 | 316 | def get_stats(self) -> dict[str, int]: 317 | """Get processing statistics.""" 318 | return self._stats.copy() 319 | 320 | 321 | # Convenience functions 322 | def stream_smiles(input_data: str | Iterable[str], **kwargs) -> Iterator[SMILESRecord]: 323 | """Convenience function for streaming SMILES.""" 324 | processor = SMILESProcessor(**kwargs) 325 | yield from processor.stream(input_data) 326 | 327 | 328 | def process_smiles_file(filepath: str, **kwargs) -> Iterator[SMILESRecord]: 329 | """Convenience function for processing SMILES files.""" 330 | processor = SMILESProcessor(**kwargs) 331 | yield from processor.stream(filepath) 332 | -------------------------------------------------------------------------------- /dimorphite_dl/mol.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for handling SMILES strings and RDKit mol objects. 3 | """ 4 | 5 | from typing import Any 6 | 7 | import copy 8 | import os 9 | import sys 10 | 11 | from loguru import logger 12 | from rdkit import Chem 13 | 14 | from dimorphite_dl.neutralize import MoleculeNeutralizer 15 | 16 | 17 | class MoleculeRecord: 18 | """ 19 | Enhanced class for managing SMILES strings and RDKit mol objects. 20 | 21 | Handles all molecule-related operations including validation, conversion, 22 | neutralization, and hydrogen management. 23 | """ 24 | 25 | def __init__(self, smiles: str, identifier: str = "") -> None: 26 | """ 27 | Initialize a MoleculeRecord. 28 | 29 | Args: 30 | smiles: SMILES string representation of the molecule 31 | identifier: Optional unique identifier for the molecule 32 | 33 | Raises: 34 | ValueError: If smiles is not a valid string 35 | """ 36 | assert isinstance(smiles, str) 37 | assert isinstance(identifier, str) 38 | 39 | smiles = smiles.strip() 40 | if not smiles: 41 | raise ValueError("SMILES string cannot be empty") 42 | 43 | self.smiles_original = smiles 44 | """Original SMILES used to initialize this MoleculeRecord""" 45 | 46 | self.identifier = identifier 47 | """Unique identifier for molecule""" 48 | 49 | self.smiles = smiles 50 | """Current SMILES after any processing""" 51 | 52 | self._mol: Chem.Mol | None = None 53 | """Cached RDKit mol object""" 54 | 55 | self._mol_with_hs: Chem.Mol | None = None 56 | """Cached RDKit mol object with explicit hydrogens""" 57 | 58 | self._neutralizer: MoleculeNeutralizer | None = None 59 | """Cached neutralizer instance""" 60 | 61 | @property 62 | def mol(self) -> Chem.Mol | None: 63 | """Get the RDKit mol object, creating it if necessary.""" 64 | if self._mol is None: 65 | self._mol = self.to_mol() 66 | return self._mol 67 | 68 | @mol.setter 69 | def mol(self, value: Chem.Mol | None) -> None: 70 | """Set the RDKit mol object and clear dependent caches.""" 71 | self._mol = value 72 | self._mol_with_hs = None # Clear dependent cache 73 | 74 | def to_mol(self) -> Chem.Mol | None: 75 | """ 76 | Convert current SMILES to a RDKit Mol object. 77 | 78 | Returns: 79 | RDKit Mol object or None if conversion fails 80 | """ 81 | conversion_info = self.to_mol_silenced(self.smiles) 82 | 83 | if conversion_info["mol"] is None: 84 | error_msg = conversion_info["stderr_content"].strip() 85 | if error_msg: 86 | logger.warning( 87 | "RDKit failed to parse SMILES '{}'. RDKit error: {}", 88 | self.smiles, 89 | error_msg, 90 | ) 91 | else: 92 | logger.warning( 93 | "RDKit failed to parse SMILES '{}' (no specific error message)", 94 | self.smiles, 95 | ) 96 | return None 97 | 98 | mol = conversion_info["mol"] 99 | logger.trace("SMILES after conversion: {}", Chem.MolToSmiles(mol)) 100 | return mol 101 | 102 | def to_mol_with_hs(self) -> Chem.Mol | None: 103 | """ 104 | Get RDKit mol object with explicit hydrogens. 105 | 106 | Returns: 107 | RDKit Mol object with explicit hydrogens or None if conversion fails 108 | """ 109 | if self._mol_with_hs is None: 110 | base_mol = self.mol 111 | if base_mol is not None: 112 | self._mol_with_hs = self.add_hydrogens(base_mol) 113 | return self._mol_with_hs 114 | 115 | def refresh_mol_from_smiles(self) -> bool: 116 | """ 117 | Refresh the mol object from current SMILES string. 118 | 119 | Returns: 120 | True if successful, False otherwise 121 | """ 122 | self._mol = None 123 | self._mol_with_hs = None 124 | new_mol = self.to_mol() 125 | return new_mol is not None 126 | 127 | def update_smiles_from_mol(self, mol: Chem.Mol | None = None) -> bool: 128 | """ 129 | Update SMILES string from RDKit mol object. 130 | 131 | Args: 132 | mol: Optional mol object to use. If None, uses self.mol 133 | 134 | Returns: 135 | True if successful, False otherwise 136 | """ 137 | if mol is None: 138 | mol = self.mol 139 | 140 | if mol is None: 141 | logger.warning("Cannot update SMILES: no valid mol object available") 142 | return False 143 | 144 | try: 145 | new_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) 146 | if new_smiles: 147 | self.smiles = new_smiles 148 | # Clear mol cache since we're updating from external mol 149 | if mol is not self._mol: 150 | self._mol = mol 151 | self._mol_with_hs = None 152 | return True 153 | except Exception as e: 154 | logger.warning("Error generating SMILES from mol: {}", str(e)) 155 | 156 | return False 157 | 158 | def get_canonical_smiles(self, isomeric: bool = True) -> str | None: 159 | """ 160 | Get canonical SMILES representation. 161 | 162 | Args: 163 | isomeric: Whether to include stereochemistry information 164 | 165 | Returns: 166 | Canonical SMILES string or None if conversion fails 167 | """ 168 | mol = self.mol 169 | if mol is None: 170 | return None 171 | 172 | try: 173 | return Chem.MolToSmiles(mol, isomericSmiles=isomeric, canonical=True) 174 | except Exception as e: 175 | logger.warning("Error generating canonical SMILES: {}", str(e)) 176 | return None 177 | 178 | def make_canonical(self, isomeric: bool = True) -> None: 179 | smiles = self.get_canonical_smiles(isomeric=isomeric) 180 | if smiles is not None: 181 | self._update_smiles(smiles) 182 | 183 | def is_valid(self) -> bool: 184 | """ 185 | Check if the current SMILES represents a valid molecule. 186 | 187 | Returns: 188 | True if valid, False otherwise 189 | """ 190 | return self.mol is not None 191 | 192 | def _update_smiles(self, smiles: str) -> None: 193 | self.smiles = smiles 194 | # Clear caches since SMILES changed 195 | self._mol = None 196 | self._mol_with_hs = None 197 | 198 | def get_neutralized(self, smiles: str) -> str: 199 | if self._neutralizer is None: 200 | self._neutralizer = MoleculeNeutralizer() 201 | 202 | neutralized_smiles = self._neutralizer.neutralize_smiles(smiles) 203 | if neutralized_smiles is not None: 204 | logger.debug("Successfully neutralized molecule") 205 | return neutralized_smiles 206 | raise RuntimeError("Issue neutralizing SMILES") 207 | 208 | def neutralize(self): 209 | """ 210 | Neutralize the molecule using the neutralizer. 211 | 212 | Returns: 213 | True if neutralization was successful, False otherwise 214 | """ 215 | smiles_neutralized = self.get_neutralized(self.smiles) 216 | self._update_smiles(smiles_neutralized) 217 | 218 | @staticmethod 219 | def add_hydrogens(mol: Chem.Mol) -> Chem.Mol | None: 220 | """ 221 | Add explicit hydrogens to a molecule. 222 | 223 | Args: 224 | mol: RDKit mol object 225 | 226 | Returns: 227 | Mol object with explicit hydrogens or None if failed 228 | """ 229 | if mol is None: 230 | return None 231 | 232 | logger.debug("Adding hydrogens to molecule") 233 | try: 234 | mol_with_hs = Chem.AddHs(mol) 235 | if mol_with_hs is None: 236 | logger.warning("Failed to add hydrogens to molecule") 237 | return None 238 | logger.trace("After adding hydrogens: {}", Chem.MolToSmiles(mol_with_hs)) 239 | return mol_with_hs 240 | except Exception as e: 241 | logger.warning("Error adding hydrogens to molecule: {}", str(e)) 242 | return None 243 | 244 | @staticmethod 245 | def remove_hydrogens(mol: Chem.Mol) -> Chem.Mol | None: 246 | """ 247 | Remove explicit hydrogens from a molecule. 248 | 249 | Args: 250 | mol: RDKit mol object 251 | 252 | Returns: 253 | Mol object without explicit hydrogens or None if failed 254 | """ 255 | if mol is None: 256 | logger.info("No molecule was provided") 257 | return None 258 | 259 | logger.debug("Removing hydrogens from molecule") 260 | try: 261 | mol_no_hs = Chem.RemoveHs(mol) 262 | if mol_no_hs is None: 263 | logger.warning("Failed to remove hydrogens from molecule") 264 | return None 265 | return mol_no_hs 266 | except Exception as e: 267 | logger.warning("Error removing hydrogens from molecule: {}", str(e)) 268 | return None 269 | 270 | @staticmethod 271 | def unprotect_atoms(mol: Chem.Mol) -> Chem.Mol: 272 | """ 273 | Set the protected property on all atoms to 0. 274 | 275 | Args: 276 | mol: RDKit mol object to unprotect 277 | 278 | Returns: 279 | The same mol object (modified in place) 280 | """ 281 | logger.trace("Unprotecting each atom") 282 | for atom in mol.GetAtoms(): 283 | atom.SetProp("_protected", "0") 284 | return mol 285 | 286 | @staticmethod 287 | def protect_atoms(mol: Chem.Mol, atom_indices: list[int]) -> Chem.Mol: 288 | """ 289 | Set the protected property on specified atoms to 1. 290 | 291 | Args: 292 | mol: RDKit mol object 293 | atom_indices: List of atom indices to protect 294 | 295 | Returns: 296 | The same mol object (modified in place) 297 | """ 298 | logger.trace("Protecting atom(s): {}", atom_indices) 299 | for idx in atom_indices: 300 | try: 301 | atom = mol.GetAtomWithIdx(idx) 302 | atom.SetProp("_protected", "1") 303 | except Exception as e: 304 | logger.warning("Could not protect atom at index {}: {}", idx, str(e)) 305 | return mol 306 | 307 | @staticmethod 308 | def is_atom_protected(mol: Chem.Mol, atom_idx: int) -> bool: 309 | """ 310 | Check if an atom is protected. 311 | 312 | Args: 313 | mol: RDKit mol object 314 | atom_idx: Atom index to check 315 | 316 | Returns: 317 | True if atom is protected, False otherwise 318 | """ 319 | try: 320 | atom = mol.GetAtomWithIdx(atom_idx) 321 | protected = atom.GetProp("_protected") 322 | return protected == "1" 323 | except Exception: 324 | return False 325 | 326 | def process_azides(self) -> None: 327 | """ 328 | Process azide patterns in SMILES string. 329 | 330 | Args: 331 | smiles: Input SMILES string 332 | 333 | Returns: 334 | SMILES string with processed azides 335 | """ 336 | smiles_working = self.smiles 337 | if "N=N=N" in smiles_working or "NN#N" in smiles_working: 338 | logger.info("Attempting to fix azide patterns in: '{}'", smiles_working) 339 | smiles_working = smiles_working.replace("N=N=N", "N=[N+]=N") 340 | smiles_working = smiles_working.replace("NN#N", "N=[N+]=N") 341 | if smiles_working != self.smiles: 342 | logger.info( 343 | "Modified SMILES: '{}' -> '{}'", self.smiles, smiles_working 344 | ) 345 | self._update_smiles(smiles_working) 346 | 347 | def prepare_for_protonation(self) -> Chem.Mol: 348 | """ 349 | Prepare molecule for protonation site detection. 350 | 351 | Returns: 352 | Prepared RDKit mol object or None if preparation fails 353 | """ 354 | logger.info("Preparing molecule for analysis") 355 | 356 | self.process_azides() 357 | self.neutralize() 358 | 359 | base_mol = self.to_mol() 360 | if base_mol is None: 361 | raise RuntimeError("Could not convert SMILES to RDKit Mol") 362 | 363 | mol_with_hydrogens = self.add_hydrogens(base_mol) 364 | if mol_with_hydrogens is None: 365 | raise RuntimeError("Could not add Hydrogens to Mol") 366 | 367 | prepared_mol = self.unprotect_atoms(mol_with_hydrogens) 368 | 369 | atom_count = prepared_mol.GetNumAtoms() 370 | logger.trace("Molecule prepared with {} atoms", atom_count) 371 | assert atom_count > 0 # Molecule must have at least one atom 372 | 373 | self._update_smiles(Chem.MolToSmiles(prepared_mol)) 374 | return prepared_mol 375 | 376 | @staticmethod 377 | def to_mol_silenced(smiles: str) -> dict[str, Any]: 378 | """ 379 | Capture RDKit stderr output and return mol object with error messages. 380 | 381 | Args: 382 | smiles: SMILES string to convert 383 | 384 | Returns: 385 | Dictionary with 'mol' (RDKit Mol or None) and 'stderr_content' (string) 386 | """ 387 | logger.debug("Converting SMILES to RDKit mol: {}", smiles) 388 | 389 | # Set up stderr capture 390 | stderr_fileno = sys.stderr.fileno() 391 | stderr_save = os.dup(stderr_fileno) 392 | stderr_pipe = os.pipe() 393 | 394 | try: 395 | # Redirect stderr to pipe 396 | os.dup2(stderr_pipe[1], stderr_fileno) 397 | os.close(stderr_pipe[1]) 398 | 399 | # Convert SMILES to mol (this may write to stderr) 400 | mol = Chem.MolFromSmiles(smiles) 401 | 402 | # Read captured stderr 403 | os.close(stderr_fileno) 404 | stderr_content = os.read(stderr_pipe[0], 1024).decode( 405 | "utf-8", errors="ignore" 406 | ) 407 | 408 | except Exception as e: 409 | logger.error("Error during SMILES conversion: {}", str(e)) 410 | mol = None 411 | stderr_content = f"Exception during conversion: {str(e)}" 412 | 413 | finally: 414 | # Restore stderr 415 | try: 416 | os.close(stderr_pipe[0]) 417 | except Exception: 418 | pass 419 | try: 420 | os.dup2(stderr_save, stderr_fileno) 421 | os.close(stderr_save) 422 | except Exception: 423 | pass 424 | 425 | return {"mol": mol, "stderr_content": stderr_content} 426 | 427 | def copy(self) -> "MoleculeRecord": 428 | """ 429 | Create a deep copy of this MoleculeRecord. 430 | 431 | Returns: 432 | New MoleculeRecord instance 433 | """ 434 | new_record = MoleculeRecord(self.smiles, self.identifier) 435 | new_record.smiles_original = self.smiles_original 436 | 437 | # Deep copy mol objects if they exist 438 | if self._mol is not None: 439 | new_record._mol = copy.deepcopy(self._mol) 440 | if self._mol_with_hs is not None: 441 | new_record._mol_with_hs = copy.deepcopy(self._mol_with_hs) 442 | 443 | return new_record 444 | 445 | def get_atom_count(self) -> int: 446 | """ 447 | Get the number of atoms in the molecule. 448 | 449 | Returns: 450 | Number of atoms, or 0 if mol is invalid 451 | """ 452 | mol = self.mol 453 | return mol.GetNumAtoms() if mol is not None else 0 454 | 455 | def get_heavy_atom_count(self) -> int: 456 | """ 457 | Get the number of heavy (non-hydrogen) atoms in the molecule. 458 | 459 | Returns: 460 | Number of heavy atoms, or 0 if mol is invalid 461 | """ 462 | mol = self.mol 463 | return mol.GetNumHeavyAtoms() if mol is not None else 0 464 | 465 | def has_substructure(self, pattern: str | Chem.Mol) -> bool: 466 | """ 467 | Check if molecule contains a specific substructure. 468 | 469 | Args: 470 | pattern: SMARTS string or RDKit mol object to search for 471 | 472 | Returns: 473 | True if substructure is found, False otherwise 474 | """ 475 | mol = self.mol 476 | if mol is None: 477 | return False 478 | 479 | try: 480 | if isinstance(pattern, str): 481 | pattern_mol = Chem.MolFromSmarts(pattern) 482 | if pattern_mol is None: 483 | logger.warning("Invalid SMARTS pattern: {}", pattern) 484 | return False 485 | else: 486 | pattern_mol = pattern 487 | 488 | return mol.HasSubstructMatch(pattern_mol) 489 | except Exception as e: 490 | logger.warning("Error checking substructure: {}", str(e)) 491 | return False 492 | 493 | def get_substructure_matches( 494 | self, pattern: str | Chem.Mol 495 | ) -> list[tuple[int, ...]]: 496 | """ 497 | Get all matches of a substructure pattern. 498 | 499 | Args: 500 | pattern: SMARTS string or RDKit mol object to search for 501 | 502 | Returns: 503 | List of tuples containing atom indices for each match 504 | """ 505 | mol = self.mol 506 | if mol is None: 507 | return [] 508 | 509 | try: 510 | if isinstance(pattern, str): 511 | pattern_mol = Chem.MolFromSmarts(pattern) 512 | if pattern_mol is None: 513 | logger.warning("Invalid SMARTS pattern: {}", pattern) 514 | return [] 515 | else: 516 | pattern_mol = pattern 517 | 518 | return list(mol.GetSubstructMatches(pattern_mol)) 519 | except Exception as e: 520 | logger.warning("Error finding substructure matches: {}", str(e)) 521 | return [] 522 | 523 | def __str__(self) -> str: 524 | """String representation of the molecule.""" 525 | if self.identifier: 526 | return f"MoleculeRecord('{self.smiles}', '{self.identifier}')" 527 | return f"MoleculeRecord('{self.smiles}')" 528 | 529 | def __repr__(self) -> str: 530 | """Detailed string representation of the molecule.""" 531 | return self.__str__() 532 | 533 | def __eq__(self, other: object) -> bool: 534 | """Check equality based on canonical SMILES.""" 535 | if not isinstance(other, MoleculeRecord): 536 | return False 537 | 538 | self_canonical = self.get_canonical_smiles() 539 | other_canonical = other.get_canonical_smiles() 540 | 541 | return ( 542 | self_canonical is not None 543 | and other_canonical is not None 544 | and self_canonical == other_canonical 545 | ) 546 | 547 | def __hash__(self) -> int: 548 | """Hash based on canonical SMILES.""" 549 | canonical = self.get_canonical_smiles() 550 | return hash(canonical) if canonical is not None else hash(self.smiles) 551 | -------------------------------------------------------------------------------- /dimorphite_dl/neutralize.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from rdkit import Chem 3 | from rdkit.Chem import AllChem 4 | 5 | RXN_DATA = ( 6 | # To handle O- bonded to only one atom (add hydrogen). 7 | ("[Ov1-1:1]", "[Ov2+0:1]-[H]"), 8 | # To handle N+ bonded to a hydrogen (remove hydrogen). 9 | ("[#7v4+1:1]-[H]", "[#7v3+0:1]"), 10 | # To handle O- bonded to two atoms. Should not be Negative. 11 | ("[Ov2-:1]", "[Ov2+0:1]"), 12 | # To handle N+ bonded to three atoms. Should not be positive. 13 | ("[#7v3+1:1]", "[#7v3+0:1]"), 14 | # To handle N- Bonded to two atoms. Add hydrogen. 15 | ("[#7v2-1:1]", "[#7+0:1]-[H]"), 16 | # To handle bad azide. R-N-N#N should be R-N=[N+]=N. 17 | ("[H]-[N:1]-[N:2]#[N:3]", "[N:1]=[N+1:2]=[N:3]-[H]"), 18 | ) 19 | 20 | 21 | class NeutralizationReaction: 22 | """ 23 | Represents a single neutralization reaction defined by a pair of SMARTS strings 24 | """ 25 | 26 | def __init__(self, smarts_reactant: str, smarts_product: str): 27 | """ 28 | Args: 29 | smarts_reactant: SMARTS for detecting the reactants of a defined 30 | neutralization reaction. 31 | smarts_product: SMARTS for what the detected `smarts_reactant` should 32 | be transformed to. 33 | """ 34 | self.smarts_reactant = smarts_reactant 35 | self.smarts_product = smarts_product 36 | self._pattern = Chem.MolFromSmarts(smarts_reactant) 37 | self._rxn = AllChem.ReactionFromSmarts(f"{smarts_reactant}>>{smarts_product}") 38 | 39 | def __str__(self) -> str: 40 | return f"{self.smarts_reactant} >> {self.smarts_product}" 41 | 42 | def __repr__(self) -> str: 43 | return self.__str__() 44 | 45 | def matches(self, mol: Chem.Mol) -> bool: 46 | """Check if this reaction can be applied to the given molecule.""" 47 | return mol.HasSubstructMatch(self._pattern) 48 | 49 | def apply(self, mol: Chem.Mol) -> Chem.Mol: 50 | """ 51 | Apply the neutralization reaction to the molecule. Returns the first product. 52 | If multiple products are generated, only the first is returned. 53 | """ 54 | products = self._rxn.RunReactants((mol,)) 55 | if products: 56 | # products is a tuple of tuples; take the first product set, first product 57 | return products[0][0] 58 | return mol 59 | 60 | 61 | class ReactionRegistry: 62 | """ 63 | Holds a collection of NeutralizationReaction objects and applies them repeatedly 64 | until no further matches are found. 65 | """ 66 | 67 | def __init__(self, rxn_data: tuple[tuple[str, str]]): 68 | self.reactions = [] 69 | for reactant, product in rxn_data: 70 | self.reactions.append(NeutralizationReaction(reactant, product)) 71 | 72 | def neutralize(self, mol: Chem.Mol) -> Chem.Mol: 73 | """ 74 | Apply all registered neutralization reactions to the molecule in a loop 75 | until no further transformations are possible. Assumes explicit H atoms 76 | have already been added. 77 | """ 78 | mol.UpdatePropertyCache(strict=False) 79 | changed = True 80 | while changed: 81 | changed = False 82 | for reaction in self.reactions: 83 | if reaction.matches(mol): 84 | logger.debug("Found reaction match: {}", str(reaction)) 85 | mol = reaction.apply(mol) 86 | mol.UpdatePropertyCache(strict=False) 87 | changed = True 88 | break # restart scanning from first reaction 89 | else: 90 | logger.trace("No match to reaction: {}", str(reaction)) 91 | # Final sanitization 92 | sanitized = Chem.SanitizeMol( 93 | mol, sanitizeOps=Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, catchErrors=True 94 | ) 95 | if sanitized.name == "SANITIZE_NONE": 96 | logger.debug("After neutralizing: {}", Chem.MolToSmiles(mol)) 97 | return mol 98 | raise RuntimeError("Ran into issue sanitizing mol") 99 | 100 | 101 | class MoleculeNeutralizer: 102 | """ 103 | High-level class to take SMILES, handle preprocessing, add Hs, 104 | run neutralization, and return a clean SMILES. 105 | """ 106 | 107 | def __init__(self, rxn_data: tuple[tuple[str, str]] | None = None): 108 | if rxn_data is None: 109 | rxn_data = RXN_DATA 110 | self.registry = ReactionRegistry(rxn_data) 111 | 112 | def neutralize_smiles(self, smiles: str) -> str | None: 113 | logger.debug("Neutralizing {}", smiles) 114 | mol = Chem.MolFromSmiles(smiles) 115 | if mol is None: 116 | raise ValueError(f"Invalid SMILES: {smiles}") 117 | 118 | # Add explicit Hs 119 | mol = Chem.AddHs(mol) 120 | logger.debug("After adding hydrogens: {}", Chem.MolToSmiles(mol)) 121 | # Run neutralization 122 | mol = self.registry.neutralize(mol) 123 | # Remove explicit Hs 124 | mol = Chem.RemoveHs(mol) 125 | logger.debug("After removing hydrogens: {}", Chem.MolToSmiles(mol)) 126 | # Generate final SMILES 127 | return Chem.MolToSmiles(mol) 128 | -------------------------------------------------------------------------------- /dimorphite_dl/protonate/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dimorphite_dl/protonate/change.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from loguru import logger 4 | from rdkit import Chem 5 | from rdkit.Chem import Mol 6 | 7 | from dimorphite_dl.protonate.site import ProtonationSite, ProtonationState 8 | 9 | 10 | def protonate_site( 11 | mols: list[Mol], 12 | site: ProtonationSite, 13 | ph_min: float, 14 | ph_max: float, 15 | precision: float, 16 | ) -> list[Mol]: 17 | """Protonate a specific site in a list of molecules. 18 | 19 | Args: 20 | mols: List of molecule objects. 21 | site: ProtonationSite object with protonation information. 22 | ph_min: Minimum pH to expose the site to. 23 | ph_max: Maximum pH to expose the site to. 24 | precision: pKa standard deviation prefactor to consider. 25 | 26 | Returns: 27 | List of appropriately protonated molecule objects. If there is any issue, 28 | this will return an empty list. 29 | """ 30 | if not mols: 31 | logger.warning("No molecules provided for protonation") 32 | return [] 33 | 34 | logger.debug("Protonating site: {}", site.name) 35 | 36 | unique_states = list(site.get_unique_states(ph_min, ph_max, precision)) 37 | 38 | current_mols = mols 39 | 40 | for idx_atom, state in unique_states: 41 | charges = state.get_charges() 42 | 43 | # If the state is not BOTH, we apply its single charge to each 44 | # molecule in current_mols without creating branches. 45 | if state != ProtonationState.BOTH: 46 | logger.debug( 47 | "Site {} atom {} has exclusive state {}; applying to all molecules", 48 | site.name, 49 | idx_atom, 50 | state.to_str(), 51 | ) 52 | processed = set_protonation_charge( 53 | current_mols, idx_atom, charges, site.name 54 | ) 55 | if len(processed) == 0: 56 | return [] 57 | current_mols = processed 58 | 59 | else: 60 | logger.debug( 61 | "Site {} atom {} is BOTH; branching into {} variants per molecule", 62 | site.name, 63 | idx_atom, 64 | charges, 65 | ) 66 | 67 | branched = [] 68 | for mol in current_mols: 69 | try: 70 | variants = set_protonation_charge( 71 | [mol], idx_atom, charges, site.name 72 | ) 73 | branched.extend(variants) 74 | except Exception as e: 75 | logger.error("Error protonating site {}: {}", idx_atom, str(e)) 76 | return [] 77 | current_mols = branched 78 | return current_mols 79 | 80 | 81 | def set_protonation_charge( 82 | mols: list[Mol], idx: int, charges: list[int], prot_site_name: str 83 | ) -> list[Mol]: 84 | """Set atomic charge on a specific site for a set of molecules. 85 | 86 | Args: 87 | mols: List of input molecule objects. 88 | idx: Index of the atom to modify. 89 | charges: List of charges to assign at this site. 90 | prot_site_name: Name of the protonation site. 91 | 92 | Returns: 93 | List of processed molecule objects. If anything goes wrong, then we return 94 | an empty list. 95 | """ 96 | is_special_nitrogen = "*" in prot_site_name 97 | 98 | mols_charged = [] 99 | for charge in charges: 100 | nitrogen_charge = charge + 1 101 | 102 | # Special case for nitrogen moieties where acidic group is neutral 103 | if is_special_nitrogen: 104 | nitrogen_charge = nitrogen_charge - 1 105 | 106 | for mol in mols: 107 | try: 108 | processed_mol = _apply_charge_to_molecule( 109 | mol, idx, charge, nitrogen_charge 110 | ) 111 | if processed_mol is not None: 112 | mols_charged.append(processed_mol) 113 | else: 114 | return [] 115 | except Exception as e: 116 | logger.warning( 117 | "Error processing molecule with charge {}: {}", charge, str(e) 118 | ) 119 | return [] 120 | return mols_charged 121 | 122 | 123 | def _apply_charge_to_molecule( 124 | mol: Mol, idx: int, charge: int, nitrogen_charge: int 125 | ) -> Mol | None: 126 | """Apply charge to a specific atom in a molecule. 127 | 128 | Args: 129 | mol: Input molecule 130 | idx: Atom index 131 | charge: Charge for non-nitrogen atoms 132 | nitrogen_charge: Charge for nitrogen atoms 133 | prot_site_name: Name of protonation site 134 | 135 | Returns: 136 | Modified molecule or None if processing fails 137 | """ 138 | logger.trace( 139 | "Applying charge of {} at index {} to SMILES: {}", 140 | charge, 141 | idx, 142 | Chem.MolToSmiles(mol), 143 | ) 144 | # Create deep copy to avoid modifying original 145 | mol_copy = copy.deepcopy(mol) 146 | 147 | # Remove hydrogens first 148 | try: 149 | mol_copy = Chem.RemoveHs(mol_copy) 150 | if mol_copy is None: 151 | logger.warning("RemoveHs returned None for molecule") 152 | return None 153 | except Exception as e: 154 | logger.warning("Failed to remove hydrogens: {}", str(e)) 155 | return None 156 | 157 | # Validate atom index 158 | if idx >= mol_copy.GetNumAtoms(): 159 | logger.warning( 160 | "Atom index {} out of range (molecule has {} atoms)", 161 | idx, 162 | mol_copy.GetNumAtoms(), 163 | ) 164 | return None 165 | 166 | atom = mol_copy.GetAtomWithIdx(idx) 167 | element = atom.GetAtomicNum() 168 | 169 | # Calculate explicit bond order 170 | try: 171 | explicit_bond_order_total = sum( 172 | b.GetBondTypeAsDouble() for b in atom.GetBonds() 173 | ) 174 | except Exception as e: 175 | logger.warning("Error calculating bond order for atom {}: {}", idx, str(e)) 176 | return None 177 | 178 | # Set formal charge and explicit hydrogens based on element type 179 | try: 180 | if element == 7: # Nitrogen 181 | _set_nitrogen_properties(atom, nitrogen_charge, explicit_bond_order_total) 182 | else: 183 | _set_other_element_properties( 184 | atom, charge, element, explicit_bond_order_total 185 | ) 186 | 187 | # Special case for aromatic nitrogen deprotonation 188 | mol_smiles = Chem.MolToSmiles(mol_copy) 189 | if "[nH-]" in mol_smiles: 190 | logger.debug("Detected [nH-]; setting number of Hs to zero for this atom") 191 | atom.SetNumExplicitHs(0) 192 | 193 | # Update property cache 194 | mol_copy.UpdatePropertyCache(strict=False) 195 | 196 | except Exception as e: 197 | logger.warning("Error setting atom properties: {}", str(e)) 198 | return None 199 | 200 | return mol_copy 201 | 202 | 203 | def _set_nitrogen_properties( 204 | atom: Chem.Atom, charge: int, bond_order_total: int 205 | ) -> None: 206 | """Set properties for nitrogen atoms based on charge and bonding.""" 207 | atom_idx = atom.GetIdx() 208 | is_aromatic = atom.GetIsAromatic() 209 | degree = atom.GetDegree() 210 | logger.trace( 211 | "Setting N properties: index={}, charge={}, bond_order={}, aromatic={}, degree={}", 212 | atom_idx, 213 | charge, 214 | bond_order_total, 215 | is_aromatic, 216 | degree, 217 | ) 218 | 219 | # Handling niche cases of aromatics often detected on NADP 220 | if charge == 1 and bond_order_total == 4.0 and is_aromatic and degree == 3: 221 | return 222 | 223 | atom.SetFormalCharge(charge) 224 | logger.debug("Set formal charge to {}", charge) 225 | 226 | # Set explicit hydrogens based on charge and bond order 227 | h_count_map = { 228 | (1, 1): 3, 229 | (1, 2): 2, 230 | (1, 3): 1, # Positive charge 231 | (0, 1): 2, 232 | (0, 2): 1, # Neutral 233 | (-1, 1): 1, 234 | (-1, 2): 0, # Negative charge 235 | } 236 | 237 | h_count = h_count_map.get((int(charge), int(bond_order_total)), -1) 238 | if h_count != -1: 239 | logger.debug("Setting hydrogen count to {}", h_count) 240 | atom.SetNumExplicitHs(h_count) 241 | 242 | 243 | def _set_other_element_properties( 244 | atom: Chem.Atom, charge: int, element: int, bond_order_total: float 245 | ) -> None: 246 | """Set properties for non-nitrogen atoms.""" 247 | atom_idx = atom.GetIdx() 248 | is_aromatic = atom.GetIsAromatic() 249 | degree = atom.GetDegree() 250 | logger.trace( 251 | "Setting {} properties: index={}, charge={}, bond_order={}, aromatic={}, degree={}", 252 | element, 253 | atom_idx, 254 | charge, 255 | bond_order_total, 256 | is_aromatic, 257 | degree, 258 | ) 259 | 260 | atom.SetFormalCharge(charge) 261 | logger.debug("Set formal charge to {}", charge) 262 | 263 | # Special handling for oxygen and sulfur 264 | if element in (8, 16): # O and S 265 | if charge == 0 and bond_order_total == 1: 266 | atom.SetNumExplicitHs(1) 267 | logger.debug("Set explicit hydrogens for this atom to 1") 268 | elif charge == -1 and bond_order_total == 1: 269 | atom.SetNumExplicitHs(0) 270 | logger.debug("Set explicit hydrogens for this atom to 0") 271 | -------------------------------------------------------------------------------- /dimorphite_dl/protonate/data.py: -------------------------------------------------------------------------------- 1 | import importlib.resources as pkg_resources 2 | from collections.abc import Iterator 3 | 4 | from loguru import logger 5 | from rdkit import Chem 6 | 7 | from dimorphite_dl.protonate.site import PKaDatum, SubstructureDatum 8 | 9 | 10 | class PKaData: 11 | _data: list[SubstructureDatum] = [] 12 | """All loaded data for our protonation substructures.""" 13 | 14 | _instance = None 15 | 16 | def __new__(cls): 17 | if cls._instance is None: 18 | cls._instance = super().__new__(cls) 19 | cls._load_data() 20 | return cls._instance 21 | 22 | @classmethod 23 | def _load_data(cls) -> None: 24 | lines = cls._load_lines() 25 | data = [] 26 | for line in lines: 27 | data.append(cls._parse_substructure_line(line)) 28 | cls._data = data 29 | 30 | @classmethod 31 | def _load_lines(cls) -> list[str]: 32 | """Load the substructure SMARTS file, filtering out comments and blank lines. 33 | 34 | Returns: 35 | List of valid SMARTS lines from the file. 36 | 37 | Raises: 38 | FileNotFoundError: If the substructure file cannot be found. 39 | IOError: If there are issues reading the file. 40 | """ 41 | logger.trace("Loading substructure data from site_substructures.smarts") 42 | 43 | try: 44 | with pkg_resources.open_text( 45 | "dimorphite_dl.smarts", "site_substructures.smarts" 46 | ) as f: 47 | lines = [] 48 | line_count = 0 49 | valid_count = 0 50 | 51 | for line in f: 52 | line_count += 1 53 | stripped = line.strip() 54 | 55 | # Skip empty lines and comments 56 | if stripped and not stripped.startswith("#"): 57 | lines.append(stripped) 58 | valid_count += 1 59 | 60 | logger.info("Loaded {} valid SMARTS patterns", valid_count) 61 | return lines 62 | 63 | except FileNotFoundError: 64 | logger.error("Could not find site_substructures.smarts file") 65 | raise 66 | except Exception as e: 67 | logger.error("Error reading substructure file: {}", str(e)) 68 | raise IOError(f"Failed to read substructure file: {e}") 69 | 70 | @classmethod 71 | def _parse_substructure_line(cls, line: str) -> SubstructureDatum: 72 | """Parse a single line from the substructure data file. 73 | 74 | Args: 75 | line: Line from the substructure file 76 | 77 | Returns: 78 | SubstructureData object. 79 | 80 | Notes: 81 | Below is an example line of the tab separated file. 82 | 83 | ```text 84 | *Azide [N+0:1]=[N+:2]=[N+0:3]-[H] 2 4.65 0.07071067811865513 85 | ``` 86 | 87 | This contains the following information separated by tabs. 88 | 89 | - Name of the substructure. A `*` prefix indicates that it is an aromatic 90 | nitrogen that needs special treatment. 91 | - [SMARTS](https://www.daylight.com/dayhtml/doc/theory/theory.smarts.html) 92 | of this particular substructure. 93 | - Data about the protonation site always in a set of threes. You can have 94 | more than one site. 95 | - The site index. 96 | - pKa mean 97 | - pKa standard deviation. 98 | """ 99 | parts = line.split() 100 | if len(parts) < 3: 101 | logger.warning("Invalid line format (too few parts): '{}'", line) 102 | raise ValueError 103 | 104 | name = parts[0] 105 | logger.trace("Substructure name is {}", name) 106 | smarts = parts[1] 107 | logger.trace("Substructure SMARTS is {}", smarts) 108 | mol = cls._create_rdkit_mol(smarts) 109 | 110 | # Parse pKa ranges (groups of 3: site, mean, std) 111 | pka_data = cls._parse_pka_line(parts[2:]) 112 | return SubstructureDatum(name=name, smarts=smarts, pkas=pka_data, mol=mol) 113 | 114 | @classmethod 115 | def _create_rdkit_mol(cls, smarts: str) -> Chem.Mol: 116 | # Create mol object from SMARTS 117 | try: 118 | logger.trace("Attempting to make RDKit mol from SMARTS") 119 | mol = Chem.MolFromSmarts(smarts) 120 | if mol is None: 121 | logger.warning("Invalid SMARTS pattern: {}", smarts) 122 | raise ValueError 123 | except Exception as e: 124 | logger.warning("Error creating mol from SMARTS '{}' : {}", smarts, str(e)) 125 | raise ValueError 126 | return mol 127 | 128 | @classmethod 129 | def _parse_pka_line(cls, line_parts: list[str]) -> list[PKaDatum]: 130 | if len(line_parts) % 3 != 0: 131 | logger.warning( 132 | "Invalid pKa data format, expected groups of 3, got {}", len(line_parts) 133 | ) 134 | raise ValueError 135 | 136 | pka_data = [] 137 | for i in range(0, len(line_parts), 3): 138 | try: 139 | idx_site = int(line_parts[i]) 140 | mean = float(line_parts[i + 1]) 141 | stdev = float(line_parts[i + 2]) 142 | pka_data.append(PKaDatum(idx_site=idx_site, mean=mean, stdev=stdev)) 143 | except (ValueError, IndexError) as e: 144 | logger.warning("Error parsing pKa data: {}", line_parts) 145 | raise ValueError from e 146 | return pka_data 147 | 148 | @classmethod 149 | def get_substructures(cls) -> Iterator[SubstructureDatum]: 150 | for substruct in cls._data: 151 | yield substruct 152 | -------------------------------------------------------------------------------- /dimorphite_dl/protonate/detect.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides functionality to detect protonation sites in molecules 3 | using substructure matching with comprehensive error handling and validation. 4 | """ 5 | 6 | from collections.abc import Iterator 7 | 8 | from loguru import logger 9 | from rdkit import Chem 10 | 11 | from dimorphite_dl.mol import MoleculeRecord 12 | from dimorphite_dl.protonate.data import PKaData 13 | from dimorphite_dl.protonate.site import ProtonationSite, SubstructureDatum 14 | 15 | 16 | class ProtonationSiteDetectionError(Exception): 17 | """Raised when protonation site detection encounters an error.""" 18 | 19 | pass 20 | 21 | 22 | class ProtonationSiteDetector: 23 | """ 24 | Robust detector for finding protonation sites in molecules. 25 | 26 | Uses substructure matching to identify potential protonation sites 27 | based on known pKa patterns. Handles atom protection to prevent 28 | overlapping matches. 29 | """ 30 | 31 | def __init__(self, validate_sites: bool = True, max_sites_per_molecule: int = 50): 32 | """ 33 | Initialize the detector with explicit configuration. 34 | 35 | Args: 36 | validate_sites: Whether to validate detected sites (explicit, not default) 37 | max_sites_per_molecule: Maximum sites to detect per molecule (bounded) 38 | """ 39 | assert isinstance(validate_sites, bool) 40 | assert isinstance(max_sites_per_molecule, int) 41 | assert max_sites_per_molecule > 0 42 | assert max_sites_per_molecule <= 1000 # Reasonable upper bound 43 | 44 | self.validate_sites = validate_sites 45 | self.max_sites_per_molecule = max_sites_per_molecule 46 | self.pka_data = PKaData() 47 | 48 | # Initialize statistics - all counters start at zero 49 | self._stats_molecules_processed = 0 50 | self._stats_sites_found = 0 51 | self._stats_sites_validated = 0 52 | self._stats_sites_rejected = 0 53 | self._stats_substructures_matched = 0 54 | 55 | def find_sites( 56 | self, mol_record: MoleculeRecord 57 | ) -> tuple[MoleculeRecord, list[ProtonationSite]]: 58 | """ 59 | Find protonation sites in a molecule. This is the main entry point. 60 | 61 | Args: 62 | mol_record: MoleculeRecord to analyze 63 | 64 | Returns: 65 | Tuple of (updated_mol_record, list_of_protonation_sites) 66 | 67 | Raises: 68 | ProtonationSiteDetectionError: If detection fails critically 69 | """ 70 | assert isinstance(mol_record, MoleculeRecord) 71 | assert mol_record.smiles # SMILES cannot be empty 72 | 73 | logger.debug("Finding protonation sites for '{}'", mol_record.smiles) 74 | self._stats_molecules_processed += 1 75 | 76 | try: 77 | prepared_mol = mol_record.prepare_for_protonation() 78 | if prepared_mol is None: 79 | logger.warning("Failed to prepare molecule: '{}'", mol_record.smiles) 80 | return mol_record, [] 81 | 82 | mol_record.mol = prepared_mol 83 | gen_sites = self._detect_all_sites_in_molecule(prepared_mol) 84 | 85 | sites_found = [] 86 | for site in gen_sites: 87 | if self.validate_sites: 88 | if not site.is_valid(): 89 | continue 90 | sites_found.append(site) 91 | 92 | sites_count = 0 93 | for site in sites_found: 94 | sites_count += len(site.pkas) 95 | 96 | logger.info( 97 | "Found {} protonation site(s) for '{}'", sites_count, mol_record.smiles 98 | ) 99 | return mol_record, sites_found 100 | 101 | except Exception as error: 102 | logger.error( 103 | "Critical error detecting sites for '{}': {}", 104 | mol_record.smiles, 105 | str(error), 106 | ) 107 | raise ProtonationSiteDetectionError(f"Detection failed: {error}") from error 108 | 109 | def _detect_all_sites_in_molecule(self, mol: Chem.Mol) -> Iterator[ProtonationSite]: 110 | """ 111 | Detect all protonation sites in the prepared molecule. 112 | 113 | Args: 114 | mol: Prepared RDKit mol object 115 | 116 | Yields: 117 | Detected protonation sites. 118 | """ 119 | assert mol is not None 120 | assert mol.GetNumAtoms() > 0 121 | 122 | total_matches_found = 0 123 | 124 | for substructure_data in self._iterate_available_substructures(): 125 | if substructure_data.mol is None: 126 | logger.debug( 127 | "Skipping substructure '{}' - no mol object", substructure_data.name 128 | ) 129 | continue 130 | 131 | matches = self._find_unprotected_matches_for_substructure( 132 | mol, substructure_data 133 | ) 134 | 135 | n_matches = len(matches) 136 | if n_matches == 0: 137 | continue 138 | 139 | total_matches_found += n_matches 140 | self._stats_substructures_matched += 1 141 | n_sites = 0 142 | 143 | for site in self._create_sites_from_matches( 144 | mol, matches, substructure_data 145 | ): 146 | n_sites += len(site.pkas) 147 | if n_sites >= self.max_sites_per_molecule: 148 | break 149 | 150 | yield site 151 | 152 | mol = self._protect_matched_atoms_in_molecule(mol, matches) 153 | 154 | def _iterate_available_substructures(self) -> Iterator[SubstructureDatum]: 155 | """ 156 | Get available substructure patterns for matching. 157 | 158 | Yields: 159 | SubstructureDatum objects for pattern matching 160 | """ 161 | try: 162 | substructure_count = 0 163 | for substructure_data in self.pka_data.get_substructures(): 164 | assert isinstance(substructure_data, SubstructureDatum) 165 | substructure_count += 1 166 | yield substructure_data 167 | 168 | logger.trace("Iterated over {} substructures", substructure_count) 169 | 170 | except Exception as error: 171 | logger.error("Error loading substructure data: {}", str(error)) 172 | raise ProtonationSiteDetectionError( 173 | f"Failed to load substructure data: {error}" 174 | ) from error 175 | 176 | def _find_unprotected_matches_for_substructure( 177 | self, mol: Chem.Mol, substructure_data: SubstructureDatum 178 | ) -> list[tuple[int, ...]]: 179 | """ 180 | Find unprotected matches for a specific substructure pattern. 181 | 182 | Args: 183 | mol: RDKit mol object to search in 184 | substructure_data: Substructure pattern to match 185 | 186 | Returns: 187 | List of tuples containing atom indices for unprotected matches 188 | """ 189 | assert mol is not None 190 | assert substructure_data is not None 191 | assert substructure_data.mol is not None 192 | 193 | try: 194 | has_substructure = mol.HasSubstructMatch(substructure_data.mol) 195 | if not has_substructure: 196 | return [] 197 | 198 | all_matches = list(mol.GetSubstructMatches(substructure_data.mol)) 199 | total_matches = len(all_matches) 200 | logger.debug( 201 | "Found {} '{}' group(s)", total_matches, substructure_data.name 202 | ) 203 | 204 | unprotected_matches = self._filter_matches_by_protection_status( 205 | mol, all_matches 206 | ) 207 | unprotected_count = len(unprotected_matches) 208 | 209 | logger.debug( 210 | "{}/{} matches were unprotected", 211 | unprotected_count, 212 | total_matches, 213 | substructure_data.name, 214 | ) 215 | 216 | return unprotected_matches 217 | 218 | except Exception as error: 219 | logger.warning( 220 | "Error finding matches for substructure '{}': {}", 221 | substructure_data.name, 222 | str(error), 223 | ) 224 | return [] 225 | 226 | def _filter_matches_by_protection_status( 227 | self, mol: Chem.Mol, all_matches: list[tuple[int, ...]] 228 | ) -> list[tuple[int, ...]]: 229 | """ 230 | Filter matches to only include those with unprotected atoms. 231 | 232 | Args: 233 | mol: RDKit mol object 234 | all_matches: List of all matches to filter 235 | 236 | Returns: 237 | List of matches where all atoms are unprotected 238 | """ 239 | assert mol is not None 240 | assert isinstance(all_matches, list) 241 | 242 | unprotected_matches = [] 243 | atom_count = mol.GetNumAtoms() 244 | 245 | for match in all_matches: 246 | assert isinstance(match, tuple) 247 | 248 | # Validate atom indices are within bounds 249 | for atom_index in match: 250 | assert isinstance(atom_index, int) 251 | assert 0 <= atom_index < atom_count 252 | 253 | if self._are_all_atoms_in_match_unprotected(mol, match): 254 | unprotected_matches.append(match) 255 | 256 | return unprotected_matches 257 | 258 | def _are_all_atoms_in_match_unprotected( 259 | self, mol: Chem.Mol, match: tuple[int, ...] 260 | ) -> bool: 261 | """ 262 | Check if all atoms in a match are unprotected. 263 | 264 | Args: 265 | mol: RDKit mol object 266 | match: Tuple of atom indices to check 267 | 268 | Returns: 269 | True if all atoms in match are unprotected, False otherwise 270 | """ 271 | assert mol is not None 272 | assert isinstance(match, tuple) 273 | assert len(match) > 0 # Match cannot be empty 274 | 275 | try: 276 | for atom_index in match: 277 | assert isinstance(atom_index, int) 278 | if MoleculeRecord.is_atom_protected(mol, atom_index): 279 | return False 280 | return True 281 | 282 | except Exception as error: 283 | logger.debug( 284 | "Error checking protection for match {}: {}", match, str(error) 285 | ) 286 | return False 287 | 288 | def _create_sites_from_matches( 289 | self, 290 | mol: Chem.Mol, 291 | matches: list[tuple[int, ...]], 292 | substructure_data: SubstructureDatum, 293 | ) -> Iterator[ProtonationSite]: 294 | """ 295 | Create ProtonationSite objects from matches. 296 | 297 | Args: 298 | mol: RDKit mol object used to detect this protonation site. 299 | matches: List of atom index tuples 300 | substructure_data: Substructure information 301 | 302 | Yields: 303 | Detected protonation sites. 304 | """ 305 | assert isinstance(matches, list) 306 | assert isinstance(substructure_data, SubstructureDatum) 307 | 308 | for match_indices in matches: 309 | site = ProtonationSite( 310 | mol=mol, 311 | idxs_match=tuple(match_indices), 312 | pkas=substructure_data.pkas, 313 | smarts=substructure_data.smarts, 314 | name=substructure_data.name, 315 | ) 316 | yield site 317 | 318 | def _protect_matched_atoms_in_molecule( 319 | self, mol: Chem.Mol, matches: list[tuple[int, ...]] 320 | ) -> Chem.Mol: 321 | """ 322 | Protect all atoms involved in matches to prevent overlap. 323 | 324 | Args: 325 | mol: RDKit mol object 326 | matches: List of matches whose atoms should be protected 327 | 328 | Returns: 329 | Same mol object with matched atoms protected 330 | """ 331 | assert mol is not None 332 | assert isinstance(matches, list) 333 | 334 | for match in matches: 335 | assert isinstance(match, tuple) 336 | atom_indices = list(match) 337 | logger.debug("Protecting atoms: {}", match) 338 | mol = MoleculeRecord.protect_atoms(mol, atom_indices) 339 | 340 | return mol 341 | 342 | def get_stats(self) -> dict[str, int]: 343 | """ 344 | Get detection statistics. 345 | 346 | Returns: 347 | Dictionary of detection statistics 348 | """ 349 | return { 350 | "molecules_processed": self._stats_molecules_processed, 351 | "sites_found": self._stats_sites_found, 352 | "sites_validated": self._stats_sites_validated, 353 | "sites_rejected": self._stats_sites_rejected, 354 | "substructures_matched": self._stats_substructures_matched, 355 | } 356 | 357 | def reset_stats(self) -> None: 358 | """ 359 | Reset all detection statistics to zero. 360 | 361 | """ 362 | self._stats_molecules_processed = 0 363 | self._stats_sites_found = 0 364 | self._stats_sites_validated = 0 365 | self._stats_sites_rejected = 0 366 | self._stats_substructures_matched = 0 367 | 368 | 369 | def canonicalize_smiles_list( 370 | mols: list[Chem.Mol], original_smiles: str = "" 371 | ) -> list[str]: 372 | """ 373 | Generate canonical SMILES from molecule objects. 374 | 375 | Args: 376 | mols: List of RDKit mol objects to convert 377 | original_smiles: Original SMILES for logging context 378 | 379 | Returns: 380 | List of unique canonical SMILES strings 381 | """ 382 | assert isinstance(mols, list) 383 | assert isinstance(original_smiles, str) 384 | 385 | if len(mols) == 0: 386 | return [] 387 | 388 | logger.debug("Generating canonical SMILES for {} molecules", len(mols)) 389 | 390 | try: 391 | unique_smiles = set() 392 | valid_mol_count = 0 393 | 394 | for mol in mols: 395 | if mol is None: 396 | continue 397 | 398 | valid_mol_count += 1 399 | canonical_smiles = _generate_single_canonical_smiles(mol) 400 | if canonical_smiles is not None: 401 | unique_smiles.add(canonical_smiles) 402 | 403 | smiles_list = list(unique_smiles) 404 | unique_count = len(smiles_list) 405 | 406 | context_msg = f" for '{original_smiles}'" if original_smiles else "" 407 | logger.debug( 408 | "Generated {} unique canonical SMILES from {} valid molecules{}", 409 | unique_count, 410 | valid_mol_count, 411 | context_msg, 412 | ) 413 | 414 | return smiles_list 415 | 416 | except Exception as error: 417 | context_msg = f" for '{original_smiles}'" if original_smiles else "" 418 | logger.warning( 419 | "Error in canonical SMILES generation{}: {}", context_msg, str(error) 420 | ) 421 | return [] 422 | 423 | 424 | def _generate_single_canonical_smiles(mol: Chem.Mol) -> str | None: 425 | """ 426 | Generate canonical SMILES for a single molecule. 427 | 428 | Args: 429 | mol: RDKit mol object 430 | 431 | Returns: 432 | Canonical SMILES string or None if generation failed 433 | """ 434 | assert mol is not None 435 | 436 | try: 437 | canonical = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) 438 | if canonical and len(canonical) > 0: 439 | return canonical 440 | else: 441 | logger.debug("Generated empty SMILES string") 442 | return None 443 | 444 | except Exception as error: 445 | logger.debug("Error generating canonical SMILES: {}", str(error)) 446 | return None 447 | 448 | 449 | # Convenience functions for backward compatibility 450 | def find( 451 | mol_record: MoleculeRecord, 452 | validate_sites: bool = True, 453 | max_sites_per_molecule: int = 50, 454 | ) -> tuple[MoleculeRecord, list[ProtonationSite]]: 455 | """ 456 | Convenience function for finding protonation sites with default settings. 457 | 458 | Args: 459 | mol_record: MoleculeRecord to analyze 460 | 461 | Returns: 462 | Tuple of (updated_mol_record, list_of_protonation_sites) 463 | """ 464 | assert isinstance(mol_record, MoleculeRecord) 465 | assert isinstance(validate_sites, bool) 466 | assert isinstance(max_sites_per_molecule, int) 467 | assert max_sites_per_molecule > 0 468 | 469 | detector = ProtonationSiteDetector( 470 | validate_sites=validate_sites, max_sites_per_molecule=max_sites_per_molecule 471 | ) 472 | return detector.find_sites(mol_record) 473 | -------------------------------------------------------------------------------- /dimorphite_dl/protonate/results.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from dimorphite_dl.protonate.site import ProtonationState 4 | 5 | 6 | @dataclass 7 | class ProtonationResult: 8 | """Data class for protonation results.""" 9 | 10 | smiles: str 11 | identifier: str 12 | states: ProtonationState | None = None 13 | 14 | def to_string(self, include_states: bool = False) -> str: 15 | """Convert to output string format.""" 16 | if include_states and self.states: 17 | return f"{self.smiles}\t{self.identifier}\t{self.states.to_str()}" 18 | return f"{self.smiles}\t{self.identifier}" 19 | 20 | 21 | @dataclass 22 | class ProtonationStats: 23 | """Statistics for protonation processing.""" 24 | 25 | molecules_processed: int = 0 26 | total_variants_generated: int = 0 27 | variants_validated: int = 0 28 | variants_rejected: int = 0 29 | molecules_with_sites: int = 0 30 | molecules_without_sites: int = 0 31 | fallback_used: int = 0 32 | -------------------------------------------------------------------------------- /dimorphite_dl/protonate/site.py: -------------------------------------------------------------------------------- 1 | """ 2 | Protonation site data structures and state calculations. 3 | 4 | This module defines the core data structures for protonation sites, 5 | including state enumerations, pKa data, and site information. 6 | Each class has clear responsibilities and comprehensive validation. 7 | """ 8 | 9 | from collections.abc import Iterator 10 | from dataclasses import dataclass, field 11 | from enum import Enum 12 | 13 | from loguru import logger 14 | from rdkit import Chem 15 | 16 | 17 | class ProtonationState(Enum): 18 | """ 19 | Enumeration of possible protonation states for a site. 20 | 21 | Values are explicitly assigned for clarity and debugging. 22 | """ 23 | 24 | UNKNOWN = 0 25 | DEPROTONATED = 1 26 | PROTONATED = 2 27 | BOTH = 3 28 | 29 | def to_str(self) -> str: 30 | """ 31 | Convert protonation state to string representation. 32 | 33 | Returns: 34 | String representation of the protonation state 35 | """ 36 | # Use explicit if-elif chain for clarity (TigerStyle) 37 | if self == ProtonationState.DEPROTONATED: 38 | return "DEPROTONATED" 39 | elif self == ProtonationState.PROTONATED: 40 | return "PROTONATED" 41 | elif self == ProtonationState.BOTH: 42 | return "BOTH" 43 | else: 44 | return "UNKNOWN" 45 | 46 | def get_charges(self) -> list[int]: 47 | """ 48 | Get the formal charges associated with this protonation state. 49 | 50 | Returns: 51 | List of integer formal charges for this state 52 | """ 53 | # Use explicit if-elif chain for clarity (TigerStyle) 54 | if self == ProtonationState.DEPROTONATED: 55 | return [-1] 56 | elif self == ProtonationState.PROTONATED: 57 | return [0] 58 | elif self == ProtonationState.BOTH: 59 | return [-1, 0] 60 | else: 61 | return [] 62 | 63 | 64 | class PKaDatum: 65 | """ 66 | Data structure for pKa information at a specific site. 67 | 68 | Contains the site index, mean pKa value, and standard deviation 69 | for calculating protonation states at different pH values. 70 | """ 71 | 72 | def __init__(self, idx_site: int, mean: float, stdev: float): 73 | """ 74 | Initialize pKa data with validation. 75 | 76 | Args: 77 | idx_atom: Site index (non-negative integer) 78 | mean: Mean pKa value (bounded 0-20 for realistic range) 79 | stdev: Standard deviation (non-negative, bounded 0-5) 80 | """ 81 | assert isinstance(idx_site, int) 82 | assert isinstance(mean, (int, float)) 83 | assert isinstance(stdev, (int, float)) 84 | assert idx_site >= 0, f"Site index must be non-negative, got: {idx_site}" 85 | 86 | self.idx_site = idx_site 87 | """Index of the atom we would protonate in the SMARTS substructure pattern""" 88 | self.mean = float(mean) 89 | self.stdev = float(stdev) 90 | 91 | def get_state( 92 | self, ph_min: float, ph_max: float, precision: float 93 | ) -> ProtonationState: 94 | """ 95 | Calculate protonation state for given pH range and precision. 96 | 97 | Args: 98 | ph_min: Minimum pH value (bounded 0-14) 99 | ph_max: Maximum pH value (bounded 0-14, greater than ph_min) 100 | precision: Precision factor for pKa calculation (positive) 101 | 102 | Returns: 103 | ProtonationState based on pH range and pKa statistics 104 | """ 105 | assert isinstance(ph_min, (int, float)) 106 | assert isinstance(ph_max, (int, float)) 107 | assert isinstance(precision, (int, float)) 108 | assert ph_min <= ph_max, ( 109 | f"ph_min ({ph_min}) must be less than ph_max ({ph_max})" 110 | ) 111 | assert precision >= 0.0, f"precision must be positive, got: {precision}" 112 | 113 | # Calculate effective pKa range based on precision 114 | effective_stdev = precision * self.stdev 115 | pka_min = self.mean - effective_stdev 116 | pka_max = self.mean + effective_stdev 117 | 118 | # Determine protonation state based on pH and pKa overlap 119 | # Use explicit conditions for clarity (TigerStyle) 120 | pka_overlaps_ph_range = (pka_min <= ph_max) and (ph_min <= pka_max) 121 | if pka_overlaps_ph_range: 122 | protonation_state = ProtonationState.BOTH 123 | elif self.mean > ph_max: 124 | protonation_state = ProtonationState.PROTONATED 125 | elif self.mean < ph_min: 126 | protonation_state = ProtonationState.DEPROTONATED 127 | else: 128 | protonation_state = ProtonationState.UNKNOWN 129 | 130 | return protonation_state 131 | 132 | 133 | @dataclass 134 | class SubstructureDatum: 135 | """ 136 | Data structure for substructure pattern matching information. 137 | 138 | Contains the pattern name, SMARTS string, RDKit mol object, 139 | and associated pKa data for protonation site detection. 140 | """ 141 | 142 | name: str = "" 143 | smarts: str = "" 144 | mol: Chem.Mol | None = None 145 | pkas: list[PKaDatum] = field(default_factory=list) 146 | 147 | def __post_init__(self): 148 | """Validate substructure data after initialization.""" 149 | assert isinstance(self.name, str) 150 | assert isinstance(self.smarts, str) 151 | assert isinstance(self.pkas, list) 152 | 153 | # Name and SMARTS should not be empty for valid substructures 154 | if len(self.name) > 0 or len(self.smarts) > 0: 155 | assert len(self.name) > 0, "Substructure name cannot be empty" 156 | assert len(self.smarts) > 0, "SMARTS pattern cannot be empty" 157 | 158 | # Validate all pKa data entries 159 | for pka in self.pkas: 160 | assert isinstance(pka, PKaDatum) 161 | 162 | def has_valid_pattern(self) -> bool: 163 | """ 164 | Check if substructure has a valid molecular pattern. 165 | 166 | Returns: 167 | True if mol object exists and is valid 168 | """ 169 | return self.mol is not None 170 | 171 | def has_pka_data(self) -> bool: 172 | """ 173 | Check if substructure has pKa data available. 174 | 175 | Returns: 176 | True if pKa data list is non-empty 177 | """ 178 | return len(self.pkas) > 0 179 | 180 | def get_pka_count(self) -> int: 181 | """ 182 | Get the number of pKa data points for this substructure. 183 | 184 | Returns: 185 | Number of pKa data entries 186 | """ 187 | return len(self.pkas) 188 | 189 | def is_valid_for_matching(self) -> bool: 190 | """ 191 | Check if substructure is valid for pattern matching. 192 | 193 | Returns: 194 | True if both pattern and pKa data are available 195 | """ 196 | return self.has_valid_pattern() and self.has_pka_data() 197 | 198 | 199 | @dataclass 200 | class ProtonationSite: 201 | """ 202 | Data structure for detected protonation site information. 203 | 204 | Contains atom indices and associated substructure data 205 | for a specific protonation site in a molecule. 206 | """ 207 | 208 | mol: Chem.Mol 209 | """RDKit Mol object that this protonation site was detected""" 210 | 211 | idxs_match: tuple[int, ...] 212 | """Atom indices of substructure match""" 213 | 214 | pkas: list[PKaDatum] 215 | """Observed pKas of this site.""" 216 | 217 | smarts: str 218 | """SMARTS used to detect the protonation site.""" 219 | 220 | name: str 221 | """Name of identified protonation site.""" 222 | 223 | def get_states( 224 | self, ph_min: float, ph_max: float, precision: float 225 | ) -> Iterator[tuple[int, ProtonationState]]: 226 | """ 227 | Generate protonation states for all pKa data at this site. 228 | 229 | Args: 230 | ph_min: Minimum pH value 231 | ph_max: Maximum pH value 232 | precision: Precision factor for pKa calculation 233 | 234 | Yields: 235 | Atom index of Mol and ProtonationState for each pKa datum at this site 236 | """ 237 | assert isinstance(ph_min, (int, float)) 238 | assert isinstance(ph_max, (int, float)) 239 | assert isinstance(precision, (int, float)) 240 | assert ph_min <= ph_max, ( 241 | f"ph_min ({ph_min}) must be less than or equal to ph_max ({ph_max})" 242 | ) 243 | assert precision >= 0.0, f"precision must be positive, got: {precision}" 244 | 245 | pka_count = len(self.pkas) 246 | assert pka_count > 0, "Cannot generate states without pKa data" 247 | 248 | states_generated = 0 249 | for pka in self.pkas: 250 | assert isinstance(pka, PKaDatum) 251 | idx_atom = self.idxs_match[pka.idx_site] 252 | state = pka.get_state(ph_min, ph_max, precision) 253 | states_generated += 1 254 | yield idx_atom, state 255 | 256 | assert states_generated == pka_count, ( 257 | f"Expected {pka_count} states, generated {states_generated}" 258 | ) 259 | 260 | def get_unique_states( 261 | self, ph_min: float, ph_max: float, precision: float 262 | ) -> tuple[tuple[int, ProtonationState], ...]: 263 | """ 264 | Get protonation states as a list for easier handling. 265 | 266 | Args: 267 | ph_min: Minimum pH value (bounded 0-14) 268 | ph_max: Maximum pH value (bounded 0-14, greater than ph_min) 269 | precision: Precision factor for pKa calculation (positive) 270 | 271 | Returns: 272 | List of ProtonationState objects for this site 273 | """ 274 | gen = tuple(state for state in self.get_states(ph_min, ph_max, precision)) 275 | states_unique = tuple(set(gen)) 276 | return states_unique 277 | 278 | def is_valid(self) -> bool: 279 | if self.mol is None: 280 | logger.debug("Site validation failed: no mol object") 281 | return False 282 | 283 | atom_count = self.mol.GetNumAtoms() 284 | if atom_count <= 0: 285 | return False 286 | 287 | for atom_index in self.idxs_match: 288 | if not isinstance(atom_index, int): 289 | logger.debug("Invalid atom index type: {}", type(atom_index)) 290 | return False 291 | if atom_index < 0: 292 | logger.debug("Negative atom index: {}", atom_index) 293 | return False 294 | if atom_index >= atom_count: 295 | logger.debug( 296 | "Atom index {} out of range (molecule has {} atoms)", 297 | atom_index, 298 | atom_count, 299 | ) 300 | return False 301 | 302 | return True 303 | 304 | 305 | def validate_ph_range(ph_min: float, ph_max: float) -> bool: 306 | """ 307 | Validate pH range parameters. 308 | 309 | Args: 310 | ph_min: Minimum pH value 311 | ph_max: Maximum pH value 312 | 313 | Returns: 314 | True if pH range is valid 315 | """ 316 | try: 317 | assert isinstance(ph_min, (int, float)) 318 | assert isinstance(ph_max, (int, float)) 319 | assert 0.0 <= ph_min <= 14.0 320 | assert 0.0 <= ph_max <= 14.0 321 | assert ph_min < ph_max 322 | return True 323 | except AssertionError: 324 | return False 325 | 326 | 327 | def create_pka_datum_safe(idx_site: int, mean: float, stdev: float) -> PKaDatum | None: 328 | """ 329 | Create PKaDatum with error handling. 330 | 331 | Args: 332 | idx_site: Site index 333 | mean: Mean pKa value 334 | stdev: Standard deviation 335 | 336 | Returns: 337 | PKaDatum object or None if parameters are invalid 338 | """ 339 | try: 340 | return PKaDatum(idx_site, mean, stdev) 341 | except (AssertionError, ValueError): 342 | return None 343 | 344 | 345 | def create_protonation_site_safe( 346 | mol: Chem.Mol, 347 | idxs_match: tuple[int, ...], 348 | substructure: SubstructureDatum | None = None, 349 | ) -> ProtonationSite | None: 350 | """ 351 | Create ProtonationSite with error handling. 352 | 353 | Args: 354 | mol: RDKit Mol object we are creating a protonation site for. 355 | idxs_match: Atom indices of substructure match. 356 | substructure: Substructure data 357 | 358 | Returns: 359 | ProtonationSite object or None if parameters are invalid 360 | """ 361 | try: 362 | if substructure is None: 363 | substructure = SubstructureDatum() 364 | return ProtonationSite( 365 | mol=mol, 366 | idxs_match=idxs_match, 367 | pkas=substructure.pkas, 368 | smarts=substructure.smarts, 369 | name=substructure.name, 370 | ) 371 | except (AssertionError, ValueError): 372 | return None 373 | -------------------------------------------------------------------------------- /dimorphite_dl/smarts/site_substructures.smarts: -------------------------------------------------------------------------------- 1 | *Azide [N+0:1]=[N+:2]=[N+0:3]-[H] 2 4.65 0.07071067811865513 2 | Nitro [C,c,N,n,O,o:1]-[NX3:2](=[O:3])-[O:4]-[H] 3 -1000.0 0 3 | AmidineGuanidine1 [N:1]-[C:2](-[N:3])=[NX2:4]-[H:5] 3 12.025333333333334 1.5941046150769165 4 | AmidineGuanidine2 [C:1](-[N:2])=[NX2+0:3] 2 10.035538461538462 2.1312826469414716 5 | Sulfate [SX4:1](=[O:2])(=[O:3])([O:4]-[C,c,N,n:5])-[OX2:6]-[H] 5 -2.36 1.3048043093561141 6 | Sulfonate [SX4:1](=[O:2])(=[O:3])(-[C,c,N,n:4])-[OX2:5]-[H] 4 -1.8184615384615386 1.4086213481855594 7 | Sulfinic_acid [SX3:1](=[O:2])-[O:3]-[H] 2 1.7933333333333332 0.4372070447739835 8 | Phenyl_carboxyl [c,n,o:1]-[C:2](=[O:3])-[O:4]-[H] 3 3.463441968255319 1.2518054407928614 9 | Carboxyl [C:1](=[O:2])-[O:3]-[H] 2 3.456652971502591 1.2871420886834017 10 | Thioic_acid [C,c,N,n:1](=[O,S:2])-[SX2,OX2:3]-[H] 2 0.678267 1.497048763660801 11 | Phenyl_Thiol [c,n:1]-[SX2:2]-[H] 1 4.978235294117647 2.6137000480499806 12 | Thiol [C,N:1]-[SX2:2]-[H] 1 9.12448275862069 1.3317968158171463 13 | 14 | # [*]OP(=O)(O[H])O[H]. Note that this matches terminal phosphate of ATP, ADP, AMP. 15 | Phosphate [PX4:1](=[O:2])(-[OX2:3]-[H])(-[O+0:4])-[OX2:5]-[H] 2 2.4182608695652172 1.1091177991945305 5 6.5055 0.9512787792174668 16 | 17 | # Note that Internal_phosphate_polyphos_chain and 18 | # Initial_phosphate_like_in_ATP_ADP were added on 6/2/2020 to better detail with 19 | # molecules that have polyphosphate chains (e.g., ATP, ADP, NADH, etc.). Unlike 20 | # the other protonation states, these two were not determined by analyzing a set 21 | # of many compounds with experimentally determined pKa values. 22 | 23 | # For Internal_phosphate_polyphos_chain, we use a mean pKa value of 0.9, per 24 | # DOI: 10.7554/eLife.38821. For the precision value we use 1.0, which is roughly 25 | # the precision of the two ionizable hydroxyls from Phosphate (see above). Note 26 | # that when using recursive SMARTS strings, RDKit considers only the first atom 27 | # to be a match. Subsequent atoms define the environment. 28 | Internal_phosphate_polyphos_chain [$([PX4:1](=O)([OX2][PX4](=O)([OX2])(O[H]))([OX2][PX4](=O)(O[H])([OX2])))][O:2]-[H] 1 0.9 1.0 29 | 30 | # For Initial_phosphate_like_in_ATP_ADP, we use the same values found for the 31 | # lower-pKa hydroxyl of Phosphate (above). 32 | Initial_phosphate_like_in_ATP_ADP [$([PX4:1]([OX2][C,c,N,n])(=O)([OX2][PX4](=O)([OX2])(O[H])))]O-[H] 1 2.4182608695652172 1.1091177991945305 33 | 34 | # [*]P(=O)(O[H])O[H]. Cannot match terminal phosphate of ATP because O not among [C,c,N,n] 35 | Phosphonate [PX4:1](=[O:2])(-[OX2:3]-[H])(-[C,c,N,n:4])-[OX2:5]-[H] 2 1.8835714285714287 0.5925999820080644 5 7.247254901960784 0.8511476450801531 36 | 37 | Phenol [c,n,o:1]-[O:2]-[H] 1 7.065359866910526 3.277356122295936 38 | Peroxide1 [O:1]([$(C=O),$(C[Cl]),$(CF),$(C[Br]),$(CC#N):2])-[O:3]-[H] 2 8.738888888888889 0.7562592839596507 39 | Peroxide2 [C:1]-[O:2]-[O:3]-[H] 2 11.978235294117647 0.8697645895163075 40 | O=C-C=C-OH [O:1]=[C;R:2]-[C;R:3]=[C;R:4]-[O:5]-[H] 4 3.554 0.803339458581667 41 | Vinyl_alcohol [C:1]=[C:2]-[O:3]-[H] 2 8.871850714285713 1.660200255394124 42 | Alcohol [C:1]-[O:2]-[H] 1 14.780384615384616 2.546464970533435 43 | N-hydroxyamide [C:1](=[O:2])-[N:3]-[O:4]-[H] 3 9.301904761904762 1.2181897185891002 44 | *Ringed_imide1 [O,S:1]=[C;R:2]([$([#8]),$([#7]),$([#16]),$([#6][Cl]),$([#6]F),$([#6][Br]):3])-[N;R:4]([C;R:5]=[O,S:6])-[H] 3 6.4525 0.5555627777308341 45 | *Ringed_imide2 [O,S:1]=[C;R:2]-[N;R:3]([C;R:4]=[O,S:5])-[H] 2 8.681666666666667 1.8657779975741713 46 | *Imide [F,Cl,Br,S,s,P,p:1][#6:2][CX3:3](=[O,S:4])-[NX3+0:5]([CX3:6]=[O,S:7])-[H] 4 2.466666666666667 1.4843629385474877 47 | *Imide2 [O,S:1]=[CX3:2]-[NX3+0:3]([CX3:4]=[O,S:5])-[H] 2 10.23 1.1198214143335534 48 | *Amide_electronegative [C:1](=[O:2])-[N:3](-[Br,Cl,I,F,S,O,N,P:4])-[H] 2 3.4896 2.688124315081677 49 | *Amide [C:1](=[O:2])-[N:3]-[H] 2 12.00611111111111 4.512491341218857 50 | *Sulfonamide [SX4:1](=[O:2])(=[O:3])-[NX3+0:4]-[H] 3 7.9160326086956525 1.9842121316708763 51 | Anilines_primary [c:1]-[NX3+0:2]([H:3])[H:4] 1 3.899298673194805 2.068768503987161 52 | Anilines_secondary [c:1]-[NX3+0:2]([H:3])[!H:4] 1 4.335408163265306 2.1768842022330843 53 | Anilines_tertiary [c:1]-[NX3+0:2]([!H:3])[!H:4] 1 4.16690685045614 2.005865735782679 54 | Aromatic_nitrogen_unprotonated [n+0&H0:1] 0 4.3535441240733945 2.0714072661859584 55 | Amines_primary_secondary_tertiary [C:1]-[NX3+0:2] 1 8.159107682388349 2.5183597445318147 56 | 57 | # e.g., [*]P(=O)(O[H])[*]. Note that cannot match the internal phosphates of ATP, because 58 | # oxygen is not among [C,c,N,n,F,Cl,Br,I] 59 | Phosphinic_acid [PX4:1](=[O:2])(-[C,c,N,n,F,Cl,Br,I:3])(-[C,c,N,n,F,Cl,Br,I:4])-[OX2:5]-[H] 4 2.9745 0.6867886750744557 60 | 61 | # e.g., [*]OP(=O)(O[H])O[*]. Cannot match ATP because P not among [C,c,N,n,F,Cl,Br,I] 62 | Phosphate_diester [PX4:1](=[O:2])(-[OX2:3]-[C,c,N,n,F,Cl,Br,I:4])(-[O+0:5]-[C,c,N,n,F,Cl,Br,I:4])-[OX2:6]-[H] 6 2.7280434782608696 2.5437448856908316 63 | 64 | # e.g., [*]P(=O)(O[H])O[*]. Cannot match ATP because O not among [C,c,N,n,F,Cl,Br,I]. 65 | Phosphonate_ester [PX4:1](=[O:2])(-[OX2:3]-[C,c,N,n,F,Cl,Br,I:4])(-[C,c,N,n,F,Cl,Br,I:5])-[OX2:6]-[H] 5 2.0868 0.4503028610465036 66 | 67 | Primary_hydroxyl_amine [C,c:1]-[O:2]-[NH2:3] 2 4.035714285714286 0.8463816543155368 68 | *Indole_pyrrole [c;R:1]1[c;R:2][c;R:3][c;R:4][n;R:5]1[H] 4 14.52875 4.06702491591416 69 | *Aromatic_nitrogen_protonated [n:1]-[H] 0 7.17 2.94602395490212 70 | -------------------------------------------------------------------------------- /docs/.nav.yml: -------------------------------------------------------------------------------- 1 | nav: 2 | - Home: index.md 3 | - API: api 4 | - Development: development.md 5 | 6 | sort: 7 | type: natural 8 | ignore_case: true 9 | by: title 10 | direction: asc 11 | flatten_single_child_sections: true 12 | -------------------------------------------------------------------------------- /docs/.overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block header %} 4 | {{ super() }} 5 | 6 | 7 | 8 | {% endblock %} 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/css/base.css: -------------------------------------------------------------------------------- 1 | 2 | /*Make the content wider and relative to window size.*/ 3 | .md-grid { 4 | max-width: 85% 5 | } 6 | 7 | :root { 8 | --md-tooltip-width: 600px; 9 | } 10 | 11 | @page { 12 | size: letter; 13 | max-width: 100%; 14 | margin-top: 1in; 15 | margin-right: 0in; 16 | margin-bottom: 1in; 17 | margin-left: 0in; 18 | } 19 | 20 | .md-typeset h2 { 21 | line-height: 1.13; 22 | } 23 | -------------------------------------------------------------------------------- /docs/css/colors.css: -------------------------------------------------------------------------------- 1 | [data-md-color-scheme="default"] { 2 | 3 | /* Primary color shades */ 4 | --md-primary-fg-color: #025099; 5 | --md-primary-fg-color--light: #0437AD; 6 | --md-primary-bg-color: #ffffff; /* Header text */ 7 | --md-primary-bg-color--light: #DBDBDB; /* Secondary header text */ 8 | 9 | /* Default color shades */ 10 | --md-default-fg-color: #646464; /* ??? */ 11 | --md-default-fg-color--light: #7A7A7A; /* h1 */ 12 | --md-default-fg-color--lighter: #9B9B9B; /* ??? */ 13 | --md-default-fg-color--lightest: #BCBCBC; /* ??? */ 14 | 15 | --md-default-bg-color: #FAFAFA; /* Body background */ 16 | --md-default-bg-color--light: #FAFAFA; 17 | --md-default-bg-color--lighter: #FAFAFA; 18 | --md-default-bg-color--lightest: #FAFAFA; 19 | 20 | /* Code color shades */ 21 | --md-code-fg-color: #36464e; /* Code block text color */ 22 | --md-code-bg-color: #f1f1f1; /* Code block background */ 23 | 24 | /* Code highlighting color shades */ 25 | --md-code-hl-color: #0000ff; 26 | --md-code-hl-color--light: #0000ff; 27 | --md-code-hl-number-color: #d52a2a; 28 | --md-code-hl-special-color: #db1457; 29 | --md-code-hl-function-color: #a846b9; 30 | --md-code-hl-constant-color: #6e59d9; 31 | --md-code-hl-keyword-color: #3f6ec6; 32 | --md-code-hl-string-color: #1c7d4d; 33 | --md-code-hl-name-color: #36464e; 34 | --md-code-hl-operator-color: var(--md-primary-fg-color); 35 | --md-code-hl-punctuation-color: var(--md-primary-fg-color); 36 | --md-code-hl-comment-color: var(--md-primary-fg-color); 37 | --md-code-hl-generic-color: var(--md-primary-fg-color); 38 | --md-code-hl-variable-color: var(--md-primary-fg-color); 39 | 40 | /* Typeset color shades */ 41 | --md-typeset-color: #212529; /* Main text color */ 42 | 43 | /* Typeset `a` color shades */ 44 | --md-typeset-a-color: #01a0d7; /* Link color */ 45 | 46 | /* Typeset `table` color shades */ 47 | --md-typeset-table-color: #a5a5a5; /* Outline color */ 48 | --md-typeset-table-color--light: #e3e2e2; /* Hover color */ 49 | 50 | /* Footer color shades */ 51 | --md-footer-fg-color: #ffffff; /* ??? */ 52 | --md-footer-fg-color--light: #e9ecef; /* Footer text */ 53 | --md-footer-fg-color--lighter: #adb5bd; /* ??? */ 54 | --md-footer-bg-color: #000000; 55 | --md-footer-bg-color--dark: #212529; /* Footer background */ 56 | 57 | /* Accent color shades */ 58 | --md-accent-fg-color: #032779; /* Hover over link */ 59 | --md-accent-fg-color--transparent: #caf0f8; /* Hover over transparent (e.g., code with link) */ 60 | --md-accent-bg-color: #ffffff; 61 | --md-accent-bg-color--light: #e5e5e5; 62 | 63 | /* Admonition colors */ 64 | --md-admonition-fg-color: #212529; 65 | --md-admonition-bg-color: #FAFAFA; 66 | } 67 | 68 | [data-md-color-scheme="dark"] { 69 | 70 | /* Primary color shades */ 71 | --md-primary-fg-color: #23243D; 72 | --md-primary-fg-color--light: #0437AD; 73 | --md-primary-bg-color: #ffffff; /* Header text */ 74 | --md-primary-bg-color--light: #DBDBDB; /* Secondary header text */ 75 | 76 | /* Default color shades */ 77 | --md-default-fg-color: #e2e4e9; /* ??? */ 78 | --md-default-fg-color--light: #ffffff; /* h1 */ 79 | --md-default-fg-color--lighter: #e2e4e9; /* ??? */ 80 | --md-default-fg-color--lightest: #e2e4e9; /* ??? */ 81 | 82 | --md-default-bg-color: #212529; /* Body background */ 83 | --md-default-bg-color--light: #FAFAFA; 84 | --md-default-bg-color--lighter: #FAFAFA; 85 | --md-default-bg-color--lightest: #FAFAFA; 86 | 87 | /* Code color shades */ 88 | --md-code-fg-color: #dddddd; /* Code block text color */ 89 | --md-code-bg-color: #333333; /* Code block background */ 90 | 91 | /* Code highlighting color shades */ 92 | --md-code-hl-color: #aeaeff; 93 | --md-code-hl-color--light: #aeaeff; 94 | --md-code-hl-number-color: #ff9494; 95 | --md-code-hl-special-color: #ffa0c0; 96 | --md-code-hl-function-color: #f3adff; 97 | --md-code-hl-constant-color: #bdaeff; 98 | --md-code-hl-keyword-color: #a0c1ff; 99 | --md-code-hl-string-color: #9fffcf; 100 | --md-code-hl-name-color: #f5f5f5; 101 | --md-code-hl-operator-color: #a6f0ff; 102 | --md-code-hl-punctuation-color: #a6f0ff; 103 | --md-code-hl-comment-color: #a6f0ff; 104 | --md-code-hl-generic-color: #a6f0ff; 105 | --md-code-hl-variable-color: #a6f0ff; 106 | 107 | /* Typeset color shades */ 108 | --md-typeset-color: #ffffff; /* Main text color */ 109 | 110 | /* Typeset `a` color shades */ 111 | --md-typeset-a-color: #96E4FE; /* Link color */ 112 | 113 | /* Typeset `table` color shades */ 114 | --md-typeset-table-color: #a5a5a5; /* Outline color */ 115 | --md-typeset-table-color--light: #343a40; /* Hover color */ 116 | 117 | /* Footer color shades */ 118 | --md-footer-fg-color: #ffffff; /* ??? */ 119 | --md-footer-fg-color--light: #e9ecef; /* Footer text */ 120 | --md-footer-fg-color--lighter: #adb5bd; /* ??? */ 121 | --md-footer-bg-color: #000000; 122 | --md-footer-bg-color--dark: #171717; /* Footer background */ 123 | 124 | /* Accent color shades */ 125 | --md-accent-fg-color: #90e0ef; /* Hover over link */ 126 | --md-accent-fg-color--transparent: #6D6D6D; /* Hover over transparent (e.g., code with link) */ 127 | --md-accent-bg-color: #ffffff; 128 | --md-accent-bg-color--light: #e5e5e5; 129 | 130 | /* Admonition colors */ 131 | --md-admonition-fg-color: #ffffff; 132 | --md-admonition-bg-color: #212529; 133 | 134 | .highlight-ipynb { 135 | --jp-mirror-editor-string-color: #98c379; 136 | --jp-mirror-editor-number-color: #d19a66; 137 | --jp-mirror-editor-keyword-color: #c678dd; 138 | --jp-mirror-editor-operator-color: #c678dd; 139 | } 140 | 141 | 142 | 143 | .highlight-ipynb { 144 | margin: 0; 145 | padding: 5px 10px; 146 | background-color: #23262A; 147 | } 148 | 149 | .highlight-ipynb .nf { 150 | color: #61afef; 151 | } 152 | 153 | .highlight-ipynb .p { 154 | color: #ffffff; 155 | } 156 | 157 | .highlight-ipynb .nb { 158 | color: #56b6c2; 159 | } 160 | 161 | .highlight-ipynb .kc { 162 | color: #d19a66; 163 | } 164 | 165 | .highlight-ipynb .c1 { 166 | color: #8f8f8f; 167 | } 168 | 169 | .jupyter-wrapper .jp-InputArea-editor { 170 | position: relative; 171 | border-color: #30363C; 172 | } 173 | 174 | .jupyter-wrapper .highlight pre { 175 | background-color: transparent; 176 | padding: 10px; 177 | overflow: auto; 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /docs/css/jupyter.css: -------------------------------------------------------------------------------- 1 | html 2 | /* 3 | This adjusts the font size of Jupyter notebook code blocks to be closer to normal. 4 | */ 5 | .highlight { 6 | font-size: 85%; 7 | } 8 | -------------------------------------------------------------------------------- /docs/css/launchy.css: -------------------------------------------------------------------------------- 1 | .launchy-container { 2 | display: flex; 3 | justify-content: flex-end; 4 | position: relative; 5 | margin-top: -4.4em; 6 | margin-bottom: 2em; 7 | margin-right: 0.4em; 8 | } 9 | -------------------------------------------------------------------------------- /docs/css/mkdocstrings.css: -------------------------------------------------------------------------------- 1 | /*Indentation.*/ 2 | div.doc-contents:not(.first) { 3 | padding-left: 35px; /*25px is the default*/ 4 | border-left: .15rem solid #ededed; 5 | } 6 | 7 | .doc-heading .highlight { 8 | font-size: 18px; 9 | background-color: transparent; 10 | } 11 | 12 | /*Mark external links as such. */ 13 | a.external::after, 14 | a.autorefs-external::after { 15 | /* */ 16 | mask-image: url('data:image/svg+xml,'); 17 | -webkit-mask-image: url('data:image/svg+xml,'); 18 | content: ' '; 19 | 20 | display: inline-block; 21 | vertical-align: middle; 22 | position: relative; 23 | 24 | height: 1em; 25 | width: 1em; 26 | background-color: var(--md-typeset-a-color); 27 | } 28 | 29 | a.external:hover::after, 30 | a.autorefs-external:hover::after { 31 | background-color: var(--md-accent-fg-color); 32 | } 33 | 34 | /* Fancier color for operators such as * and |. */ 35 | .doc-signature .o { 36 | color: var(--md-code-hl-special-color); 37 | } 38 | 39 | /* Fancier color for constants such as None, True, and False. */ 40 | .doc-signature .kc { 41 | color: var(--md-code-hl-constant-color); 42 | } 43 | 44 | /* Fancier color for built-in types (only useful when cross-references are used). */ 45 | .doc-signature .n > a[href^="https://docs.python.org/"][href*="/functions.html#"], 46 | .doc-signature .n > a[href^="https://docs.python.org/"][href*="/stdtypes.html#"] { 47 | color: var(--md-code-hl-constant-color); 48 | } 49 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | This comprehensive guide provides detailed instructions to help maintainers effectively develop, test, document, build, and release new versions of `dimorphite_dl`. 4 | 5 | ## Setting up the Development Environment 6 | 7 | `dimorphite_dl` utilizes [`pixi`](https://pixi.sh/latest/) for managing environments and dependencies, streamlining the setup process. Follow these precise steps to configure your development environment: 8 | 9 | 1. **Clone the repository:** 10 | Begin by obtaining a local copy of the `dimorphite_dl` codebase: 11 | 12 | ```bash 13 | git clone git@GitHub.com:durrantlab/dimorphite_dl.git 14 | cd dimorphite_dl 15 | ``` 16 | 2. **Install dependencies:** 17 | Install all necessary dependencies by running: 18 | 19 | ```bash 20 | pixi install 21 | ``` 22 | 3. **Activate the development environment:** 23 | To enter the isolated virtual environment configured specifically for `dimorphite_dl` development, execute: 24 | 25 | ```bash 26 | pixi shell 27 | ``` 28 | 29 | You are now fully prepared and equipped to develop `dimorphite_dl`. 30 | 31 | ## Code Formatting and Style Guide 32 | 33 | Maintaining consistent style and formatting across the codebase is crucial for readability and maintainability. 34 | `dimorphite_dl` employs automated formatting tools configured to enforce standardized style guidelines. 35 | Execute the following command to apply formatting automatically: 36 | 37 | ```bash 38 | pixi run format 39 | ``` 40 | 41 | This command sequentially runs `black` for Python formatting, `isort` for managing imports, and `markdownlint-cli2` to enforce markdown formatting standards, ensuring your contributions align with project conventions. 42 | 43 | ## Documentation 44 | 45 | `dimorphite_dl`'s documentation is built using MkDocs, allowing easy creation and maintenance of high-quality documentation. 46 | To locally preview documentation changes, serve the documentation by running: 47 | 48 | ```bash 49 | pixi run -e docs serve-docs 50 | ``` 51 | 52 | After execution, open your web browser and visit [`http://127.0.0.1:8000/`](http://127.0.0.1:8000/) to review changes in real-time. 53 | 54 | ## Testing 55 | 56 | Writing and maintaining tests is essential for ensuring code correctness, reliability, and stability. 57 | Execute `dimorphite_dl`'s tests with: 58 | 59 | ```bash 60 | pixi run -e dev tests 61 | ``` 62 | 63 | Additionally, you can evaluate test coverage to identify untested areas and improve overall reliability by running: 64 | 65 | ```bash 66 | pixi run -e dev coverage 67 | ``` 68 | 69 | Review the generated coverage reports to address any gaps in testing. 70 | 71 | ## Bumping Version 72 | 73 | Releasing a new version of `dimorphite_dl` requires updating version information, documenting changes, and creating a corresponding release tag. Follow these steps precisely to ensure consistency and traceability: 74 | 75 | 1. **Update the changelog:** 76 | Document all notable changes since the previous release in the `CHANGELOG.md` file. Follow a consistent and clear format to help users understand what has changed. 77 | 78 | 3. **Commit the changes:** 79 | Stage and commit the version bump and changelog update using a clear and standardized message, for example: 80 | 81 | ```bash 82 | git add . 83 | git commit -m "bump: v1.2.5" 84 | ``` 85 | 86 | 4. **Tag the commit:** 87 | Create a version tag that follows the `v` format: 88 | 89 | ```bash 90 | git tag v1.2.5 91 | git push origin main --tags 92 | ``` 93 | 94 | 5. **Create a GitHub release:** 95 | Navigate to the [GitHub Releases](https://github.com/durrantlab/dimorphite_dl/releases) page and draft a new release: 96 | 97 | - Tag version: `v1.2.5` 98 | - Release title: `v1.2.5` 99 | - Description: Copy the relevant changelog section or summarize the key changes. 100 | 101 | Attach the built package files from the `dist/` directory, if desired. 102 | 103 | ## Building the Package 104 | 105 | Prepare `dimorphite_dl` for publishing or distribution by building the package. 106 | Execute: 107 | 108 | ```bash 109 | pixi run build 110 | ``` 111 | 112 | Upon completion, inspect the `dist` directory for the generated distribution files, which are ready for publication. 113 | 114 | ## Publishing to PyPI 115 | 116 | Once the version number is updated and the package is built, it can be published to PyPI. 117 | Execute: 118 | 119 | ```bash 120 | pixi run publish 121 | ``` 122 | 123 | For preliminary testing or release candidates, it is highly recommended to publish to TestPyPI first. 124 | Execute: 125 | 126 | ```bash 127 | pixi run publish-test 128 | ``` 129 | 130 | Publishing to TestPyPI allows you to validate packaging correctness and installation processes without affecting production users. 131 | 132 | ## Maintenance Best Practices 133 | 134 | To maintain high quality and reliability of `dimorphite_dl`, adhere to the following best practices: 135 | 136 | - Regularly synchronize your local repository with the main branch to incorporate the latest updates: 137 | 138 | ```bash 139 | git pull origin main 140 | ``` 141 | - Frequently review and address open issues and pull requests on GitHub. 142 | - Clearly document changes in commit messages, issue descriptions, and pull requests. 143 | - Routinely verify dependencies and update them as necessary to maintain compatibility and security. 144 | 145 | Adhering to these guidelines ensures a robust, stable, and continuously improving `dimorphite_dl` project. 146 | 147 | This expanded documentation guide covers the entire workflow comprehensively, providing clarity and precision for effective `dimorphite_dl` project maintenance. 148 | -------------------------------------------------------------------------------- /docs/gen_ref_pages.py: -------------------------------------------------------------------------------- 1 | """Generate the code reference pages.""" 2 | 3 | import os 4 | from pathlib import Path 5 | 6 | import mkdocs_gen_files 7 | 8 | SRC_DIR = "dimorphite_dl" 9 | WRITE_DIR = "api" 10 | 11 | for path in sorted(Path(SRC_DIR).rglob("*.py")): # 12 | module_path = path.relative_to(SRC_DIR).with_suffix("") # 13 | 14 | doc_path = path.relative_to(SRC_DIR).with_suffix(".md") # 15 | 16 | if not os.path.exists(Path(WRITE_DIR)): 17 | os.mkdir(Path(WRITE_DIR)) 18 | 19 | full_doc_path = Path(WRITE_DIR, doc_path) # 20 | 21 | parts = list(module_path.parts) 22 | 23 | if parts[-1] == "__init__": # 24 | parts = parts[:-1] 25 | elif parts[-1] == "__main__": 26 | continue 27 | 28 | if len(parts) == 0: 29 | continue 30 | 31 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: # 32 | identifier = ".".join(parts) # 33 | 34 | print("::: " + identifier, file=fd) # 35 | 36 | mkdocs_gen_files.set_edit_path(full_doc_path, path) # 37 | -------------------------------------------------------------------------------- /docs/img/launchy/colab.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 40 | 43 | 47 | 51 | 55 | 59 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --8<-- "README.md" 2 | -------------------------------------------------------------------------------- /docs/js/mathjax-config.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"], ["$", "$"]], 4 | displayMath: [["\\[", "\\]"], ["$$", "$$"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.startup.output.clearCache() 16 | MathJax.typesetClear() 17 | MathJax.texReset() 18 | MathJax.typesetPromise() 19 | }) 20 | -------------------------------------------------------------------------------- /hooks/launchy.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | def is_jupyter(markdown=None, html=None): 7 | if html: 8 | return '
' in html 9 | if markdown: 10 | return False 11 | raise ValueError("Must provide either markdown or html") 12 | 13 | 14 | def on_page_content(html, page, config, files): 15 | """Adds a Google colab button to launch Jupyter files""" 16 | # Only Jupyter notebooks will have this div. 17 | if is_jupyter(html=html): 18 | page_url = page.url 19 | if "/index.ipynb" == page.file.abs_src_path[-12:]: 20 | page_url += "index.ipynb" 21 | else: 22 | page_url = page_url[:-1] + ".ipynb" 23 | colab_url = os.path.join(config["colab_base_url"], page_url) 24 | colab_launch_html = f""" 25 |
26 | 27 | Colab Image 28 | 29 |
30 | """ 31 | soup = BeautifulSoup(html, "html.parser") 32 | h1_tag = soup.find("h1") 33 | h1_tag.insert_after(BeautifulSoup(colab_launch_html, "html.parser")) 34 | return soup.prettify() 35 | return html 36 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | docs_dir: docs 2 | 3 | site_name: dimorphite_dl 4 | site_author: durrantlab 5 | 6 | repo_name: durrantlab/dimorphite_dl 7 | repo_url: https://github.com/durrantlab/dimorphite_dl 8 | copyright: CC BY-NC-SA 4.0 by OASCI 9 | 10 | # https://squidfunk.github.io/mkdocs-material/ 11 | theme: 12 | name: material 13 | custom_dir: docs/.overrides 14 | language: en 15 | # logo: img/logo.svg 16 | palette: 17 | # Palette toggle for light mode 18 | - scheme: default 19 | toggle: 20 | icon: material/lightbulb-outline 21 | name: Switch to dark mode 22 | 23 | # Palette toggle for dark mode 24 | - scheme: dark 25 | toggle: 26 | icon: material/lightbulb 27 | name: Switch to light mode 28 | font: 29 | text: Roboto 30 | code: Roboto Mono 31 | icon: 32 | repo: fontawesome/brands/github 33 | annotation: material/star-four-points-circle 34 | features: 35 | - content.code.annotate 36 | - content.code.copy 37 | - content.code.select 38 | - content.tooltips 39 | - content.tabs.link 40 | - navigation.tabs 41 | - navigation.tabs.sticky 42 | - navigation.tracking 43 | - navigation.top 44 | - navigation.indexes 45 | - navigation.path 46 | - navigation.prune 47 | - toc.follow 48 | - search.suggest 49 | 50 | validation: 51 | omitted_files: warn 52 | absolute_links: warn 53 | unrecognized_links: warn 54 | 55 | # Options need to be indented twice for some reason? 56 | plugins: 57 | - search 58 | - autorefs 59 | - material-plausible 60 | - gen-files: 61 | scripts: 62 | - docs/gen_ref_pages.py 63 | - mkdocstrings: 64 | handlers: 65 | python: 66 | inventories: 67 | - "https://docs.python.org/3/objects.inv" 68 | paths: ["dimorphite_dl"] 69 | options: 70 | show_source: false 71 | show_root_heading: false 72 | annotations_path: brief 73 | docstring_style: google 74 | merge_init_into_class: true 75 | docstring_section_style: spacy 76 | show_if_no_docstring: true 77 | show_labels: false 78 | parameter_headings: false 79 | show_symbol_type_heading: true 80 | show_symbol_type_toc: true 81 | - mkdocs-jupyter: 82 | no_input: False 83 | include_requirejs: true 84 | include_source: True 85 | ignore: ["*.py"] 86 | remove_tag_config: 87 | remove_input_tags: 88 | - hide_code 89 | - awesome-nav 90 | - glightbox 91 | - macros 92 | - print-site 93 | - git-revision-date-localized: 94 | type: iso_datetime 95 | timezone: America/Detroit 96 | fallback_to_build_date: true 97 | 98 | hooks: 99 | - hooks/launchy.py 100 | colab_base_url: https://colab.research.google.com/github/durrantlab/dimorphite_dl/blob/main/study 101 | 102 | 103 | extra: 104 | generator: false 105 | 106 | extra_css: 107 | - css/base.css 108 | - css/colors.css 109 | - css/jupyter.css 110 | - css/mkdocstrings.css 111 | - css/launchy.css 112 | 113 | extra_javascript: 114 | - js/mathjax-config.js 115 | 116 | markdown_extensions: 117 | - abbr 118 | - toc: 119 | permalink: true 120 | - admonition 121 | - attr_list 122 | - def_list 123 | - footnotes 124 | - md_in_html 125 | - tables 126 | - pymdownx.arithmatex: 127 | generic: true 128 | - pymdownx.betterem 129 | - pymdownx.caret 130 | - pymdownx.details 131 | - pymdownx.highlight: 132 | anchor_linenums: true 133 | line_spans: __span 134 | pygments_lang_class: true 135 | - pymdownx.inlinehilite 136 | - pymdownx.keys 137 | - pymdownx.mark 138 | - pymdownx.smartsymbols 139 | - pymdownx.snippets 140 | - pymdownx.superfences: 141 | custom_fences: 142 | - name: mermaid 143 | class: mermaid 144 | format: !!python/name:pymdownx.superfences.fence_code_format 145 | - pymdownx.tabbed: 146 | alternate_style: true 147 | - pymdownx.tasklist: 148 | custom_checkbox: true 149 | - pymdownx.tilde 150 | -------------------------------------------------------------------------------- /pixi.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | authors = [ 3 | "durrantlab ", 4 | ] 5 | channels = ["conda-forge"] 6 | name = "dimorphite_dl" 7 | description = "Adds hydrogen atoms to molecular representations as specified by pH" 8 | platforms = ["win-64", "linux-64", "osx-64"] 9 | license = "Apache-2.0" 10 | readme = "README.md" 11 | 12 | [pypi-dependencies] 13 | dimorphite_dl = { path = ".", editable = true } 14 | 15 | [environments] 16 | dev = ["dev"] 17 | docs = ["docs"] 18 | 19 | [tasks] 20 | 21 | [dependencies] 22 | python = "==3.13" 23 | rdkit = ">=2020.3.3,<2026" 24 | 25 | [feature.dev.dependencies] 26 | isort = ">=5.12.0" 27 | pylint = ">=3.0.1" 28 | mypy = ">=1.6.0" 29 | pytest = ">=7.4.2" 30 | pytest-cov = ">=4.1.0" 31 | coverage = ">=7.3.1" 32 | pytest-html = ">=4.0.1" 33 | colorama = ">=0.4.6" 34 | basedpyright = ">=1.29.1,<2" 35 | ruff = ">=0.11.10,<0.12" 36 | twine = ">=6.1.0,<7" 37 | ipykernel = ">=6.29.5,<7" 38 | 39 | [feature.dev.tasks] 40 | mdlint = { cmd = ["markdownlint-cli2", '"**/*.{md,markdown}"', "--fix", "--config", ".markdownlint.yaml", "||", "true"] } 41 | isort = { cmd = ["isort", "--settings-path", ".isort.cfg", "./dimorphite_dl", "./tests", "||", "true"] } 42 | ruff = { cmd = ["ruff", "format", "--config", ".ruff.toml", "./dimorphite_dl", "./tests", "||", "true"] } 43 | format = { depends-on = ["mdlint", "isort", "ruff"] } 44 | tests = { cmd = [ 45 | "PYTHONPATH=.", 46 | "pytest", 47 | "-c", 48 | ".pytest.ini", 49 | "--cov='dimorphite_dl'", 50 | "--cov-report=xml", 51 | "--junit-xml=report.xml", 52 | "--failed-first", 53 | ]} 54 | coverage = { cmd = ["coverage", "report"] } 55 | cleanup-build = { cmd = ["rm", "-rf", "./build", "./dist"] } 56 | build = { cmd = ["python3", "-m", "build"], depends-on = ["cleanup-build"]} 57 | publish-test = { cmd = ["twine", "upload", "--repository", "testpypi", "dist/*"] } 58 | publish = { cmd = ["twine", "upload", "dist/*"] } 59 | 60 | [feature.dev.pypi-dependencies] 61 | build = ">=1.2.2.post1,<2" 62 | mypy-extensions = ">=1.0.0" 63 | black = { version = ">=23.10.0", extras = ["jupyter"] } 64 | pyrefly = ">=0.16.0, <0.17" 65 | setuptools-scm = ">=8.0.0" 66 | 67 | [feature.docs.dependencies] 68 | mkdocs = ">=1.6.1,<2" 69 | mkdocs-material = ">=9.6.5,<10" 70 | mkdocstrings = ">=0.28.2,<0.29" 71 | mkdocstrings-python = ">=1.16.2,<2" 72 | pymdown-extensions = ">=10.14.3,<11" 73 | mkdocs-table-reader-plugin = ">=3.1.0,<4" 74 | mkdocs-gen-files = ">=0.4.0,<0.5" 75 | mkdocs-macros-plugin = ">=1.3.7,<2" 76 | mkdocs-jupyter = ">=0.25.1,<0.26" 77 | mkdocs-glightbox = ">=0.4.0,<0.5" 78 | mkdocs-git-revision-date-localized-plugin = ">=1.2.9,<2" 79 | 80 | [feature.docs.pypi-dependencies] 81 | material-plausible-plugin = ">=0.2.0,<0.3" 82 | mkdocs-print-site-plugin = ">=2.6.0,<3" 83 | mkdocs-awesome-nav = ">=3.0.0,<4" 84 | 85 | [feature.docs.tasks] 86 | docs = { cmd = ["rm", "-rf", "./public/", "&&", "mkdocs", "build", "-d", "public/"] } 87 | serve = { cmd = ["mkdocs", "serve"] } 88 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | authors = [ 3 | {name = "Durrant Lab @ Pitt", email = "durrantj@pitt.edu"} 4 | ] 5 | maintainers = [ 6 | {name = "Alex M. Maldonado", email = "alex.maldonado@pitt.edu"} 7 | ] 8 | description = "Adds hydrogen atoms to molecular representations as specified by pH" 9 | name = "dimorphite_dl" 10 | dynamic = ["version"] 11 | readme = "README.md" 12 | requires-python = ">=3.10" 13 | license = "Apache-2.0" 14 | classifiers = [ 15 | "Development Status :: 5 - Production/Stable", 16 | "Intended Audience :: Developers", 17 | "Intended Audience :: Science/Research", 18 | "Natural Language :: English", 19 | "Programming Language :: Python", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.10", 22 | "Programming Language :: Python :: 3.11", 23 | "Programming Language :: Python :: 3.12", 24 | "Programming Language :: Python :: 3.13", 25 | "Topic :: Scientific/Engineering :: Chemistry", 26 | ] 27 | 28 | # TODO: Keep this here until pixi releases building capabilities 29 | dependencies = [ 30 | "loguru>=0.7.2,<0.8", 31 | "rdkit>=2020.3.3,<2026", 32 | ] 33 | 34 | [project.urls] 35 | Documentation = "https://durrantlab.github.io/dimorphite_dl" 36 | Repository = "https://github.com/durrantlab/dimorphite_dl" 37 | Issues = "https://github.com/durrantlab/dimorphite_dl/issues" 38 | Changelog = "https://github.com/durrantlab/dimorphite_dl/blob/main/CHANGELOG.md" 39 | 40 | [project.scripts] 41 | dimorphite_dl = "dimorphite_dl.cli:run_cli" 42 | 43 | [build-system] 44 | requires = ["setuptools>=61.0", "setuptools-scm>=8", "wheel"] 45 | 46 | [tool.setuptools.packages.find] 47 | where = ["."] 48 | include = ["dimorphite_dl*"] 49 | 50 | [tool.setuptools.package-data] 51 | dimorphite_dl = ["smarts/*.smarts"] 52 | 53 | [tool.setuptools_scm] 54 | write_to = "dimorphite_dl/_version.py" 55 | version_scheme = "guess-next-dev" 56 | local_scheme = "node-and-timestamp" 57 | 58 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from rdkit import Chem 5 | 6 | from dimorphite_dl import enable_logging 7 | from dimorphite_dl.io import SMILESProcessor 8 | 9 | TEST_DIR = os.path.dirname(__file__) 10 | 11 | 12 | def compare_smiles(smiles1, smiles2): 13 | detected_can = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1), isomericSmiles=True) 14 | assert isinstance(detected_can, str) 15 | expected_can = Chem.MolToSmiles(Chem.MolFromSmiles(smiles2), isomericSmiles=True) 16 | assert isinstance(expected_can, str) 17 | assert detected_can == expected_can, f"got {smiles1}, expected {smiles2}" 18 | 19 | 20 | def compare_smarts(smarts1, smarts2): 21 | detected_can = Chem.MolToSmarts(Chem.MolFromSmarts(smarts1)) 22 | expected_can = Chem.MolToSmarts(Chem.MolFromSmarts(smarts2)) 23 | assert detected_can == expected_can, f"got {smarts1}, expected {smarts2}" 24 | 25 | 26 | @pytest.fixture(scope="session", autouse=True) 27 | def turn_on_logging(): 28 | enable_logging(0) 29 | 30 | 31 | # Pytest fixtures for reusable test data 32 | @pytest.fixture 33 | def sample_smiles_list(): 34 | """Fixture providing a sample list of SMILES strings.""" 35 | return ["CCO", "CCC", "c1ccccc1", "CC(C)C", "CCN"] 36 | 37 | 38 | @pytest.fixture 39 | def sample_smiles_file(tmp_path): 40 | """Fixture providing a temporary SMILES file.""" 41 | content = "CCO ethanol\nCCC propane\nc1ccccc1 benzene\n" 42 | file_path = tmp_path / "test_molecules.smi" 43 | file_path.write_text(content) 44 | return str(file_path) 45 | 46 | 47 | @pytest.fixture 48 | def processor_no_validation(): 49 | """Fixture providing a processor with validation disabled.""" 50 | return SMILESProcessor(validate_smiles=False) 51 | -------------------------------------------------------------------------------- /tests/files/sample_molecules.smi: -------------------------------------------------------------------------------- 1 | C#CCO Alcohol 2 | C(=O)N Amide 3 | CC(=O)NOC(C)=O Amide_electronegative 4 | COC(=N)N AmidineGuanidine2 5 | Brc1ccc(C2NCCS2)cc1 Amines_primary_secondary_tertiary 6 | CC(=O)[n+]1ccc(N)cc1 Anilines_primary 7 | CCNc1ccccc1 Anilines_secondary 8 | Cc1ccccc1N(C)C Anilines_tertiary 9 | BrC1=CC2=C(C=C1)NC=C2 Aromatic_nitrogen_protonated 10 | C-N=[N+]=[N@H] Azide 11 | BrC(C(O)=O)CBr Carboxyl 12 | NC(NN=O)=N AmidineGuanidine1 13 | C(F)(F)(F)C(=O)NC(=O)C Imide 14 | O=C(C)NC(C)=O Imide2 15 | CC(C)(C)C(N(C)O)=O N-hydroxyamide 16 | C[N+](O)=O Nitro 17 | O=C1C=C(O)CC1 O=C-C=C-OH 18 | C1CC1OO Peroxide2 19 | C(=O)OO Peroxide1 20 | Brc1cc(O)cc(Br)c1 Phenol 21 | CC(=O)c1ccc(S)cc1 Phenyl_Thiol 22 | C=CCOc1ccc(C(=O)O)cc1 Phenyl_carboxyl 23 | COP(=O)(O)OC Phosphate_diester 24 | CP(C)(=O)O Phosphinic_acid 25 | CC(C)OP(C)(=O)O Phosphonate_ester 26 | CC1(C)OC(=O)NC1=O Ringed_imide1 27 | O=C(N1)C=CC1=O Ringed_imide2 28 | O=S(OC)(O)=O Sulfate 29 | COc1ccc(S(=O)O)cc1 Sulfinic_acid 30 | CS(N)(=O)=O Sulfonamide 31 | CC(=O)CSCCS(O)(=O)=O Sulfonate 32 | CC(=O)S Thioic_acid 33 | C(C)(C)(C)(S) Thiol 34 | Brc1cc[nH+]cc1 Aromatic_nitrogen_unprotonated 35 | C=C(O)c1c(C)cc(C)cc1C Vinyl_alcohol 36 | CC(=O)ON Primary_hydroxyl_amine 37 | O=P(O)(O)OCCCC Phosphate 38 | CC(P(O)(O)=O)C Phosphonate 39 | -------------------------------------------------------------------------------- /tests/mol/test_detect_substruct.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from conftest import compare_smarts, compare_smiles # type: ignore 3 | from rdkit import Chem 4 | 5 | from dimorphite_dl.mol import MoleculeRecord 6 | from dimorphite_dl.protonate.detect import ProtonationSiteDetector 7 | 8 | 9 | @pytest.mark.parametrize( 10 | ("smiles", "smiles_prepped_correct", "expected_smarts", "expected_idxs_match"), 11 | [ 12 | ("C#CCO", "[H]C#CC([H])([H])O[H]", "[C:1]-[O:2]-[#1]", (2, 3, 7)), 13 | ("Brc1cc[nH+]cc1", "[H]c1nc([H])c([H])c(Br)c1[H]", "[n&+0&H0:1]", (4,)), 14 | ( 15 | "C-N=[N+]=[N@H]", 16 | "[H]N=[N+]=NC([H])([H])[H]", 17 | "[N&+0:1]=[N&+:2]=[N&+0:3]-[#1]", 18 | (1, 2, 3, 7), 19 | ), 20 | ( 21 | "O=P(O)(O)OCCCC", 22 | "[H]OP(=O)(O[H])OC([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]", 23 | "[P&X4:1](=[O:2])(-[O&X2:3]-[#1])(-[O&+0:4])-[O&X2:5]-[#1]", 24 | (5, 6, 7, 18, 4, 8, 19), 25 | ), 26 | ], 27 | ) 28 | def test_substructure_detect( 29 | smiles, smiles_prepped_correct, expected_smarts, expected_idxs_match 30 | ): 31 | mol_record = MoleculeRecord(smiles) 32 | 33 | detector = ProtonationSiteDetector() 34 | 35 | # prepare molecule 36 | mol = mol_record.prepare_for_protonation() 37 | smiles_prepped = Chem.MolToSmiles(mol) 38 | compare_smiles(smiles_prepped, smiles_prepped_correct) 39 | 40 | # detect substructures 41 | substructures = list(detector._detect_all_sites_in_molecule(mol)) 42 | sub_match = substructures[0] 43 | 44 | # instead of raw string equality, canonicalize both SMARTS and compare 45 | compare_smarts(sub_match.smarts, expected_smarts) 46 | # atom indices should still be the same 47 | assert sub_match.idxs_match == expected_idxs_match 48 | -------------------------------------------------------------------------------- /tests/mol/test_neutralize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dimorphite_dl.mol import MoleculeRecord 4 | 5 | 6 | @pytest.mark.parametrize( 7 | ("input_smiles", "exp_azides", "exp_neutral", "exp_canonical"), 8 | [ 9 | ("C#CCO", "C#CCO", "C#CCO", "C#CCO"), 10 | ("Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1", "Brc1ccncc1", "Brc1ccncc1"), 11 | ("C-N=[N+]=[N@H]", "C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=N"), 12 | ("O=P(O)(O)OCCCC", "O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)(O)O"), 13 | ], 14 | ) 15 | def test_molecule_preparation_steps( 16 | input_smiles, exp_azides, exp_neutral, exp_canonical 17 | ): 18 | mol = MoleculeRecord(input_smiles) 19 | mol.process_azides() 20 | assert mol.smiles == exp_azides, ( 21 | f"after process_azides: got {mol.smiles!r}, expected {exp_azides!r}" 22 | ) 23 | mol.neutralize() 24 | assert mol.smiles == exp_neutral, ( 25 | f"after neutralize: got {mol.smiles!r}, expected {exp_neutral!r}" 26 | ) 27 | mol.make_canonical() 28 | assert mol.smiles == exp_canonical, ( 29 | f"after make_canonical: got {mol.smiles!r}, expected {exp_canonical!r}" 30 | ) 31 | -------------------------------------------------------------------------------- /tests/protonate/test_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for loading protonation site data 3 | """ 4 | 5 | from dimorphite_dl.protonate.data import PKaData 6 | 7 | 8 | def test_data_init(): 9 | pka_data = PKaData() 10 | pka_data2 = PKaData() 11 | assert pka_data == pka_data2 12 | 13 | n_substructures = len(pka_data._data) 14 | assert n_substructures == 41 15 | -------------------------------------------------------------------------------- /tests/protonate/test_run.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from conftest import compare_smiles # type: ignore 3 | 4 | from dimorphite_dl import protonate_smiles 5 | 6 | 7 | # Every molecule should be protonated 8 | @pytest.mark.parametrize( 9 | ("smiles_input", "smiles_correct"), 10 | [ 11 | ("C#CCO", "C#CCO"), # alcohol 12 | ("C(=O)N", "NC=O"), # Amide, 13 | ("CC(=O)NOC(C)=O", "CC(=O)NOC(C)=O"), # Amide_electronegative, 14 | ("COC(=N)N", "COC(N)=[NH2+]"), # AmidineGuanidine2, 15 | ( 16 | "Brc1ccc(C2NCCS2)cc1", 17 | "Brc1ccc(C2[NH2+]CCS2)cc1", 18 | ), # Amines_primary_secondary_tertiary, 19 | ("CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc([NH3+])cc1"), # Anilines_primary, 20 | ("CCNc1ccccc1", "CC[NH2+]c1ccccc1"), # Anilines_secondary, 21 | ("Cc1ccccc1N(C)C", "Cc1ccccc1[NH+](C)C"), # Anilines_tertiary, 22 | ("BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[nH]ccc2c1"), # Indole_pyrrole, 23 | ("O=c1cc[nH]cc1", "O=c1cc[nH]cc1"), # Aromatic_nitrogen_protonated, 24 | ("C-N=[N+]=[N@H]", "CN=[N+]=N"), # Azide, 25 | ("BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr"), # Carboxyl, 26 | ("NC(NN=O)=N", "NC(=[NH2+])NN=O"), # AmidineGuanidine1, 27 | ("C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F"), # Imide, 28 | ("O=C(C)NC(C)=O", "CC(=O)NC(C)=O"), # Imide2, 29 | ("CC(C)(C)C(N(C)O)=O", "CN(O)C(=O)C(C)(C)C"), # N-hydroxyamide, 30 | ("C[N+](O)=O", "C[N+](=O)O"), # Nitro, 31 | ("O=C1C=C(O)CC1", "O=C1C=C(O)CC1"), # O=C-C=C-OH, 32 | ("C1CC1OO", "OOC1CC1"), # Peroxide2, 33 | ("C(=O)OO", "O=COO"), # Peroxide1, 34 | ("Brc1cc(O)cc(Br)c1", "Oc1cc(Br)cc(Br)c1"), # Phenol, 35 | ("CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc(S)cc1"), # Phenyl_Thiol, 36 | ("C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)O)cc1"), # Phenyl_carboxyl, 37 | ("COP(=O)(O)OC", "COP(=O)(O)OC"), # Phosphate_diester, 38 | ("CP(C)(=O)O", "CP(C)(=O)O"), # Phosphinic_acid, 39 | ("CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)O"), # Phosphonate_ester, 40 | ("CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)NC1=O"), # Ringed_imide1, 41 | ("O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1"), # Ringed_imide2, 42 | ("O=S(OC)(O)=O", "COS(=O)(=O)O"), # Sulfate, 43 | ("COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)O)cc1"), # Sulfinic_acid, 44 | ("CS(N)(=O)=O", "CS(N)(=O)=O"), # Sulfonamide, 45 | ("CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)O"), # Sulfonate, 46 | ("CC(=O)S", "CC(=O)S"), # Thioic_acid, 47 | ("C(C)(C)(C)(S)", "CC(C)(C)S"), # Thiol, 48 | ("Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1"), # Aromatic_nitrogen_unprotonated, 49 | ("C=C(O)c1c(C)cc(C)cc1C", "C=C(O)c1c(C)cc(C)cc1C"), # Vinyl_alcohol, 50 | ("CC(=O)ON", "CC(=O)O[NH3+]"), # Primary_hydroxyl_amine, 51 | ("O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O"), # Phosphate 52 | ("CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O"), # Phosphonate 53 | ], 54 | ) 55 | def test_very_acidic_single(smiles_input, smiles_correct): 56 | ph_min = -10000000 57 | ph_max = -10000000 58 | 59 | output = protonate_smiles(smiles_input, ph_min=ph_min, ph_max=ph_max, precision=0.5) 60 | assert len(output) == 1 61 | smiles_output = output[0] 62 | 63 | compare_smiles(smiles_output, smiles_correct) 64 | 65 | 66 | # Every molecule should be deprotonated 67 | @pytest.mark.parametrize( 68 | ("smiles_input", "smiles_correct"), 69 | [ 70 | ("C#CCO", "C#CC[O-]"), # Alcohol 71 | ("C(=O)N", "[NH-]C=O"), # Amide 72 | ("CC(=O)NOC(C)=O", "CC(=O)[N-]OC(C)=O"), # Amide_electronegative 73 | ("COC(=N)N", "COC(=N)N"), # AmidineGuanidine2 74 | ( 75 | "Brc1ccc(C2NCCS2)cc1", 76 | "Brc1ccc(C2NCCS2)cc1", 77 | ), # Amines_primary_secondary_tertiary 78 | ("CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc(N)cc1"), # Anilines_primary 79 | ("CCNc1ccccc1", "CCNc1ccccc1"), # Anilines_secondary 80 | ("Cc1ccccc1N(C)C", "Cc1ccccc1N(C)C"), # Anilines_tertiary 81 | ("BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[n-]ccc2c1"), # Indole_pyrrole 82 | ("O=c1cc[nH]cc1", "O=c1cc[n-]cc1"), # Aromatic_nitrogen_protonated 83 | ("C-N=[N+]=[N@H]", "CN=[N+]=[N-]"), # Azide 84 | ("BrC(C(O)=O)CBr", "O=C([O-])C(Br)CBr"), # Carboxyl 85 | ("NC(NN=O)=N", "N=C(N)NN=O"), # AmidineGuanidine1 86 | ("C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)[N-]C(=O)C(F)(F)F"), # Imide 87 | ("O=C(C)NC(C)=O", "CC(=O)[N-]C(C)=O"), # Imide2 88 | ("CC(C)(C)C(N(C)O)=O", "CN([O-])C(=O)C(C)(C)C"), # N-hydroxyamide 89 | ("C[N+](O)=O", "C[N+](=O)[O-]"), # Nitro 90 | ("O=C1C=C(O)CC1", "O=C1C=C([O-])CC1"), # O=C-C=C-OH 91 | ("C1CC1OO", "[O-]OC1CC1"), # Peroxide2 92 | ("C(=O)OO", "O=CO[O-]"), # Peroxide1 93 | ("Brc1cc(O)cc(Br)c1", "[O-]c1cc(Br)cc(Br)c1"), # Phenol 94 | ("CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc([S-])cc1"), # Phenyl_Thiol 95 | ("C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)[O-])cc1"), # Phenyl_carboxyl 96 | ("COP(=O)(O)OC", "COP(=O)([O-])OC"), # Phosphate_diester 97 | ("CP(C)(=O)O", "CP(C)(=O)[O-]"), # Phosphinic_acid 98 | ("CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)[O-]"), # Phosphonate_ester 99 | ("CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)[N-]C1=O"), # Ringed_imide1 100 | ("O=C(N1)C=CC1=O", "O=C1C=CC(=O)[N-]1"), # Ringed_imide2 101 | ("O=S(OC)(O)=O", "COS(=O)(=O)[O-]"), # Sulfate 102 | ("COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)[O-])cc1"), # Sulfinic_acid 103 | ("CS(N)(=O)=O", "CS([NH-])(=O)=O"), # Sulfonamide 104 | ("CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)[O-]"), # Sulfonate 105 | ("CC(=O)S", "CC(=O)[S-]"), # Thioic_acid 106 | ("C(C)(C)(C)(S)", "CC(C)(C)[S-]"), # Thiol 107 | ("Brc1cc[nH+]cc1", "Brc1ccncc1"), # Aromatic_nitrogen_unprotonated 108 | ("C=C(O)c1c(C)cc(C)cc1C", "C=C([O-])c1c(C)cc(C)cc1C"), # Vinyl_alcohol 109 | ("CC(=O)ON", "CC(=O)ON"), # Primary_hydroxyl_amine 110 | ("O=P(O)(O)OCCCC", "CCCCOP(=O)([O-])[O-]"), # Phosphate 111 | ("CC(P(O)(O)=O)C", "CC(C)P(=O)([O-])[O-]"), # Phosphonate 112 | ], 113 | ) 114 | def test_very_basic(smiles_input, smiles_correct): 115 | ph_min = 10000000 116 | ph_max = 10000000 117 | 118 | output = list( 119 | protonate_smiles(smiles_input, ph_min=ph_min, ph_max=ph_max, precision=0.5) 120 | ) 121 | assert len(output) == 1 122 | smiles_output = output[0] 123 | 124 | compare_smiles(smiles_output, smiles_correct) 125 | 126 | 127 | @pytest.mark.parametrize( 128 | ("smiles_input", "smiles_protonated", "smiles_deprotonated", "pka_avg"), 129 | [ 130 | ["C#CCO", "C#CCO", "C#CC[O-]", 14.780384615384616], # alcohol 131 | ["C(=O)N", "NC=O", "[NH-]C=O", 12.00611111111111], # amide 132 | [ 133 | "CC(=O)NOC(C)=O", 134 | "CC(=O)NOC(C)=O", 135 | "CC(=O)[N-]OC(C)=O", 136 | 3.4896, 137 | ], # Amide_electronegative 138 | [ 139 | "COC(=N)N", 140 | "COC(N)=[NH2+]", 141 | "COC(=N)N", 142 | 10.035538461538462, 143 | ], # AmidineGuanidine2" 144 | [ 145 | "Brc1ccc(C2NCCS2)cc1", 146 | "Brc1ccc(C2[NH2+]CCS2)cc1", 147 | "Brc1ccc(C2NCCS2)cc1", 148 | 8.159107682388349, 149 | ], # Amines_primary_secondary_tertiary 150 | [ 151 | "CC(=O)[n+]1ccc(N)cc1", 152 | "CC(=O)[n+]1ccc([NH3+])cc1", 153 | "CC(=O)[n+]1ccc(N)cc1", 154 | 3.899298673194805, 155 | ], # Anilines_primary 156 | [ 157 | "CCNc1ccccc1", 158 | "CC[NH2+]c1ccccc1", 159 | "CCNc1ccccc1", 160 | 4.335408163265306, 161 | ], # Anilines_secondary 162 | [ 163 | "Cc1ccccc1N(C)C", 164 | "Cc1ccccc1[NH+](C)C", 165 | "Cc1ccccc1N(C)C", 166 | 4.16690685045614, 167 | ], # Anilines_tertiary 168 | [ 169 | "BrC1=CC2=C(C=C1)NC=C2", 170 | "Brc1ccc2[nH]ccc2c1", 171 | "Brc1ccc2[n-]ccc2c1", 172 | 14.52875, 173 | ], # Indole_pyrrole 174 | [ 175 | "O=c1cc[nH]cc1", 176 | "O=c1cc[nH]cc1", 177 | "O=c1cc[n-]cc1", 178 | 7.17, 179 | ], # Aromatic_nitrogen_protonated 180 | ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", 4.65], # Azide 181 | [ 182 | "BrC(C(O)=O)CBr", 183 | "O=C(O)C(Br)CBr", 184 | "O=C([O-])C(Br)CBr", 185 | 3.456652971502591, 186 | ], # Carboxyl 187 | [ 188 | "NC(NN=O)=N", 189 | "NC(=[NH2+])NN=O", 190 | "N=C(N)NN=O", 191 | 12.025333333333334, 192 | ], # AmidineGuanidine1 193 | [ 194 | "C(F)(F)(F)C(=O)NC(=O)C", 195 | "CC(=O)NC(=O)C(F)(F)F", 196 | "CC(=O)[N-]C(=O)C(F)(F)F", 197 | 2.466666666666667, 198 | ], # Imide 199 | ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", 10.23], # Imide2 200 | [ 201 | "CC(C)(C)C(N(C)O)=O", 202 | "CN(O)C(=O)C(C)(C)C", 203 | "CN([O-])C(=O)C(C)(C)C", 204 | 9.301904761904762, 205 | ], # N-hydroxyamide 206 | ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", -1000.0], # Nitro 207 | ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", 11.978235294117647], # Peroxide2 208 | ["C(=O)OO", "O=COO", "O=CO[O-]", 8.738888888888889], # Peroxide1 209 | [ 210 | "Brc1cc(O)cc(Br)c1", 211 | "Oc1cc(Br)cc(Br)c1", 212 | "[O-]c1cc(Br)cc(Br)c1", 213 | 7.065359866910526, 214 | ], # Phenol 215 | [ 216 | "CC(=O)c1ccc(S)cc1", 217 | "CC(=O)c1ccc(S)cc1", 218 | "CC(=O)c1ccc([S-])cc1", 219 | 4.978235294117647, 220 | ], # Phenyl_Thiol 221 | [ 222 | "C=CCOc1ccc(C(=O)O)cc1", 223 | "C=CCOc1ccc(C(=O)O)cc1", 224 | "C=CCOc1ccc(C(=O)[O-])cc1", 225 | 3.463441968255319, 226 | ], # Phenyl_carboxyl 227 | [ 228 | "COP(=O)(O)OC", 229 | "COP(=O)(O)OC", 230 | "COP(=O)([O-])OC", 231 | 2.7280434782608696, 232 | ], # Phosphate_diester 233 | ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", 2.9745], # Phosphinic_acid 234 | [ 235 | "CC(C)OP(C)(=O)O", 236 | "CC(C)OP(C)(=O)O", 237 | "CC(C)OP(C)(=O)[O-]", 238 | 2.0868, 239 | ], # Phosphonate_ester 240 | [ 241 | "CC1(C)OC(=O)NC1=O", 242 | "CC1(C)OC(=O)NC1=O", 243 | "CC1(C)OC(=O)[N-]C1=O", 244 | 6.4525, 245 | ], # Ringed_imide1 246 | [ 247 | "O=C(N1)C=CC1=O", 248 | "O=C1C=CC(=O)N1", 249 | "O=C1C=CC(=O)[N-]1", 250 | 8.681666666666667, 251 | ], # Ringed_imide2 252 | ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", -2.36], # Sulfate 253 | [ 254 | "COc1ccc(S(=O)O)cc1", 255 | "COc1ccc(S(=O)O)cc1", 256 | "COc1ccc(S(=O)[O-])cc1", 257 | 1.7933333333333332, 258 | ], # Sulfinic_acid 259 | [ 260 | "CS(N)(=O)=O", 261 | "CS(N)(=O)=O", 262 | "CS([NH-])(=O)=O", 263 | 7.9160326086956525, 264 | ], # Sulfonamide 265 | [ 266 | "CC(=O)CSCCS(O)(=O)=O", 267 | "CC(=O)CSCCS(=O)(=O)O", 268 | "CC(=O)CSCCS(=O)(=O)[O-]", 269 | -1.8184615384615386, 270 | ], # Sulfonate 271 | ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", 0.678267], # Thioic_acid 272 | ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", 9.12448275862069], # Thiol 273 | [ 274 | "Brc1cc[nH+]cc1", 275 | "Brc1cc[nH+]cc1", 276 | "Brc1ccncc1", 277 | 4.3535441240733945, 278 | ], # Aromatic_nitrogen_unprotonated 279 | [ 280 | "C=C(O)c1c(C)cc(C)cc1C", 281 | "C=C(O)c1c(C)cc(C)cc1C", 282 | "C=C([O-])c1c(C)cc(C)cc1C", 283 | 8.871850714285713, 284 | ], # Vinyl_alcohol 285 | [ 286 | "CC(=O)ON", 287 | "CC(=O)O[NH3+]", 288 | "CC(=O)ON", 289 | 4.035714285714286, 290 | ], # Primary_hydroxyl_amine 291 | ], 292 | ) 293 | def test_pka_average(smiles_input, smiles_protonated, smiles_deprotonated, pka_avg): 294 | """Test that when the pH is equal to the average pKa, the protonation 295 | state is always both""" 296 | 297 | output = list( 298 | protonate_smiles(smiles_input, ph_min=pka_avg, ph_max=pka_avg, precision=0.5) 299 | ) 300 | assert len(output) == 2 301 | smiles_output_sorted = tuple(sorted(output)) 302 | smiles_correct_sorted = tuple(sorted((smiles_protonated, smiles_deprotonated))) 303 | for smiles_output, smiles_correct in zip( 304 | smiles_output_sorted, smiles_correct_sorted 305 | ): 306 | compare_smiles(smiles_output, smiles_correct) 307 | 308 | 309 | def test_no_carbanion(): 310 | smi = ( 311 | "Cc1nc2cc(-c3[nH]c4cc5ccccc5c5c4c3CCN(C(=O)O)[C@@H]5O)cc3c(=O)[nH][nH]c(n1)c23" 312 | ) 313 | output = list(protonate_smiles(smi)) 314 | 315 | if "[C-]" in "".join(output).upper(): 316 | msg = "Processing " + smi + " produced a molecule with a carbanion!" 317 | raise RuntimeError(msg) 318 | else: 319 | print("(CORRECT) No carbanion: " + smi) 320 | 321 | 322 | def test_max_variants(): 323 | # Make sure max number of variants is limited (old bug). 324 | smi = "CCCC[C@@H](C(=O)N)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](C(C)C)NC(=O)[C@@H](NC(=O)[C@H](Cc1c[nH]c2c1cccc2)NC(=O)[C@@H](NC(=O)[C@@H](Cc1ccc(cc1)O)N)CCC(=O)N)C)C)Cc1nc[nH]c1)Cc1ccccc1" 325 | output = list(protonate_smiles(smi)) 326 | 327 | assert len(output) == 128, f"Should produce 128 mol, but produced {len(output)}" 328 | 329 | 330 | @pytest.mark.parametrize(("smiles"), [r"CCC(C)=C(Cl)C/C(I)=C(\C)F"]) 331 | def test_no_protonation_sites(smiles): 332 | output = protonate_smiles(smiles) 333 | assert len(output) == 1 334 | compare_smiles(output[0], smiles) 335 | 336 | 337 | @pytest.mark.parametrize( 338 | ("smiles_input", "ph", "smiles_correct"), 339 | [ 340 | ( 341 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O", 342 | 0.5, 343 | "[NH3+]c1[nH+]c[nH+]c2c1[nH+]cn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O", 344 | ), 345 | ( 346 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O", 347 | 1.0, 348 | "[NH3+]c1[nH+]c[nH+]c2c1[nH+]cn2C1OC(COP(=O)(O)OP(=O)([O-])OP(=O)(O)O)C(O)C1O", 349 | ), 350 | ( 351 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O", 352 | 2.6, 353 | "[NH3+]c1[nH+]c[nH+]c2c1[nH+]cn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])O)C(O)C1O", 354 | ), 355 | ( 356 | "O=P(O)(OP(O)(OP(O)(OCC1OC(C(C1O)O)N2C=NC3=C2N=CN=C3N)=O)=O)O", 357 | 7.0, 358 | "Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O", 359 | ), 360 | # Changed output NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c([NH3+])ncnc54)C(O)C3O)C(O)C2O)c1 361 | # to NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1 362 | ( 363 | "O=P(O)(OP(O)(OCC1C(O)C(O)C(N2C=NC3=C(N)N=CN=C32)O1)=O)OCC(O4)C(O)C(O)C4[N+]5=CC=CC(C(N)=O)=C5", 364 | 0.5, 365 | "NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1", 366 | ), 367 | # Changed output NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4cnc5c([NH3+])ncnc54)C(O)C3O)C(O)C2O)c1 368 | # to NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1 369 | # Old version of dimorphite would inconsistently handle failed protonation by sometimes referring to 370 | # the last successful site OR the last successful site TYPE. 371 | ( 372 | "O=P(O)(OP(O)(OCC1C(O)C(O)C(N2C=NC3=C(N)N=CN=C32)O1)=O)OCC(O4)C(O)C(O)C4[N+]5=CC=CC(C(N)=O)=C5", 373 | 2.5, 374 | "NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4c[nH+]c5c([NH3+])[nH+]c[nH+]c54)C(O)C3O)C(O)C2O)c1", 375 | ), 376 | ( 377 | "O=P(O)(OP(O)(OCC1C(O)C(O)C(N2C=NC3=C(N)N=CN=C32)O1)=O)OCC(O4)C(O)C(O)C4[N+]5=CC=CC(C(N)=O)=C5", 378 | 7.4, 379 | "NC(=O)c1ccc[n+](C2OC(COP(=O)([O-])OP(=O)([O-])OCC3OC(n4cnc5c(N)ncnc54)C(O)C3O)C(O)C2O)c1", 380 | ), 381 | ], 382 | ) 383 | def test_multiple_ph(smiles_input, ph, smiles_correct): 384 | output = protonate_smiles( 385 | smiles_input, ph_min=ph, ph_max=ph, precision=0.0, validate_output=True 386 | ) 387 | assert len(output) == 1 388 | 389 | compare_smiles(output[0], smiles_correct) 390 | -------------------------------------------------------------------------------- /tests/tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /training_data/README.md: -------------------------------------------------------------------------------- 1 | Training Data 2 | ============= 3 | 4 | Format 5 | ------ 6 | 7 | To allow others to reproduce our work, we here include the data used to 8 | calculate typical pKa ranges for 38 ionizable substructures. Please see the 9 | `training_data.json` file. 10 | 11 | The keys of the JSON are the labels of each substructure (e.g., 12 | "Thioic_acid"). The JSON values are lists of pKa values. For example: 13 | 14 | ``` json 15 | { 16 | "Aromatic_protonated_nitrogen": [ 17 | 7.7, 14.9, 15.3, ... 18 | ], 19 | "Vynl_alcohol": [ 20 | 9.2, 9.5, 9.5, ... 21 | ] 22 | } 23 | ``` 24 | 25 | In the case of "Phosphate" and "Phosphonate" groups, the values are lists of 26 | two pKa values (pKa1 and pKa2). Where one of these pKa values is unavailable, 27 | it is listed as `null`. For example: 28 | 29 | ``` json 30 | { 31 | "Phosphonate": [ 32 | [1.1, 6.5], [2.7, 8.4], [null, 8.7], ... 33 | ] 34 | } 35 | ``` 36 | 37 | Reaxys Terms and Conditions 38 | --------------------------- 39 | 40 | Most of the pKa data used to train Dimorphite-DL was taken from the [Reaxys 41 | database](https://www.reaxys.com/#/about-content), owned and operated by 42 | Elsevier Information Systems GmbH. [Facts are not 43 | copyrightable](https://www.copyright.gov/help/faq/faq-protect.html), but in 44 | using the database we did agree to Elsevier's [Terms and 45 | Conditions](https://www.elsevier.com/legal/elsevier-website-terms-and-conditions). 46 | 47 | Ideally, we would like to include both the SMILES strings and precise 48 | catalogued pKa values for all training examples. But, given the Terms and 49 | Conditions, it is unclear whether this use is permissible: 50 | 51 | > Unless otherwise set out herein, content comprised within the Services, 52 | > including text... and other information (collectively, the "Content")... is 53 | > owned by Elsevier, its licensors or its content providers and is protected 54 | > by copyright, trademark and other intellectual property and unfair 55 | > competition laws. 56 | 57 | Do the catalogued SMILES strings and pKa values fall under this definition of 58 | "content"? But they are not copyrightable, perhaps suggesting they do not. On 59 | the other hand, publication is certainly a kind of "scholarly use": 60 | 61 | > ...you may print or download Content from the Services for your own 62 | > personal, non-commercial, informational or scholarly use, provided that you 63 | > keep intact all copyright and other proprietary notices. 64 | 65 | But, later in the terms, publication seems to be expressly prohibited: 66 | 67 | > You may not copy, display, distribute, modify, publish, reproduce, store, 68 | > transmit, post, translate or create other derivative works from, or sell, 69 | > rent or license all or any part of the Content... in any medium to anyone, 70 | > except as otherwise expressly permitted under these Terms and Conditions, or 71 | > any relevant license or subscription agreement or authorization by us. 72 | 73 | We emailed Reaxys seeking clarification but did not hear back from them. 74 | 75 | Solution 76 | -------- 77 | 78 | Given this uncertainty, we opted not to publish the exact SMILES structures 79 | taken from the Reaxys database. We further opted to round the pKa values to 80 | the nearest tenth, to avoid directly redistributing Reaxys data. The data we 81 | do provide should allow others to recalculate our pKa ranges with reasonable 82 | accuracy. 83 | -------------------------------------------------------------------------------- /training_data/training_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "Azide": [ 3 | 4.6, 4.7 4 | ], 5 | "Nitro": [ 6 | -1000.0 7 | ], 8 | "AmidineGuanidine1": [ 9 | 13.6, 10.1, 13.4, 10.4, 11.7, 13.6, 11.5, 13.7, 11.8, 12.1, 13.6, 13.4, 10 | 8.3, 10.9, 12.3 11 | ], 12 | "AmidineGuanidine2": [ 13 | 11.1, 10.3, 8.3, 11.1, 10.9, 9.2, 8.9, 7.7, 6.7, 8.3, 6.8, 10.7, 4.8, 14 | 8.1, 9.2, 8.5, 8.1, 9.1, 8.0, 9.0, 9.7, 9.3, 9.3, 8.6, 8.1, 9.5, 10.3, 15 | 9.8, 8.6, 8.3, 7.9, 9.1, 8.4, 9.2, 8.5, 9.4, 4.7, 8.3, 10.7, 12.4, 16 | 12.3, 12.5, 11.6, 14.4, 13.9, 13.9, 7.5, 12.2, 8.9, 9.1, 14.3, 10.6, 17 | 12.2, 12.3, 12.4, 12.3, 12.3, 12.2, 10.9, 12.5, 12.6, 12.0, 11.7, 11.2, 18 | 11.5 19 | ], 20 | "Sulfate": [ 21 | -1.5, -1.4, -1.7, -1.4, -1.2, -3.4, -3.9, -4.4 22 | ], 23 | "Sulfonate": [ 24 | -1.9, -1.7, -5.9, -2.8, -0.8, -0.6, -2.5, -1.0, -1.3, -1.0, -0.6, -1.8, 25 | -1.8 26 | ], 27 | "Sulfinic_acid": [ 28 | 1.9, 1.7, 2.0, 1.5, 2.2, 1.0, 2.3, 1.4, 2.2 29 | ], 30 | "Phenyl_carboxyl": [ 31 | 3.7, 4.5, 3.7, 3.0, 3.8, 3.1, 4.5, 4.5, 4.3, 4.3, 3.6, 4.5, 4.5, 4.5, 32 | 1.9, 0.1, 3.6, 3.6, 4.5, 4.2, 2.8, 3.8, 4.2, 3.9, 4.2, 3.9, 4.3, 3.6, 33 | 3.9, 4.6, 3.8, 3.5, 3.4, 4.1, -0.6, 3.3, 4.0, 2.9, 3.4, 3.7, 4.0, 4.2, 34 | 3.6, -0.6, 4.3, 4.2, 3.4, 2.4, 4.1, 2.5, 2.7, 3.8, 3.8, 3.9, 3.8, 3.8, 35 | 3.8, 4.1, 3.5, 3.6, 3.1, 3.9, 3.5, 2.9, 3.8, 4.5, 3.0, 3.6, 1.5, 3.3, 36 | 3.2, 1.5, 3.3, 3.4, 3.1, 3.2, 2.4, 1.7, 1.9, 2.3, 3.9, 3.9, 3.8, 4.1, 37 | 4.4, 5.7, 5.3, 1.8, 4.3, 3.4, 3.1, 3.1, 3.1, 3.1, 3.2, 4.4, 3.6, 4.0, 38 | 4.4, 4.4, 4.1, 5.2, 5.5, 4.2, 3.9, 4.4, 4.5, 3.5, 5.0, 4.3, 3.3, 3.4, 39 | 3.4, 3.8, 4.3, 4.1, 5.3, 5.5, 4.0, 3.6, 4.2, 3.7, -0.6, 3.5, 3.6, -0.6, 40 | 2.9, 3.5, -0.5, -0.5, -0.5, 3.7, 3.7, 3.5, 2.9, 1.7, 4.2, 5.0, 4.2, 41 | 3.6, 1.2 42 | ], 43 | "Carboxyl": [ 44 | 4.2, 4.6, 4.6, 4.8, 5.8, 3.2, 2.7, 4.5, 4.7, 4.7, 4.2, 1.9, 3.2, 2.6, 45 | 3.6, 4.7, 5.1, 4.6, 4.6, 4.7, 3.2, 4.2, 4.0, 4.8, 3.2, 4.2, 3.8, 2.4, 46 | 3.8, 3.8, 3.8, 2.6, 3.7, 4.7, 4.8, -0.7, 2.6, 2.5, 3.7, 3.4, 4.7, 4.9, 47 | 4.6, -0.7, 4.7, 1.7, 1.9, 2.6, 3.5, 3.6, 5.0, 4.8, 3.4, 4.1, 4.5, 4.5, 48 | 3.6, 4.7, 1.9, 2.6, 4.8, 4.8, 3.6, 4.9, 4.9, 3.6, 4.1, -0.7, 3.6, -0.7, 49 | -0.7, -0.7, 4.6, 4.9, 3.6, 3.8, 2.6, 3.8, 5.7, 1.1, 2.7, 4.3, 3.1, 2.5, 50 | 3.6, 3.4, 4.3, 4.6, 4.7, 1.7, 4.2, 4.5, 4.6, 2.6, 2.9, 5.4, 5.0, 3.4, 51 | -0.5, 3.5, 4.2, 4.5, 4.2, 4.5, 4.6, 4.3, 4.5, 3.3, 1.8, 4.3, 2.0, 3.5, 52 | 4.4, 4.7, 2.6, 2.8, 3.7, 4.6, 4.0, 2.0, 4.4, 4.7, 3.1, 2.6, 3.5, 4.8, 53 | 3.2, 3.9, 3.7, 3.9, 3.5, 4.6, 4.9, 5.0, 1.3, 1.4, 4.1, 4.5, 5.0, 5.0, 54 | 5.3, 5.0, 1.8, 2.6, 2.4, 3.2, 1.3, 2.7, 2.6, 3.4, 3.1, 2.3, 2.4, 2.2, 55 | 4.0, 4.0, 4.0, 3.9, 4.0, 1.8, 3.8, 4.4, 4.6, 4.8, 3.2, 3.4, 4.0, 3.9, 56 | 3.9, 2.2, 2.9, 3.0, 3.0, 2.0, 2.0, 2.0, 1.5, 3.2, 3.4, 3.4, 3.2, 3.3, 57 | -0.7, 3.3, 1.4, 3.1, 3.2, 3.3, 4.6, 4.3, 4.3, 4.3, -0.6, 1.9, 1.9, 2.7, 58 | 2.5, 1.9, 2.0, 3.1, 2.1, 2.2, 2.2, 2.5, 1.4, 1.1, 1.2, 0.2, 4.4, 2.5, 59 | 0.7, 0.3, 3.2, 2.9, 3.4, 2.5, 4.2, 5.1, 4.1, 3.3, 3.3, 4.9, 4.8, 4.8, 60 | 3.6, 3.5, 4.8, 5.0, 4.9, 3.9, 3.6, 3.4, 3.1, 3.3, -0.6, 4.3, 4.0, 3.7, 61 | 4.4, 4.5, 3.3, 3.4, 3.4, 4.4, 5.0, 3.6, 3.0, 4.7, 4.8, 4.4, 4.3, 4.6, 62 | 5.0, 4.5, 4.2, 4.8, 4.1, 4.9, 4.9, 5.0, 5.1, 5.0, 4.9, 5.0, 5.0, 4.5, 63 | 4.0, 4.4, 4.2, -0.7, 4.9, 4.3, 4.7, 4.7, 4.4, 4.3, 4.4, 4.5, 3.1, 3.5, 64 | 3.4, 2.8, 3.1, 3.1, 3.1, 3.1, 3.2, 3.1, 3.1, 3.2, 3.1, 3.1, 3.1, 3.2, 65 | 3.1, 3.2, 3.0, 3.1, 3.2, 4.1, 3.6, 2.4, 1.9, 2.5, 2.5, 2.6, 2.4, 2.5, 66 | 2.5, 2.6, 2.5, 2.4, 2.1, 2.7, 2.7, 2.8, 2.7, 2.6, 2.7, 2.8, 2.7, 2.7, 67 | 2.7, 2.9, 3.0, 4.3, 3.7, 3.5, 3.7, 3.8, 3.6, 3.6, 3.5, 3.6, 3.6, 3.5, 68 | 3.5, 3.5, 3.5, 3.9, 3.9, 3.8, 3.8, 3.6, 5.2, 5.3, 5.4, 4.9, 5.4, -0.7, 69 | -0.6, 3.9, 4.3, 3.7, 3.9, 4.1, 4.1, 4.1, 3.9, 4.6, 4.5, 4.5, 4.3, 4.5, 70 | 4.4, 4.3, 4.0, 4.1, 3.4, -0.6, 2.0, 2.3, 4.0, 2.8, 3.3, 4.8, 3.8, 2.6, 71 | 4.4, 4.8, 4.8, 4.5, 4.7, 2.0, 1.8, 1.8 72 | ], 73 | "Thioic_acid": [ 74 | -0.6, 1.7, -0.6, 2.5, -0.4, -0.4, 2.6 75 | ], 76 | "Phenyl_Thiol": [ 77 | 5.3, 6.3, -0.8, 6.4, 2.7, 2.8, 4.9, -0.8, 5.5, 5.9, 7.2, 6.2, 7.0, 5.4, 78 | 5.5, 8.6, 6.6 79 | ], 80 | "Thiol": [ 81 | 7.7, 7.3, 7.7, 7.2, 7.6, 7.5, 9.4, 10.4, 10.2, 10.6, 10.3, 8.5, 10.0, 82 | 10.9, 8.6, 7.9, 11.1, 11.2, 7.9, 9.4, 7.3, 10.7, 9.3, 7.9, 9.5, 8.4, 83 | 9.9, 10.2, 10.2 84 | ], 85 | "Phosphate": [ 86 | [2.0, 6.5], [2.0, 6.5], [2.3, 6.8], [4.2, 9.7], [4.0, 7.3], [2.7, 6.7], 87 | [1.3, 5.8], [2.0, 6.8], [2.9, 5.9], [5.9, 6.2], [1.6, 6.7], [1.0, 5.8], 88 | [2.0, 5.8], [2.0, 5.6], [3.4, 6.2], [3.1, 7.0], [1.8, null], 89 | [1.8, null], [1.6, null], [1.6, null], [null, 6.6], [null, 6.2], 90 | [null, 4.8], [1.7, null], [2.4, null], [2.5, null], [null, 6.7] 91 | ], 92 | "Phosphonate": [ 93 | [1.1, 6.5], [2.7, 8.4], [2.8, 8.7], [1.3, 6.5], [2.7, 8.5], [2.9, 9.0], 94 | [2.6, 8.2], [2.5, 8.2], [2.1, 7.5], [1.4, 6.7], [1.3, 6.5], [1.3, 6.7], 95 | [1.6, 7.0], [1.1, 6.3], [2.3, 7.5], [2.4, 7.3], [1.8, 7.2], [3.8, 8.1], 96 | [1.7, 6.9], [1.8, 8.4], [2.4, 8.1], [1.4, 6.3], [1.8, 7.1], [1.9, 7.3], 97 | [1.7, 6.8], [1.6, 4.8], [2.7, 8.4], [1.3, 6.5], [1.9, 7.3], [1.6, 6.7], 98 | [2.2, 7.8], [1.9, 7.4], [1.6, 7.0], [1.4, 6.6], [1.5, 6.7], [1.4, 6.6], 99 | [1.6, 7.0], [2.1, 7.3], [1.7, 7.1], [1.8, 7.2], [1.1, 5.6], [1.3, 5.9], 100 | [null, 8.0], [null, 7.1], [null, 8.1], [null, 7.8], [null, 8.6], 101 | [null, 6.6], [null, 7.2], [null, 6.8], [null, 8.2] 102 | ], 103 | "Phenol": [ 104 | 3.9, 4.3, 5.2, 6.0, 7.6, 5.2, 6.3, 7.8, 4.7, 6.6, 10.0, 5.0, 10.0, 8.9, 105 | 0.1, -0.7, -0.8, 10.2, -1.1, -1.0, 6.3, 6.5, 6.4, 6.3, 10.3, 10.3, 7.2, 106 | 6.7, -1.0, 8.3, -1.0, 8.1, 8.4, -1.0, 7.1, 6.1, 8.3, 8.7, -1.0, 10.1, 107 | 9.6, 7.7, 8.0, 6.9, 8.3, 10.2, -0.7, -0.8, 8.4, 7.4, 7.8, 7.8, 2.3, 108 | 7.4, 9.9, 7.2, 7.9, 10.0, 9.2, 8.0, 9.7, 9.5, 9.5, 10.5, 5.9, 5.3, 109 | 10.6, 10.1, 6.6, 10.6, 8.1, 12.6, 6.4, 7.3, 5.6, 6.3, 10.5, 10.4, 10.0, 110 | 10.6, 10.2, 8.9, 11.8, 7.7, 7.8, 6.3, 5.4, 7.1, 7.2, 9.6, 7.8, 6.1, 111 | 8.6, 6.8, 2.1, 8.6, 8.2, 8.3, 8.0, 8.6, -0.6, -0.8, 8.3, 7.4, 6.2, 7.4, 112 | 3.3, 8.9, 6.6, 6.6, 8.2, 1.6, 7.2, 6.1, 8.3, 3.9, 3.8, 4.1, 7.8, 1.6, 113 | 6.4, 1.4, 2.2, 2.0, 6.0, 8.7, 7.9, 5.2, 10.7, 10.8, 10.9, 6.3, 6.6, 114 | -0.9, 10.0, 10.3, 7.6, 7.6, -0.9, -0.9, 9.9, -1.0, 7.8, 8.3, 8.3, 9.6, 115 | -0.9, 10.2, 9.5, 9.1, 9.1, 9.9, 9.2, 9.6, 7.8, 7.4, 5.6, 9.0, 9.1, 9.0, 116 | 8.4, 8.8, 9.8, 9.5, 10.2, 8.6, 9.9, 7.2, 10.7, 8.4, 8.4, 8.8, 10.0, 6.8, 117 | 8.2, 7.3, 9.3, 9.9, 11.9, 10.4, 9.6, 8.1, 5.4, 9.3, 9.3, 9.4, 8.2, 118 | 10.1, 9.8, 7.3 119 | ], 120 | "Peroxide1": [ 121 | 7.1, 9.3, 9.8, 8.6, 9.6, 8.1, 9.6, 8.9, 8.2, 9.0, 9.4, 9.0, 8.7, 8.9, 122 | 7.8, 7.4, 9.0, 8.9 123 | ], 124 | "Peroxide2": [ 125 | 12.6, 11.5, 11.2, 11.2, 11.9, 10.6, 13.3, 11.4, 12.8, 11.0, 12.8, 12.8, 126 | 10.5, 12.5, 12.6, 12.4, 12.6 127 | ], 128 | "O=C-C=C-OH": [ 129 | 4.2, 3.6, 3.6, 3.6, 3.1, 3.6, 1.7, 4.0, 3.5, 3.6, 3.6, 2.4, 3.2, 5.2, 130 | 4.3 131 | ], 132 | "Vinyl_alcohol": [ 133 | 9.2, 9.5, 9.5, 9.6, 9.5, 3.9, 8.5, 10.7, 10.4, 8.6, 9.5, 10.5, 9.3, 134 | 9.2, 9.1, 9.4, 10.5, 9.4, 7.4, 5.7, 6.8 135 | ], 136 | "Alcohol": [ 137 | 14.6, 15.4, 15.5, 15.6, 15.7, 15.1, 15.1, 13.3, 14.3, 12.4, 14.9, 15.5, 138 | 9.2, 12.2, 15.4, 12.1, 12.2, 16.0, 17.0, 14.4, 13.3, 24.0, 14.0, 15.1, 139 | 16.8, 15.2 140 | ], 141 | "N-hydroxyamide": [ 142 | 8.0, 9.0, 8.3, 11.1, 10.8, 8.4, 8.8, 10.5, 8.5, 10.1, 8.7, 8.2, 9.8, 143 | 10.8, 8.1, 11.2, 11.1, 8.2, 9.9, 7.7, 8.3 144 | ], 145 | "Ringed_imide1": [ 146 | 6.1, 6.9, 7.5, 5.8, 6.2, 6.4, 6.0, 6.6 147 | ], 148 | "Ringed_imide2": [ 149 | 9.3, 9.5, 9.6, 11.4, 10.5, 10.2, 10.0, 4.4, 9.8, 8.7, 9.4, 8.8, 7.7, 150 | 6.2, 10.5, 11.2, 9.4, 9.4, 7.7, 6.2, 8.7, 8.3, 6.3, 5.2 151 | ], 152 | "Imide": [ 153 | 1.2, 2.1, 4.1 154 | ], 155 | "Imide2": [ 156 | 9.8, 10.3, 9.1, 9.4, 11.0, 10.3, 12.9, 10.2, 10.3, 9.1 157 | ], 158 | "Amide_electronegative": [ 159 | -0.6, -0.7, 6.8, 6.8, 9.3, 1.7, 1.9, 2.1, 2.4, 8.9, 1.5, 2.0, 2.1, 2.2, 160 | -0.8, 5.0, 4.2, 3.8, 1.6, 5.9, 2.9, 4.0, 4.7, 4.9, 4.5 161 | ], 162 | "Amide": [ 163 | 10.4, 3.4, 10.0, 13.5, 8.2, 11.7, 3.5, 9.3, 9.6, 13.4, 13.4, 13.3, 164 | 18.5, 18.5, 15.1, 19.4, 10.4, 14.5 165 | ], 166 | "Sulfonamide": [ 167 | 5.0, 7.8, 10.7, 8.5, 3.8, 7.6, 8.9, 9.7, 11.5, 8.0, 9.4, 10.8, 8.6, 168 | 8.3, 9.6, 9.6, 8.5, 9.5, 7.5, 7.9, 9.2, 8.9, 9.0, 6.5, 6.5, 6.3, 6.8, 169 | 3.9, 2.8, 4.9, 3.5, 9.0, 8.2, 9.3, 7.5, 6.8, 7.5, 10.3, 7.2, 7.7, 8.8, 170 | 8.2, 9.4, 10.3, 8.2, 6.2 171 | ], 172 | "Anilines_primary": [ 173 | 3.5, 3.6, 0.3, 9.2, 2.3, 2.3, 5.0, 4.4, 4.3, 2.5, 4.4, 2.2, 3.1, 5.3, 174 | 4.2, 4.4, 5.2, 4.4, 9.2, 2.4, 3.5, 3.7, 4.1, 4.5, 4.4, 4.7, 4.8, 4.4, 175 | 4.3, 4.9, 4.9, 4.6, 5.1, 3.0, 4.7, 3.9, 4.4, 2.0, 3.4, -0.4, 2.6, 10.6, 176 | 1.2, 0.9, 2.4, 5.0, 4.0, 5.2, 2.8, 3.8, 4.0, 3.8, 2.9, 1.5, 4.4, 3.5, 177 | 4.7, 4.7, 4.5, 13.8, 3.4, 3.6, -0.5, 3.5, 3.6, 3.6, 2.7, 4.6, 2.5, 3.8, 178 | 2.6, 3.8, 1.8, 3.9, 2.5, 3.9, 3.9 179 | ], 180 | "Anilines_secondary": [ 181 | 7.2, 4.2, 4.4, 7.0, 6.8, 6.3, 1.2, 4.1, 4.6, 5.1, 5.2, 5.1, 5.1, 5.4, 182 | 4.9, 4.6, 3.3, 5.0, 4.2, 2.5, 3.7, 4.0, 6.1, 5.4, 5.9, 4.9, 3.7, 4.6, 183 | 8.6, 4.6, 4.6, 3.3, 5.0, 4.3, 2.5, 3.7, 4.0, 7.3, 6.5, -0.8, 0.3, 0.8, 184 | 4.5, -0.9, 7.2, 7.2, 0.8, -0.7, 5.2 185 | ], 186 | "Anilines_tertiary": [ 187 | 4.1, 3.9, 3.6, 4.0, 4.1, 2.1, 3.3, 4.9, 6.4, 5.7, 6.1, 8.0, 7.2, 6.6, 188 | 0.7, 1.4, 7.2, 4.8, 4.1, 2.6, 2.6, 2.6, 2.2, 7.5, 5.4, 4.4, 5.8, 4.2, 189 | 2.7, 2.4, 2.2, 2.1, 1.9, 4.6, 3.3, 5.1, 6.0, 2.6, 8.8, 5.5, 4.8, 4.1, 190 | 5.8, 4.0, 5.6, 4.1, 2.3, 0.7, 1.8, 3.0, 2.6, 0.6, 4.5, 1.6, 4.5, 7.1, 191 | 7.8 192 | ], 193 | "Aromatic_nitrogen_unprotonated": [ 194 | 0.9, 2.9, -0.4, 2.8, 4.0, 5.0, 5.0, 3.7, 4.9, 5.7, 4.9, 4.9, 5.6, 2.3, 195 | 3.3, 3.7, 2.6, 5.0, 5.7, 5.8, 5.8, 5.7, 6.7, 6.0, 6.0, 2.7, 5.5, 6.0, 196 | 5.3, 4.3, 3.2, 4.8, 6.0, 5.6, 6.0, 8.4, 9.0, 8.2, 3.8, 1.8, 3.3, 3.4, 197 | 3.5, 3.4, 3.0, 4.1, 1.8, 5.0, 0.9, 4.8, 3.4, 4.9, 6.7, 1.5, 3.1, 4.2, 198 | 5.4, 3.6, 4.4, 9.6, 4.9, 7.5, 6.4, 6.0, 5.2, 2.3, 6.8, 3.5, 6.0, 5.7, 199 | 6.8, 6.2, 6.7, 3.1, 6.6, 0.7, 2.7, 0.6, -0.4, 3.0, 1.8, 3.3, 3.5, 1.5, 200 | 2.0, 3.4, 3.5, 4.2, 4.0, 12.7, 2.3, 2.6, 2.3, 2.4, 4.2, 6.2, 0.7, 4.6, 201 | 4.9, 4.2, 4.7, 5.0, 4.2, 4.6, 5.4, 2.7, 5.6, 5.7, 3.7 202 | ], 203 | "Amines_primary_secondary_tertiary": [ 204 | 5.0, 5.5, 8.5, 10.8, 7.0, 11.0, 10.8, 11.3, 11.1, 9.7, 9.0, 9.6, 11.3, 205 | 9.5, 6.3, 7.9, -1.1, 9.1, 10.2, 9.8, 9.1, 5.2, 9.4, 7.1, 7.9, 9.8, 8.1, 206 | 7.8, 8.7, 8.3, 7.4, 6.9, 9.7, 7.0, 9.3, 9.6, 8.7, 9.3, 9.9, 10.3, 7.9, 207 | 7.5, 8.6, 10.6, 8.9, 7.2, 11.9, 8.8, 7.0, 7.8, 10.4, 10.6, 9.2, 8.3, 208 | 8.5, 7.7, 8.8, 8.0, 7.5, 6.1, 6.9, 8.7, 9.1, 5.8, 3.4, 11.1, 10.2, 9.4, 209 | 10.0, 9.7, 9.5, 10.1, 9.6, 11.4, 11.6, 11.9, 11.3, 11.0, 10.3, 7.5, 210 | 8.3, 10.9, 8.2, 10.9, 11.3, 9.0, 8.6, 9.2, 8.9, 7.0, 6.7, 8.3, 8.3, 211 | 10.6, 6.3, 8.3, 8.5, 8.3, 11.9, 10.2, 10.1, 10.6, 8.3, 10.0, 10.7, 4.5, 212 | 10.7, 11.2, 6.7, 6.0, 10.0, 10.9, 8.3, 6.6, 4.2, 4.6, 10.7, 10.5, 10.7, 213 | 10.8, 10.0, 9.1, 6.9, 6.2, 6.5, 4.3, 3.7, 3.6, 8.9, 8.7, 6.2, 9.3, 214 | 10.7, 5.0, 10.4, 8.9, 6.2, 6.7, 6.6, 3.9, 9.0, 8.5, 8.9, 10.1, 4.5, 215 | 8.7, 10.5, 1.0, 7.6, 7.5, 8.7, 1.9, 8.6, 7.5, 6.3, 11.9, 10.6, 10.4, 216 | 10.7, 7.7, 11.0, 12.1, 9.2, 11.1, 9.8, 0.9, 7.7, 3.1, 5.2, 9.0, 8.8, 217 | 9.6, 8.6, 7.0, 7.7, 7.3, 8.4, 8.3, 5.7, 6.0, 6.5, 6.7, 6.5, 3.9, 6.4, 218 | 6.7, 6.7, 9.3, 9.3, 6.6, 5.5, 9.4, 8.0, 8.4, 8.1, 7.8, 7.7, 8.1, 8.1, 219 | 5.5, 10.7, 9.9, 10.4, 8.9, 10.4, 9.9, 7.2, 10.5, 4.5, 5.0, 9.8, 9.4, 220 | 4.8, 8.0, 8.3, 6.5, 7.3, 8.0, 4.2, 9.5, 8.1, 8.1, 9.4, 9.2, 9.8, 9.6, 221 | 9.7, 9.9, 9.9, 9.9, 9.9, 8.1, 8.3, 8.6, 8.9, 8.2, 8.1, 7.0, 8.3, 7.8, 222 | 7.7, 9.1, 8.7, 7.9, 9.9, 10.4, 10.5, 10.2, 10.1, 10.7, 6.2, 6.0, 10.8, 223 | 10.6, 9.0, 6.1, 9.7, 1.7, 7.2, 7.8, 8.7, 7.2, 8.4, 9.4, 7.7, 3.8, 9.1, 224 | 7.3, 9.0, 8.8, 8.9, 9.7, 9.9, 9.4, 9.0, -0.9, 9.2, 9.4, -0.9, 8.7, 5.5, 225 | -0.9, 8.8, 8.3, 10.5, 9.3, 9.6, 9.5, 10.5, 10.6, 11.4, 10.6, 11.4, 6.8, 226 | 5.3, 9.1, 9.8, 5.3, 5.1, 5.3, 7.0, 7.5, 9.7, 10.2, 6.3, 8.8, 6.2, 5.0, 227 | 0.6, 3.1, 6.6, 7.4, 6.0, 2.6, 8.1, 6.1, 5.8, 5.8, 7.8, 10.1, 9.3, 7.0, 228 | 2.0, 4.5, 1.1, 5.3, -0.6, 4.5, 6.1, 7.0, 8.0, 6.5, 0.8, 0.4, 8.9, 9.1, 229 | 10.7, 9.2, 9.5, -0.9, 5.3, 5.7, 7.1, 5.8, 10.0, 11.5, 5.7, 7.1, 9.5, 230 | 8.6, 9.1, 8.8, 8.6, 9.7, 10.2, 10.4, 10.5, 9.9, 10.4, 10.8, 10.3, 8.8, 231 | 8.2, 7.3, 11.0, 6.8, 10.1, 9.7, 9.8, 9.6, 9.7, 9.8, 10.1, 11.0, 7.7, 232 | 9.6, 8.7, 8.9, 9.2, 9.7, 9.4, 8.6, 8.1, 8.3, 6.2, 8.0, 7.3, 7.0, 12.2, 233 | 10.3, -0.8, 7.6, 6.1, 7.5, 8.1, 3.1, 7.1, 9.1, 8.9, 10.0, 7.6, 9.2, 234 | 8.7, 8.8, 10.7, 10.9, 8.7, 7.7, 7.5, 14.8, 14.8, 14.7 235 | ], 236 | "Phosphinic_acid": [ 237 | 3.3, 3.3, 4.2, 3.6, 3.2, 3.2, 3.2, 3.3, 3.2, 2.4, 2.7, 0.8, 3.1, 3.5, 238 | 2.0, 3.1, 2.5, 2.7, 3.1, 3.0 239 | ], 240 | "Phosphate_diester": [ 241 | 1.1, 1.0, 1.1, 1.2, 1.1, 1.0, 1.1, 0.9, 0.7, 0.5, 3.6, 3.9, 3.6, 1.6, 242 | 2.9, 4.2, 3.5, 12.7, 3.8, 4.1, 3.5, 3.4, 2.4 243 | ], 244 | "Phosphonate_ester": [ 245 | 2.0, 2.2, 2.0, 1.9, 1.9, 1.9, 1.9, 2.0, 1.8, 2.0, 1.7, 1.7, 3.7, 2.1, 246 | 2.2, 2.3, 2.2, 2.1, 2.2, 2.2, 2.2, 2.4, 2.3, 2.3, 0.9 247 | ], 248 | "primary_hydroxyl_amine_2": [ 249 | 4.1, 4.6, 4.4, 4.2, 4.4, 4.3, 2.1 250 | ], 251 | "Indole_pyrrole": [ 252 | 14.9, 15.3, 14.5, 0.1, 17.5, 17.0, 16.5, 16.6, 16.3, 16.3, 15.2, 15.4, 253 | 13.0, 12.5, 15.2, 16.1 254 | ], 255 | "Aromatic_nitrogen_protonated": [ 256 | 7.7, 3.0, 11.1, 9.4, 8.3, 2.9, 8.7, 6.3 257 | ] 258 | } 259 | --------------------------------------------------------------------------------