├── .bumpversion.cfg ├── .gitignore ├── .readthedocs.yml ├── CHANGELOG.md ├── CONTRIBUTING.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── azure-pipelines.yml ├── docs ├── Makefile └── source │ ├── _images │ ├── mol1.png │ └── mol1s.png │ ├── api.rst │ ├── conf.py │ ├── guide │ ├── charge.rst │ ├── cli.rst │ ├── contributing.rst │ ├── fragment.rst │ ├── gettingstarted.rst │ ├── install.rst │ ├── intro.rst │ ├── standardize.rst │ ├── tautomer.rst │ └── validate.rst │ └── index.rst ├── environment.yml ├── examples ├── README.rst └── standardization.ipynb ├── molvs ├── __init__.py ├── charge.py ├── cli.py ├── errors.py ├── fragment.py ├── metal.py ├── normalize.py ├── resonance.py ├── standardize.py ├── tautomer.py ├── utils.py ├── validate.py └── validations.py ├── setup.py └── tests ├── __init__.py ├── test_charge.py ├── test_fragment.py ├── test_metal.py ├── test_normalize.py ├── test_resonance.py ├── test_standardize.py ├── test_tautomer.py └── test_validate.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.1 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:molvs/__init__.py] 9 | 10 | [bumpversion:file:docs/source/guide/install.rst] 11 | 12 | [bumpversion:file:docs/source/conf.py] 13 | 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | env/ 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | pip-log.txt 23 | pip-delete-this-directory.txt 24 | htmlcov/ 25 | .tox/ 26 | .coverage 27 | .coverage.* 28 | .cache 29 | .pytest_cache 30 | nosetests.xml 31 | coverage.xml 32 | .hypothesis/ 33 | docs/build/ 34 | .ipynb_checkpoints 35 | .python-version 36 | .env 37 | .venv/ 38 | venv/ 39 | ENV/ 40 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | image: latest 4 | conda: 5 | environment: environment.yml 6 | formats: all 7 | python: 8 | version: 3.7 9 | install: 10 | - method: pip 11 | path: . 12 | sphinx: 13 | configuration: docs/source/conf.py 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## [v0.1.1](https://github.com/mcs07/MolVS/tree/v0.1.1) (2018-04-11) 4 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.1.0...v0.1.1) 5 | 6 | **Implemented enhancements:** 7 | 8 | - Add hydrogen to REMOVE\_FRAGMENTS list [\#23](https://github.com/mcs07/MolVS/pull/23) ([JoshuaMeyers](https://github.com/JoshuaMeyers)) 9 | 10 | **Fixed bugs:** 11 | 12 | - Fluorine considered metal [\#24](https://github.com/mcs07/MolVS/issues/24) 13 | - Fix mistake in metal SMARTS [\#25](https://github.com/mcs07/MolVS/pull/25) ([mcs07](https://github.com/mcs07)) 14 | 15 | **Closed issues:** 16 | 17 | - MolVS 0.1.0 Standardization fails on Python 3 [\#22](https://github.com/mcs07/MolVS/issues/22) 18 | - molvs hugs on some molecules [\#21](https://github.com/mcs07/MolVS/issues/21) 19 | 20 | ## [v0.1.0](https://github.com/mcs07/MolVS/tree/v0.1.0) (2018-02-07) 21 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.9...v0.1.0) 22 | 23 | **Implemented enhancements:** 24 | 25 | - Add support for using conda in development [\#19](https://github.com/mcs07/MolVS/pull/19) ([mcs07](https://github.com/mcs07)) 26 | 27 | **Fixed bugs:** 28 | 29 | - error standardizing ionization [\#15](https://github.com/mcs07/MolVS/issues/15) 30 | - Molecule did not standardise overnight [\#14](https://github.com/mcs07/MolVS/issues/14) 31 | - Problem with \_\_repr\_\_ method of TautomerTransform class [\#7](https://github.com/mcs07/MolVS/issues/7) 32 | - Fixed forced charge corrections [\#18](https://github.com/mcs07/MolVS/pull/18) ([mcs07](https://github.com/mcs07)) 33 | - Fixing infinite loop issue in reionization code [\#17](https://github.com/mcs07/MolVS/pull/17) ([coleb](https://github.com/coleb)) 34 | - Fix TautomerTransform repr - fixes \#7 [\#8](https://github.com/mcs07/MolVS/pull/8) ([mcs07](https://github.com/mcs07)) 35 | 36 | ## [v0.0.9](https://github.com/mcs07/MolVS/tree/v0.0.9) (2017-01-27) 37 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.8...v0.0.9) 38 | 39 | ## [v0.0.8](https://github.com/mcs07/MolVS/tree/v0.0.8) (2016-12-12) 40 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.7...v0.0.8) 41 | 42 | **Fixed bugs:** 43 | 44 | - Standardizer gets stuck on a molecule [\#4](https://github.com/mcs07/MolVS/issues/4) 45 | - Fix reionizer infinite loop - fixes \#4 [\#5](https://github.com/mcs07/MolVS/pull/5) ([mcs07](https://github.com/mcs07)) 46 | 47 | ## [v0.0.7](https://github.com/mcs07/MolVS/tree/v0.0.7) (2016-12-04) 48 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.6...v0.0.7) 49 | 50 | ## [v0.0.6](https://github.com/mcs07/MolVS/tree/v0.0.6) (2016-12-04) 51 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.5...v0.0.6) 52 | 53 | ## [v0.0.5](https://github.com/mcs07/MolVS/tree/v0.0.5) (2016-02-15) 54 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.4...v0.0.5) 55 | 56 | ## [v0.0.4](https://github.com/mcs07/MolVS/tree/v0.0.4) (2016-01-05) 57 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.3...v0.0.4) 58 | 59 | ## [v0.0.3](https://github.com/mcs07/MolVS/tree/v0.0.3) (2014-08-19) 60 | [Full Changelog](https://github.com/mcs07/MolVS/compare/v0.0.2...v0.0.3) 61 | 62 | ## [v0.0.2](https://github.com/mcs07/MolVS/tree/v0.0.2) (2014-08-06) 63 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | .. sectionauthor:: Matt Swain 5 | 6 | Contributions of any kind are greatly appreciated! 7 | 8 | Feedback 9 | -------- 10 | 11 | The `Issue Tracker`_ is the best place to post any feature ideas, requests and bug reports. 12 | 13 | The following are especially welcome: 14 | 15 | - General feedback on whether any standardization stages should work differently. 16 | - Specific molecules that don't validate or standardize as expected. 17 | - Ideas for new validation and standardization stages. 18 | 19 | Contributing 20 | ------------ 21 | 22 | If you are able to contribute changes yourself, just fork the `source code`_ on GitHub, make changes and file a pull 23 | request. All contributions are welcome, no matter how big or small. 24 | 25 | The following are especially welcome: 26 | 27 | - New validation or standardization stages. 28 | - Alternative tautomer transforms and scores. 29 | - Lists of salts and solvents to strip out. 30 | - New or improved documentation of existing features. 31 | 32 | Quick guide to contributing 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | 1. `Fork the MolVS repository on GitHub`_, then clone your fork to your local machine:: 36 | 37 | git clone https://github.com//MolVS.git 38 | cd molvs 39 | 40 | 2. Install the development requirements into a `conda environment`_:: 41 | 42 | conda env create -n molvs -f environment.yml 43 | source activate molvs 44 | 45 | 3. Create a new branch for your changes:: 46 | 47 | git checkout -b 48 | 49 | 4. Make your changes or additions. Ideally add some tests and ensure they pass by running:: 50 | 51 | pytest 52 | 53 | 5. Commit your changes and push to your fork on GitHub:: 54 | 55 | git add . 56 | git commit -m "" 57 | git push origin 58 | 59 | 4. `Submit a pull request`_. 60 | 61 | Tips 62 | ~~~~ 63 | 64 | - Follow the `PEP8`_ style guide. 65 | - Include docstrings as described in `PEP257`_. 66 | - Try and include tests that cover your changes. 67 | - Try to write `good commit messages`_. 68 | - Consider `squashing your commits`_ with rebase. 69 | - Read the GitHub help page on `Using pull requests`_. 70 | 71 | .. _`Issue Tracker`: https://github.com/mcs07/MolVS/issues 72 | .. _`source code`: https://github.com/mcs07/MolVS 73 | .. _`Fork the MolVS repository on GitHub`: https://github.com/mcs07/MolVS/fork 74 | .. _`conda environment`: https://conda.io/docs/ 75 | .. _`Submit a pull request`: https://github.com/mcs07/MolVS/compare/ 76 | .. _`squashing your commits`: http://gitready.com/advanced/2009/02/10/squashing-commits-with-rebase.html 77 | .. _`PEP8`: https://www.python.org/dev/peps/pep-0008 78 | .. _`PEP257`: https://www.python.org/dev/peps/pep-0257 79 | .. _`good commit messages`: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html 80 | .. _`Using pull requests`: https://help.github.com/articles/using-pull-requests 81 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright 2019 Matt Swain 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include LICENSE 3 | recursive-include tests *.py 4 | recursive-include docs * 5 | recursive-include requirements *.txt 6 | prune docs/build 7 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | MolVS: Molecule Validation and Standardization 2 | ============================================== 3 | 4 | .. image:: https://img.shields.io/pypi/v/MolVS.svg?style=flat-square 5 | :alt: PyPI package 6 | :target: https://pypi.python.org/pypi/MolVS 7 | 8 | .. image:: https://img.shields.io/conda/vn/conda-forge/molvs.svg?style=flat-square 9 | :alt: Conda package 10 | :target: https://anaconda.org/conda-forge/molvs 11 | 12 | .. image:: https://img.shields.io/github/license/mcs07/MolVS.svg?style=flat-square 13 | :alt: MIT license 14 | :target: https://github.com/mcs07/MolVS/blob/master/LICENSE 15 | 16 | .. image:: https://img.shields.io/azure-devops/build/mcs07/MolVS/1.svg?style=flat-square 17 | :alt: Azure DevOps tests 18 | :target: https://dev.azure.com/mcs07/MolVS/_build?definitionId=1 19 | 20 | **MolVS** is a molecule validation and standardization tool, written in Python using the `RDKit chemistry framework`_. 21 | 22 | Building a collection of chemical structures from different sources can be difficult due to differing representations, 23 | drawing conventions and mistakes. MolVS can standardize chemical structures to improve data quality, help with 24 | de-duplication and identify relationships between molecules. 25 | 26 | There are sensible defaults that make it easy to get started:: 27 | 28 | >>> from molvs import standardize_smiles 29 | >>> standardize_smiles('[Na]OC(=O)c1ccc(C[S+2]([O-])([O-]))cc1') 30 | '[Na+].O=C([O-])c1ccc(CS(=O)=O)cc1' 31 | 32 | Installation 33 | ------------ 34 | 35 | To install MolVS with Anaconda Python, simply run:: 36 | 37 | conda install -c conda-forge molvs 38 | 39 | Alternatively, try one of the other `installation options`_. 40 | 41 | Documentation 42 | ------------- 43 | 44 | Full documentation is available at https://molvs.readthedocs.io. 45 | 46 | Contribute 47 | ---------- 48 | 49 | - Feature ideas and bug reports are welcome on the `Issue Tracker`_. 50 | - Fork the `source code`_ on GitHub, make changes and send a pull request. 51 | 52 | License 53 | ------- 54 | 55 | MolVS is licensed under the `MIT license`_. 56 | 57 | Similar projects 58 | ---------------- 59 | 60 | There are a number of projects with similar goals that take differing approaches: 61 | 62 | - `Francis Atkinson's Standardiser`_ 63 | - `RSC Chemistry Validation and Standardization Platform (CVSP)`_ 64 | - `PubChem Standardization Service`_ 65 | - `Tripod Structure standardizer`_ 66 | - `FDA Substance Registration System Standard Operating Procedure`_ 67 | - `ChemAxon Structure Standardizer`_ 68 | 69 | 70 | .. _`RDKit chemistry framework`: http://www.rdkit.org 71 | .. _`installation options`: http://molvs.readthedocs.io/en/latest/guide/install.html 72 | .. _`source code`: https://github.com/mcs07/MolVS 73 | .. _`Issue Tracker`: https://github.com/mcs07/MolVS/issues 74 | .. _`MIT license`: https://github.com/mcs07/MolVS/blob/master/LICENSE 75 | .. _`Francis Atkinson's Standardiser`: https://wwwdev.ebi.ac.uk/chembl/extra/francis/standardiser/ 76 | .. _`RSC Chemistry Validation and Standardization Platform (CVSP)`: http://cvsp.chemspider.com 77 | .. _`PubChem Standardization Service`: https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi 78 | .. _`Tripod Structure standardizer`: https://tripod.nih.gov/?p=61 79 | .. _`FDA Substance Registration System Standard Operating Procedure`: http://www.fda.gov/downloads/ForIndustry/DataStandards/SubstanceRegistrationSystem-UniqueIngredientIdentifierUNII/ucm127743.pdf 80 | .. _`ChemAxon Structure Standardizer`: http://www.chemaxon.com/products/standardizer/ 81 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | jobs: 2 | - job: Test 3 | strategy: 4 | matrix: 5 | linux_python36: 6 | VM_IMAGE: ubuntu-16.04 7 | PYTHON_VERSION: '3.6' 8 | linux_python37: 9 | VM_IMAGE: ubuntu-16.04 10 | PYTHON_VERSION: '3.7' 11 | mac_python36: 12 | VM_IMAGE: macOS-10.13 13 | PYTHON_VERSION: '3.6' 14 | mac_python37: 15 | VM_IMAGE: macOS-10.13 16 | PYTHON_VERSION: '3.7' 17 | win_python36: 18 | VM_IMAGE: vs2017-win2016 19 | PYTHON_VERSION: '3.6' 20 | win_python37: 21 | VM_IMAGE: vs2017-win2016 22 | PYTHON_VERSION: '3.7' 23 | maxParallel: 4 24 | pool: 25 | vmImage: $(VM_IMAGE) 26 | steps: 27 | - bash: echo "##vso[task.prependpath]/usr/share/miniconda/bin" 28 | condition: eq( variables['Agent.OS'], 'Linux' ) 29 | displayName: Add conda to PATH (Linux) 30 | - bash: echo "##vso[task.prependpath]$CONDA/bin" 31 | condition: eq( variables['Agent.OS'], 'Darwin' ) 32 | displayName: Add conda to PATH (Mac) 33 | - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" 34 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 35 | displayName: Add conda to PATH (Windows) 36 | - script: conda env create --quiet --name molvs python=$(PYTHON_VERSION) 37 | displayName: Create conda environment 38 | - script: conda env update --quiet --name molvs --file environment.yml 39 | displayName: Install dependencies 40 | - bash: | 41 | source activate molvs 42 | python -m pip install . --no-deps 43 | pytest --junitxml=result.xml 44 | displayName: Run tests (Linux/Mac) 45 | condition: ne( variables['Agent.OS'], 'Windows_NT' ) 46 | - script: | 47 | call activate molvs 48 | python -m pip install . --no-deps 49 | pytest --junitxml=result.xml 50 | displayName: Run tests (Windows) 51 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 52 | - task: PublishTestResults@2 53 | inputs: 54 | testResultsFiles: result.xml 55 | testRunTitle: Python $(PYTHON_VERSION) ($(VM_IMAGE)) 56 | condition: succeededOrFailed() 57 | 58 | - job: Package 59 | dependsOn: Test 60 | condition: and(succeeded(), or(eq(variables['Build.SourceBranch'], 'refs/heads/master'), startsWith(variables['Build.SourceBranch'], 'refs/tags/'))) 61 | pool: 62 | vmImage: ubuntu-16.04 63 | steps: 64 | - bash: echo "##vso[task.prependpath]/usr/share/miniconda/bin" 65 | displayName: Add conda to PATH (Linux) 66 | - script: conda env create --quiet --name molvs --file environment.yml 67 | displayName: Create conda environment 68 | - script: | 69 | source activate molvs 70 | python setup.py sdist bdist_wheel --universal 71 | displayName: Build sdist and wheel 72 | - task: PublishPipelineArtifact@0 73 | inputs: 74 | artifactName: molvs-$(Build.BuildNumber) 75 | targetPath: dist 76 | displayName: Publish dist artifact 77 | 78 | - job: PyPI 79 | dependsOn: Package 80 | condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/')) 81 | pool: 82 | vmImage: ubuntu-16.04 83 | steps: 84 | - task: DownloadPipelineArtifact@0 85 | inputs: 86 | artifactName: molvs-$(Build.BuildNumber) 87 | targetPath: dist 88 | displayName: Download dist artifact 89 | - task: TwineAuthenticate@0 90 | inputs: 91 | externalFeeds: pypi 92 | displayName: Authenticate with PyPI 93 | - script: twine upload -r pypi --config-file $PYPIRC_PATH dist/* 94 | displayName: Upload to PyPI 95 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/MolVS.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/MolVS.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/MolVS" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/MolVS" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/source/_images/mol1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcs07/MolVS/d815fe52d160abcecbcbf117e6437bf727dbd8ad/docs/source/_images/mol1.png -------------------------------------------------------------------------------- /docs/source/_images/mol1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcs07/MolVS/d815fe52d160abcecbcbf117e6437bf727dbd8ad/docs/source/_images/mol1s.png -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API documentation 4 | ================= 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | .. module:: molvs 9 | 10 | This part of the documentation is automatically generated from the MolVS source code and comments. 11 | 12 | The MolVS package is made up of the following modules: 13 | 14 | .. contents:: 15 | :local: 16 | :depth: 1 17 | 18 | .. automodule:: molvs.standardize 19 | .. autoclass:: molvs.standardize.Standardizer(normalizations=NORMALIZATIONS, acid_base_pairs=ACID_BASE_PAIRS, tautomer_transforms=TAUTOMER_TRANSFORMS, tautomer_scores=TAUTOMER_SCORES, max_restarts=MAX_RESTARTS, max_tautomers=MAX_TAUTOMERS, prefer_organic=PREFER_ORGANIC) 20 | :special-members: __call__ 21 | :members: 22 | .. autofunction:: molvs.standardize.standardize_smiles 23 | .. autofunction:: molvs.standardize.enumerate_tautomers_smiles 24 | .. autofunction:: molvs.standardize.canonicalize_tautomer_smiles 25 | 26 | .. automodule:: molvs.normalize 27 | .. autodata:: molvs.normalize.NORMALIZATIONS 28 | :annotation: 29 | .. autodata:: molvs.normalize.MAX_RESTARTS 30 | :annotation: = 200 31 | .. autoclass:: molvs.normalize.Normalization 32 | :members: 33 | .. autoclass:: molvs.normalize.Normalizer(normalizations=NORMALIZATIONS, max_restarts=MAX_RESTARTS) 34 | :special-members: __call__ 35 | :members: 36 | 37 | .. automodule:: molvs.metal 38 | .. autoclass:: molvs.metal.MetalDisconnector 39 | :special-members: __call__ 40 | :members: 41 | 42 | .. automodule:: molvs.tautomer 43 | .. autodata:: molvs.tautomer.TAUTOMER_TRANSFORMS 44 | :annotation: 45 | .. autodata:: molvs.tautomer.TAUTOMER_SCORES 46 | :annotation: 47 | .. autodata:: molvs.tautomer.MAX_TAUTOMERS 48 | :annotation: = 1000 49 | .. autoclass:: molvs.tautomer.TautomerTransform 50 | :members: 51 | .. autoclass:: molvs.tautomer.TautomerScore 52 | :members: 53 | .. autoclass:: molvs.tautomer.TautomerCanonicalizer(transforms=TAUTOMER_TRANSFORMS, scores=TAUTOMER_SCORES, max_tautomers=MAX_TAUTOMERS) 54 | :special-members: __call__ 55 | :members: 56 | .. autoclass:: molvs.tautomer.TautomerEnumerator(transforms=TAUTOMER_TRANSFORMS, max_tautomers=MAX_TAUTOMERS) 57 | :special-members: __call__ 58 | :members: 59 | 60 | .. automodule:: molvs.fragment 61 | .. autodata:: molvs.fragment.REMOVE_FRAGMENTS 62 | :annotation: 63 | .. autodata:: molvs.fragment.LEAVE_LAST 64 | :annotation: = True 65 | .. autodata:: molvs.fragment.PREFER_ORGANIC 66 | :annotation: = False 67 | .. autoclass:: molvs.fragment.FragmentPattern 68 | :members: 69 | .. autofunction:: molvs.fragment.is_organic 70 | .. autoclass:: molvs.fragment.FragmentRemover(fragments=REMOVE_FRAGMENTS, leave_last=LEAVE_LAST) 71 | :special-members: __call__ 72 | :members: 73 | .. autoclass:: molvs.fragment.LargestFragmentChooser(prefer_organic=PREFER_ORGANIC) 74 | :special-members: __call__ 75 | :members: 76 | 77 | .. automodule:: molvs.charge 78 | .. autodata:: molvs.charge.ACID_BASE_PAIRS 79 | :annotation: 80 | .. autoclass:: molvs.charge.AcidBasePair 81 | :members: 82 | .. autoclass:: molvs.charge.Reionizer(acid_base_pairs=ACID_BASE_PAIRS) 83 | :special-members: __call__ 84 | :members: 85 | .. autoclass:: molvs.charge.Uncharger 86 | :special-members: __call__ 87 | :members: 88 | 89 | .. automodule:: molvs.validate 90 | .. autodata:: molvs.validate.SIMPLE_FORMAT 91 | :annotation: = '%(levelname)s: [%(validation)s] %(message)s' 92 | .. autodata:: molvs.validate.LONG_FORMAT 93 | :annotation: = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s' 94 | .. autoclass:: molvs.validate.Validator(validations=VALIDATIONS, log_format=SIMPLE_FORMAT, level=logging.INFO, stdout=False, raw=False) 95 | :special-members: __call__ 96 | :members: 97 | .. autofunction:: molvs.validate.validate_smiles 98 | 99 | .. _molvs_validations: 100 | .. automodule:: molvs.validations 101 | .. autodata:: molvs.validations.VALIDATIONS 102 | :annotation: 103 | .. autoclass:: molvs.validations.Validation 104 | :members: 105 | .. autoclass:: molvs.validations.SmartsValidation 106 | :members: 107 | .. autoclass:: molvs.validations.IsNoneValidation 108 | .. autoclass:: molvs.validations.NoAtomValidation 109 | .. autoclass:: molvs.validations.DichloroethaneValidation 110 | .. autoclass:: molvs.validations.FragmentValidation 111 | .. autoclass:: molvs.validations.NeutralValidation 112 | .. autoclass:: molvs.validations.IsotopeValidation 113 | 114 | .. automodule:: molvs.cli 115 | 116 | .. automodule:: molvs.errors 117 | .. autoexception:: molvs.errors.MolVSError() 118 | .. autoexception:: molvs.errors.StandardizeError() 119 | .. autoexception:: molvs.errors.ValidateError() 120 | .. autoexception:: molvs.errors.StopValidateError() 121 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # MolVS documentation build configuration file, created by sphinx-quickstart on Thu Apr 24 14:35:38 2014. 4 | # This file is execfile()d with the current directory set to its containing dir. 5 | # Note that not all possible configuration values are present in this autogenerated file. 6 | # All configuration values have a default; values that are commented out serve to show the default. 7 | 8 | import sys 9 | import os 10 | 11 | # on_rtd is whether we are on readthedocs.org 12 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 13 | 14 | # If extensions (or modules to document with autodoc) are in another directory, add these directories to sys.path here. 15 | # If the directory is relative to the documentation root, use os.path.abspath to make it absolute, like shown here. 16 | sys.path.insert(0, os.path.abspath('../..')) 17 | 18 | # -- General configuration ------------------------------------------------ 19 | 20 | # If your documentation needs a minimal Sphinx version, state it here. 21 | #needs_sphinx = '1.0' 22 | 23 | # Add any Sphinx extension module names here, as strings. They can be extensions coming with Sphinx 24 | # (named 'sphinx.ext.*') or your custom ones. 25 | extensions = [ 26 | 'sphinx.ext.autodoc', 27 | 'sphinx.ext.intersphinx', 28 | 'sphinx.ext.coverage', 29 | 'sphinx.ext.ifconfig', 30 | 'sphinx.ext.viewcode', 31 | 'sphinx.ext.extlinks', 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # The suffix of source filenames. 38 | source_suffix = '.rst' 39 | 40 | # The encoding of source files. 41 | #source_encoding = 'utf-8-sig' 42 | 43 | # The master toctree document. 44 | master_doc = 'index' 45 | 46 | # General information about the project. 47 | project = u'MolVS' 48 | copyright = u'2016, Matt Swain' 49 | 50 | # The version info for the project you're documenting, acts as replacement for |version| and |release|, also used in 51 | # various other places throughout the built documents. 52 | # The short X.Y version. 53 | version = '0.1.1' 54 | # The full version, including alpha/beta/rc tags. 55 | release = '0.1.1' 56 | 57 | # The language for content autogenerated by Sphinx. Refer to documentation for a list of supported languages. 58 | #language = None 59 | 60 | # There are two options for replacing |today|: either, you set today to some non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and directories to ignore when looking for source 66 | # files. 67 | exclude_patterns = [] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description unit titles (such as .. function::). 76 | #add_module_names = True 77 | 78 | # If true, sectionauthor and moduleauthor directives will be shown in the output. They are ignored by default. 79 | #show_authors = False 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = 'sphinx' 83 | 84 | # A list of ignored prefixes for module index sorting. 85 | #modindex_common_prefix = [] 86 | 87 | # If true, keep warnings as "system message" paragraphs in the built documents. 88 | #keep_warnings = False 89 | 90 | 91 | # -- Options for HTML output ---------------------------------------------- 92 | 93 | # The theme to use for HTML and HTML Help pages. See the documentation for a list of builtin themes. 94 | if not on_rtd: # only import and set the theme if we're building docs locally 95 | import sphinx_rtd_theme 96 | html_theme = 'sphinx_rtd_theme' 97 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 98 | 99 | #html_theme = 'default' 100 | 101 | # Theme options are theme-specific and customize the look and feel of a theme further. For a list of options available 102 | # for each theme, see the documentation. 103 | #html_theme_options = {} 104 | 105 | # Add any paths that contain custom themes here, relative to this directory. 106 | #html_theme_path = [] 107 | 108 | # The name for this set of Sphinx documents. If None, it defaults to " v documentation". 109 | #html_title = None 110 | 111 | # A shorter title for the navigation bar. Default is the same as html_title. 112 | #html_short_title = None 113 | 114 | # The name of an image file (relative to this directory) to place at the top of the sidebar. 115 | #html_logo = None 116 | 117 | # The name of an image file (within the static path) to use as favicon of the docs. This file should be a Windows icon 118 | # file (.ico) being 16x16 or 32x32 pixels large. 119 | #html_favicon = None 120 | 121 | # Add any paths that contain custom static files (such as style sheets) here, relative to this directory. They are 122 | # copied after the builtin static files, so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # Add any extra paths that contain custom files (such as robots.txt or .htaccess) here, relative to this directory. 126 | # These files are copied directly to the root of the documentation. 127 | #html_extra_path = [] 128 | 129 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, using the given strftime format. 130 | #html_last_updated_fmt = '%b %d, %Y' 131 | 132 | # If true, SmartyPants will be used to convert quotes and dashes to typographically correct entities. 133 | #html_use_smartypants = True 134 | 135 | # Custom sidebar templates, maps document names to template names. 136 | #html_sidebars = {} 137 | 138 | # Additional templates that should be rendered to pages, maps page names to template names. 139 | #html_additional_pages = {} 140 | 141 | # If false, no module index is generated. 142 | #html_domain_indices = True 143 | 144 | # If false, no index is generated. 145 | #html_use_index = True 146 | 147 | # If true, the index is split into individual pages for each letter. 148 | #html_split_index = False 149 | 150 | # If true, links to the reST sources are added to the pages. 151 | #html_show_sourcelink = True 152 | 153 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 154 | #html_show_sphinx = True 155 | 156 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 157 | #html_show_copyright = True 158 | 159 | # If true, an OpenSearch description file will be output, and all pages will contain a tag referring to it. The 160 | # value of this option must be the base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'MolVSdoc' 168 | 169 | 170 | # -- Options for LaTeX output --------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | 'papersize': 'a4paper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | 'pointsize': '12pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto, manual, or own class]). 185 | latex_documents = [ 186 | ('index', 'MolVS.tex', u'MolVS Documentation', u'Matt Swain', 'manual'), 187 | ] 188 | 189 | # The name of an image file (relative to this directory) to place at the top of the title page. 190 | #latex_logo = None 191 | 192 | # For "manual" documents, if this is true, then toplevel headings are parts, not chapters. 193 | latex_use_parts = False 194 | 195 | # If true, show page references after internal links. 196 | latex_show_pagerefs = True 197 | 198 | # If true, show URL addresses after external links. 199 | latex_show_urls = True 200 | 201 | # Documents to append as an appendix to all manuals. 202 | #latex_appendices = [] 203 | 204 | # If false, no module index is generated. 205 | latex_domain_indices = False 206 | 207 | 208 | # -- Options for manual page output --------------------------------------- 209 | 210 | # One entry per manual page. List of tuples (source start file, name, description, authors, manual section). 211 | man_pages = [ 212 | ('index', 'molvs', u'MolVS Documentation', [u'Matt Swain'], 1) 213 | ] 214 | 215 | # If true, show URL addresses after external links. 216 | #man_show_urls = False 217 | 218 | 219 | # -- Options for Texinfo output ------------------------------------------- 220 | 221 | # Grouping the document tree into Texinfo files. List of tuples 222 | # (source start file, target name, title, author, dir menu entry, description, category) 223 | texinfo_documents = [ 224 | ('index', 'MolVS', u'MolVS Documentation', u'Matt Swain', 'MolVS', 225 | 'One line description of project.', 'Miscellaneous'), 226 | ] 227 | 228 | # Documents to append as an appendix to all manuals. 229 | #texinfo_appendices = [] 230 | 231 | # If false, no module index is generated. 232 | #texinfo_domain_indices = True 233 | 234 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 235 | #texinfo_show_urls = 'footnote' 236 | 237 | # If true, do not generate a @detailmenu in the "Top" node's menu. 238 | #texinfo_no_detailmenu = False 239 | 240 | 241 | # Example configuration for intersphinx: refer to the Python standard library. 242 | intersphinx_mapping = { 243 | 'python': ('https://docs.python.org/3', None), 244 | 'rdkit': ('http://www.rdkit.org/docs', None) 245 | } 246 | 247 | # Sort autodoc members by the order they appear in the source code 248 | autodoc_member_order = 'bysource' 249 | 250 | # Concatenate the class and __init__ docstrings together 251 | autoclass_content = 'both' 252 | -------------------------------------------------------------------------------- /docs/source/guide/charge.rst: -------------------------------------------------------------------------------- 1 | .. _charge: 2 | 3 | Charges 4 | ======= 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | This page gives details on dealing with charges in molecules. 9 | 10 | 11 | Acid reionization 12 | ----------------- 13 | 14 | - Ensure the strongest acid groups ionize first in partially ionized molecules. 15 | 16 | Neutralization 17 | -------------- 18 | 19 | - Attempt to neutralize charges by adding and/or removing hydrogens where possible. 20 | - Not always possible to produce a neutral molecule. 21 | -------------------------------------------------------------------------------- /docs/source/guide/cli.rst: -------------------------------------------------------------------------------- 1 | .. _cli: 2 | 3 | Command Line Tool 4 | ================= 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | MolVS comes with a simple command line tool that allows standardization and validation by typing ``molvs`` at the 9 | command line. 10 | 11 | Standardization 12 | --------------- 13 | 14 | See standardization help by typing ``molvs standardize -h``:: 15 | 16 | usage: molvs standardize [infile] [-i {smi,mol,sdf}] [-O ] 17 | [-o {smi,mol,sdf}] [-: ] 18 | 19 | positional arguments: 20 | infile input filename 21 | 22 | optional arguments: 23 | -i {smi,mol,sdf}, --intype {smi,mol,sdf} 24 | input filetype 25 | -: , --smiles 26 | input SMILES instead of file 27 | -O , --outfile 28 | output filename 29 | -o {smi,mol,sdf}, --outtype {smi,mol,sdf} 30 | output filetype 31 | 32 | 33 | 34 | 35 | Validation 36 | ---------- 37 | 38 | See validation help by typing ``molvs validate -h``:: 39 | 40 | usage: molvs validate [infile] [-i {smi,mol,sdf}] [-O ] 41 | [-: ] 42 | 43 | positional arguments: 44 | infile input filename 45 | 46 | optional arguments: 47 | -i {smi,mol,sdf}, --intype {smi,mol,sdf} 48 | input filetype 49 | -: , --smiles 50 | input SMILES instead of file 51 | -O , --outfile 52 | output filename 53 | 54 | 55 | Examples 56 | -------- 57 | 58 | SMILES standardization: 59 | 60 | .. code-block:: bash 61 | 62 | $ molvs standardize -:"C[n+]1c([N-](C))cccc1" 63 | CN=c1ccccn1C 64 | 65 | Specifying an output format: 66 | 67 | .. code-block:: bash 68 | 69 | $ molvs standardize -:"[N](=O)(=O)O" -o mol 70 | 71 | RDKit 72 | 73 | 4 3 0 0 0 0 0 0 0 0999 V2000 74 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 75 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 76 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 77 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 78 | 1 2 1 0 79 | 1 3 2 0 80 | 1 4 1 0 81 | M CHG 2 1 1 2 -1 82 | M END 83 | 84 | Using stdin: 85 | 86 | .. code-block:: bash 87 | 88 | $ echo "C[n+]1c([N-](C))cccc1" | molvs standardize 89 | CN=c1ccccn1C 90 | 91 | Specifying an input file: 92 | 93 | .. code-block:: bash 94 | 95 | $ molvs standardize example.mol 96 | CN=c1ccccn1C 97 | 98 | Specifying an output file: 99 | 100 | .. code-block:: bash 101 | 102 | $ molvs standardize example.mol -O output.smi 103 | $ molvs standardize example.mol -O output.mol 104 | $ molvs standardize example.mol -O output -o mol 105 | 106 | Logging validations to stdout: 107 | 108 | .. code-block:: bash 109 | 110 | $ molvs validate -:"O=C([O-])c1ccccc1" 111 | INFO: [NeutralValidation] Not an overall neutral system (-1) 112 | 113 | Logging validations to a file: 114 | 115 | .. code-block:: bash 116 | 117 | $ molvs validate -:"O=C([O-])c1ccccc1" -O logs.txt 118 | -------------------------------------------------------------------------------- /docs/source/guide/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | 3 | .. include:: ../../../CONTRIBUTING.rst 4 | -------------------------------------------------------------------------------- /docs/source/guide/fragment.rst: -------------------------------------------------------------------------------- 1 | .. _fragment: 2 | 3 | Fragments 4 | ========= 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | This page gives details on dealing with fragments. 9 | 10 | The term fragment refers to covalently bonded units. A molecule can contain multiple fragments. 11 | 12 | 13 | Getting the largest fragment 14 | ---------------------------- 15 | 16 | - LargestFragmentChooser 17 | 18 | Filtering out fragments 19 | ----------------------- 20 | 21 | - FragmentRemover 22 | -------------------------------------------------------------------------------- /docs/source/guide/gettingstarted.rst: -------------------------------------------------------------------------------- 1 | .. _gettingstarted: 2 | 3 | Getting started 4 | =============== 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | This page gives a introduction on how to get started with MolVS. This assumes you already have MolVS 9 | :ref:`installed `. 10 | 11 | TODO... 12 | -------------------------------------------------------------------------------- /docs/source/guide/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | Installation 4 | ============ 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | MolVS supports Python versions 2.7 and 3.5+. 9 | 10 | There are a variety of ways to download and install MolVS. 11 | 12 | Option 1: Use conda (recommended) 13 | --------------------------------- 14 | 15 | The easiest and recommended way to install is using conda. `Anaconda Python`_ is a self-contained Python environment 16 | that is particularly useful for scientific applications. If you don't already have it, start by installing `Miniconda`_, 17 | which includes a complete Python distribution and the conda package manager. Choose the Python 3 version, unless you 18 | have a particular reason why you must use Python 2. 19 | 20 | To install MolVS, at the command line, run:: 21 | 22 | conda config --add channels conda-forge 23 | conda install molvs 24 | 25 | This will add the `conda-forge`_ channel to your conda config, then install MolVS and all its dependencies into your 26 | conda environment. 27 | 28 | Option 2: Use pip 29 | ----------------- 30 | 31 | An alternative method is to install using pip:: 32 | 33 | pip install molvs 34 | 35 | This will download the latest version of MolVS, and place it in your `site-packages` folder so it is automatically 36 | available to all your python scripts. 37 | 38 | .. note:: 39 | 40 | MolVS requires RDKit, which cannot be installed using pip. On the Mac, you can use Homebrew:: 41 | 42 | brew tap mcs07/cheminformatics 43 | brew install rdkit 44 | 45 | The official RDKit documentation has `installation instructions for a variety of platforms`_. 46 | 47 | 48 | Option 3: Download the latest release 49 | ------------------------------------- 50 | 51 | Alternatively, `download the latest release`_ manually and install yourself:: 52 | 53 | tar -xzvf MolVS-0.1.1.tar.gz 54 | cd MolVS-0.1.1 55 | python setup.py install 56 | 57 | The setup.py command will install MolVS in your `site-packages` folder so it is automatically available to all your 58 | python scripts. 59 | 60 | Option 4: Clone the repository 61 | ------------------------------ 62 | 63 | The latest development version of MolVS is always `available on GitHub`_. This version is not guaranteed to be 64 | stable, but may include new features that have not yet been released. Simply clone the repository and install as usual:: 65 | 66 | git clone https://github.com/mcs07/MolVS.git 67 | cd MolVS 68 | python setup.py install 69 | 70 | .. _`Anaconda Python`: https://www.continuum.io/anaconda-overview 71 | .. _`Miniconda`: http://conda.pydata.org/miniconda.html 72 | .. _`conda-forge`: https://conda-forge.org/ 73 | .. _`installation instructions for a variety of platforms`: http://www.rdkit.org/docs/Install.html 74 | .. _`install it using get-pip.py`: http://www.pip-installer.org/en/latest/installing.html 75 | .. _`download the latest release`: https://github.com/mcs07/MolVS/releases/ 76 | .. _`available on GitHub`: https://github.com/mcs07/MolVS 77 | -------------------------------------------------------------------------------- /docs/source/guide/intro.rst: -------------------------------------------------------------------------------- 1 | .. _introduction: 2 | 3 | Introduction 4 | ============ 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | Building a collection of chemical structures from various different sources is difficult. There are differing 9 | file formats, molecular representations, drawing conventions, and things that are just plain wrong. 10 | 11 | A lot of this arises due to our chemical models being an imperfect description of reality, but even within the idealized 12 | models there is often no single correct answer to whether two differently represented molecules are actually "the same". 13 | Whether tautomers or isomers of the same molecule should be considered equivalent or distinct entities can depend 14 | entirely on the specific application. 15 | 16 | MolVS tries to address this problem through customizable validation and standardization processes, combined with the 17 | concept of "parent" molecule relationships to allow multiple simultaneous degrees of standardization. 18 | 19 | This guide provides a quick tour through MolVS concepts and functionality. 20 | 21 | 22 | MolVS license 23 | ------------- 24 | 25 | MolVS is released under the MIT License. This is a short, permissive software license that allows commercial use, 26 | modifications, distribution, sublicensing and private use. Basically, you can do whatever you want with MolVS as long as 27 | you include the original copyright and license in any copies or derivative projects. 28 | 29 | See the `LICENSE file`_ for the full text of the license. 30 | 31 | .. _`LICENSE file`: https://github.com/mcs07/MolVS/blob/master/LICENSE 32 | -------------------------------------------------------------------------------- /docs/source/guide/standardize.rst: -------------------------------------------------------------------------------- 1 | .. _standardize: 2 | 3 | Standardization 4 | =============== 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | This page gives details on the standardization process. 9 | 10 | 11 | Standardizing a molecule 12 | ------------------------ 13 | 14 | The ``standardize_smiles`` function provides a quick and easy way to get the standardized version of a given SMILES 15 | string:: 16 | 17 | >>> from molvs import standardize_smiles 18 | >>> standardize_smiles('C[n+]1c([N-](C))cccc1') 19 | 'CN=c1ccccn1C' 20 | 21 | While this is convenient for one-off cases, it's inefficient when dealing with multiple molecules and doesn't allow any 22 | customization of the standardization process. 23 | 24 | The ``Standardizer`` class provides flexibility to specify custom standardization stages and efficiently standardize 25 | multiple molecules:: 26 | 27 | >>> from rdkit import Chem 28 | >>> mol = Chem.MolFromSmiles('[Na]OC(=O)c1ccc(C[S+2]([O-])([O-]))cc1') 29 | 30 | .. image:: ../_images/mol1.png 31 | 32 | :: 33 | 34 | >>> from molvs import Standardizer 35 | >>> s = Standardizer() 36 | >>> smol = s.standardize(mol) 37 | 38 | .. image:: ../_images/mol1s.png 39 | 40 | The standardization process 41 | --------------------------- 42 | 43 | TODO: Explain this properly... 44 | 45 | RDKit Sanitize 46 | ~~~~~~~~~~~~~~ 47 | 48 | - Nitro N=O: ``CN(=O)=O >> C[N+](=O)[O-]`` and ``C1=CC=CN(=O)=C1 >> C1=CC=C[N+]([O-])=C1`` 49 | - Nitro N#O: ``C-N=N#N >> C-N=[N+]=[N-]`` 50 | - Perchlorate: ``Cl(=O)(=O)(=O)[O-] >> [Cl+3]([O-])([O-])([O-])[O-]`` 51 | - Calculate explicit and implicit valence of all atoms. Fails when atoms have illegal valence. 52 | - Calculate symmetrized SSSR. Slowest step, fails in rare cases. 53 | - Kekulize. Fails if a Kekule form cannot be found or non-ring bonds are marked as aromatic. 54 | - Assign radicals if hydrogens set and bonds+hydrogens+charge < valence. 55 | - Set aromaticity, if none set in input. Go round rings, Huckel rule to set atoms+bonds as aromatic. 56 | - Set conjugated property on bonds where applicable. 57 | - Set hybridisation property on atoms. 58 | - Remove chirality markers from sp and sp2 hybridised centers. 59 | 60 | RDKit RemoveHs 61 | ~~~~~~~~~~~~~~ 62 | 63 | - RDKit implementation detail - this is the preferred way to store the molecule. 64 | - Remove explicit H count from atoms, instead infer it on the fly from valence model. 65 | 66 | Disconnect metals 67 | ~~~~~~~~~~~~~~~~~ 68 | 69 | - Break covalent bonds between metals and organic atoms under certain conditions. 70 | - First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions). 71 | - For every bond broken, adjust the charges of the begin and end atoms accordingly. 72 | - In future, we might attempt to replace with zero-order bonds. 73 | 74 | Apply normalization rules 75 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 76 | 77 | - A series of transformations to correct common drawing errors and standardize functional groups. Includes: 78 | - Uncharge-separate sulfones 79 | - Charge-separate nitro groups 80 | - Charge-separate pyridine oxide 81 | - Charge-separate azide 82 | - Charge-separate diazo and azo groups 83 | - Charge-separate sulfoxides 84 | - Hydrazine-diazonium system 85 | 86 | Reionize acids 87 | ~~~~~~~~~~~~~~ 88 | 89 | If molecule with multiple acid groups is partially ionized, ensure strongest acids ionize 90 | first. 91 | 92 | The algorithm works as follows: 93 | 94 | - Use SMARTS to find the strongest protonated acid and the weakest ionized acid. 95 | - If the ionized acid is weaker than the protonated acid, swap proton and repeat. 96 | 97 | Recalculate stereochemistry 98 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 99 | 100 | - Use built-in RDKit functionality to force a clean recalculation of stereochemistry 101 | -------------------------------------------------------------------------------- /docs/source/guide/tautomer.rst: -------------------------------------------------------------------------------- 1 | .. _tautomer: 2 | 3 | Tautomers 4 | ========= 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | This page gives details on tautomer enumeration and canonicalization. 9 | 10 | Background 11 | ---------- 12 | 13 | Tautomers are sets of molecules that readily interconvert with each other through the movement of a hydrogen atom. 14 | Tautomers have the same molecular formula and net charge, but they differ in terms of the positions of hydrogens and 15 | the associated changes in adjacent double and single bonds. 16 | 17 | Because they rapidly interconvert, for many applications tautomers are considered to be the same chemical compound. 18 | And even in situations where it is important to treat tautomers as distinct compounds, it is still useful to be 19 | aware of the tautomerism relationships between molecules in a collection. 20 | 21 | Varying tautomeric forms of the same molecule can have significantly different fingerprints and descriptors, which can 22 | negatively impact models for things like property prediction if they are used inconsistently. 23 | 24 | There are two main tautomerism tasks that MolVS carries out: 25 | 26 | - Tautomer enumeration: Finding the set of all the different possible tautomeric forms of a molecule. 27 | - Tautomer canonicalization: Consistently picking one of the tautomers to be the canonical tautomer for the set. 28 | 29 | Tautomer enumeration 30 | -------------------- 31 | 32 | - All possible tautomers are generated using a series of transform rules. 33 | - Remove stereochemistry from double bonds that are single in at least 1 tautomer. 34 | 35 | Tautomer canonicalization 36 | ------------------------- 37 | 38 | - Enumerate all possible tautomers using transform rules. 39 | - Use scoring system to determine canonical tautomer. 40 | - Canonical tautomer should be "reasonable" from a chemist's point of view, but isn't guaranteed to be the most energetically favourable. 41 | 42 | -------------------------------------------------------------------------------- /docs/source/guide/validate.rst: -------------------------------------------------------------------------------- 1 | .. _validate: 2 | 3 | Validation 4 | ========== 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | The MolVS :class:`~molvs.metal.Validator` provides a way to identify and log unusual and potentially troublesome 9 | characteristics of a molecule. 10 | 11 | The validation process makes no actual changes to a molecule – that is left to the standardization process, which fixes 12 | many of the issues identified through validation. There is no real requirement to validate a molecule before or after 13 | standardizing it - the process simply provides additional information about potential problems. 14 | 15 | Validating a molecule 16 | --------------------- 17 | 18 | The :func:`~molvs.validate.validate_smiles` function is a convenient way to quickly validate a single SMILES 19 | string:: 20 | 21 | >>> from molvs import validate_smiles 22 | >>> validate_smiles('O=C([O-])c1ccccc1') 23 | ['INFO: [NeutralValidation] Not an overall neutral system (-1)'] 24 | 25 | It returns a list of log messages as strings. 26 | 27 | The :class:`~molvs.metal.Validator` class provides more flexibility when working with multiple molecules or when a 28 | custom :class:`~molvs.metal.Validation` list is required:: 29 | 30 | >>> fmt = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s' 31 | >>> validator = Validator(log_format=fmt) 32 | >>> mol = Chem.MolFromSmiles('[2H]C(Cl)(Cl)Cl') 33 | >>> validator.validate(mol) 34 | ['2014-08-05 16:04:23,682 - INFO - IsotopeValidation - Molecule contains isotope 2H'] 35 | 36 | 37 | 38 | Available validations 39 | --------------------- 40 | 41 | The :ref:`API documentation ` contains a full list of the individual validations that are available. 42 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. MolVS documentation master file, created by sphinx-quickstart on Thu Apr 24 14:35:38 2014. 2 | 3 | MolVS: Molecule Validation and Standardization 4 | ============================================== 5 | 6 | .. sectionauthor:: Matt Swain 7 | 8 | **MolVS** is a molecule validation and standardization tool, written in Python using the `RDKit chemistry framework`_. 9 | 10 | Building a collection of chemical structures from different sources can be difficult due to differing representations, 11 | drawing conventions and mistakes. MolVS can standardize chemical structures to improve data quality, help with 12 | de-duplication and identify relationships between molecules. 13 | 14 | There are sensible defaults that make it easy to get started:: 15 | 16 | >>> from molvs import standardize_smiles 17 | >>> standardize_smiles('[Na]OC(=O)c1ccc(C[S+2]([O-])([O-]))cc1') 18 | '[Na+].O=C([O-])c1ccc(CS(=O)=O)cc1' 19 | 20 | Each standardization module is also available separately, allowing the development of custom standardization processes. 21 | 22 | Features 23 | -------- 24 | 25 | - Normalization of functional groups to a consistent format. 26 | - Recombination of separated charges. 27 | - Breaking of bonds to metal atoms. 28 | - Competitive reionization to ensure strongest acids ionize first in partially ionize molecules. 29 | - Tautomer enumeration and canonicalization. 30 | - Neutralization of charges. 31 | - Standardization or removal of stereochemistry information. 32 | - Filtering of salt and solvent fragments. 33 | - Generation of fragment, isotope, charge, tautomer or stereochemistry insensitive parent structures. 34 | - Validations to identify molecules with unusual and potentially troublesome characteristics. 35 | 36 | User guide 37 | ---------- 38 | 39 | A step-by-step guide to getting started with MolVS. 40 | 41 | .. toctree:: 42 | :maxdepth: 1 43 | 44 | guide/intro 45 | guide/install 46 | guide/gettingstarted 47 | guide/validate 48 | guide/standardize 49 | guide/tautomer 50 | guide/fragment 51 | guide/charge 52 | guide/cli 53 | guide/contributing 54 | 55 | API documentation 56 | ----------------- 57 | 58 | Comprehensive API documentation with information on every function, class and method. This is automatically generated 59 | from the MolVS source code and comments. 60 | 61 | .. toctree:: 62 | :maxdepth: 2 63 | 64 | api 65 | 66 | 67 | Useful links 68 | ------------ 69 | 70 | - `MolVS on GitHub`_ 71 | - `MolVS on PyPI`_ 72 | - `Issue tracker`_ 73 | - `Release history`_ 74 | - `MolVS Travis CI`_ 75 | 76 | .. _`RDKit chemistry framework`: http://www.rdkit.org 77 | .. _`MolVS on GitHub`: https://github.com/mcs07/MolVS 78 | .. _`MolVS on PyPI`: https://pypi.python.org/pypi/MolVS 79 | .. _`Issue tracker`: https://github.com/mcs07/MolVS/issues 80 | .. _`Release history`: https://github.com/mcs07/MolVS/releases 81 | .. _`MolVS Travis CI`: https://travis-ci.org/mcs07/MolVS 82 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: molvs 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - pip=19.0.3 7 | - pytest=4.3.1 8 | - rdkit=2018.09.1 9 | - six=1.12.0 10 | - twine=1.13.0 11 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | This directory contains Jupyter iPython notebooks with examples of how to use MolVS. 5 | 6 | Viewing 7 | ------- 8 | 9 | The notebooks can be viewed easily using `nbviewer`_: 10 | 11 | - `Standardization notebook`_ 12 | 13 | Interacting 14 | ----------- 15 | 16 | You can run the notebooks in an interactive and editable form by downloading the ``.ipynb`` files. You will need to have 17 | Jupyter installed:: 18 | 19 | pip install jupyter 20 | 21 | To start the notebook server, run:: 22 | 23 | jupyter notebook 24 | 25 | Then import the downloaded ``.ipynb`` files. 26 | 27 | .. _`nbviewer`: http://nbviewer.ipython.org 28 | .. _`Standardization notebook`: http://nbviewer.ipython.org/github/mcs07/molvs/blob/master/examples/standardization.ipynb 29 | -------------------------------------------------------------------------------- /molvs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | MolVS - Molecule Validation and Standardization 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | MolVS is a python tool built on top of RDKit that performs validation and standardization of chemical structures. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import logging 14 | 15 | from .standardize import Standardizer, standardize_smiles, enumerate_tautomers_smiles, canonicalize_tautomer_smiles 16 | from .validate import Validator, validate_smiles 17 | from .errors import MolVSError, StandardizeError, ValidateError 18 | 19 | 20 | __title__ = 'MolVS' 21 | __version__ = '0.1.1' 22 | __author__ = 'Matt Swain' 23 | __email__ = 'm.swain@me.com' 24 | __license__ = 'MIT' 25 | __copyright__ = 'Copyright 2019 Matt Swain' 26 | 27 | 28 | log = logging.getLogger(__name__) 29 | log.addHandler(logging.NullHandler()) 30 | -------------------------------------------------------------------------------- /molvs/charge.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.charge 4 | ~~~~~~~~~~~~ 5 | 6 | This module implements tools for manipulating charges on molecules. In particular, :class:`~molvs.charge.Reionizer`, 7 | which competitively reionizes acids such that the strongest acids ionize first, and :class:`~molvs.charge.Uncharger`, 8 | which attempts to neutralize ionized acids and bases on a molecule. 9 | 10 | """ 11 | 12 | from __future__ import print_function 13 | from __future__ import unicode_literals 14 | from __future__ import division 15 | import copy 16 | import logging 17 | 18 | from rdkit import Chem 19 | 20 | from .utils import memoized_property 21 | 22 | 23 | log = logging.getLogger(__name__) 24 | 25 | 26 | class AcidBasePair(object): 27 | """An acid and its conjugate base, defined by SMARTS. 28 | 29 | A strength-ordered list of AcidBasePairs can be used to ensure the strongest acids in a molecule ionize first. 30 | """ 31 | 32 | def __init__(self, name, acid, base): 33 | """Initialize an AcidBasePair with the following parameters: 34 | 35 | :param string name: A name for this AcidBasePair. 36 | :param string acid: SMARTS pattern for the protonated acid. 37 | :param string base: SMARTS pattern for the conjugate ionized base. 38 | """ 39 | log.debug('Initializing AcidBasePair: %s', name) 40 | self.name = name 41 | self.acid_str = acid 42 | self.base_str = base 43 | 44 | @memoized_property 45 | def acid(self): 46 | log.debug('Loading AcidBasePair acid: %s', self.name) 47 | return Chem.MolFromSmarts(self.acid_str) 48 | 49 | @memoized_property 50 | def base(self): 51 | log.debug('Loading AcidBasePair base: %s', self.name) 52 | return Chem.MolFromSmarts(self.base_str) 53 | 54 | def __repr__(self): 55 | return 'AcidBasePair({!r}, {!r}, {!r})'.format(self.name, self.acid_str, self.base_str) 56 | 57 | def __str__(self): 58 | return self.name 59 | 60 | 61 | #: The default list of AcidBasePairs, sorted from strongest to weakest. This list is derived from the Food and Drug 62 | #: Administration Substance Registration System Standard Operating Procedure guide. 63 | ACID_BASE_PAIRS = ( 64 | AcidBasePair('-OSO3H', 'OS(=O)(=O)[OH]', 'OS(=O)(=O)[O-]'), 65 | AcidBasePair('–SO3H', '[!O]S(=O)(=O)[OH]', '[!O]S(=O)(=O)[O-]'), 66 | AcidBasePair('-OSO2H', 'O[SD3](=O)[OH]', 'O[SD3](=O)[O-]'), 67 | AcidBasePair('-SO2H', '[!O][SD3](=O)[OH]', '[!O][SD3](=O)[O-]'), 68 | AcidBasePair('-OPO3H2', 'OP(=O)([OH])[OH]', 'OP(=O)([OH])[O-]'), 69 | AcidBasePair('-PO3H2', '[!O]P(=O)([OH])[OH]', '[!O]P(=O)([OH])[O-]'), 70 | AcidBasePair('-CO2H', 'C(=O)[OH]', 'C(=O)[O-]'), 71 | AcidBasePair('thiophenol', 'c[SH]', 'c[S-]'), 72 | AcidBasePair('(-OPO3H)-', 'OP(=O)([O-])[OH]', 'OP(=O)([O-])[O-]'), 73 | AcidBasePair('(-PO3H)-', '[!O]P(=O)([O-])[OH]', '[!O]P(=O)([O-])[O-]'), 74 | AcidBasePair('phthalimide', 'O=C2c1ccccc1C(=O)[NH]2', 'O=C2c1ccccc1C(=O)[N-]2'), 75 | AcidBasePair('CO3H (peracetyl)', 'C(=O)O[OH]', 'C(=O)O[O-]'), 76 | AcidBasePair('alpha-carbon-hydrogen-nitro group', 'O=N(O)[CH]', 'O=N(O)[C-]'), 77 | AcidBasePair('-SO2NH2', 'S(=O)(=O)[NH2]', 'S(=O)(=O)[NH-]'), 78 | AcidBasePair('-OBO2H2', 'OB([OH])[OH]', 'OB([OH])[O-]'), 79 | AcidBasePair('-BO2H2', '[!O]B([OH])[OH]', '[!O]B([OH])[O-]'), 80 | AcidBasePair('phenol', 'c[OH]', 'c[O-]'), 81 | AcidBasePair('SH (aliphatic)', 'C[SH]', 'C[S-]'), 82 | AcidBasePair('(-OBO2H)-', 'OB([O-])[OH]', 'OB([O-])[O-]'), 83 | AcidBasePair('(-BO2H)-', '[!O]B([O-])[OH]', '[!O]B([O-])[O-]'), 84 | AcidBasePair('cyclopentadiene', 'C1=CC=C[CH2]1', 'c1ccc[cH-]1'), 85 | AcidBasePair('-CONH2', 'C(=O)[NH2]', 'C(=O)[NH-]'), 86 | AcidBasePair('imidazole', 'c1cnc[nH]1', 'c1cnc[n-]1'), 87 | AcidBasePair('-OH (aliphatic alcohol)', '[CX4][OH]', '[CX4][O-]'), 88 | AcidBasePair('alpha-carbon-hydrogen-keto group', 'O=C([!O])[C!H0+0]', 'O=C([!O])[C-]'), 89 | AcidBasePair('alpha-carbon-hydrogen-acetyl ester group', 'OC(=O)[C!H0+0]', 'OC(=O)[C-]'), 90 | AcidBasePair('sp carbon hydrogen', 'C#[CH]', 'C#[C-]'), 91 | AcidBasePair('alpha-carbon-hydrogen-sulfone group', 'CS(=O)(=O)[C!H0+0]', 'CS(=O)(=O)[C-]'), 92 | AcidBasePair('alpha-carbon-hydrogen-sulfoxide group', 'C[SD3](=O)[C!H0+0]', 'C[SD3](=O)[C-]'), 93 | AcidBasePair('-NH2', '[CX4][NH2]', '[CX4][NH-]'), 94 | AcidBasePair('benzyl hydrogen', 'c[CX4H2]', 'c[CX3H-]'), 95 | AcidBasePair('sp2-carbon hydrogen', '[CX3]=[CX3!H0+0]', '[CX3]=[CX2-]'), 96 | AcidBasePair('sp3-carbon hydrogen', '[CX4!H0+0]', '[CX3-]'), 97 | ) 98 | 99 | 100 | class ChargeCorrection(object): 101 | """An atom that should have a certain charge applied, defined by a SMARTS pattern.""" 102 | 103 | def __init__(self, name, smarts, charge): 104 | """Initialize a ChargeCorrection with the following parameters: 105 | 106 | :param string name: A name for this ForcedAtomCharge. 107 | :param string smarts: SMARTS pattern to match. Charge is applied to the first atom. 108 | :param int charge: The charge to apply. 109 | """ 110 | log.debug('Initializing ChargeCorrection: %s', name) 111 | self.name = name 112 | self.smarts_str = smarts 113 | self.charge = charge 114 | 115 | @memoized_property 116 | def smarts(self): 117 | log.debug('Loading ChargeCorrection smarts: %s', self.name) 118 | return Chem.MolFromSmarts(self.smarts_str) 119 | 120 | def __repr__(self): 121 | return 'ChargeCorrection({!r}, {!r}, {!r})'.format(self.name, self.smarts_str, self.charge) 122 | 123 | def __str__(self): 124 | return self.name 125 | 126 | 127 | #: The default list of ChargeCorrections. 128 | CHARGE_CORRECTIONS = ( 129 | ChargeCorrection('[Li,Na,K]', '[Li,Na,K;X0+0]', 1), 130 | ChargeCorrection('[Mg,Ca]', '[Mg,Ca;X0+0]', 2), 131 | ChargeCorrection('[Cl]', '[Cl;X0+0]', -1), 132 | # TODO: Extend to other incorrectly charged atoms 133 | ) 134 | 135 | 136 | class Reionizer(object): 137 | """A class to fix charges and reionize a molecule such that the strongest acids ionize first.""" 138 | 139 | def __init__(self, acid_base_pairs=ACID_BASE_PAIRS, charge_corrections=CHARGE_CORRECTIONS): 140 | """Initialize a Reionizer with the following parameter: 141 | 142 | :param acid_base_pairs: A list of :class:`AcidBasePairs ` to reionize, sorted from 143 | strongest to weakest. 144 | :param charge_corrections: A list of :class:`ChargeCorrections `. 145 | """ 146 | log.debug('Initializing Reionizer') 147 | self.acid_base_pairs = acid_base_pairs 148 | self.charge_corrections = charge_corrections 149 | 150 | def __call__(self, mol): 151 | """Calling a Reionizer instance like a function is the same as calling its reionize(mol) method.""" 152 | return self.reionize(mol) 153 | 154 | def reionize(self, mol): 155 | """Enforce charges on certain atoms, then perform competitive reionization. 156 | 157 | First, charge corrections are applied to ensure, for example, that free metals are correctly ionized. Then, if 158 | a molecule with multiple acid groups is partially ionized, ensure the strongest acids ionize first. 159 | 160 | The algorithm works as follows: 161 | 162 | - Use SMARTS to find the strongest protonated acid and the weakest ionized acid. 163 | - If the ionized acid is weaker than the protonated acid, swap proton and repeat. 164 | 165 | :param mol: The molecule to reionize. 166 | :type mol: rdkit.Chem.rdchem.Mol 167 | :return: The reionized molecule. 168 | :rtype: rdkit.Chem.rdchem.Mol 169 | """ 170 | log.debug('Running Reionizer') 171 | 172 | start_charge = Chem.GetFormalCharge(mol) 173 | 174 | # Apply forced charge corrections 175 | for cc in self.charge_corrections: 176 | for match in mol.GetSubstructMatches(cc.smarts): 177 | atom = mol.GetAtomWithIdx(match[0]) 178 | log.info('Applying charge correction %s (%s %+d)', cc.name, atom.GetSymbol(), cc.charge) 179 | atom.SetFormalCharge(cc.charge) 180 | 181 | current_charge = Chem.GetFormalCharge(mol) 182 | charge_diff = Chem.GetFormalCharge(mol) - start_charge 183 | # If molecule is now neutral, assume everything is now fixed 184 | # But otherwise, if charge has become more positive, look for additional protonated acid groups to ionize 185 | if not current_charge == 0: 186 | while charge_diff > 0: 187 | ppos, poccur = self._strongest_protonated(mol) 188 | if ppos is None: 189 | break 190 | log.info('Ionizing %s to balance previous charge corrections', self.acid_base_pairs[ppos].name) 191 | patom = mol.GetAtomWithIdx(poccur[-1]) 192 | patom.SetFormalCharge(patom.GetFormalCharge() - 1) 193 | if patom.GetNumExplicitHs() > 0: 194 | patom.SetNumExplicitHs(patom.GetNumExplicitHs() - 1) 195 | # else: 196 | patom.UpdatePropertyCache() 197 | charge_diff -= 1 198 | 199 | already_moved = set() 200 | while True: 201 | ppos, poccur = self._strongest_protonated(mol) 202 | ipos, ioccur = self._weakest_ionized(mol) 203 | if ioccur and poccur and ppos < ipos: 204 | if poccur[-1] == ioccur[-1]: 205 | # Bad! H wouldn't be moved, resulting in infinite loop. 206 | log.warning('Aborted reionization due to unexpected situation') 207 | break 208 | 209 | key = tuple(sorted([poccur[-1], ioccur[-1]])) 210 | if key in already_moved: 211 | log.warning('Aborting reionization to avoid infinite loop due to it being ambiguous where to put a Hydrogen') 212 | break 213 | already_moved.add(key) 214 | 215 | log.info('Moved proton from %s to %s', self.acid_base_pairs[ppos].name, self.acid_base_pairs[ipos].name) 216 | 217 | # Remove hydrogen from strongest protonated 218 | patom = mol.GetAtomWithIdx(poccur[-1]) 219 | patom.SetFormalCharge(patom.GetFormalCharge() - 1) 220 | # If no implicit Hs to autoremove, and at least 1 explicit H to remove, reduce explicit count by 1 221 | if patom.GetNumImplicitHs() == 0 and patom.GetNumExplicitHs() > 0: 222 | patom.SetNumExplicitHs(patom.GetNumExplicitHs() - 1) 223 | # TODO: Remove any chiral label on patom? 224 | patom.UpdatePropertyCache() 225 | 226 | # Add hydrogen to weakest ionized 227 | iatom = mol.GetAtomWithIdx(ioccur[-1]) 228 | iatom.SetFormalCharge(iatom.GetFormalCharge() + 1) 229 | # Increase explicit H count if no implicit, or aromatic N or P, or non default valence state 230 | if (iatom.GetNoImplicit() or 231 | ((patom.GetAtomicNum() == 7 or patom.GetAtomicNum() == 15) and patom.GetIsAromatic()) or 232 | iatom.GetTotalValence() not in list(Chem.GetPeriodicTable().GetValenceList(iatom.GetAtomicNum()))): 233 | iatom.SetNumExplicitHs(iatom.GetNumExplicitHs() + 1) 234 | iatom.UpdatePropertyCache() 235 | else: 236 | break 237 | 238 | # TODO: Canonical ionization position if multiple equivalent positions? 239 | 240 | Chem.SanitizeMol(mol) 241 | return mol 242 | 243 | def _strongest_protonated(self, mol): 244 | for position, pair in enumerate(self.acid_base_pairs): 245 | for occurrence in mol.GetSubstructMatches(pair.acid): 246 | return position, occurrence 247 | return None, None 248 | 249 | def _weakest_ionized(self, mol): 250 | for position, pair in enumerate(reversed(self.acid_base_pairs)): 251 | for occurrence in mol.GetSubstructMatches(pair.base): 252 | return len(self.acid_base_pairs) - position - 1, occurrence 253 | return None, None 254 | 255 | 256 | class Uncharger(object): 257 | """Class for neutralizing charges in a molecule. 258 | 259 | This class uncharges molecules by adding and/or removing hydrogens. In cases where there is a positive charge that 260 | is not neutralizable, any corresponding negative charge is also preserved. 261 | """ 262 | 263 | def __init__(self, acid_base_pairs=ACID_BASE_PAIRS): 264 | log.debug('Initializing Uncharger') 265 | self.acid_base_pairs = acid_base_pairs 266 | self.nitro = Chem.MolFromSmarts('[!#8][NX3+](=O)[O-]') 267 | 268 | def __call__(self, mol): 269 | """Calling an Uncharger instance like a function is the same as calling its uncharge(mol) method.""" 270 | return self.uncharge(mol) 271 | 272 | def uncharge(self, mol): 273 | """Neutralize molecule by adding/removing hydrogens. 274 | 275 | :param mol: The molecule to uncharge. 276 | :type mol: rdkit.Chem.rdchem.Mol 277 | :return: The uncharged molecule. 278 | :rtype: rdkit.Chem.rdchem.Mol 279 | """ 280 | log.debug('Running Uncharger') 281 | mol = copy.deepcopy(mol) 282 | 283 | # Neutralize positive charges 284 | pos_remainder = 0 285 | neg_count = 0 286 | for atom in mol.GetAtoms(): 287 | # Remove hydrogen from positive atoms and reduce formal change until neutral or no more hydrogens 288 | while atom.GetFormalCharge() > 0 and atom.GetNumExplicitHs() > 0: 289 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() - 1) 290 | atom.SetFormalCharge(atom.GetFormalCharge() - 1) 291 | log.info('Removed positive charge') 292 | chg = atom.GetFormalCharge() 293 | if chg > 0: 294 | # Record number of non-neutralizable positive charges 295 | pos_remainder += chg 296 | elif chg < 0: 297 | # Record total number of negative charges 298 | neg_count += -chg 299 | 300 | # Choose negative charges to leave in order to balance non-neutralizable positive charges 301 | neg_skip = self._get_neg_skip(mol, pos_remainder) 302 | 303 | # Neutralize remaining negative charges 304 | for atom in mol.GetAtoms(): 305 | log.info(atom.GetIdx()) 306 | if atom.GetIdx() in neg_skip: 307 | continue 308 | # Make sure to stop when neg_count <= pos_remainder, as it is possible that neg_skip is not large enough 309 | while atom.GetFormalCharge() < 0 and neg_count > pos_remainder: 310 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1) 311 | atom.SetFormalCharge(atom.GetFormalCharge() + 1) 312 | neg_count -= 1 313 | log.info('Removed negative charge') 314 | return mol 315 | 316 | def _get_neg_skip(self, mol, pos_count): 317 | """Get negatively charged atoms to skip (up to pos_count).""" 318 | neg_skip = set() 319 | if pos_count: 320 | # Get negative oxygens in charge-separated nitro groups TODO: Any other special cases to skip? 321 | for occurrence in mol.GetSubstructMatches(self.nitro): 322 | neg_skip.add(occurrence[-1]) 323 | if len(neg_skip) >= pos_count: 324 | return neg_skip 325 | # Get strongest ionized acids 326 | for position, pair in enumerate(self.acid_base_pairs): 327 | for occurrence in mol.GetSubstructMatches(pair.base): 328 | neg_skip.add(occurrence[-1]) 329 | if len(neg_skip) >= pos_count: 330 | return neg_skip 331 | return neg_skip 332 | -------------------------------------------------------------------------------- /molvs/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.cli 4 | ~~~~~~~~~ 5 | 6 | This module contains a command line interface for standardization. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import argparse 14 | import logging 15 | from rdkit import Chem 16 | import sys 17 | 18 | from molvs import Standardizer, Validator 19 | 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | FILETYPES = ['smi', 'mol', 'sdf'] 25 | 26 | 27 | class MolvsParser(argparse.ArgumentParser): 28 | 29 | def error(self, message): 30 | sys.stderr.write('Error: %s\n\n'.encode() % message) 31 | self.print_help() 32 | sys.exit(2) 33 | 34 | 35 | def main(): 36 | """Main function for molvs command line interface.""" 37 | 38 | # Root options 39 | parser = MolvsParser(epilog='use "molvs -h" to show help for a specific command') 40 | subparsers = parser.add_subparsers(title='Available commands') 41 | 42 | # Options common to all commands 43 | 44 | common_parser = MolvsParser(add_help=False) 45 | common_parser.add_argument('infile', nargs='?', help='input filename', type=argparse.FileType('r'), default=sys.stdin) 46 | common_parser.add_argument('-i', '--intype', help='input filetype', choices=FILETYPES) 47 | common_parser.add_argument('-:', '--smiles', help='input SMILES instead of file', metavar='') 48 | common_parser.add_argument('-O', '--outfile', help='output filename', type=argparse.FileType('w'), default=sys.stdout, metavar='') 49 | 50 | # Standardize options 51 | standardize_parser = subparsers.add_parser('standardize', help='standardize a molecule', parents=[common_parser]) 52 | standardize_parser.add_argument('-o', '--outtype', help='output filetype', choices=FILETYPES) 53 | standardize_parser.set_defaults(func=standardize_main) 54 | 55 | # Validate options 56 | validate_parser = subparsers.add_parser('validate', help='validate a molecule', parents=[common_parser]) 57 | validate_parser.set_defaults(func=validate_main) 58 | 59 | args = parser.parse_args() 60 | try: 61 | args.func(args) 62 | except Exception as e: 63 | sys.stderr.write('Error: %s\n\n'.encode() % e.message) 64 | parser.print_help() 65 | sys.exit(2) 66 | 67 | 68 | def _read_mol(args): 69 | if args.smiles: 70 | return Chem.MolFromSmiles(args.smiles) 71 | elif args.intype in {'smi', 'smiles'} or args.infile.name.endswith('smi') or args.infile.name.endswith('smiles'): 72 | return Chem.MolFromSmiles(args.infile.read()) 73 | elif args.intype in {'mol', 'sdf'} or args.infile.name.endswith('mol') or args.infile.name.endswith('sdf'): 74 | return Chem.MolFromMolBlock(args.infile.read()) 75 | else: 76 | return Chem.MolFromSmiles(args.infile.read()) 77 | 78 | 79 | def _write_mol(mol, args): 80 | if args.outtype in {'smi', 'smiles'} or args.outfile.name.endswith('smi') or args.outfile.name.endswith('smiles'): 81 | args.outfile.write(Chem.MolToSmiles(mol)) 82 | args.outfile.write('\n') 83 | elif args.outtype in {'mol', 'sdf'} or args.outfile.name.endswith('mol') or args.outfile.name.endswith('sdf'): 84 | args.outfile.write(Chem.MolToMolBlock(mol)) 85 | else: 86 | args.outfile.write(Chem.MolToSmiles(mol)) 87 | args.outfile.write('\n') 88 | 89 | 90 | def standardize_main(args): 91 | mol = _read_mol(args) 92 | s = Standardizer() 93 | mol = s.standardize(mol) 94 | _write_mol(mol, args) 95 | 96 | 97 | def validate_main(args): 98 | mol = _read_mol(args) 99 | v = Validator() 100 | logs = v.validate(mol) 101 | for log in logs: 102 | args.outfile.write(log) 103 | args.outfile.write('\n') 104 | 105 | -------------------------------------------------------------------------------- /molvs/errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.errors 4 | ~~~~~~~~~~~~ 5 | 6 | This module contains exceptions that are raised by MolVS. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | 14 | 15 | class MolVSError(Exception): 16 | pass 17 | 18 | 19 | class StandardizeError(MolVSError): 20 | pass 21 | 22 | 23 | class ValidateError(MolVSError): 24 | pass 25 | 26 | 27 | class StopValidateError(ValidateError): 28 | """Called by Validations to stop any further validations from being performed.""" 29 | pass 30 | -------------------------------------------------------------------------------- /molvs/fragment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.fragment 4 | ~~~~~~~~~~~~~~ 5 | 6 | This module contains tools for dealing with molecules with more than one covalently bonded unit. The main classes are 7 | :class:`~molvs.fragment.LargestFragmentChooser`, which returns the largest covalent unit in a molecule, and 8 | :class:`~molvs.fragment.FragmentRemover`, which filters out fragments from a molecule using SMARTS patterns. 9 | 10 | """ 11 | 12 | from __future__ import print_function 13 | from __future__ import unicode_literals 14 | from __future__ import division 15 | import logging 16 | 17 | from rdkit import Chem 18 | from rdkit.Chem import rdMolDescriptors 19 | 20 | from .utils import memoized_property 21 | 22 | 23 | log = logging.getLogger(__name__) 24 | 25 | 26 | class FragmentPattern(object): 27 | """A fragment defined by a SMARTS pattern.""" 28 | 29 | def __init__(self, name, smarts): 30 | """Initialize a FragmentPattern with a name and a SMARTS pattern. 31 | 32 | :param name: A name for this FragmentPattern. 33 | :param smarts: A SMARTS pattern. 34 | """ 35 | self.name = name 36 | self.smarts_str = smarts 37 | 38 | @memoized_property 39 | def smarts(self): 40 | return Chem.MolFromSmarts(self.smarts_str) 41 | 42 | def __repr__(self): 43 | return 'FragmentPattern({!r}, {!r})'.format(self.name, self.smarts_str) 44 | 45 | def __str__(self): 46 | return self.name 47 | 48 | 49 | #: The default list of :class:`FragmentPatterns ` to be used by 50 | #: :class:`~molvs.fragment.FragmentRemover`. 51 | REMOVE_FRAGMENTS = ( 52 | FragmentPattern('hydrogen', '[H]'), 53 | FragmentPattern('fluorine', '[F]'), 54 | FragmentPattern('chlorine', '[Cl]'), 55 | FragmentPattern('bromine', '[Br]'), 56 | FragmentPattern('iodine', '[I]'), 57 | FragmentPattern('lithium', '[Li]'), 58 | FragmentPattern('sodium', '[Na]'), 59 | FragmentPattern('potassium', '[K]'), 60 | FragmentPattern('calcium', '[Ca]'), 61 | FragmentPattern('magnesium', '[Mg]'), 62 | FragmentPattern('aluminium', '[Al]'), 63 | FragmentPattern('barium', '[Ba]'), 64 | FragmentPattern('bismuth', '[Bi]'), 65 | FragmentPattern('silver', '[Ag]'), 66 | FragmentPattern('strontium', '[Sr]'), 67 | FragmentPattern('zinc', '[Zn]'), 68 | FragmentPattern('ammonia/ammonium', '[#7]'), 69 | FragmentPattern('water/hydroxide', '[#8]'), 70 | FragmentPattern('methyl amine', '[#6]-[#7]'), 71 | FragmentPattern('sulfide', 'S'), 72 | FragmentPattern('nitrate', '[#7](=[#8])(-[#8])-[#8]'), 73 | FragmentPattern('phosphate', '[P](=[#8])(-[#8])(-[#8])-[#8]'), 74 | FragmentPattern('hexafluorophosphate', '[P](-[#9])(-[#9])(-[#9])(-[#9])(-[#9])-[#9]'), 75 | FragmentPattern('sulfate', '[S](=[#8])(=[#8])(-[#8])-[#8]'), 76 | FragmentPattern('methyl sulfonate', '[#6]-[S](=[#8])(=[#8])(-[#8])'), 77 | FragmentPattern('trifluoromethanesulfonic acid', '[#8]-[S](=[#8])(=[#8])-[#6](-[#9])(-[#9])-[#9]'), 78 | FragmentPattern('trifluoroacetic acid', '[#9]-[#6](-[#9])(-[#9])-[#6](=[#8])-[#8]'), 79 | FragmentPattern('1,2-dichloroethane', '[Cl]-[#6]-[#6]-[Cl]'), 80 | FragmentPattern('1,2-dimethoxyethane', '[#6]-[#8]-[#6]-[#6]-[#8]-[#6]'), 81 | FragmentPattern('1,4-dioxane', '[#6]-1-[#6]-[#8]-[#6]-[#6]-[#8]-1'), 82 | FragmentPattern('1-methyl-2-pyrrolidinone', '[#6]-[#7]-1-[#6]-[#6]-[#6]-[#6]-1=[#8]'), 83 | FragmentPattern('2-butanone', '[#6]-[#6]-[#6](-[#6])=[#8]'), 84 | FragmentPattern('acetate/acetic acid', '[#8]-[#6](-[#6])=[#8]'), 85 | FragmentPattern('acetone', '[#6]-[#6](-[#6])=[#8]'), 86 | FragmentPattern('acetonitrile', '[#6]-[#6]#[N]'), 87 | FragmentPattern('benzene', '[#6]1[#6][#6][#6][#6][#6]1'), 88 | FragmentPattern('butanol', '[#8]-[#6]-[#6]-[#6]-[#6]'), 89 | FragmentPattern('t-butanol', '[#8]-[#6](-[#6])(-[#6])-[#6]'), 90 | FragmentPattern('chloroform', '[Cl]-[#6](-[Cl])-[Cl]'), 91 | FragmentPattern('cycloheptane', '[#6]-1-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-1'), 92 | FragmentPattern('cyclohexane', '[#6]-1-[#6]-[#6]-[#6]-[#6]-[#6]-1'), 93 | FragmentPattern('dichloromethane', '[Cl]-[#6]-[Cl]'), 94 | FragmentPattern('diethyl ether', '[#6]-[#6]-[#8]-[#6]-[#6]'), 95 | FragmentPattern('diisopropyl ether', '[#6]-[#6](-[#6])-[#8]-[#6](-[#6])-[#6]'), 96 | FragmentPattern('dimethyl formamide', '[#6]-[#7](-[#6])-[#6]=[#8]'), 97 | FragmentPattern('dimethyl sulfoxide', '[#6]-[S](-[#6])=[#8]'), 98 | FragmentPattern('ethanol', '[#8]-[#6]-[#6]'), 99 | FragmentPattern('ethyl acetate', '[#6]-[#6]-[#8]-[#6](-[#6])=[#8]'), 100 | FragmentPattern('formic acid', '[#8]-[#6]=[#8]'), 101 | FragmentPattern('heptane', '[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]'), 102 | FragmentPattern('hexane', '[#6]-[#6]-[#6]-[#6]-[#6]-[#6]'), 103 | FragmentPattern('isopropanol', '[#8]-[#6](-[#6])-[#6]'), 104 | FragmentPattern('methanol', '[#8]-[#6]'), 105 | FragmentPattern('N,N-dimethylacetamide', '[#6]-[#7](-[#6])-[#6](-[#6])=[#8]'), 106 | FragmentPattern('pentane', '[#6]-[#6]-[#6]-[#6]-[#6]'), 107 | FragmentPattern('propanol', '[#8]-[#6]-[#6]-[#6]'), 108 | FragmentPattern('pyridine', '[#6]-1=[#6]-[#6]=[#7]-[#6]=[#6]-1'), 109 | FragmentPattern('t-butyl methyl ether', '[#6]-[#8]-[#6](-[#6])(-[#6])-[#6]'), 110 | FragmentPattern('tetrahydrofurane', '[#6]-1-[#6]-[#6]-[#8]-[#6]-1'), 111 | FragmentPattern('toluene', '[#6]-[#6]~1~[#6]~[#6]~[#6]~[#6]~[#6]~1'), 112 | FragmentPattern('xylene', '[#6]-[#6]~1~[#6](-[#6])~[#6]~[#6]~[#6]~[#6]~1') 113 | ) 114 | 115 | #: The default value for whether to ensure at least one fragment is left after FragmentRemover is applied. 116 | LEAVE_LAST = True 117 | 118 | #: The default value for whether LargestFragmentChooser sees organic fragments as "larger" than inorganic fragments. 119 | PREFER_ORGANIC = False 120 | 121 | 122 | def is_organic(fragment): 123 | """Return true if fragment contains at least one carbon atom. 124 | 125 | :param fragment: The fragment as an RDKit Mol object. 126 | """ 127 | # TODO: Consider a different definition? 128 | # Could allow only H, C, N, O, S, P, F, Cl, Br, I 129 | for a in fragment.GetAtoms(): 130 | if a.GetAtomicNum() == 6: 131 | return True 132 | return False 133 | 134 | 135 | class FragmentRemover(object): 136 | """A class for filtering out fragments using SMARTS patterns.""" 137 | 138 | def __init__(self, fragments=REMOVE_FRAGMENTS, leave_last=LEAVE_LAST): 139 | """Initialize a FragmentRemover with an optional custom list of :class:`~molvs.fragment.FragmentPattern`. 140 | 141 | Setting leave_last to True will ensure at least one fragment is left in the molecule, even if it is matched by a 142 | :class:`~molvs.fragment.FragmentPattern`. Fragments are removed in the order specified in the list, so place 143 | those you would prefer to be left towards the end of the list. If all the remaining fragments match the same 144 | :class:`~molvs.fragment.FragmentPattern`, they will all be left. 145 | 146 | :param fragments: A list of :class:`~molvs.fragment.FragmentPattern` to remove. 147 | :param bool leave_last: Whether to ensure at least one fragment is left. 148 | """ 149 | log.debug('Initializing FragmentRemover') 150 | self.fragments = fragments 151 | self.leave_last = leave_last 152 | 153 | def __call__(self, mol): 154 | """Calling a FragmentRemover instance like a function is the same as calling its remove(mol) method.""" 155 | return self.remove(mol) 156 | 157 | def remove(self, mol): 158 | """Return the molecule with specified fragments removed. 159 | 160 | :param mol: The molecule to remove fragments from. 161 | :type mol: rdkit.Chem.rdchem.Mol 162 | :return: The molecule with fragments removed. 163 | :rtype: rdkit.Chem.rdchem.Mol 164 | """ 165 | log.debug('Running FragmentRemover') 166 | # Iterate FragmentPatterns and remove matching fragments 167 | for frag in self.fragments: 168 | # If nothing is left or leave_last and only one fragment, end here 169 | if mol.GetNumAtoms() == 0 or (self.leave_last and len(Chem.GetMolFrags(mol)) <= 1): 170 | break 171 | # Apply removal for this FragmentPattern 172 | removed = Chem.DeleteSubstructs(mol, frag.smarts, onlyFrags=True) 173 | if not mol.GetNumAtoms() == removed.GetNumAtoms(): 174 | log.info('Removed fragment: %s', frag.name) 175 | if self.leave_last and removed.GetNumAtoms() == 0: 176 | # All the remaining fragments match this pattern - leave them all 177 | break 178 | mol = removed 179 | return mol 180 | 181 | 182 | class LargestFragmentChooser(object): 183 | """A class for selecting the largest covalent unit in a molecule with multiple fragments.""" 184 | 185 | def __init__(self, prefer_organic=PREFER_ORGANIC): 186 | """ 187 | 188 | If prefer_organic is set to True, any organic fragment will be considered larger than any inorganic fragment. A 189 | fragment is considered organic if it contains a carbon atom. 190 | 191 | :param bool prefer_organic: Whether to prioritize organic fragments above all others. 192 | """ 193 | log.debug('Initializing LargestFragmentChooser') 194 | self.prefer_organic = prefer_organic 195 | 196 | def __call__(self, mol): 197 | """Calling a LargestFragmentChooser instance like a function is the same as calling its choose(mol) method.""" 198 | return self.choose(mol) 199 | 200 | def choose(self, mol): 201 | """Return the largest covalent unit. 202 | 203 | The largest fragment is determined by number of atoms (including hydrogens). Ties are broken by taking the 204 | fragment with the higher molecular weight, and then by taking the first alphabetically by SMILES if needed. 205 | 206 | :param mol: The molecule to choose the largest fragment from. 207 | :type mol: rdkit.Chem.rdchem.Mol 208 | :return: The largest fragment. 209 | :rtype: rdkit.Chem.rdchem.Mol 210 | """ 211 | log.debug('Running LargestFragmentChooser') 212 | # TODO: Alternatively allow a list of fragments to be passed as the mol parameter 213 | fragments = Chem.GetMolFrags(mol, asMols=True) 214 | largest = None 215 | for f in fragments: 216 | smiles = Chem.MolToSmiles(f, isomericSmiles=True) 217 | log.debug('Fragment: %s', smiles) 218 | organic = is_organic(f) 219 | if self.prefer_organic: 220 | # Skip this fragment if not organic and we already have an organic fragment as the largest so far 221 | if largest and largest['organic'] and not organic: 222 | continue 223 | # Reset largest if it wasn't organic and this fragment is organic 224 | if largest and organic and not largest['organic']: 225 | largest = None 226 | # Count atoms 227 | atoms = 0 228 | for a in f.GetAtoms(): 229 | atoms += 1 + a.GetTotalNumHs() 230 | # Skip this fragment if fewer atoms than the largest 231 | if largest and atoms < largest['atoms']: 232 | continue 233 | # Skip this fragment if equal number of atoms but weight is lower 234 | weight = rdMolDescriptors.CalcExactMolWt(f) 235 | if largest and atoms == largest['atoms'] and weight < largest['weight']: 236 | continue 237 | # Skip this fragment if equal atoms and equal weight but smiles comes last alphabetically 238 | if largest and atoms == largest['atoms'] and weight == largest['weight'] and smiles > largest['smiles']: 239 | continue 240 | # Otherwise this is the largest so far 241 | log.debug('New largest fragment: %s (%s)', smiles, atoms) 242 | largest = {'smiles': smiles, 'fragment': f, 'atoms': atoms, 'weight': weight, 'organic': organic} 243 | return largest['fragment'] 244 | -------------------------------------------------------------------------------- /molvs/metal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.metal 4 | ~~~~~~~~~~~ 5 | 6 | This module contains tools for disconnecting metal atoms that are defined as covalently bonded to non-metals. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import logging 14 | 15 | from rdkit import Chem 16 | 17 | 18 | log = logging.getLogger(__name__) 19 | 20 | 21 | # TODO: This won't disconnect e.g. covalent [Na]Cl... 22 | 23 | 24 | class MetalDisconnector(object): 25 | """Class for breaking covalent bonds between metals and organic atoms under certain conditions.""" 26 | 27 | def __init__(self): 28 | log.debug('Initializing MetalDisconnector') 29 | # Initialize SMARTS to identify relevant substructures 30 | # TODO: Use atomic numbers instead of element symbols in SMARTS to allow for isotopes? 31 | self._metal_nof = Chem.MolFromSmarts('[Li,Na,K,Rb,Cs,Fr,Be,Mg,Ca,Sr,Ba,Ra,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Al,Ga,Y,Zr,Nb,Mo,Tc,Ru,Rh,Pd,Ag,Cd,In,Sn,Hf,Ta,W,Re,Os,Ir,Pt,Au,Hg,Tl,Pb,Bi]~[N,O,F]') 32 | self._metal_non = Chem.MolFromSmarts('[Al,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Y,Zr,Nb,Mo,Tc,Ru,Rh,Pd,Ag,Cd,Hf,Ta,W,Re,Os,Ir,Pt,Au]~[B,C,Si,P,As,Sb,S,Se,Te,Cl,Br,I,At]') 33 | 34 | def __call__(self, mol): 35 | """Calling a MetalDisconnector instance like a function is the same as calling its disconnect(mol) method.""" 36 | return self.disconnect(mol) 37 | 38 | def disconnect(self, mol): 39 | """Break covalent bonds between metals and organic atoms under certain conditions. 40 | 41 | The algorithm works as follows: 42 | 43 | - Disconnect N, O, F from any metal. 44 | - Disconnect other non-metals from transition metals + Al (but not Hg, Ga, Ge, In, Sn, As, Tl, Pb, Bi, Po). 45 | - For every bond broken, adjust the charges of the begin and end atoms accordingly. 46 | 47 | :param mol: The input molecule. 48 | :type mol: rdkit.Chem.rdchem.Mol 49 | :return: The molecule with metals disconnected. 50 | :rtype: rdkit.Chem.rdchem.Mol 51 | """ 52 | log.debug('Running MetalDisconnector') 53 | # Remove bonds that match SMARTS 54 | for smarts in [self._metal_nof, self._metal_non]: 55 | pairs = mol.GetSubstructMatches(smarts) 56 | rwmol = Chem.RWMol(mol) 57 | orders = [] 58 | for i, j in pairs: 59 | # TODO: Could get the valence contributions of the bond instead of GetBondTypeAsDouble? 60 | orders.append(int(mol.GetBondBetweenAtoms(i, j).GetBondTypeAsDouble())) 61 | rwmol.RemoveBond(i, j) 62 | # Adjust neighbouring charges accordingly 63 | mol = rwmol.GetMol() 64 | for n, (i, j) in enumerate(pairs): 65 | chg = orders[n] 66 | atom1 = mol.GetAtomWithIdx(i) 67 | atom1.SetFormalCharge(atom1.GetFormalCharge() + chg) 68 | atom2 = mol.GetAtomWithIdx(j) 69 | atom2.SetFormalCharge(atom2.GetFormalCharge() - chg) 70 | log.info('Removed covalent bond between %s and %s', atom1.GetSymbol(), atom2.GetSymbol()) 71 | Chem.SanitizeMol(mol) 72 | return mol 73 | -------------------------------------------------------------------------------- /molvs/normalize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.normalize 4 | ~~~~~~~~~~~~~~~ 5 | 6 | This module contains tools for normalizing molecules using reaction SMARTS patterns. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import logging 14 | 15 | from rdkit import Chem 16 | from rdkit.Chem import AllChem 17 | import six 18 | 19 | from .utils import memoized_property 20 | 21 | 22 | log = logging.getLogger(__name__) 23 | 24 | 25 | class Normalization(object): 26 | """A normalization transform defined by reaction SMARTS.""" 27 | 28 | def __init__(self, name, transform): 29 | """ 30 | :param string name: A name for this Normalization 31 | :param string transform: Reaction SMARTS to define the transformation. 32 | """ 33 | log.debug('Initializing Normalization: %s', name) 34 | self.name = name 35 | self.transform_str = transform 36 | 37 | @memoized_property 38 | def transform(self): 39 | log.debug('Loading Normalization transform: %s', self.name) 40 | return AllChem.ReactionFromSmarts(str(self.transform_str)) 41 | 42 | def __repr__(self): 43 | return 'Normalization({!r}, {!r})'.format(self.name, self.transform_str) 44 | 45 | def __str__(self): 46 | return self.name 47 | 48 | 49 | #: The default list of Normalization transforms. 50 | NORMALIZATIONS = ( 51 | # Opposite of #2.1 in InChI technical manual? Covered by RDKit Sanitization. 52 | Normalization('Nitro to N+(O-)=O', '[N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]'), 53 | Normalization('Sulfone to S(=O)(=O)', '[S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])'), 54 | Normalization('Pyridine oxide to n+O-', '[n:1]=[O:2]>>[n+:1][O-:2]'), 55 | Normalization('Azide to N=N+=N-', '[*,H:1][N:2]=[N:3]#[N:4]>>[*,H:1][N:2]=[N+:3]=[N-:4]'), 56 | Normalization('Diazo/azo to =N+=N-', '[*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]'), 57 | Normalization('Sulfoxide to -S+(O-)-', '[!O:1][S+0;X3:2](=[O:3])[!O:4]>>[*:1][S+1:2]([O-:3])[*:4]'), 58 | # Equivalent to #1.5 in InChI technical manual 59 | Normalization('Phosphate to P(O-)=O', '[O,S,Se,Te;-1:1][P+;D4:2][O,S,Se,Te;-1:3]>>[*+0:1]=[P+0;D5:2][*-1:3]'), 60 | # Equivalent to #1.8 in InChI technical manual 61 | Normalization('C/S+N to C/S=N+', '[C,S;X3+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]'), 62 | # Equivalent to #1.8 in InChI technical manual 63 | Normalization('P+N to P=N+', '[P;X4+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]'), 64 | Normalization('Normalize hydrazine-diazonium', '[CX4:1][NX3H:2]-[NX3H:3][CX4:4][NX2+:5]#[NX1:6]>>[CX4:1][NH0:2]=[NH+:3][C:4][N+0:5]=[NH:6]'), 65 | # Equivalent to #1.3 in InChI technical manual 66 | Normalization('Recombine 1,3-separated charges', '[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[N,P,As,Sb,O,S,Se,Te;+1:3]>>[*-0:1]=[*:2]-[*+0:3]'), 67 | Normalization('Recombine 1,3-separated charges', '[n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3]'), 68 | Normalization('Recombine 1,3-separated charges', '[N,O,P,S;-1:1]-[a:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3]'), 69 | Normalization('Recombine 1,5-separated charges', '[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,P,As,Sb,O,S,Se,Te;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5]'), 70 | Normalization('Recombine 1,5-separated charges', '[n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5]'), 71 | Normalization('Recombine 1,5-separated charges', '[N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5]'), 72 | # Conjugated cation rules taken from Francis Atkinson's standardiser. Those that can reduce aromaticity aren't included 73 | Normalization('Normalize 1,3 conjugated cation', '[N,O;+0!H0:1]-[A:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]'), 74 | Normalization('Normalize 1,3 conjugated cation', '[n;+0!H0:1]:[c:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3]'), 75 | #Normalization('Normalize 1,3 conjugated cation', '[N,O;+0!H0:1]-[c:2]:[n!$(*[O-]),o;+1H0:3]>>[*+1:1]=[*:2]:[*+0:3]'), 76 | Normalization('Normalize 1,5 conjugated cation', '[N,O;+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*[O-]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]'), 77 | Normalization('Normalize 1,5 conjugated cation', '[n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*[O-]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5]'), 78 | # Normalization('Normalize 1,5 conjugated cation', '[N,O;+0!H0:1]-[c:2]:[a:3]:[a:4]:[n!$(*[O-]),o;+1H0:5]>>[*+1:1]=[c:2]:[*:3]:[*:4]:[*+0:5]'), 79 | # Normalization('Normalize 1,5 conjugated cation', '[n;+0!H0:1]1:[a:2]:[a:3]:[a:4]:[n!$(*[O-]);+1H0:5]1>>[n+1:1]1:[*:2]:[*:3]:[*:4]:[n+0:5]1'), 80 | # Normalization('Normalize 1,5 conjugated cation', '[n;+0!H0:1]:[a:2]:[a:3]:[a:4]:[n!$(*[O-]);+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]:[n+0:5]'), 81 | # Equivalent to #1.6 in InChI technical manual. RDKit Sanitization handles this for perchlorate. 82 | Normalization('Charge normalization', '[F,Cl,Br,I,At;-1:1]=[O:2]>>[*-0:1][O-:2]'), 83 | Normalization('Charge recombination', '[N,P,As,Sb;-1:1]=[C+;v3:2]>>[*+0:1]#[C+0:2]'), 84 | ) 85 | # InChI technical manual has many additional rules that cover situations that are disallowed by RDKit 86 | 87 | 88 | #: The default value for the maximum number of times to attempt to apply the series of normalizations. 89 | MAX_RESTARTS = 200 90 | 91 | 92 | class Normalizer(object): 93 | """A class for applying Normalization transforms. 94 | 95 | This class is typically used to apply a series of Normalization transforms to correct functional groups and 96 | recombine charges. Each transform is repeatedly applied until no further changes occur. 97 | """ 98 | 99 | def __init__(self, normalizations=NORMALIZATIONS, max_restarts=MAX_RESTARTS): 100 | """Initialize a Normalizer with an optional custom list of :class:`~molvs.normalize.Normalization` transforms. 101 | 102 | :param normalizations: A list of :class:`~molvs.normalize.Normalization` transforms to apply. 103 | :param int max_restarts: The maximum number of times to attempt to apply the series of normalizations (default 104 | 200). 105 | """ 106 | log.debug('Initializing Normalizer') 107 | self.normalizations = normalizations 108 | self.max_restarts = max_restarts 109 | 110 | def __call__(self, mol): 111 | """Calling a Normalizer instance like a function is the same as calling its normalize(mol) method.""" 112 | return self.normalize(mol) 113 | 114 | def normalize(self, mol): 115 | """Apply a series of Normalization transforms to correct functional groups and recombine charges. 116 | 117 | A series of transforms are applied to the molecule. For each Normalization, the transform is applied repeatedly 118 | until no further changes occur. If any changes occurred, we go back and start from the first Normalization 119 | again, in case the changes mean an earlier transform is now applicable. The molecule is returned once the entire 120 | series of Normalizations cause no further changes or if max_restarts (default 200) is reached. 121 | 122 | :param mol: The molecule to normalize. 123 | :type mol: rdkit.Chem.rdchem.Mol 124 | :return: The normalized fragment. 125 | :rtype: rdkit.Chem.rdchem.Mol 126 | """ 127 | log.debug('Running Normalizer') 128 | # Normalize each fragment separately to get around quirky RunReactants behaviour 129 | fragments = [] 130 | for fragment in Chem.GetMolFrags(mol, asMols=True): 131 | fragments.append(self._normalize_fragment(fragment)) 132 | # Join normalized fragments into a single molecule again 133 | outmol = fragments.pop() 134 | for fragment in fragments: 135 | outmol = Chem.CombineMols(outmol, fragment) 136 | Chem.SanitizeMol(outmol) 137 | return outmol 138 | 139 | def _normalize_fragment(self, mol): 140 | for n in six.moves.range(self.max_restarts): 141 | # Iterate through Normalization transforms and apply each in order 142 | for normalization in self.normalizations: 143 | product = self._apply_transform(mol, normalization.transform) 144 | if product: 145 | # If transform changed mol, go back to first rule and apply each again 146 | log.info('Rule applied: %s', normalization.name) 147 | mol = product 148 | break 149 | else: 150 | # For loop finishes normally, all applicable transforms have been applied 151 | return mol 152 | # If we're still going after max_restarts (default 200), stop and warn, but still return the mol 153 | log.warning('Gave up normalization after %s restarts', self.max_restarts) 154 | return mol 155 | 156 | def _apply_transform(self, mol, rule): 157 | """Repeatedly apply normalization transform to molecule until no changes occur. 158 | 159 | It is possible for multiple products to be produced when a rule is applied. The rule is applied repeatedly to 160 | each of the products, until no further changes occur or after 20 attempts. If there are multiple unique products 161 | after the final application, the first product (sorted alphabetically by SMILES) is chosen. 162 | """ 163 | mols = [mol] 164 | for n in six.moves.range(20): 165 | products = {} 166 | for mol in mols: 167 | for product in [x[0] for x in rule.RunReactants((mol,))]: 168 | if Chem.SanitizeMol(product, catchErrors=True) == 0: 169 | products[Chem.MolToSmiles(product, isomericSmiles=True)] = product 170 | if products: 171 | mols = [products[s] for s in sorted(products)] 172 | else: 173 | # If n == 0, the rule was not applicable and we return None 174 | return mols[0] if n > 0 else None 175 | -------------------------------------------------------------------------------- /molvs/resonance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.resonance 4 | ~~~~~~~~~~~~~~~ 5 | 6 | Resonance (mesomeric) transformations. 7 | 8 | """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | from __future__ import unicode_literals 14 | import logging 15 | 16 | from rdkit import Chem 17 | 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | MAX_STRUCTURES = 1000 23 | 24 | 25 | class ResonanceEnumerator(object): 26 | """Simple wrapper around RDKit ResonanceMolSupplier. 27 | 28 | """ 29 | 30 | def __init__(self, kekule_all=False, allow_incomplete_octets=False, unconstrained_cations=False, 31 | unconstrained_anions=False, allow_charge_separation=False, max_structures=MAX_STRUCTURES): 32 | """ 33 | 34 | :param bool allow_incomplete_octets: include resonance structures whose octets are less complete than the the most octet-complete structure. 35 | :param bool allow_charge_separation: include resonance structures featuring charge separation also when uncharged resonance structures exist. 36 | :param bool kekule_all: enumerate all possible degenerate Kekule resonance structures (the default is to include just one). 37 | :param bool unconstrained_cations: if False positively charged atoms left and right of N with an incomplete octet are acceptable only if the conjugated group has a positive total formal charge. 38 | :param bool unconstrained_anions: if False, negatively charged atoms left of N are acceptable only if the conjugated group has a negative total formal charge. 39 | :param int max_structures: Maximum number of resonance forms. 40 | """ 41 | self.kekule_all = kekule_all 42 | self.allow_incomplete_octets = allow_incomplete_octets 43 | self.unconstrained_cations = unconstrained_cations 44 | self.unconstrained_anions = unconstrained_anions 45 | self.allow_charge_separation = allow_charge_separation 46 | self.max_structures = max_structures 47 | 48 | def __call__(self, mol): 49 | """Calling a ResonanceEnumerator instance like a function is the same as calling its enumerate(mol) method.""" 50 | return self.enumerate(mol) 51 | 52 | def enumerate(self, mol): 53 | """Enumerate all possible resonance forms and return them as a list. 54 | 55 | :param mol: The input molecule. 56 | :type mol: rdkit.Chem.rdchem.Mol 57 | :return: A list of all possible resonance forms of the molecule. 58 | :rtype: list of rdkit.Chem.rdchem.Mol 59 | """ 60 | flags = 0 61 | if self.kekule_all: 62 | flags = flags | Chem.KEKULE_ALL 63 | if self.allow_incomplete_octets: 64 | flags = flags | Chem.ALLOW_INCOMPLETE_OCTETS 65 | if self.allow_charge_separation: 66 | flags = flags | Chem.ALLOW_CHARGE_SEPARATION 67 | if self.unconstrained_anions: 68 | flags = flags | Chem.UNCONSTRAINED_ANIONS 69 | if self.unconstrained_cations: 70 | flags = flags | Chem.UNCONSTRAINED_CATIONS 71 | results = [] 72 | for result in Chem.ResonanceMolSupplier(mol, flags=flags, maxStructs=self.max_structures): 73 | # This seems necessary? ResonanceMolSupplier only does a partial sanitization 74 | Chem.SanitizeMol(result) 75 | results.append(result) 76 | return results 77 | 78 | # Potentially interesting: getNumConjGrps(), getBondConjGrpIdx() and getAtomConjGrpIdx() 79 | 80 | 81 | def enumerate_resonance_smiles(smiles): 82 | """Return a set of resonance forms as SMILES strings, given a SMILES string. 83 | 84 | :param smiles: A SMILES string. 85 | :returns: A set containing SMILES strings for every possible resonance form. 86 | :rtype: set of strings. 87 | """ 88 | mol = Chem.MolFromSmiles(smiles) 89 | #Chem.SanitizeMol(mol) # MolFromSmiles does Sanitize by default 90 | mesomers = ResonanceEnumerator().enumerate(mol) 91 | return {Chem.MolToSmiles(m, isomericSmiles=True) for m in mesomers} 92 | -------------------------------------------------------------------------------- /molvs/standardize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.standardize 4 | ~~~~~~~~~~~~~~~~~ 5 | 6 | This module contains the main :class:`~molvs.standardize.Standardizer` class that can be used to perform all 7 | standardization tasks, as well as convenience functions like :func:`~molvs.standardize.standardize_smiles` for common 8 | standardization tasks. 9 | 10 | """ 11 | 12 | from __future__ import print_function 13 | from __future__ import unicode_literals 14 | from __future__ import division 15 | import copy 16 | import logging 17 | 18 | from rdkit import Chem 19 | 20 | from .metal import MetalDisconnector 21 | from .fragment import PREFER_ORGANIC, LargestFragmentChooser, FragmentRemover 22 | from .normalize import NORMALIZATIONS, MAX_RESTARTS, Normalizer 23 | from .tautomer import TAUTOMER_TRANSFORMS, TAUTOMER_SCORES, MAX_TAUTOMERS, TautomerCanonicalizer, TautomerEnumerator 24 | from .charge import ACID_BASE_PAIRS, CHARGE_CORRECTIONS, Reionizer, Uncharger 25 | from .utils import memoized_property 26 | 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | class Standardizer(object): 32 | """The main class for performing standardization of molecules and deriving parent molecules. 33 | 34 | The primary usage is via the :meth:`~molvs.standardize.Standardizer.standardize` method:: 35 | 36 | s = Standardizer() 37 | mol1 = Chem.MolFromSmiles('C1=CC=CC=C1') 38 | mol2 = s.standardize(mol1) 39 | 40 | There are separate methods to derive fragment, charge, tautomer, isotope and stereo parent molecules. 41 | 42 | """ 43 | 44 | def __init__(self, normalizations=NORMALIZATIONS, acid_base_pairs=ACID_BASE_PAIRS, 45 | charge_corrections=CHARGE_CORRECTIONS, tautomer_transforms=TAUTOMER_TRANSFORMS, 46 | tautomer_scores=TAUTOMER_SCORES, max_restarts=MAX_RESTARTS, max_tautomers=MAX_TAUTOMERS, 47 | prefer_organic=PREFER_ORGANIC): 48 | """Initialize a Standardizer with optional custom parameters. 49 | 50 | :param normalizations: A list of Normalizations to apply (default: :data:`~molvs.normalize.NORMALIZATIONS`). 51 | :param acid_base_pairs: A list of AcidBasePairs for competitive reionization (default: 52 | :data:`~molvs.charge.ACID_BASE_PAIRS`). 53 | :param charge_corrections: A list of ChargeCorrections to apply (default: 54 | :data:`~molvs.charge.CHARGE_CORRECTIONS`). 55 | :param tautomer_transforms: A list of TautomerTransforms to apply (default: 56 | :data:`~molvs.tautomer.TAUTOMER_TRANSFORMS`). 57 | :param tautomer_scores: A list of TautomerScores used to determine canonical tautomer (default: 58 | :data:`~molvs.tautomer.TAUTOMER_SCORES`). 59 | :param max_restarts: The maximum number of times to attempt to apply the series of normalizations (default 200). 60 | :param max_tautomers: The maximum number of tautomers to enumerate (default 1000). 61 | :param prefer_organic: Whether to prioritize organic fragments when choosing fragment parent (default False). 62 | """ 63 | log.debug('Initializing Standardizer') 64 | self.normalizations = normalizations 65 | self.acid_base_pairs = acid_base_pairs 66 | self.charge_corrections = charge_corrections 67 | self.tautomer_transforms = tautomer_transforms 68 | self.tautomer_scores = tautomer_scores 69 | self.max_restarts = max_restarts 70 | self.max_tautomers = max_tautomers 71 | self.prefer_organic = prefer_organic 72 | 73 | def __call__(self, mol): 74 | """Calling a Standardizer instance like a function is the same as calling its 75 | :meth:`~molvs.standardize.Standardizer.standardize` method.""" 76 | return self.standardize(mol) 77 | 78 | def standardize(self, mol): 79 | """Return a standardized version the given molecule. 80 | 81 | The standardization process consists of the following stages: RDKit 82 | :py:func:`~rdkit.Chem.rdmolops.RemoveHs`, RDKit :py:func:`~rdkit.Chem.rdmolops.SanitizeMol`, 83 | :class:`~molvs.metal.MetalDisconnector`, :class:`~molvs.normalize.Normalizer`, 84 | :class:`~molvs.charge.Reionizer`, RDKit :py:func:`~rdkit.Chem.rdmolops.AssignStereochemistry`. 85 | 86 | :param mol: The molecule to standardize. 87 | :type mol: rdkit.Chem.rdchem.Mol 88 | :returns: The standardized molecule. 89 | :rtype: rdkit.Chem.rdchem.Mol 90 | """ 91 | mol = copy.deepcopy(mol) 92 | Chem.SanitizeMol(mol) 93 | mol = Chem.RemoveHs(mol) 94 | mol = self.disconnect_metals(mol) 95 | mol = self.normalize(mol) 96 | mol = self.reionize(mol) 97 | Chem.AssignStereochemistry(mol, force=True, cleanIt=True) 98 | # TODO: Check this removes symmetric stereocenters 99 | return mol 100 | 101 | def tautomer_parent(self, mol, skip_standardize=False): 102 | """Return the tautomer parent of a given molecule. 103 | 104 | :param mol: The input molecule. 105 | :type mol: rdkit.Chem.rdchem.Mol 106 | :param bool skip_standardize: Set to True if mol has already been standardized. 107 | :returns: The tautomer parent molecule. 108 | :rtype: rdkit.Chem.rdchem.Mol 109 | """ 110 | if not skip_standardize: 111 | mol = self.standardize(mol) 112 | tautomer = self.canonicalize_tautomer(mol) 113 | tautomer = self.standardize(tautomer) 114 | return tautomer 115 | 116 | def fragment_parent(self, mol, skip_standardize=False): 117 | """Return the fragment parent of a given molecule. 118 | 119 | The fragment parent is the largest organic covalent unit in the molecule. 120 | 121 | :param mol: The input molecule. 122 | :type mol: rdkit.Chem.rdchem.Mol 123 | :param bool skip_standardize: Set to True if mol has already been standardized. 124 | :returns: The fragment parent molecule. 125 | :rtype: rdkit.Chem.rdchem.Mol 126 | """ 127 | if not skip_standardize: 128 | mol = self.standardize(mol) 129 | # TODO: Consider applying FragmentRemover first to remove salts, solvents? 130 | fragment = self.largest_fragment(mol) 131 | return fragment 132 | 133 | def stereo_parent(self, mol, skip_standardize=False): 134 | """Return the stereo parent of a given molecule. 135 | 136 | The stereo parent has all stereochemistry information removed from tetrahedral centers and double bonds. 137 | 138 | :param mol: The input molecule. 139 | :type mol: rdkit.Chem.rdchem.Mol 140 | :param bool skip_standardize: Set to True if mol has already been standardized. 141 | :returns: The stereo parent molecule. 142 | :rtype: rdkit.Chem.rdchem.Mol 143 | """ 144 | if not skip_standardize: 145 | mol = self.standardize(mol) 146 | else: 147 | mol = copy.deepcopy(mol) 148 | Chem.RemoveStereochemistry(mol) 149 | return mol 150 | 151 | def isotope_parent(self, mol, skip_standardize=False): 152 | """Return the isotope parent of a given molecule. 153 | 154 | The isotope parent has all atoms replaced with the most abundant isotope for that element. 155 | 156 | :param mol: The input molecule. 157 | :type mol: rdkit.Chem.rdchem.Mol 158 | :param bool skip_standardize: Set to True if mol has already been standardized. 159 | :returns: The isotope parent molecule. 160 | :rtype: rdkit.Chem.rdchem.Mol 161 | """ 162 | if not skip_standardize: 163 | mol = self.standardize(mol) 164 | else: 165 | mol = copy.deepcopy(mol) 166 | # Replace isotopes with common weight 167 | for atom in mol.GetAtoms(): 168 | atom.SetIsotope(0) 169 | return mol 170 | 171 | def charge_parent(self, mol, skip_standardize=False): 172 | """Return the charge parent of a given molecule. 173 | 174 | The charge parent is the uncharged version of the fragment parent. 175 | 176 | :param mol: The input molecule. 177 | :type mol: rdkit.Chem.rdchem.Mol 178 | :param bool skip_standardize: Set to True if mol has already been standardized. 179 | :returns: The charge parent molecule. 180 | :rtype: rdkit.Chem.rdchem.Mol 181 | """ 182 | # TODO: All ionized acids and bases should be neutralised. 183 | if not skip_standardize: 184 | mol = self.standardize(mol) 185 | fragment = self.fragment_parent(mol, skip_standardize=True) 186 | if fragment: 187 | uncharged = self.uncharge(fragment) 188 | # During final standardization, the Reionizer ensures any remaining charges are in the right places 189 | uncharged = self.standardize(uncharged) 190 | return uncharged 191 | 192 | def super_parent(self, mol, skip_standardize=False): 193 | """Return the super parent of a given molecule. 194 | 195 | THe super parent is fragment, charge, isotope, stereochemistry and tautomer insensitive. From the input 196 | molecule, the largest fragment is taken. This is uncharged and then isotope and stereochemistry information is 197 | discarded. Finally, the canonical tautomer is determined and returned. 198 | 199 | :param mol: The input molecule. 200 | :type mol: rdkit.Chem.rdchem.Mol 201 | :param bool skip_standardize: Set to True if mol has already been standardized. 202 | :returns: The super parent molecule. 203 | :rtype: rdkit.Chem.rdchem.Mol 204 | """ 205 | if not skip_standardize: 206 | mol = self.standardize(mol) 207 | # We don't need to get fragment parent, because the charge parent is the largest fragment 208 | mol = self.charge_parent(mol, skip_standardize=True) 209 | mol = self.isotope_parent(mol, skip_standardize=True) 210 | mol = self.stereo_parent(mol, skip_standardize=True) 211 | mol = self.tautomer_parent(mol, skip_standardize=True) 212 | mol = self.standardize(mol) 213 | return mol 214 | 215 | def standardize_with_parents(self, mol): 216 | """""" 217 | standardized = self.standardize(mol) 218 | tautomer = self.tautomer_parent(standardized, skip_standardize=True) 219 | super = self.super_parent(standardized, skip_standardize=True) 220 | # TODO: Add other parents - have optional argument to specify which are wanted 221 | mols = { 222 | 'standardized': standardized, 223 | 'tautomer_parent': tautomer, 224 | 'super_parent': super 225 | } 226 | return mols 227 | 228 | # TODO: All unique tautomers 229 | # TODO: All unique fragments (each has to be standardized again?) 230 | 231 | @memoized_property 232 | def disconnect_metals(self): 233 | """ 234 | :returns: A callable :class:`~molvs.metal.MetalDisconnector` instance. 235 | """ 236 | return MetalDisconnector() 237 | 238 | @memoized_property 239 | def normalize(self): 240 | """ 241 | :returns: A callable :class:`~molvs.normalize.Normalizer` instance. 242 | """ 243 | return Normalizer(normalizations=self.normalizations, max_restarts=self.max_restarts) 244 | 245 | @memoized_property 246 | def reionize(self): 247 | """ 248 | :returns: A callable :class:`~molvs.charge.Reionizer` instance. 249 | """ 250 | return Reionizer(acid_base_pairs=self.acid_base_pairs, charge_corrections=self.charge_corrections) 251 | 252 | @memoized_property 253 | def uncharge(self): 254 | """ 255 | :returns: A callable :class:`~molvs.charge.Uncharger` instance. 256 | """ 257 | return Uncharger(acid_base_pairs=self.acid_base_pairs) 258 | 259 | @memoized_property 260 | def remove_fragments(self): 261 | """ 262 | :returns: A callable :class:`~molvs.fragment.FragmentRemover` instance. 263 | """ 264 | return FragmentRemover() 265 | 266 | @memoized_property 267 | def largest_fragment(self): 268 | """ 269 | :returns: A callable :class:`~molvs.fragment.LargestFragmentChooser` instance. 270 | """ 271 | return LargestFragmentChooser(prefer_organic=self.prefer_organic) 272 | 273 | @memoized_property 274 | def enumerate_tautomers(self): 275 | """ 276 | :returns: A callable :class:`~molvs.tautomer.TautomerEnumerator` instance. 277 | """ 278 | return TautomerEnumerator(transforms=self.tautomer_transforms, max_tautomers=self.max_tautomers) 279 | 280 | @memoized_property 281 | def canonicalize_tautomer(self): 282 | """ 283 | :returns: A callable :class:`~molvs.tautomer.TautomerCanonicalizer` instance. 284 | """ 285 | return TautomerCanonicalizer(transforms=self.tautomer_transforms, scores=self.tautomer_scores, 286 | max_tautomers=self.max_tautomers) 287 | 288 | 289 | def standardize_smiles(smiles): 290 | """Return a standardized canonical SMILES string given a SMILES string. 291 | 292 | Note: This is a convenience function for quickly standardizing a single SMILES string. It is more efficient to use 293 | the :class:`~molvs.standardize.Standardizer` class directly when working with many molecules or when custom options 294 | are needed. 295 | 296 | :param string smiles: The SMILES for the molecule. 297 | :returns: The SMILES for the standardized molecule. 298 | :rtype: string. 299 | """ 300 | # Skip sanitize as standardize does this anyway 301 | mol = Chem.MolFromSmiles(smiles, sanitize=False) 302 | mol = Standardizer().standardize(mol) 303 | return Chem.MolToSmiles(mol, isomericSmiles=True) 304 | 305 | 306 | def enumerate_tautomers_smiles(smiles): 307 | """Return a set of tautomers as SMILES strings, given a SMILES string. 308 | 309 | :param smiles: A SMILES string. 310 | :returns: A set containing SMILES strings for every possible tautomer. 311 | :rtype: set of strings. 312 | """ 313 | # Skip sanitize as standardize does this anyway 314 | mol = Chem.MolFromSmiles(smiles, sanitize=False) 315 | mol = Standardizer().standardize(mol) 316 | tautomers = TautomerEnumerator().enumerate(mol) 317 | return {Chem.MolToSmiles(m, isomericSmiles=True) for m in tautomers} 318 | 319 | 320 | def canonicalize_tautomer_smiles(smiles): 321 | """Return a standardized canonical tautomer SMILES string given a SMILES string. 322 | 323 | Note: This is a convenience function for quickly standardizing and finding the canonical tautomer for a single 324 | SMILES string. It is more efficient to use the :class:`~molvs.standardize.Standardizer` class directly when working 325 | with many molecules or when custom options are needed. 326 | 327 | :param string smiles: The SMILES for the molecule. 328 | :returns: The SMILES for the standardize canonical tautomer. 329 | :rtype: string. 330 | """ 331 | # Skip sanitize as standardize does this anyway 332 | mol = Chem.MolFromSmiles(smiles, sanitize=False) 333 | mol = Standardizer().standardize(mol) 334 | tautomer = TautomerCanonicalizer().canonicalize(mol) 335 | return Chem.MolToSmiles(tautomer, isomericSmiles=True) 336 | -------------------------------------------------------------------------------- /molvs/tautomer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.tautomer 4 | ~~~~~~~~~~~~~~ 5 | 6 | This module contains tools for enumerating tautomers and determining a canonical tautomer. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import copy 14 | import logging 15 | 16 | from rdkit import Chem 17 | from rdkit.Chem.rdchem import BondDir, BondStereo, BondType 18 | 19 | from .utils import memoized_property, pairwise 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | class TautomerTransform(object): 25 | """Rules to transform one tautomer to another. 26 | 27 | Each TautomerTransform is defined by a SMARTS pattern where the transform involves moving a hydrogen from the first 28 | atom in the pattern to the last atom in the pattern. By default, alternating single and double bonds along the 29 | pattern are swapped accordingly to account for the hydrogen movement. If necessary, the transform can instead define 30 | custom resulting bond orders and also resulting atom charges. 31 | """ 32 | 33 | BONDMAP = {'-': BondType.SINGLE, '=': BondType.DOUBLE, '#': BondType.TRIPLE, ':': BondType.AROMATIC} 34 | CHARGEMAP = {'+': 1, '0': 0, '-': -1} 35 | 36 | def __init__(self, name, smarts, bonds=(), charges=(), radicals=()): 37 | """Initialize a TautomerTransform with a name, SMARTS pattern and optional bonds and charges. 38 | 39 | The SMARTS pattern match is applied to a Kekule form of the molecule, so use explicit single and double bonds 40 | rather than aromatic. 41 | 42 | Specify custom bonds as a string of ``-``, ``=``, ``#``, ``:`` for single, double, triple and aromatic bonds 43 | respectively. Specify custom charges as ``+``, ``0``, ``-`` for +1, 0 and -1 charges respectively. 44 | 45 | :param string name: A name for this TautomerTransform. 46 | :param string smarts: SMARTS pattern to match for the transform. 47 | :param string bonds: Optional specification for the resulting bonds. 48 | :param string charges: Optional specification for the resulting charges on the atoms. 49 | """ 50 | self.name = name 51 | self.tautomer_str = smarts 52 | self.bonds = [self.BONDMAP[b] for b in bonds] 53 | self.charges = [self.CHARGEMAP[b] for b in charges] 54 | # TODO: Raise error (ValueError?) if bonds and charges lists are not the correct length 55 | 56 | @memoized_property 57 | def tautomer(self): 58 | return Chem.MolFromSmarts(self.tautomer_str) 59 | 60 | def __repr__(self): 61 | return 'TautomerTransform({!r}, {!r}, {!r}, {!r})'.format(self.name, self.tautomer_str, self.bonds, self.charges) 62 | 63 | def __str__(self): 64 | return self.name 65 | 66 | 67 | class TautomerScore(object): 68 | """A substructure defined by SMARTS and its score contribution to determine the canonical tautomer.""" 69 | 70 | def __init__(self, name, smarts, score): 71 | """Initialize a TautomerScore with a name, SMARTS pattern and score. 72 | 73 | :param name: A name for this TautomerScore. 74 | :param smarts: SMARTS pattern to match a substructure. 75 | :param score: The score to assign for this substructure. 76 | """ 77 | self.name = name 78 | self.smarts_str = smarts 79 | self.score = score 80 | 81 | @memoized_property 82 | def smarts(self): 83 | return Chem.MolFromSmarts(self.smarts_str) 84 | 85 | def __repr__(self): 86 | return 'TautomerScore({!r}, {!r}, {!r})'.format(self.name, self.smarts_str, self.score) 87 | 88 | def __str__(self): 89 | return self.name 90 | 91 | 92 | #: The default list of TautomerTransforms. 93 | TAUTOMER_TRANSFORMS = ( 94 | TautomerTransform('1,3 (thio)keto/enol f', '[CX4!H0]-[C]=[O,S,Se,Te;X1]'), 95 | TautomerTransform('1,3 (thio)keto/enol r', '[O,S,Se,Te;X2!H0]-[C]=[C]'), 96 | TautomerTransform('1,5 (thio)keto/enol f', '[CX4,NX3;!H0]-[C]=[C][CH0]=[O,S,Se,Te;X1]'), 97 | TautomerTransform('1,5 (thio)keto/enol r', '[O,S,Se,Te;X2!H0]-[CH0]=[C]-[C]=[C,N]'), 98 | TautomerTransform('aliphatic imine f', '[CX4!H0]-[C]=[NX2]'), 99 | TautomerTransform('aliphatic imine r', '[NX3!H0]-[C]=[CX3]'), 100 | TautomerTransform('special imine f', '[N!H0]-[C]=[CX3R0]'), 101 | TautomerTransform('special imine r', '[CX4!H0]-[c]=[n]'), 102 | TautomerTransform('1,3 aromatic heteroatom H shift f', '[#7!H0]-[#6R1]=[O,#7X2]'), 103 | TautomerTransform('1,3 aromatic heteroatom H shift r', '[O,#7;!H0]-[#6R1]=[#7X2]'), 104 | TautomerTransform('1,3 heteroatom H shift', '[#7,S,O,Se,Te;!H0]-[#7X2,#6,#15]=[#7,#16,#8,Se,Te]'), 105 | TautomerTransform('1,5 aromatic heteroatom H shift', '[#7,#16,#8;!H0]-[#6,#7]=[#6]-[#6,#7]=[#7,#16,#8;H0]'), 106 | TautomerTransform('1,5 aromatic heteroatom H shift f', '[#7,#16,#8,Se,Te;!H0]-[#6,nX2]=[#6,nX2]-[#6,#7X2]=[#7X2,S,O,Se,Te]'), 107 | TautomerTransform('1,5 aromatic heteroatom H shift r', '[#7,S,O,Se,Te;!H0]-[#6,#7X2]=[#6,nX2]-[#6,nX2]=[#7,#16,#8,Se,Te]'), 108 | TautomerTransform('1,7 aromatic heteroatom H shift f', '[#7,#8,#16,Se,Te;!H0]-[#6,#7X2]=[#6,#7X2]-[#6,#7X2]=[#6]-[#6,#7X2]=[#7X2,S,O,Se,Te,CX3]'), 109 | TautomerTransform('1,7 aromatic heteroatom H shift r', '[#7,S,O,Se,Te,CX4;!H0]-[#6,#7X2]=[#6]-[#6,#7X2]=[#6,#7X2]-[#6,#7X2]=[NX2,S,O,Se,Te]'), 110 | TautomerTransform('1,9 aromatic heteroatom H shift f', '[#7,O;!H0]-[#6,#7X2]=[#6,#7X2]-[#6,#7X2]=[#6,#7X2]-[#6,#7X2]=[#6,#7X2]-[#6,#7X2]=[#7,O]'), 111 | TautomerTransform('1,11 aromatic heteroatom H shift f', '[#7,O;!H0]-[#6,nX2]=[#6,nX2]-[#6,nX2]=[#6,nX2]-[#6,nX2]=[#6,nX2]-[#6,nX2]=[#6,nX2]-[#6,nX2]=[#7X2,O]'), 112 | TautomerTransform('furanone f', '[O,S,N;!H0]-[#6r5]=[#6X3r5;$([#6]([#6r5])=[#6r5])]'), 113 | TautomerTransform('furanone r', '[#6r5!H0;$([#6]([#6r5])[#6r5])]-[#6r5]=[O,S,N]'), 114 | TautomerTransform('keten/ynol f', '[C!H0]=[C]=[O,S,Se,Te;X1]', bonds='#-'), 115 | TautomerTransform('keten/ynol r', '[O,S,Se,Te;!H0X2]-[C]#[C]', bonds='=='), 116 | TautomerTransform('ionic nitro/aci-nitro f', '[C!H0]-[N+;$([N][O-])]=[O]'), 117 | TautomerTransform('ionic nitro/aci-nitro r', '[O!H0]-[N+;$([N][O-])]=[C]'), 118 | TautomerTransform('oxim/nitroso f', '[O!H0]-[N]=[C]'), 119 | TautomerTransform('oxim/nitroso r', '[C!H0]-[N]=[O]'), 120 | TautomerTransform('oxim/nitroso via phenol f', '[O!H0]-[N]=[C]-[C]=[C]-[C]=[OH0]'), 121 | TautomerTransform('oxim/nitroso via phenol r', '[O!H0]-[c]=[c]-[c]=[c]-[N]=[OH0]'), 122 | TautomerTransform('cyano/iso-cyanic acid f', '[O!H0]-[C]#[N]', bonds='=='), 123 | TautomerTransform('cyano/iso-cyanic acid r', '[N!H0]=[C]=[O]', bonds='#-'), 124 | # TautomerTransform('formamidinesulfinic acid f', '[O,N;!H0]-[C]=[S,Se,Te]=[O]', bonds='=--'), # TODO: WAT!? 125 | # TautomerTransform('formamidinesulfinic acid r', '[O!H0]-[S,Se,Te]-[C]=[O,N]', bonds='=--'), 126 | TautomerTransform('isocyanide f', '[C-0!H0]#[N+0]', bonds='#', charges='-+'), 127 | TautomerTransform('isocyanide r', '[N+!H0]#[C-]', bonds='#', charges='-+'), 128 | TautomerTransform('phosphonic acid f', '[OH]-[PH0]', bonds='='), 129 | TautomerTransform('phosphonic acid r', '[PH]=[O]', bonds='-'), 130 | ) 131 | 132 | #: The default list of TautomerScores. 133 | TAUTOMER_SCORES = ( 134 | TautomerScore('benzoquinone', '[#6]1([#6]=[#6][#6]([#6]=[#6]1)=,:[N,S,O])=,:[N,S,O]', 25), 135 | TautomerScore('oxim', '[#6]=[N][OH]', 4), 136 | TautomerScore('C=O', '[#6]=,:[#8]', 2), 137 | TautomerScore('N=O', '[#7]=,:[#8]', 2), 138 | TautomerScore('P=O', '[#15]=,:[#8]', 2), 139 | TautomerScore('C=hetero', '[#6]=[!#1;!#6]', 1), 140 | TautomerScore('methyl', '[CX4H3]', 1), 141 | TautomerScore('guanidine terminal=N', '[#7][#6](=[NR0])[#7H0]', 1), 142 | TautomerScore('guanidine endocyclic=N', '[#7;R][#6;R]([N])=[#7;R]', 2), 143 | TautomerScore('aci-nitro', '[#6]=[N+]([O-])[OH]', -4), 144 | ) 145 | 146 | #: The default value for the maximum number of tautomers to enumerate, a limit to prevent combinatorial explosion. 147 | MAX_TAUTOMERS = 1000 148 | 149 | 150 | class TautomerCanonicalizer(object): 151 | """ 152 | 153 | """ 154 | 155 | def __init__(self, transforms=TAUTOMER_TRANSFORMS, scores=TAUTOMER_SCORES, max_tautomers=MAX_TAUTOMERS): 156 | """ 157 | 158 | :param transforms: A list of TautomerTransforms to use to enumerate tautomers. 159 | :param scores: A list of TautomerScores to use to choose the canonical tautomer. 160 | :param max_tautomers: The maximum number of tautomers to enumerate, a limit to prevent combinatorial explosion. 161 | """ 162 | self.transforms = transforms 163 | self.scores = scores 164 | self.max_tautomers = max_tautomers 165 | 166 | def __call__(self, mol): 167 | """Calling a TautomerCanonicalizer instance like a function is the same as calling its canonicalize(mol) method.""" 168 | return self.canonicalize(mol) 169 | 170 | def canonicalize(self, mol): 171 | """Return a canonical tautomer by enumerating and scoring all possible tautomers. 172 | 173 | :param mol: The input molecule. 174 | :type mol: rdkit.Chem.rdchem.Mol 175 | :return: The canonical tautomer. 176 | :rtype: rdkit.Chem.rdchem.Mol 177 | """ 178 | # TODO: Overload the mol parameter to pass a list of pre-enumerated tautomers 179 | tautomers = self._enumerate_tautomers(mol) 180 | if len(tautomers) == 1: 181 | return tautomers[0] 182 | # Calculate score for each tautomer 183 | highest = None 184 | for t in tautomers: 185 | smiles = Chem.MolToSmiles(t, isomericSmiles=True) 186 | log.debug('Tautomer: %s', smiles) 187 | score = 0 188 | # Add aromatic ring scores 189 | ssr = Chem.GetSymmSSSR(t) 190 | for ring in ssr: 191 | btypes = {t.GetBondBetweenAtoms(*pair).GetBondType() for pair in pairwise(ring)} 192 | elements = {t.GetAtomWithIdx(idx).GetAtomicNum() for idx in ring} 193 | if btypes == {BondType.AROMATIC}: 194 | log.debug('Score +100 (aromatic ring)') 195 | score += 100 196 | if elements == {6}: 197 | log.debug('Score +150 (carbocyclic aromatic ring)') 198 | score += 150 199 | # Add SMARTS scores 200 | for tscore in self.scores: 201 | for match in t.GetSubstructMatches(tscore.smarts): 202 | log.debug('Score %+d (%s)', tscore.score, tscore.name) 203 | score += tscore.score 204 | # Add (P,S,Se,Te)-H scores 205 | for atom in t.GetAtoms(): 206 | if atom.GetAtomicNum() in {15, 16, 34, 52}: 207 | hs = atom.GetTotalNumHs() 208 | if hs: 209 | log.debug('Score %+d (%s-H bonds)', -hs, atom.GetSymbol()) 210 | score -= hs 211 | # Set as highest if score higher or if score equal and smiles comes first alphabetically 212 | if not highest or highest['score'] < score or (highest['score'] == score and smiles < highest['smiles']): 213 | log.debug('New highest tautomer: %s (%s)', smiles, score) 214 | highest = {'smiles': smiles, 'tautomer': t, 'score': score} 215 | return highest['tautomer'] 216 | 217 | @memoized_property 218 | def _enumerate_tautomers(self): 219 | return TautomerEnumerator(self.transforms, self.max_tautomers) 220 | 221 | 222 | class TautomerEnumerator(object): 223 | """ 224 | 225 | """ 226 | 227 | def __init__(self, transforms=TAUTOMER_TRANSFORMS, max_tautomers=MAX_TAUTOMERS): 228 | """ 229 | 230 | :param transforms: A list of TautomerTransforms to use to enumerate tautomers. 231 | :param max_tautomers: The maximum number of tautomers to enumerate (limit to prevent combinatorial explosion). 232 | """ 233 | self.transforms = transforms 234 | self.max_tautomers = max_tautomers 235 | 236 | def __call__(self, mol): 237 | """Calling a TautomerEnumerator instance like a function is the same as calling its enumerate(mol) method.""" 238 | return self.enumerate(mol) 239 | 240 | def enumerate(self, mol): 241 | """Enumerate all possible tautomers and return them as a list. 242 | 243 | :param mol: The input molecule. 244 | :type mol: rdkit.Chem.rdchem.Mol 245 | :return: A list of all possible tautomers of the molecule. 246 | :rtype: list of rdkit.Chem.rdchem.Mol 247 | """ 248 | smiles = Chem.MolToSmiles(mol, isomericSmiles=True) 249 | tautomers = {smiles: copy.deepcopy(mol)} 250 | # Create a kekulized form of the molecule to match the SMARTS against 251 | kekulized = copy.deepcopy(mol) 252 | Chem.Kekulize(kekulized) 253 | kekulized = {smiles: kekulized} 254 | done = set() 255 | while len(tautomers) < self.max_tautomers: 256 | for tsmiles in sorted(tautomers): 257 | if tsmiles in done: 258 | continue 259 | for transform in self.transforms: 260 | for match in kekulized[tsmiles].GetSubstructMatches(transform.tautomer): 261 | # log.debug('Matched rule: %s to %s for %s', transform.name, tsmiles, match) 262 | # Create a copy of in the input molecule so we can modify it 263 | # Use kekule form so bonds are explicitly single/double instead of aromatic 264 | product = copy.deepcopy(kekulized[tsmiles]) 265 | # Remove a hydrogen from the first matched atom and add one to the last 266 | first = product.GetAtomWithIdx(match[0]) 267 | last = product.GetAtomWithIdx(match[-1]) 268 | # log.debug('%s: H%s -> H%s' % (first.GetSymbol(), first.GetTotalNumHs(), first.GetTotalNumHs() - 1)) 269 | # log.debug('%s: H%s -> H%s' % (last.GetSymbol(), last.GetTotalNumHs(), last.GetTotalNumHs() + 1)) 270 | first.SetNumExplicitHs(max(0, first.GetTotalNumHs() - 1)) 271 | last.SetNumExplicitHs(last.GetTotalNumHs() + 1) 272 | # Remove any implicit hydrogens from the first and last atoms now we have set the count explicitly 273 | first.SetNoImplicit(True) 274 | last.SetNoImplicit(True) 275 | # Adjust bond orders 276 | for bi, pair in enumerate(pairwise(match)): 277 | if transform.bonds: 278 | # Set the resulting bond types as manually specified in the transform 279 | # log.debug('%s-%s: %s -> %s' % (product.GetAtomWithIdx(pair[0]).GetSymbol(), product.GetAtomWithIdx(pair[1]).GetSymbol(), product.GetBondBetweenAtoms(*pair).GetBondType(), transform.bonds[bi])) 280 | product.GetBondBetweenAtoms(*pair).SetBondType(transform.bonds[bi]) 281 | else: 282 | # If no manually specified bond types, just swap single and double bonds 283 | current_bond_type = product.GetBondBetweenAtoms(*pair).GetBondType() 284 | product.GetBondBetweenAtoms(*pair).SetBondType(BondType.DOUBLE if current_bond_type == BondType.SINGLE else BondType.SINGLE) 285 | # log.debug('%s-%s: %s -> %s' % (product.GetAtomWithIdx(pair[0]).GetSymbol(), product.GetAtomWithIdx(pair[1]).GetSymbol(), current_bond_type, product.GetBondBetweenAtoms(*pair).GetBondType())) 286 | # Adjust charges 287 | if transform.charges: 288 | for ci, idx in enumerate(match): 289 | atom = product.GetAtomWithIdx(idx) 290 | # log.debug('%s: C%s -> C%s' % (atom.GetSymbol(), atom.GetFormalCharge(), atom.GetFormalCharge() + transform.charges[ci])) 291 | atom.SetFormalCharge(atom.GetFormalCharge() + transform.charges[ci]) 292 | try: 293 | Chem.SanitizeMol(product) 294 | smiles = Chem.MolToSmiles(product, isomericSmiles=True) 295 | log.debug('Applied rule: %s to %s', transform.name, tsmiles) 296 | if smiles not in tautomers: 297 | log.debug('New tautomer produced: %s' % smiles) 298 | kekulized_product = copy.deepcopy(product) 299 | Chem.Kekulize(kekulized_product) 300 | tautomers[smiles] = product 301 | kekulized[smiles] = kekulized_product 302 | else: 303 | log.debug('Previous tautomer produced again: %s' % smiles) 304 | except ValueError: 305 | log.debug('ValueError Applying rule: %s', transform.name) 306 | done.add(tsmiles) 307 | if len(tautomers) == len(done): 308 | break 309 | else: 310 | log.warning('Tautomer enumeration stopped at maximum %s', self.max_tautomers) 311 | # Clean up stereochemistry 312 | for tautomer in tautomers.values(): 313 | Chem.AssignStereochemistry(tautomer, force=True, cleanIt=True) 314 | for bond in tautomer.GetBonds(): 315 | if bond.GetBondType() == BondType.DOUBLE and bond.GetStereo() > BondStereo.STEREOANY: 316 | begin = bond.GetBeginAtomIdx() 317 | end = bond.GetEndAtomIdx() 318 | for othertautomer in tautomers.values(): 319 | if not othertautomer.GetBondBetweenAtoms(begin, end).GetBondType() == BondType.DOUBLE: 320 | neighbours = tautomer.GetAtomWithIdx(begin).GetBonds() + tautomer.GetAtomWithIdx(end).GetBonds() 321 | for otherbond in neighbours: 322 | if otherbond.GetBondDir() in {BondDir.ENDUPRIGHT, BondDir.ENDDOWNRIGHT}: 323 | otherbond.SetBondDir(BondDir.NONE) 324 | Chem.AssignStereochemistry(tautomer, force=True, cleanIt=True) 325 | log.debug('Removed stereochemistry from unfixed double bond') 326 | break 327 | return list(tautomers.values()) 328 | -------------------------------------------------------------------------------- /molvs/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.utils 4 | ~~~~~~~~~~~ 5 | 6 | This module contains miscellaneous utility functions. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import functools 14 | from itertools import tee 15 | 16 | import six 17 | 18 | 19 | def memoized_property(fget): 20 | """Decorator to create memoized properties.""" 21 | attr_name = '_{}'.format(fget.__name__) 22 | 23 | @functools.wraps(fget) 24 | def fget_memoized(self): 25 | if not hasattr(self, attr_name): 26 | setattr(self, attr_name, fget(self)) 27 | return getattr(self, attr_name) 28 | return property(fget_memoized) 29 | 30 | 31 | def pairwise(iterable): 32 | """Utility function to iterate in a pairwise fashion.""" 33 | a, b = tee(iterable) 34 | next(b, None) 35 | return six.moves.zip(a, b) 36 | -------------------------------------------------------------------------------- /molvs/validate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.validate 4 | ~~~~~~~~~~~~~~ 5 | 6 | This module contains the main :class:`~molvs.validate.Validator` class that can be used to perform all 7 | :class:`Validations `, as well as the :func:`~molvs.validate.validate_smiles()` 8 | convenience function. 9 | 10 | """ 11 | 12 | from __future__ import print_function 13 | from __future__ import unicode_literals 14 | from __future__ import division 15 | import logging 16 | import sys 17 | 18 | from rdkit import Chem 19 | 20 | from .errors import StopValidateError 21 | from .validations import VALIDATIONS 22 | 23 | 24 | #: The default format for log messages. 25 | SIMPLE_FORMAT = '%(levelname)s: [%(validation)s] %(message)s' 26 | 27 | #: A more detailed format for log messages. Specify when initializing a Validator. 28 | LONG_FORMAT = '%(asctime)s - %(levelname)s - %(validation)s - %(message)s' 29 | 30 | 31 | class LogHandler(logging.Handler): 32 | """A simple logging Handler that just stores logs in an array until flushed.""" 33 | 34 | def __init__(self): 35 | logging.Handler.__init__(self) 36 | self.logs = [] 37 | 38 | @property 39 | def logmessages(self): 40 | return [self.format(record) for record in self.logs] 41 | 42 | def emit(self, record): 43 | """Append the record.""" 44 | self.logs.append(record) 45 | 46 | def flush(self): 47 | """Clear the log records.""" 48 | self.acquire() 49 | try: 50 | self.logs = [] 51 | finally: 52 | self.release() 53 | 54 | def close(self): 55 | """Close the handler.""" 56 | self.flush() 57 | logging.Handler.close(self) 58 | 59 | 60 | class Validator(object): 61 | """The main class for running :class:`Validations ` on molecules.""" 62 | 63 | def __init__(self, validations=VALIDATIONS, log_format=SIMPLE_FORMAT, level=logging.INFO, stdout=False, raw=False): 64 | """Initialize a Validator with the following parameters: 65 | 66 | :param validations: A list of Validations to apply (default: :data:`~molvs.validations.VALIDATIONS`). 67 | :param string log_format: A string format (default: :data:`~molvs.validate.SIMPLE_FORMAT`). 68 | :param level: The minimum logging level to output. 69 | :param bool stdout: Whether to send log messages to standard output. 70 | :param bool raw: Whether to return raw :class:`~logging.LogRecord` objects instead of formatted log strings. 71 | """ 72 | self.raw = raw 73 | # Set up logger and add default LogHandler 74 | self.log = logging.getLogger(type(self).__name__) 75 | self.log.setLevel(level) 76 | self.handler = LogHandler() 77 | self.handler.setFormatter(logging.Formatter(log_format)) 78 | self.log.addHandler(self.handler) 79 | # Add stdout StreamHandler if specified in parameters 80 | if stdout: 81 | strhdlr = logging.StreamHandler(sys.stdout) 82 | strhdlr.setFormatter(logging.Formatter(log_format)) 83 | self.log.addHandler(strhdlr) 84 | # Instantiate the validations 85 | self.validations = [validation(self.log) for validation in validations] 86 | 87 | def __call__(self, mol): 88 | """Calling a Validator instance like a function is the same as calling its 89 | :meth:`~molvs.validate.Validator.validate` method.""" 90 | return self.validate(mol) 91 | 92 | def validate(self, mol): 93 | """""" 94 | # Clear any log messages from previous runs 95 | self.handler.flush() 96 | # Run every validation, stopping if StopValidateError is raised 97 | for validation in self.validations: 98 | try: 99 | validation(mol) 100 | except StopValidateError: 101 | break 102 | return self.handler.logs if self.raw else self.handler.logmessages 103 | 104 | 105 | def validate_smiles(smiles): 106 | """Return log messages for a given SMILES string using the default validations. 107 | 108 | Note: This is a convenience function for quickly validating a single SMILES string. It is more efficient to use 109 | the :class:`~molvs.validate.Validator` class directly when working with many molecules or when custom options 110 | are needed. 111 | 112 | :param string smiles: The SMILES for the molecule. 113 | :returns: A list of log messages. 114 | :rtype: list of strings. 115 | """ 116 | # Skip sanitize as standardize does this anyway 117 | mol = Chem.MolFromSmiles(smiles) 118 | logs = Validator().validate(mol) 119 | return logs 120 | -------------------------------------------------------------------------------- /molvs/validations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | molvs.validations 4 | ~~~~~~~~~~~~~~~~~ 5 | 6 | This module contains all the built-in :class:`Validations `. 7 | 8 | """ 9 | 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | from __future__ import division 13 | import logging 14 | 15 | from rdkit import Chem 16 | 17 | from .errors import StopValidateError 18 | from .fragment import REMOVE_FRAGMENTS 19 | 20 | 21 | class Validation(object): 22 | """The base class that all :class:`~molvs.validations.Validation` subclasses must inherit from.""" 23 | 24 | def __init__(self, log): 25 | self.log = logging.LoggerAdapter(log, {'validation': type(self).__name__}) 26 | 27 | def __call__(self, mol): 28 | try: 29 | self.log.debug('Running %s', type(self).__name__) 30 | self.run(mol) 31 | except Exception as e: 32 | if isinstance(e, StopValidateError): 33 | raise e 34 | else: 35 | self.log.debug('Validation failed: %s', e) 36 | 37 | def run(self, mol): 38 | """""" 39 | raise NotImplementedError("Validation subclasses must implement the run method") 40 | 41 | 42 | class SmartsValidation(Validation): 43 | """Abstract superclass for :class:`Validations ` that log a message if a SMARTS 44 | pattern matches the molecule. 45 | 46 | Subclasses can override the following attributes: 47 | """ 48 | 49 | #: The logging level of the message. 50 | level = logging.INFO 51 | 52 | #: The message to log if the SMARTS pattern matches the molecule. 53 | message = 'Molecule matched %(smarts)s' 54 | 55 | #: Whether the SMARTS pattern should match an entire covalent unit. 56 | entire_fragment = False 57 | 58 | def __init__(self, log): 59 | super(SmartsValidation, self).__init__(log) 60 | self._smarts = Chem.MolFromSmarts(self.smarts) 61 | 62 | @property 63 | def smarts(self): 64 | """The SMARTS pattern as a string. Subclasses must implement this.""" 65 | raise NotImplementedError('SmartsValidation subclasses must have a smarts attribute') 66 | 67 | def _check_matches(self, mol): 68 | if mol.HasSubstructMatch(self._smarts): 69 | self.log.log(self.level, self.message, {'smarts': self.smarts}) 70 | 71 | def _check_matches_fragment(self, mol): 72 | matches = frozenset(frozenset(match) for match in mol.GetSubstructMatches(self._smarts)) 73 | fragments = frozenset(frozenset(frag) for frag in Chem.GetMolFrags(mol)) 74 | if matches & fragments: 75 | self.log.log(self.level, self.message, {'smarts': self.smarts}) 76 | 77 | def run(self, mol): 78 | if self.entire_fragment: 79 | self._check_matches_fragment(mol) 80 | else: 81 | self._check_matches(mol) 82 | 83 | 84 | class IsNoneValidation(Validation): 85 | """Logs an error if ``None`` is passed to the Validator. 86 | 87 | This can happen if RDKit failed to parse an input format. If the molecule is ``None``, no subsequent validations 88 | will run. 89 | """ 90 | 91 | def run(self, mol): 92 | if mol is None: 93 | self.log.error('Molecule is None') 94 | raise StopValidateError() 95 | 96 | 97 | class NoAtomValidation(Validation): 98 | """Logs an error if the molecule has zero atoms. 99 | 100 | If the molecule has no atoms, no subsequent validations will run. 101 | """ 102 | 103 | def run(self, mol): 104 | if mol.GetNumAtoms() == 0: 105 | self.log.error('No atoms are present') 106 | raise StopValidateError() 107 | 108 | 109 | class DichloroethaneValidation(SmartsValidation): 110 | """Logs if 1,2-dichloroethane is present. 111 | 112 | This is provided as an example of how to subclass :class:`~molvs.validations.SmartsValidation` to check for the 113 | presence of a substructure. 114 | """ 115 | level = logging.INFO 116 | smarts = '[Cl]-[#6]-[#6]-[Cl]' 117 | entire_fragment = True 118 | message = '1,2-Dichloroethane is present' 119 | 120 | 121 | class FragmentValidation(Validation): 122 | """Logs if certain fragments are present. 123 | 124 | Subclass and override the ``fragments`` class attribute to customize the list of 125 | :class:`FragmentPatterns `. 126 | """ 127 | 128 | #: A list of :class:`FragmentPatterns ` to check for. 129 | fragments = REMOVE_FRAGMENTS 130 | 131 | def run(self, mol): 132 | for fp in self.fragments: 133 | matches = frozenset(frozenset(match) for match in mol.GetSubstructMatches(fp.smarts)) 134 | fragments = frozenset(frozenset(frag) for frag in Chem.GetMolFrags(mol)) 135 | if matches & fragments: 136 | self.log.info('%s is present', fp.name) 137 | 138 | 139 | class NeutralValidation(Validation): 140 | """Logs if not an overall neutral system.""" 141 | 142 | def run(self, mol): 143 | charge = Chem.GetFormalCharge(mol) 144 | if not charge == 0: 145 | chargestring = '+%s' % charge if charge > 0 else '%s' % charge 146 | self.log.info('Not an overall neutral system (%s)', chargestring) 147 | 148 | 149 | class IsotopeValidation(Validation): 150 | """Logs if molecule contains isotopes.""" 151 | 152 | def run(self, mol): 153 | isotopes = set() 154 | for atom in mol.GetAtoms(): 155 | isotope = atom.GetIsotope() 156 | if not isotope == 0: 157 | isotopes.add('%s%s' % (isotope, atom.GetSymbol())) 158 | for isotope in isotopes: 159 | self.log.info('Molecule contains isotope %s', isotope) 160 | 161 | 162 | #: The default list of :class:`Validations ` used by :class:`~molvs.validate.Validator`. 163 | VALIDATIONS = ( 164 | IsNoneValidation, 165 | NoAtomValidation, 166 | #DichloroethaneValidation, 167 | FragmentValidation, 168 | NeutralValidation, 169 | IsotopeValidation, 170 | ) 171 | 172 | 173 | 174 | # - WARN/ERROR: Are all atoms defined/real - no query atoms or invalid elements, r-group things 175 | # - INFO: Contains unknown stereo (Perform stereochemistry perception first?) 176 | # - INFO: Nonstandard tautomer (log SMILES of tautomer parent, or the name of the tautomer transform?) 177 | # - WARN: InChI generation failed 178 | # - WARN: Contains covalent bond to metal (that would be broken by MetalDisconnector) 179 | # - WARN: Contains solvent molecules (in addition other fragment) 180 | # - WARN: More than 99 rings causes problems with SMILES 181 | # - INFO: Cis azo dye is unusual 182 | # - WARN: Adjacent atoms with like charges (i.e. both positive or both negative) 183 | # - INFO: Has more than one radical centre 184 | # - INFO: ethane, methane molecules present 185 | # - INFO: Boron, Sulfur atoms with no explicit bonds 186 | # - INFO: Solvent molecules present (only if also other fragments) 187 | # - INFO: One unknown stereocentre and no defined stereocentres (probably racemate, so info not warn) 188 | # - WARN: More than one undefined stereocentre and no defined stereocentres 189 | # - INFO: One undefined stereocentre and at least one defined stereocentre (epimer or mixture of anomers, so info not warn) 190 | # - WARN: More than one undefined stereocentre and at least one defined stereocentre 191 | # - INFO: Unknown double bond stereochemistry 192 | # - WARN: Ring containing stereobonds? 193 | # - INFO: Not canonical tautomer 194 | 195 | 196 | # Coordinates? 197 | # Info - Lack of coordinates? Uneven bond lengths? 198 | 199 | # Web services (needs to be optional) 200 | # Info - Could not match to ChemSpider ID, PubChem CID 201 | # UniChem from EBI could be useful here, otherwise use each API directly 202 | 203 | 204 | 205 | 206 | # Allow definition of MolSchema to set custom validations on e.g. 207 | 208 | # People can define a filterer 209 | # This has a series of validations, and the required output - e.g. no error or no warns? 210 | 211 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | from setuptools import setup 6 | 7 | 8 | if os.path.exists('README.rst'): 9 | long_description = open('README.rst').read() 10 | else: 11 | long_description = '''MolVS is a molecule validation and standardization tool, written in Python using the RDKit 12 | chemistry framework. Building a collection of chemical structures from different sources can be difficult due to 13 | differing representations, drawing conventions and mistakes. MolVS can standardize chemical structures to improve data 14 | quality, help with de-duplication and identify relationships between molecules. 15 | ''' 16 | 17 | setup( 18 | name='MolVS', 19 | version='0.1.1', 20 | author='Matt Swain', 21 | author_email='m.swain@me.com', 22 | license='MIT', 23 | url='https://github.com/mcs07/MolVS', 24 | packages=['molvs'], 25 | description='Molecule Validation and Standardization', 26 | long_description=long_description, 27 | keywords='chemistry cheminformatics rdkit', 28 | zip_safe=False, 29 | tests_require=['pytest'], 30 | install_requires=['six'], 31 | entry_points={'console_scripts': ['molvs = molvs.cli:main']}, 32 | classifiers=[ 33 | 'Environment :: Console', 34 | 'Intended Audience :: Developers', 35 | 'Intended Audience :: Healthcare Industry', 36 | 'Intended Audience :: Science/Research', 37 | 'License :: OSI Approved :: MIT License', 38 | 'Operating System :: OS Independent', 39 | 'Programming Language :: Python :: 2', 40 | 'Programming Language :: Python :: 2.7', 41 | 'Programming Language :: Python :: 3', 42 | 'Programming Language :: Python :: 3.4', 43 | 'Programming Language :: Python :: 3.5', 44 | 'Programming Language :: Python :: 3.6', 45 | 'Topic :: Scientific/Engineering', 46 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 47 | 'Topic :: Scientific/Engineering :: Chemistry', 48 | 'Topic :: Software Development :: Libraries :: Python Modules', 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcs07/MolVS/d815fe52d160abcecbcbf117e6437bf727dbd8ad/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_charge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for charge.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from rdkit import Chem 11 | 12 | from molvs.standardize import Standardizer, standardize_smiles 13 | from molvs.charge import Reionizer, Uncharger 14 | 15 | 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | 19 | def charge_parent_smiles(smiles, prefer_organic=False): 20 | """Utility function that returns the charge parent SMILES for given a SMILES string.""" 21 | mol = Chem.MolFromSmiles(smiles, sanitize=False) 22 | mol = Standardizer(prefer_organic=prefer_organic).charge_parent(mol) 23 | if mol: 24 | return Chem.MolToSmiles(mol, isomericSmiles=True) 25 | 26 | 27 | def uncharge_smiles(smiles): 28 | """Utility function that returns the uncharged SMILES for a given SMILES string.""" 29 | mol = Chem.MolFromSmiles(smiles) 30 | u = Uncharger() 31 | mol = u.uncharge(mol) 32 | if mol: 33 | return Chem.MolToSmiles(mol, isomericSmiles=True) 34 | 35 | 36 | def test_charge_parent(): 37 | """Test neutralization of ionized acids and bases.""" 38 | assert charge_parent_smiles('C(C(=O)[O-])(Cc1n[n-]nn1)(C[NH3+])(C[N+](=O)[O-])') == 'NCC(Cc1nn[nH]n1)(C[N+](=O)[O-])C(=O)O' 39 | 40 | 41 | def test_charge_parent2(): 42 | """Test preservation of zwitterion.""" 43 | assert charge_parent_smiles('n(C)1cc[n+]2cccc([O-])c12') == 'Cn1cc[n+]2cccc([O-])c12' 44 | 45 | 46 | def test_charge_parent3(): 47 | """Choline should be left with a positive charge.""" 48 | assert charge_parent_smiles('C[N+](C)(C)CCO') == 'C[N+](C)(C)CCO' 49 | 50 | 51 | def test_charge_parent4(): 52 | """This should have the hydrogen removed to give deanol as a charge parent.""" 53 | assert charge_parent_smiles('C[NH+](C)CCO') == 'CN(C)CCO' 54 | 55 | 56 | def test_charge_parent5(): 57 | """Sodium benzoate to benzoic acid.""" 58 | assert charge_parent_smiles('[Na+].O=C([O-])c1ccccc1') == 'O=C(O)c1ccccc1' 59 | 60 | 61 | def test_charge_parent6(): 62 | """Benzoate ion to benzoic acid.""" 63 | assert charge_parent_smiles('O=C([O-])c1ccccc1') == 'O=C(O)c1ccccc1' 64 | 65 | 66 | def test_charge_parent7(): 67 | """Charges in histidine should be neutralized.""" 68 | assert charge_parent_smiles('[NH3+]C(Cc1cnc[nH]1)C(=O)[O-]') == 'NC(Cc1cnc[nH]1)C(=O)O' 69 | 70 | 71 | def test_charge_parent8(): 72 | """Pick organic fragment and neutralise.""" 73 | assert charge_parent_smiles('C[NH+](C)(C).[Cl-]') == 'CN(C)C' 74 | 75 | 76 | def test_charge_parent9(): 77 | """No organic fragments.""" 78 | assert charge_parent_smiles('[N+](=O)([O-])[O-]') == 'O=[N+]([O-])O' 79 | 80 | 81 | def test_charge_parent10(): 82 | """No organic fragments.""" 83 | assert charge_parent_smiles('[N+](=O)([O-])[O-]', prefer_organic=True) == 'O=[N+]([O-])O' 84 | 85 | 86 | def test_charge_parent11(): 87 | """Larger inorganic fragment should be chosen.""" 88 | assert charge_parent_smiles('[N+](=O)([O-])[O-].[CH2]') == 'O=[N+]([O-])O' 89 | 90 | 91 | def test_charge_parent12(): 92 | """Smaller organic fragment should be chosen over larger inorganic fragment.""" 93 | assert charge_parent_smiles('[N+](=O)([O-])[O-].[CH2]', prefer_organic=True) == '[CH2]' 94 | 95 | 96 | def test_charge_parent13(): 97 | """Single oxygen should be protonated, the other left to balance the positive nitrogen.""" 98 | assert charge_parent_smiles('C[N+](C)(C)CC([O-])C[O-]') == 'C[N+](C)(C)CC([O-])CO' 99 | 100 | 101 | def test_charge_parent14(): 102 | """Strongest acid should be left ionized.""" 103 | assert charge_parent_smiles('[O-]C(=O)C[n+]1ccn2cccc([O-])c12') == 'O=C([O-])C[n+]1ccn2cccc(O)c21' 104 | 105 | 106 | def test_charge_parent15(): 107 | """All charges should be neutralized.""" 108 | assert charge_parent_smiles('[NH+](C)(C)CC([O-])C[O-]') == 'CN(C)CC(O)CO' 109 | 110 | 111 | def test_charge_parent16(): 112 | """All charges should be neutralized.""" 113 | assert charge_parent_smiles('CNCC([O-])C[O-]') == 'CNCC(O)CO' 114 | 115 | 116 | def test_standardize(): 117 | """Test table salt.""" 118 | assert standardize_smiles('[Na].[Cl]') == '[Cl-].[Na+]' 119 | 120 | 121 | def test_reionize(): 122 | """Test reionizer moves proton to weaker acid.""" 123 | mol = Chem.MolFromSmiles('C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O') 124 | r = Reionizer() 125 | mol = r.reionize(mol) 126 | assert Chem.MolToSmiles(mol) == 'O=S(O)c1ccc(S(=O)(=O)[O-])cc1' 127 | 128 | 129 | def test_reionize2(): 130 | """Test charged carbon doesn't get recognised as alpha-carbon-hydrogen-keto.""" 131 | mol = Chem.MolFromSmiles('CCOC(=O)C(=O)[CH-]C#N') 132 | r = Reionizer() 133 | mol = r.reionize(mol) 134 | assert Chem.MolToSmiles(mol) == 'CCOC(=O)C(=O)[CH-]C#N' 135 | 136 | 137 | def test_reionize3(): 138 | """""" 139 | mol = Chem.MolFromSmiles('C[N+]1=C[CH-]N(C(=N)N)/C1=C/[N+](=O)[O-]') 140 | r = Reionizer() 141 | mol = r.reionize(mol) 142 | assert Chem.MolToSmiles(mol) == 'C[N+]1=CCN(C(=N)N)/C1=[C-]/[N+](=O)[O-]' 143 | 144 | 145 | def test_should_complete(): 146 | """Reionization should not infinitely loop forever on these molecules.""" 147 | # GitHub Issue #14 148 | assert standardize_smiles('CCCCCCCCCCCCCCCCCC(=O)CC(=C)C(=O)O[Ti](=O)(OC(C)C)C(C)C') == 'C=C(CC(=O)[CH-]CCCCCCCCCCCCCCCC)C(=O)[O-].CC(C)[O-].CCC.[O-2].[Ti+5]' 149 | assert standardize_smiles('OP(=O)(O)[O-].OP(=O)([O-])[O-].[O-]S(=O)(=O)[O-].[Na+].[Na+].[Na+].[Mg+2].[Cl-].[Cl-].[K+].[K+]') == 'O=P([O-])(O)O.O=P([O-])([O-])O.O=S(=O)([O-])[O-].[Cl-].[Cl-].[K+].[K+].[Mg+2].[Na+].[Na+].[Na+]' 150 | 151 | 152 | def test_forced_charge1(): 153 | """Test forced charge correction maintaining overall neutral charge.""" 154 | assert standardize_smiles('[Na].O=C(O)c1ccccc1') == 'O=C([O-])c1ccccc1.[Na+]' 155 | 156 | 157 | def test_forced_charge2(): 158 | """Test forced charge correction with no corresponding proton for neutralization.""" 159 | # GitHub Issue #15 160 | assert standardize_smiles('[Na].[Na]') == '[Na+].[Na+]' 161 | # TODO: Arguably should become selenite ion... O=[Se]([O-])[O-]. Need an AcidBasePair? 162 | assert standardize_smiles('[Na].[Na].O[Se](O)=O') == 'O=[Se](O)O.[Na+].[Na+]' 163 | 164 | 165 | def test_uncharge(): 166 | """Test neutralization of ionized acids and bases.""" 167 | assert uncharge_smiles('C(C(=O)[O-])(Cc1n[n-]nn1)(C[NH3+])(C[N+](=O)[O-])') == 'NCC(Cc1nn[nH]n1)(C[N+](=O)[O-])C(=O)O' 168 | 169 | 170 | def test_uncharge2(): 171 | """Test preservation of zwitterion.""" 172 | assert uncharge_smiles('n(C)1cc[n+]2cccc([O-])c12') == 'Cn1cc[n+]2cccc([O-])c12' 173 | 174 | 175 | def test_uncharge3(): 176 | """Choline should be left with a positive charge.""" 177 | assert uncharge_smiles('C[N+](C)(C)CCO') == 'C[N+](C)(C)CCO' 178 | 179 | 180 | def test_uncharge4(): 181 | """This should have the hydrogen removed to give deanol as a charge parent.""" 182 | assert uncharge_smiles('C[NH+](C)CCO') == 'CN(C)CCO' 183 | 184 | 185 | def test_uncharge5(): 186 | """Overall system is already neutral.""" 187 | assert uncharge_smiles('[Na+].O=C([O-])c1ccccc1') == 'O=C([O-])c1ccccc1.[Na+]' 188 | 189 | 190 | def test_uncharge6(): 191 | """Benzoate ion to benzoic acid.""" 192 | assert uncharge_smiles('O=C([O-])c1ccccc1') == 'O=C(O)c1ccccc1' 193 | 194 | 195 | def test_uncharge7(): 196 | """Charges in histidine should be neutralized.""" 197 | assert uncharge_smiles('[NH3+]C(Cc1cnc[nH]1)C(=O)[O-]') == 'NC(Cc1cnc[nH]1)C(=O)O' 198 | 199 | 200 | def test_uncharge8(): 201 | """Neutralize both fragments.""" 202 | assert uncharge_smiles('C[NH+](C)(C).[Cl-]') == 'CN(C)C.Cl' 203 | 204 | 205 | def test_uncharge9(): 206 | """Neutralise one oxygen.""" 207 | assert uncharge_smiles('[N+](=O)([O-])[O-]') == 'O=[N+]([O-])O' 208 | 209 | 210 | def test_uncharge11(): 211 | """Smaller organic fragment should be chosen over larger inorganic fragment.""" 212 | assert uncharge_smiles('[N+](=O)([O-])[O-].[CH2]') == 'O=[N+]([O-])O.[CH2]' 213 | 214 | 215 | def test_uncharge13(): 216 | """Single oxygen should be protonated, the other left to balance the positive nitrogen.""" 217 | assert uncharge_smiles('C[N+](C)(C)CC([O-])C[O-]') == 'C[N+](C)(C)CC([O-])CO' 218 | 219 | 220 | def test_uncharge14(): 221 | """Strongest acid should be left ionized.""" 222 | assert uncharge_smiles('[O-]C(=O)C[n+]1ccn2cccc([O-])c12') == 'O=C([O-])C[n+]1ccn2cccc(O)c21' 223 | 224 | 225 | def test_uncharge15(): 226 | """All charges should be neutralized.""" 227 | assert uncharge_smiles('[NH+](C)(C)CC([O-])C[O-]') == 'CN(C)CC(O)CO' 228 | 229 | 230 | def test_uncharge16(): 231 | """All charges should be neutralized.""" 232 | assert uncharge_smiles('CNCC([O-])C[O-]') == 'CNCC(O)CO' 233 | 234 | 235 | # def test_reionize3(): 236 | # """Test canonical ionization position when multiple equivalent possibilities.""" 237 | # mol = Chem.MolFromSmiles('CC1=CC(=CC=C1S(O)=O)S([O-])=O') 238 | # mol2 = Chem.MolFromSmiles('CC1=CC(=CC=C1S([O-])=O)S(O)=O') 239 | # r = Reionizer() 240 | # mol = r.reionize(mol) 241 | # mol2 = r.reionize(mol2) 242 | # assert Chem.MolToSmiles(mol) == 'Cc1cc(S(=O)[O-])ccc1S(=O)O' 243 | # assert Chem.MolToSmiles(mol2) == 'Cc1cc(S(=O)[O-])ccc1S(=O)O' 244 | # assert Chem.MolToSmiles(mol) == Chem.MolToSmiles(mol2) 245 | # 246 | # 247 | # def test_reionize4(): 248 | # """Test canonical ionization position when multiple equivalent possibilities.""" 249 | # mol = Chem.MolFromSmiles('CCOC(=O)C(=O)[CH-]C#N') 250 | # mol2 = Chem.MolFromSmiles('[CH2-]COC(=O)C(=O)CC#N') 251 | # r = Reionizer() 252 | # mol = r.reionize(mol) 253 | # mol2 = r.reionize(mol2) 254 | # assert Chem.MolToSmiles(mol) == '[CH2-]COC(=O)C(=O)CC#N' 255 | # assert Chem.MolToSmiles(mol2) == '' 256 | # assert Chem.MolToSmiles(mol) == Chem.MolToSmiles(mol2) 257 | -------------------------------------------------------------------------------- /tests/test_fragment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for fragment.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from rdkit import Chem 11 | 12 | from molvs.standardize import Standardizer 13 | from molvs.fragment import FragmentRemover 14 | 15 | 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | 19 | def fragment_parent_smiles(smiles, prefer_organic=False): 20 | """Utility function that returns the fragment parent SMILES for given a SMILES string.""" 21 | mol = Chem.MolFromSmiles(smiles, sanitize=False) 22 | mol = Standardizer(prefer_organic=prefer_organic).fragment_parent(mol) 23 | return Chem.MolToSmiles(mol, isomericSmiles=True) 24 | 25 | 26 | def fragment_removal_smiles(smiles, leave_last=True): 27 | """Utility function that returns the result SMILES after FragmentRemover is applied to given a SMILES string.""" 28 | mol = Chem.MolFromSmiles(smiles) 29 | mol = FragmentRemover(leave_last=leave_last).remove(mol) 30 | return Chem.MolToSmiles(mol, isomericSmiles=True) 31 | 32 | 33 | def test_fragment_parent(): 34 | """Fragments created by standardization breaking metal-nonmetal covalent bonds.""" 35 | assert fragment_parent_smiles('[Na]OC(=O)c1ccccc1') == 'O=C([O-])c1ccccc1' 36 | 37 | 38 | def test_fragment_parent2(): 39 | """Fragments created by standardization breaking metal-nonmetal covalent bonds.""" 40 | assert fragment_parent_smiles('c1ccccc1C(=O)O[Ca]OC(=O)c1ccccc1') == 'O=C([O-])c1ccccc1' 41 | 42 | 43 | def test_fragment_parent3(): 44 | """Fragments created by standardization breaking metal-nonmetal covalent bonds.""" 45 | assert fragment_parent_smiles('[Pt](Cl)(Cl)(O)(O)(NC(C)C)NC(C)C') == 'CC(C)[NH-]' 46 | 47 | 48 | def test_fragment_parent4(): 49 | """Mercury containing compound.""" 50 | assert fragment_parent_smiles('CC[Hg]SC1=C(C=CC=C1)C(=O)[O][Na]') == 'CC[Hg]Sc1ccccc1C(=O)[O-]' 51 | 52 | 53 | def test_fragment_parent5(): 54 | """Covalent bond with metal.""" 55 | assert fragment_parent_smiles('[Ag]OC(=O)O[Ag]') == 'O=C([O-])[O-]' 56 | 57 | 58 | def test_fragment_parent6(): 59 | """Salt without charges.""" 60 | assert fragment_parent_smiles('[Na].O=C(O)c1ccccc1') == 'O=C([O-])c1ccccc1' 61 | 62 | 63 | def test_fragment_parent7(): 64 | """Multiple identical fragments.""" 65 | assert fragment_parent_smiles('O=C(O)c1ccccc1.O=C(O)c1ccccc1.O=C(O)c1ccccc1') == 'O=C(O)c1ccccc1' 66 | 67 | 68 | def test_fragment_parent8(): 69 | """Multiple organic fragments of different sizes.""" 70 | assert fragment_parent_smiles('O=C(O)CCC.O=C(O)CCCC.O=C(O)CCCCC.O=C(O)CCCC') == 'CCCCCC(=O)O' 71 | 72 | 73 | def test_fragment_parent9(): 74 | """No organic fragments.""" 75 | assert fragment_parent_smiles('[N+](=O)([O-])[O-]') == 'O=[N+]([O-])[O-]' 76 | 77 | 78 | def test_fragment_parent10(): 79 | """No organic fragments.""" 80 | assert fragment_parent_smiles('[N+](=O)([O-])[O-]', prefer_organic=True) == 'O=[N+]([O-])[O-]' 81 | 82 | 83 | def test_fragment_parent11(): 84 | """Larger inorganic fragment should be chosen.""" 85 | assert fragment_parent_smiles('[N+](=O)([O-])[O-].[CH3+]') == 'O=[N+]([O-])[O-]' 86 | 87 | 88 | def test_fragment_parent12(): 89 | """Smaller organic fragment should be chosen over larger inorganic fragment.""" 90 | assert fragment_parent_smiles('[N+](=O)([O-])[O-].[CH3+]', prefer_organic=True) == '[CH3+]' 91 | 92 | 93 | def test_fragment_removal(): 94 | """Single salt removal.""" 95 | assert fragment_removal_smiles('CN(C)C.Cl') == 'CN(C)C' 96 | 97 | 98 | def test_fragment_removal2(): 99 | """Multiple salt removal.""" 100 | assert fragment_removal_smiles('CN(C)C.Cl.Cl.Br') == 'CN(C)C' 101 | 102 | 103 | def test_fragment_removal3(): 104 | """FragmentPatterns should match entire fragments only, matches within larger fragments should be left.""" 105 | assert fragment_removal_smiles('CN(Br)Cl') == 'CN(Cl)Br' 106 | 107 | 108 | def test_fragment_removal4(): 109 | """FragmentPatterns should match entire fragments only, matches within larger fragments should be left.""" 110 | assert fragment_removal_smiles('CN(Br)Cl.Cl') == 'CN(Cl)Br' 111 | 112 | 113 | def test_fragment_removal5(): 114 | """Charged salts.""" 115 | assert fragment_removal_smiles('C[NH+](C)(C).[Cl-]') == 'C[NH+](C)C' 116 | 117 | 118 | def test_fragment_removal6(): 119 | """Last match should be left.""" 120 | assert fragment_removal_smiles('CC(=O)O.[Na]') == 'CC(=O)O' 121 | 122 | 123 | def test_fragment_removal7(): 124 | """Last match should be removed.""" 125 | assert fragment_removal_smiles('CC(=O)O.[Na]', leave_last=False) == '' 126 | 127 | 128 | def test_fragment_removal8(): 129 | """Multiple identical last fragments should all be left.""" 130 | assert fragment_removal_smiles('Cl.Cl') == 'Cl.Cl' 131 | 132 | 133 | def test_fragment_removal9(): 134 | """Test multiple fragment removal.""" 135 | assert fragment_removal_smiles('[Na+].OC(=O)Cc1ccc(CN)cc1.OS(=O)(=O)C(F)(F)F') == 'NCc1ccc(CC(=O)O)cc1' 136 | 137 | 138 | def test_fragment_removal10(): 139 | """1,4-Dioxane should be removed.""" 140 | assert fragment_removal_smiles('c1ccccc1O.O1CCOCC1') == 'Oc1ccccc1' 141 | 142 | 143 | def test_fragment_removal11(): 144 | """Benzene should be removed.""" 145 | assert fragment_removal_smiles('c1ccccc1.CCCBr') == 'CCCBr' 146 | 147 | 148 | def test_fragment_removal12(): 149 | """Various fragments should be removed should be removed.""" 150 | assert fragment_removal_smiles('CC(NC1=CC=C(O)C=C1)=O.CCCCC.O.CCO.CCCO.C1CCCCC1.C1CCCCCC1') == 'CC(=O)Nc1ccc(O)cc1' 151 | -------------------------------------------------------------------------------- /tests/test_metal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for metal.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from rdkit import Chem 11 | 12 | from molvs.metal import MetalDisconnector 13 | from molvs.standardize import standardize_smiles 14 | 15 | 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | 19 | def test_standardize(): 20 | """Test covalent metal is disconnected during standardize.""" 21 | assert standardize_smiles('CCC(=O)O[Na]') == 'CCC(=O)[O-].[Na+]' 22 | 23 | 24 | def test_standardize2(): 25 | """Test metal ion is untouched during standardize.""" 26 | assert standardize_smiles('CCC(=O)[O-].[Na+]') == 'CCC(=O)[O-].[Na+]' 27 | 28 | 29 | def test_standardize3(): 30 | """Test Hg is disconnected from O during standardize.""" 31 | assert standardize_smiles('CCC(=O)O[Hg]') == 'CCC(=O)[O-].[Hg+]' 32 | 33 | 34 | def test_standardize4(): 35 | """Test dimethylmercury is not disconnected during standardize.""" 36 | assert standardize_smiles('C[Hg]C') == 'C[Hg]C' 37 | 38 | 39 | def test_standardize5(): 40 | """Test zirconium (IV) ethoxide.""" 41 | assert standardize_smiles('CCO[Zr](OCC)(OCC)OCC') == 'CC[O-].CC[O-].CC[O-].CC[O-].[Zr+4]' 42 | 43 | 44 | def test_standardize6(): 45 | """Test Grignard reagent.""" 46 | # TODO: Should we disconnect this? 47 | assert standardize_smiles('c1ccccc1[Mg]Br') == 'Br[Mg]c1ccccc1' 48 | 49 | 50 | def test_metaldisconnector1(): 51 | """Test direct usage of MetalDisconnector class.""" 52 | mol = Chem.MolFromSmiles('NC(CC(=O)O)C(=O)[O-].O.O.[Na+]') 53 | md = MetalDisconnector() 54 | mol = md.disconnect(mol) 55 | assert Chem.MolToSmiles(mol) == 'NC(CC(=O)O)C(=O)[O-].O.O.[Na+]' 56 | 57 | 58 | def test_metaldisconnector2(): 59 | """Test direct usage of MetalDisconnector class.""" 60 | mol = Chem.MolFromSmiles('CCC(=O)O[Na]') 61 | md = MetalDisconnector() 62 | mol = md.disconnect(mol) 63 | assert Chem.MolToSmiles(mol) == 'CCC(=O)[O-].[Na+]' 64 | -------------------------------------------------------------------------------- /tests/test_normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for normalize.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from rdkit import Chem 11 | 12 | from molvs.normalize import Normalizer 13 | 14 | 15 | logging.basicConfig(level=logging.DEBUG) 16 | 17 | 18 | def normalize_smiles(smiles): 19 | """Utility function that runs normalization rules on a given a SMILES string.""" 20 | mol = Chem.MolFromSmiles(smiles, sanitize=False) 21 | mol = Normalizer().normalize(mol) 22 | if mol: 23 | return Chem.MolToSmiles(mol, isomericSmiles=True) 24 | 25 | 26 | def test_nitro(): 27 | """Test nitro group normalization. Actually handled automatically by RDKit.""" 28 | assert normalize_smiles('CN(=O)=O') == 'C[N+](=O)[O-]' 29 | 30 | 31 | def test_sulfoxide(): 32 | """Test sulfoxide normalization.""" 33 | assert normalize_smiles('CS(C)=O') == 'C[S+](C)[O-]' 34 | 35 | 36 | def test_sulfone(): 37 | """""" 38 | assert normalize_smiles('C[S+2]([O-])([O-])O') == 'CS(=O)(=O)O' 39 | 40 | 41 | def test_1_3_charge_recombination(): 42 | """Test 1,3-separated charges are recombined.""" 43 | assert normalize_smiles('CC([O-])=[N+](C)C') == 'CC(=O)N(C)C' 44 | 45 | 46 | def test_1_3_charge_recombination_aromatic(): 47 | """Test 1,3-separated charges are recombined.""" 48 | assert normalize_smiles('C[n+]1ccccc1[O-]') == 'Cn1ccccc1=O' 49 | 50 | 51 | def test_1_3_charge_recombination_exception(): 52 | """Test a case where 1,3-separated charges should not be recombined.""" 53 | assert normalize_smiles('CC12CCCCC1(Cl)[N+]([O-])=[N+]2[O-]') == 'CC12CCCCC1(Cl)[N+]([O-])=[N+]2[O-]' 54 | 55 | 56 | def test_1_5_charge_recombination(): 57 | """Test 1,5-separated charges are recombined.""" 58 | assert normalize_smiles('C[N+](C)=C\\C=C\\[O-]') == 'CN(C)C=CC=O' 59 | 60 | 61 | def test_1_5_charge_recombination_exception(): 62 | """Test a case where 1,5-separated charges should not be recombined.""" 63 | assert normalize_smiles('C[N+]1=C2C=[N+]([O-])C=CN2CCC1') == 'C[N+]1=C2C=[N+]([O-])C=CN2CCC1' 64 | -------------------------------------------------------------------------------- /tests/test_resonance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for resonance.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from rdkit import Chem 11 | 12 | from molvs.resonance import ResonanceEnumerator, enumerate_resonance_smiles 13 | 14 | 15 | logging.basicConfig(level=logging.DEBUG) 16 | 17 | 18 | def test_thiocyanate_ion(): 19 | """""" 20 | assert enumerate_resonance_smiles('[S-]C#N') == {'N#C[S-]', '[N-]=C=S'} 21 | 22 | 23 | def test_thiocyanate_ion2(): 24 | """""" 25 | assert enumerate_resonance_smiles('[N-]=C=S') == {'N#C[S-]', '[N-]=C=S'} 26 | 27 | 28 | def test_carbamimidoylbenzoic_acid(): 29 | """Custom ResonanceEnumerate options allow unconstrained charges.""" 30 | mol = Chem.MolFromSmiles('NC(=[NH2+])c1ccc(cc1)C(=O)[O-]') 31 | rs = ResonanceEnumerator().enumerate(mol) 32 | assert {Chem.MolToSmiles(r, isomericSmiles=True) for r in rs} == {'NC(=[NH2+])c1ccc(C(=O)[O-])cc1'} 33 | rs = ResonanceEnumerator(allow_incomplete_octets=True, unconstrained_anions=True, unconstrained_cations=True).enumerate(mol) 34 | assert len(rs) == 32 35 | 36 | 37 | def test_mobile_charge(): 38 | """""" 39 | assert enumerate_resonance_smiles('CN1CC[N+]2=C1C1=C(C=CC=C1)C1=CC=CC=C21') == {'CN1CC[n+]2c1c1ccccc1c1ccccc12', 'C[N+]1=C2c3ccccc3-c3ccccc3N2CC1'} 40 | 41 | 42 | def test_mobile_charge2(): 43 | """""" 44 | assert enumerate_resonance_smiles('C[N+]1=C2N(CC1)C1=CC=CC=C1C1=C2C=CC=C1') == {'CN1CC[n+]2c1c1ccccc1c1ccccc12', 'C[N+]1=C2c3ccccc3-c3ccccc3N2CC1'} 45 | 46 | 47 | 48 | 49 | # Normalization limits: From p36 of the InChI technical manual 50 | # If passed to standardizer, the local symmetry means that either N could get +ve charge, regardless of O and S locations? 51 | # CN(C)C1=[NH+]C2=CC(=S)C(=O)C=C2N1 52 | # C[N+](C)=C1NC2=CC(=O)C(=S)C=C2N1 53 | # Can this be solved by resonance/tautomer enumeration and canonicalization? 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /tests/test_standardize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for standardize.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from molvs.standardize import standardize_smiles 11 | 12 | 13 | logging.basicConfig(level=logging.DEBUG) 14 | 15 | 16 | def test_aromaticity(): 17 | """Check aromaticity is correctly perceived.""" 18 | assert standardize_smiles('C1=CC=CC=C1') == 'c1ccccc1' 19 | 20 | 21 | def test_aromaticity2(): 22 | """Both rings should be aromatic.""" 23 | assert standardize_smiles('C[N]1C=NC2=C1C(=O)N(C)C(=O)N2C') == 'Cn1c(=O)c2c(ncn2C)n(C)c1=O' 24 | assert standardize_smiles('Cn1cnc2c1c(=O)n(C)c(=O)n2C') == 'Cn1c(=O)c2c(ncn2C)n(C)c1=O' 25 | 26 | 27 | def test_aromaticity3(): 28 | """Redo incorrect aromatization.""" 29 | assert standardize_smiles('C=Cc1ccc2c(c1)NC(=O)/C/2=C\\c1ccc[nH]1') == 'C=Cc1ccc2c(c1)NC(=O)/C2=C\\c1ccc[nH]1' 30 | assert standardize_smiles('C=Cc1ccc2c(c1)[nH]c(=O)/c/2=C\\c1ccc[nH]1') == 'C=Cc1ccc2c(c1)NC(=O)/C2=C\\c1ccc[nH]1' 31 | 32 | 33 | def test_stereochemistry(): 34 | """Check stereochemistry is correctly perceived.""" 35 | assert standardize_smiles('Cl\\C=C/Cl') == 'Cl/C=C\\Cl' 36 | 37 | 38 | def test_disconnect_metal(): 39 | """Break metal-organic covalent bonds.""" 40 | assert standardize_smiles('[Na]OC(=O)c1ccccc1') == 'O=C([O-])c1ccccc1.[Na+]' 41 | 42 | 43 | def test_disconnect_metal2(): 44 | """Break metal-organic covalent bonds.""" 45 | assert standardize_smiles('c1ccccc1C(=O)O[Ca]OC(=O)c1ccccc1') == 'O=C([O-])c1ccccc1.O=C([O-])c1ccccc1.[Ca+2]' 46 | 47 | 48 | def test_disconnect_metal3(): 49 | """Disconnect Pt in metal complex.""" 50 | assert standardize_smiles('[Pt](Cl)(Cl)(O)(O)(NC(C)C)NC(C)C') == 'CC(C)[NH-].CC(C)[NH-].[Cl-].[Cl-].[OH-].[OH-].[Pt+6]' 51 | 52 | 53 | def test_disconnect_metal4(): 54 | """Leave mercury covalently bonded.""" 55 | assert standardize_smiles('CC[Hg]SC1=C(C=CC=C1)C(=O)[O][Na]') == 'CC[Hg]Sc1ccccc1C(=O)[O-].[Na+]' 56 | 57 | 58 | def test_disconnect_metal5(): 59 | """Silver carbonate. Unsure about this one.""" 60 | assert standardize_smiles('[Ag]OC(=O)O[Ag]') == 'O=C([O-])[O-].[Ag+].[Ag+]' 61 | 62 | 63 | def test_charge_free_metal(): 64 | """Charge free neutral metal with carboxylic acid.""" 65 | assert standardize_smiles('[Na].O=C(O)c1ccccc1') == 'O=C([O-])c1ccccc1.[Na+]' 66 | 67 | 68 | def test_nitro_normalization(): 69 | """Normalize nitro group.""" 70 | assert standardize_smiles('C1(=CC=CC=C1)[N+](=O)[O-]') == 'O=[N+]([O-])c1ccccc1' 71 | 72 | 73 | def test_nitro_normalization2(): 74 | """Normalize nitro group.""" 75 | assert standardize_smiles('O=[N](=O)c1ccccc1') == 'O=[N+]([O-])c1ccccc1' 76 | 77 | 78 | def test_nitro_normalization3(): 79 | """Normalize nitro group.""" 80 | assert standardize_smiles('[O-][N+](=O)c1ccccc1') == 'O=[N+]([O-])c1ccccc1' 81 | 82 | 83 | def test_nitro_normalization4(): 84 | """Normalize nitro group.""" 85 | assert standardize_smiles('[N](=O)(=O)O') == 'O=[N+]([O-])O' 86 | 87 | 88 | def test_nitro_normalization5(): 89 | """Normalize nitro group.""" 90 | assert standardize_smiles('O[N+](=O)[O-]') == 'O=[N+]([O-])O' 91 | 92 | 93 | def test_pyridine_oxide_normalization(): 94 | """Normalize pyridine oxide.""" 95 | assert standardize_smiles('C1=[N](C=CC=C1)=O') == '[O-][n+]1ccccc1' 96 | 97 | 98 | def test_pyridine_oxide_normalization2(): 99 | """Normalize pyridine oxide.""" 100 | assert standardize_smiles('O=n1ccccc1') == '[O-][n+]1ccccc1' 101 | 102 | 103 | def test_sulfone_normalization(): 104 | """Normalize sulfone.""" 105 | assert standardize_smiles('C[S+2]([O-])([O-])C') == 'CS(C)(=O)=O' 106 | 107 | 108 | def test_sulfone_normalization2(): 109 | """Normalize sulfone.""" 110 | assert standardize_smiles('C[S+2]([O-])([O-])O') == 'CS(=O)(=O)O' 111 | 112 | 113 | def test_sulfoxide_normalization(): 114 | """Normalize sulfoxide.""" 115 | assert standardize_smiles('CS(=O)C') == 'C[S+](C)[O-]' 116 | 117 | 118 | def test_sulfoxide_normalization2(): 119 | """Normalize sulfoxide.""" 120 | assert standardize_smiles('COC1=CC2=C(C=C1)[N]C(=N2)[S](=O)CC3=C(C(=C(C=N3)C)OC)C') == 'COc1ccc2c(c1)N=C([S+]([O-])Cc1ncc(C)c(OC)c1C)[N]2' 121 | 122 | 123 | def test_sulfoxide_normalization3(): 124 | """Normalize sulfoxide.""" 125 | assert standardize_smiles('COc1ccc2c(c1)nc([nH]2)S(=O)Cc1ncc(c(c1C)OC)C') == 'COc1ccc2[nH]c([S+]([O-])Cc3ncc(C)c(OC)c3C)nc2c1' 126 | 127 | 128 | def test_azide_normalization(): 129 | """Normalize azide.""" 130 | assert standardize_smiles('C1(=CC=C(C=C1)N)N=[N]#N') == '[N-]=[N+]=Nc1ccc(N)cc1' 131 | 132 | 133 | def test_diazo_normalization(): 134 | """Normalize diazo.""" 135 | assert standardize_smiles('[N](#N)=C1C(NC(N=C1)=O)=O') == '[N-]=[N+]=C1C=NC(=O)NC1=O' 136 | 137 | 138 | def test_phosphate_normalization(): 139 | """Normalize phosphate.""" 140 | assert standardize_smiles('C1=NC=C([N]1)CO[P+]([O-])([O-])[O-]') == 'O=P([O-])([O-])OCC1=CN=C[N]1' 141 | 142 | 143 | def test_hydrazine_diazonium_normalization(): 144 | """Normalize hydrazine-diazonium.""" 145 | assert standardize_smiles('CNNC[N+]#N') == 'CN=[NH+]CN=N' 146 | 147 | 148 | def test_amidinium_normalization(): 149 | """Normalize amidinium.""" 150 | assert standardize_smiles('[C+](C)(N)N') == 'CC(N)=[NH2+]' 151 | 152 | 153 | def test_multi_fragment_normalization(): 154 | """All fragments should stay if one gets transformed by normalization.""" 155 | assert standardize_smiles('[Na]OC(=O)c1ccc(C[S+2]([O-])([O-]))cc1') == 'O=C([O-])c1ccc(C[S](=O)=O)cc1.[Na+]' 156 | assert standardize_smiles('[Na+].[O-]C(=O)c1ccc(C[S+2]([O-])([O-]))cc1') == 'O=C([O-])c1ccc(C[S](=O)=O)cc1.[Na+]' 157 | 158 | 159 | def test_1_3_nonaromatic_charge_recombination(): 160 | """Recombine non-aromatic 1,3-separated charges.""" 161 | assert standardize_smiles('C[N-]C(C)=[N+](C)C') == 'CN=C(C)N(C)C' 162 | 163 | 164 | def test_1_3_aromatic_charge_recombination(): 165 | """Recombine aromatic 1,3-separated charges.""" 166 | assert standardize_smiles('[n-]1c(=[N+](C)C)cccc1') == 'CN(C)c1ccccn1' 167 | 168 | 169 | def test_1_3_aromatic_charge_recombination2(): 170 | """Recombine aromatic 1,3-separated charges.""" 171 | assert standardize_smiles('C[n+]1c([N-](C))cccc1') == 'CN=c1ccccn1C' 172 | 173 | 174 | def test_pyrimidone_charge_recombination(): 175 | """Recombine aromatic 1,3-separated charges to form pyrimidone.""" 176 | assert standardize_smiles('[O-]c1[n+](C)cccc1') == 'Cn1ccccc1=O' 177 | 178 | 179 | def test_pyrimidone_charge_recombination2(): 180 | """Recombine aromatic 1,3-separated charges to form pyrimidone.""" 181 | assert standardize_smiles('COc1cc2ccc3c4c(OC)cc(OC)c(OC)c4c([O-])[n+](C)c3c2cc1OC') == 'COc1cc2ccc3c4c(OC)cc(OC)c(OC)c4c(=O)n(C)c3c2cc1OC' 182 | 183 | 184 | def test_1_5_nonaromatic_charge_recombination(): 185 | """Recombine non-aromatic 1,5-separated charges.""" 186 | assert standardize_smiles('C[N-]C=CC=[N+](C)C') == 'CN=CC=CN(C)C' 187 | 188 | 189 | def test_1_5_aromatic_charge_recombination(): 190 | """Recombine aromatic 1,5-separated charges.""" 191 | assert standardize_smiles('[n-]1ccc(=[N+](C)C)cc1') == 'CN(C)c1ccncc1' 192 | 193 | 194 | def test_1_5_aromatic_charge_recombination2(): 195 | """Recombine aromatic 1,5-separated charges.""" 196 | assert standardize_smiles('C[n+]1ccc([N-]C)cc1') == 'CN=c1ccn(C)cc1' 197 | 198 | 199 | def test_charge_to_protonated_atom(): 200 | """Shift positive charge from nonprotonated to protonated atom.""" 201 | assert standardize_smiles('CNC=[N+](C)C') == 'C[NH+]=CN(C)C' 202 | 203 | 204 | def test_charge_to_protonated_atom2(): 205 | """Shift positive charge from nonprotonated to protonated atom.""" 206 | assert standardize_smiles('[nH]1c(=[N+](C)C)cccc1') == 'CN(C)c1cccc[nH+]1' 207 | 208 | 209 | # def test_charge_to_protonated_atom3(): 210 | # """Shift positive charge from nonprotonated to protonated atom.""" 211 | # assert standardize_smiles('CNc1[n+](C)cccc1') == 'C[NH+]=c1ccccn1C' 212 | 213 | 214 | # def test_charge_to_protonated_atom4(): 215 | # """Shift positive charge from nonprotonated to protonated atom.""" 216 | # assert standardize_smiles('CNc1[n+](C)cco1') == 'C[NH+]=c1occn1C' 217 | 218 | 219 | def test_charge_to_protonated_atom5(): 220 | """Shift positive charge from nonprotonated to protonated atom.""" 221 | assert standardize_smiles('CNC=CC=[N+](C)C') == 'C[NH+]=CC=CN(C)C' 222 | 223 | 224 | def test_charge_to_protonated_atom6(): 225 | """Shift positive charge from nonprotonated to protonated atom.""" 226 | assert standardize_smiles('[nH]1ccc(=[N+](C)C)cc1') == 'CN(C)c1cc[nH+]cc1' 227 | 228 | 229 | # def test_charge_to_protonated_atom7(): 230 | # """Shift positive charge from nonprotonated to protonated atom.""" 231 | # assert standardize_smiles('CNc1cc[n+](C)cc1') == 'C[NH+]=c1ccn(C)cc1' 232 | 233 | 234 | # def test_charge_to_protonated_atom8(): 235 | # """Shift positive charge from nonprotonated to protonated atom.""" 236 | # assert standardize_smiles('C[n+]1ccc2[nH]ccc2c1') == 'Cn1ccc2[nH+]ccc-2c1' 237 | 238 | 239 | def test_charge_to_protonated_atom9(): 240 | """Shift positive charge from nonprotonated to protonated atom.""" 241 | assert standardize_smiles('CNC=CC=[N+](C)C') == 'C[NH+]=CC=CN(C)C' 242 | 243 | 244 | def test_transform_maintains_ring(): 245 | """Ensure no transforms inadvertently breaks open rings.""" 246 | assert standardize_smiles('[O-]C1=CC=CC2=CC=CC=[N+]12') == 'O=c1cccc2ccccn12' 247 | 248 | 249 | def test_equal_reionize(): 250 | """Don't change partially ionized acid with two equally strong acid groups.""" 251 | assert standardize_smiles('C1=C(C=CC(=C1)[S]([O-])(=O)=O)[S](O)(=O)=O') == 'O=S(=O)([O-])c1ccc(S(=O)(=O)O)cc1' 252 | 253 | 254 | def test_reionize(): 255 | """Partially ionized acid where proton should be moved to weaker acid.""" 256 | assert standardize_smiles('C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O') == 'O=S(O)c1ccc(S(=O)(=O)[O-])cc1' 257 | 258 | 259 | def test_reionize2(): 260 | """Partially ionized acid where proton should be moved to weaker acid.""" 261 | assert standardize_smiles('C1=C(C=CC(=C1)[P]([O-])(=O)O)[S](O)(=O)=O') == 'O=P(O)(O)c1ccc(S(=O)(=O)[O-])cc1' 262 | 263 | 264 | def test_reionize3(): 265 | """Partially ionized acid where proton should be moved to weaker acid.""" 266 | assert standardize_smiles('C1=C(C=C(C(=C1)O)C(=O)[O-])[S](O)(=O)=O.[Na+]') == 'O=C(O)c1cc(S(=O)(=O)[O-])ccc1O.[Na+]' 267 | 268 | 269 | def test_charge_preservation(): 270 | """Unusually charged histidine should be preserved. (See charge parent for normalization).""" 271 | assert standardize_smiles('[NH3+]C(Cc1cnc[nH]1)C(=O)[O-]') == '[NH3+]C(Cc1cnc[nH]1)C(=O)[O-]' 272 | 273 | 274 | def test_charge_preservation2(): 275 | """""" 276 | assert standardize_smiles('[Cl-].C[NH+](C)C') == 'C[NH+](C)C.[Cl-]' 277 | 278 | 279 | def test_disconnect_metal6(): 280 | """""" 281 | assert standardize_smiles('C1(CCCCC1)[Zn]Br') == '[Br-].[CH-]1CCCCC1.[Zn+2]' 282 | 283 | 284 | def test_positive_carbon_nitrogen(): 285 | """""" 286 | assert standardize_smiles('CN[C+](C)NC') == 'CNC(C)=[NH+]C' 287 | 288 | 289 | def test_positive_phosphorus_nitrogen(): 290 | """""" 291 | assert standardize_smiles('CN[P+](C)(C)NC') == 'CNP(C)(C)=[NH+]C' 292 | -------------------------------------------------------------------------------- /tests/test_tautomer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for tautomer.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | import logging 9 | 10 | from molvs.standardize import enumerate_tautomers_smiles, canonicalize_tautomer_smiles 11 | 12 | 13 | logging.basicConfig(level=logging.DEBUG) 14 | 15 | 16 | def test_1_3_keto_enol_enumeration(): 17 | """Enumerate 1,3 keto/enol tautomer.""" 18 | assert enumerate_tautomers_smiles('C1(=CCCCC1)O') == {'OC1=CCCCC1', 'O=C1CCCCC1'} 19 | 20 | 21 | def test_1_3_keto_enol_enumeration2(): 22 | """Enumerate 1,3 keto/enol tautomer.""" 23 | assert enumerate_tautomers_smiles('C1(CCCCC1)=O') == {'OC1=CCCCC1', 'O=C1CCCCC1'} 24 | 25 | 26 | def test_acetophenone_keto_enol_enumeration(): 27 | """Enumerate acetophenone keto/enol tautomer.""" 28 | assert enumerate_tautomers_smiles('C(=C)(O)C1=CC=CC=C1') == {'C=C(O)c1ccccc1', 'CC(=O)c1ccccc1'} 29 | 30 | 31 | def test_acetone_keto_enol_enumeration2(): 32 | """Enumerate acetone keto/enol tautomer.""" 33 | assert enumerate_tautomers_smiles('CC(C)=O') == {'CC(C)=O', 'C=C(C)O'} 34 | 35 | 36 | def test_keto_enol_enumeration(): 37 | """keto/enol tautomer""" 38 | assert enumerate_tautomers_smiles('OC(C)=C(C)C') == {'C=C(O)C(C)C', 'CC(C)=C(C)O', 'CC(=O)C(C)C'} 39 | 40 | 41 | def test_phenylpropanone_keto_enol_enumeration(): 42 | """1-phenyl-2-propanone enol/keto""" 43 | assert enumerate_tautomers_smiles('c1(ccccc1)CC(=O)C') == {'C=C(O)Cc1ccccc1', 'CC(=O)Cc1ccccc1', 'CC(O)=Cc1ccccc1'} 44 | 45 | 46 | def test_1_5_keto_enol_enumeration(): 47 | """1,5 keto/enol tautomer""" 48 | assert enumerate_tautomers_smiles('Oc1nccc2cc[nH]c(=N)c12') == {'Nc1nccc2ccnc(O)c12', 'N=c1[nH]ccc2ccnc(O)c12', 'Nc1[nH]ccc2ccnc(=O)c1-2', 'Nc1nccc2cc[nH]c(=O)c12', 'N=c1[nH]ccc2cc[nH]c(=O)c12', 'N=c1nccc2cc[nH]c(O)c1-2'} 49 | 50 | 51 | def test_1_5_keto_enol_enumeration2(): 52 | """1,5 keto/enol tautomer""" 53 | assert enumerate_tautomers_smiles('C1(C=CCCC1)=O') == {'O=C1C=CCCC1', 'OC1=CCC=CC1', 'OC1=CC=CCC1', 'O=C1CC=CCC1', 'OC1=CCCC=C1'} 54 | 55 | 56 | def test_1_5_keto_enol_enumeration3(): 57 | """1,5 keto/enol tautomer""" 58 | assert enumerate_tautomers_smiles('C1(=CC=CCC1)O') == {'O=C1C=CCCC1', 'OC1=CCC=CC1', 'OC1=CC=CCC1', 'O=C1CC=CCC1', 'OC1=CCCC=C1'} 59 | 60 | 61 | def test_aliphatic_imine_enumeration(): 62 | """aliphatic imine tautomer""" 63 | assert enumerate_tautomers_smiles('C1(CCCCC1)=N') == {'N=C1CCCCC1', 'NC1=CCCCC1'} 64 | 65 | 66 | def test_aliphatic_imine_enumeration2(): 67 | """aliphatic imine tautomer""" 68 | assert enumerate_tautomers_smiles('C1(=CCCCC1)N') == {'N=C1CCCCC1', 'NC1=CCCCC1'} 69 | 70 | 71 | def test_special_imine_enumeration(): 72 | """special imine tautomer""" 73 | assert enumerate_tautomers_smiles('C1(C=CC=CN1)=CC') == {'CC=C1C=CC=CN1', 'CCc1ccccn1', 'CC=C1C=CCC=N1'} 74 | 75 | 76 | def test_special_imine_enumeration2(): 77 | """special imine tautomer""" 78 | assert enumerate_tautomers_smiles('C1(=NC=CC=C1)CC') == {'CC=C1C=CC=CN1', 'CCc1ccccn1', 'CC=C1C=CCC=N1'} 79 | 80 | 81 | def test_1_3_aromatic_heteroatom_enumeration(): 82 | """1,3 aromatic heteroatom H shift""" 83 | assert enumerate_tautomers_smiles('O=c1cccc[nH]1') == {'Oc1ccccn1', 'O=c1cccc[nH]1'} 84 | 85 | 86 | def test_1_3_aromatic_heteroatom_enumeration2(): 87 | """1,3 aromatic heteroatom H shift""" 88 | assert enumerate_tautomers_smiles('Oc1ccccn1') == {'Oc1ccccn1', 'O=c1cccc[nH]1'} 89 | 90 | 91 | def test_1_3_aromatic_heteroatom_enumeration3(): 92 | """1,3 aromatic heteroatom H shift""" 93 | assert enumerate_tautomers_smiles('Oc1ncc[nH]1') == {'Oc1ncc[nH]1', 'O=c1[nH]cc[nH]1'} 94 | 95 | 96 | def test_1_3_heteroatom_enumeration(): 97 | """1,3 heteroatom H shift""" 98 | assert enumerate_tautomers_smiles('OC(C)=NC') == {'CN=C(C)O', 'CNC(C)=O', 'C=C(O)NC'} 99 | 100 | 101 | def test_1_3_heteroatom_enumeration2(): 102 | """1,3 heteroatom H shift""" 103 | assert enumerate_tautomers_smiles('CNC(C)=O') == {'CN=C(C)O', 'CNC(C)=O', 'C=C(O)NC'} 104 | 105 | 106 | def test_1_3_heteroatom_enumeration3(): 107 | """1,3 heteroatom H shift""" 108 | assert enumerate_tautomers_smiles('S=C(N)N') == {'N=C(N)S', 'NC(N)=S'} 109 | 110 | 111 | def test_1_3_heteroatom_enumeration4(): 112 | """1,3 heteroatom H shift""" 113 | assert enumerate_tautomers_smiles('SC(N)=N') == {'N=C(N)S', 'NC(N)=S'} 114 | 115 | 116 | def test_1_3_heteroatom_enumeration5(): 117 | """1,3 heteroatom H shift""" 118 | assert enumerate_tautomers_smiles('N=c1[nH]ccn(C)1') == {'Cn1ccnc1N', 'Cn1cc[nH]c1=N'} 119 | 120 | 121 | def test_1_3_heteroatom_enumeration6(): 122 | """1,3 heteroatom H shift""" 123 | assert enumerate_tautomers_smiles('CN=c1[nH]cncc1') == {'CN=c1ccnc[nH]1', 'CNc1ccncn1', 'CN=c1cc[nH]cn1'} 124 | 125 | 126 | def test_1_5_aromatic_heteroatom_enumeration(): 127 | """1,5 aromatic heteroatom H shift""" 128 | assert enumerate_tautomers_smiles('Oc1cccc2ccncc12') == {'O=c1cccc2cc[nH]cc1-2', 'Oc1cccc2ccncc12'} 129 | 130 | 131 | def test_1_5_aromatic_heteroatom_enumeration2(): 132 | """1,5 aromatic heteroatom H shift""" 133 | assert enumerate_tautomers_smiles('O=c1cccc2cc[nH]cc1-2') == {'O=c1cccc2cc[nH]cc1-2', 'Oc1cccc2ccncc12'} 134 | 135 | 136 | def test_1_5_aromatic_heteroatom_enumeration3(): 137 | """1,5 aromatic heteroatom H shift""" 138 | assert enumerate_tautomers_smiles('Cc1n[nH]c2ncnn12') == {'C=C1NNc2ncnn21', 'Cc1n[nH]c2ncnn12', 'Cc1nnc2[nH]cnn12', 'C=C1NN=C2N=CNN12', 'Cc1nnc2nc[nH]n12', 'C=C1NN=C2NC=NN12'} 139 | 140 | 141 | def test_1_5_aromatic_heteroatom_enumeration4(): 142 | """1,5 aromatic heteroatom H shift""" 143 | assert enumerate_tautomers_smiles('Cc1nnc2nc[nH]n12') == {'C=C1NNc2ncnn21', 'Cc1n[nH]c2ncnn12', 'Cc1nnc2[nH]cnn12', 'C=C1NN=C2N=CNN12', 'Cc1nnc2nc[nH]n12', 'C=C1NN=C2NC=NN12'} 144 | 145 | 146 | def test_1_5_aromatic_heteroatom_enumeration5(): 147 | """1,5 aromatic heteroatom H shift""" 148 | assert enumerate_tautomers_smiles('Oc1ccncc1') == {'Oc1ccncc1', 'O=c1cc[nH]cc1'} 149 | 150 | 151 | def test_1_5_aromatic_heteroatom_enumeration6(): 152 | """1,5 aromatic heteroatom H shift""" 153 | assert enumerate_tautomers_smiles('Oc1c(cccc3)c3nc2ccncc12') == {'Oc1c2ccccc2nc2ccncc12', 'O=c1c2ccccc2[nH]c2ccncc12', 'O=c1c2c[nH]ccc-2nc2ccccc12'} 154 | 155 | 156 | def test_1_3_1_5_aromatic_heteroatom_enumeration(): 157 | """1,3 and 1,5 aromatic heteroatom H shift""" 158 | assert enumerate_tautomers_smiles('Oc1ncncc1') == {'Oc1ccncn1', 'O=c1ccnc[nH]1', 'O=c1cc[nH]cn1'} 159 | 160 | 161 | def test_1_5_aromatic_heteroatom_enumeration7(): 162 | """1,5 aromatic heteroatom H shift""" 163 | assert enumerate_tautomers_smiles('C2(=C1C(=NC=N1)[NH]C(=N2)N)O') == {'N=c1[nH]c2ncnc-2c(O)[nH]1', 'Nc1nc(O)c2ncnc-2[nH]1', 'N=c1nc(O)c2nc[nH]c2[nH]1', 'Nc1nc2ncnc-2c(O)[nH]1', 'N=c1nc2nc[nH]c2c(O)[nH]1', 'N=c1[nH]c(=O)c2nc[nH]c2[nH]1', 'N=c1nc(O)c2[nH]cnc2[nH]1', 'N=c1[nH]c(=O)c2[nH]cnc2[nH]1', 'Nc1nc(=O)c2nc[nH]c2[nH]1', 'Nc1nc(O)c2nc[nH]c2n1', 'Nc1nc(=O)c2[nH]cnc2[nH]1', 'N=c1nc2[nH]cnc2c(O)[nH]1', 'Nc1nc2[nH]cnc2c(=O)[nH]1', 'Nc1nc2nc[nH]c2c(=O)[nH]1', 'Nc1nc(O)c2[nH]cnc2n1'} 164 | 165 | 166 | def test_1_5_aromatic_heteroatom_enumeration8(): 167 | """1,5 aromatic heteroatom H shift""" 168 | assert enumerate_tautomers_smiles('C2(C1=C([NH]C=N1)[NH]C(=N2)N)=O') == {'N=c1[nH]c2ncnc-2c(O)[nH]1', 'Nc1nc(O)c2ncnc-2[nH]1', 'N=c1nc(O)c2nc[nH]c2[nH]1', 'Nc1nc2ncnc-2c(O)[nH]1', 'N=c1nc2nc[nH]c2c(O)[nH]1', 'N=c1[nH]c(=O)c2nc[nH]c2[nH]1', 'N=c1nc(O)c2[nH]cnc2[nH]1', 'N=c1[nH]c(=O)c2[nH]cnc2[nH]1', 'Nc1nc(=O)c2nc[nH]c2[nH]1', 'Nc1nc(O)c2nc[nH]c2n1', 'Nc1nc(=O)c2[nH]cnc2[nH]1', 'N=c1nc2[nH]cnc2c(O)[nH]1', 'Nc1nc2[nH]cnc2c(=O)[nH]1', 'Nc1nc2nc[nH]c2c(=O)[nH]1', 'Nc1nc(O)c2[nH]cnc2n1'} 169 | 170 | 171 | def test_1_5_aromatic_heteroatom_enumeration9(): 172 | """1,5 aromatic heteroatom H shift""" 173 | assert enumerate_tautomers_smiles('Oc1n(C)ncc1') == {'Cn1nccc1O', 'CN1N=CCC1=O', 'Cn1[nH]ccc1=O'} 174 | 175 | 176 | def test_1_5_aromatic_heteroatom_enumeration10(): 177 | """1,5 aromatic heteroatom H shift""" 178 | assert enumerate_tautomers_smiles('O=c1nc2[nH]ccn2cc1') == {'O=c1ccn2cc[nH]c2n1', 'Oc1ccn2ccnc2n1', 'O=c1ccn2ccnc2[nH]1'} 179 | 180 | 181 | def test_1_5_aromatic_heteroatom_enumeration11(): 182 | """1,5 aromatic heteroatom H shift""" 183 | assert enumerate_tautomers_smiles('N=c1nc[nH]cc1') == {'N=c1cc[nH]cn1', 'N=c1ccnc[nH]1', 'Nc1ccncn1'} 184 | 185 | 186 | def test_1_5_aromatic_heteroatom_enumeration12(): 187 | """1,5 aromatic heteroatom H shift""" 188 | assert enumerate_tautomers_smiles('N=c(c1)ccn2cc[nH]c12') == {'N=c1ccn2cc[nH]c2c1', 'Nc1ccn2ccnc2c1'} 189 | 190 | 191 | def test_1_5_aromatic_heteroatom_enumeration13(): 192 | """1,5 aromatic heteroatom H shift""" 193 | assert enumerate_tautomers_smiles('CN=c1nc[nH]cc1') == {'CN=c1ccnc[nH]1', 'CNc1ccncn1', 'CN=c1cc[nH]cn1'} 194 | 195 | 196 | def test_1_7_aromatic_heteroatom_enumeration(): 197 | """1,7 aromatic heteroatom H shift""" 198 | assert enumerate_tautomers_smiles('c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1') == {'c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1', 'c1ccc2c(c1)=NC(c1nc3ccccc3[nH]1)N=2', 'c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2'} 199 | 200 | 201 | def test_1_7_aromatic_heteroatom_enumeration2(): 202 | """1,7 aromatic heteroatom H shift""" 203 | assert enumerate_tautomers_smiles('c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2') == {'c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1', 'c1ccc2c(c1)=NC(c1nc3ccccc3[nH]1)N=2', 'c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2'} 204 | 205 | 206 | def test_1_9_aromatic_heteroatom_enumeration(): 207 | """1,9 aromatic heteroatom H shift""" 208 | assert enumerate_tautomers_smiles('CNc1ccnc2ncnn21') == {'CN=c1cc[nH]c2ncnn12', 'CN=c1ccnc2nc[nH]n12', 'CN=c1ccnc2[nH]cnn12', 'CNc1ccnc2ncnn12'} 209 | 210 | 211 | def test_1_9_aromatic_heteroatom_enumeration2(): 212 | """1,9 aromatic heteroatom H shift""" 213 | assert enumerate_tautomers_smiles('CN=c1ccnc2nc[nH]n21') == {'CN=c1ccnc2nc[nH]n12', 'CN=c1cc[nH]c2ncnn12', 'CN=c1ccnc2[nH]cnn12', 'CNc1ccnc2ncnn12'} 214 | 215 | 216 | def test_1_11_aromatic_heteroatom_enumeration(): 217 | """1,11 aromatic heteroatom H shift""" 218 | assert enumerate_tautomers_smiles('Nc1ccc(C=C2C=CC(=O)C=C2)cc1') == {'Nc1ccc(C=C2C=CC(=O)C=C2)cc1', 'N=C1C=CC(=CC2C=CC(=O)C=C2)C=C1', 'N=C1C=CC(=Cc2ccc(O)cc2)C=C1', 'N=C1C=CC(C=C2C=CC(=O)C=C2)C=C1'} 219 | 220 | 221 | def test_1_11_aromatic_heteroatom_enumeration2(): 222 | """1,11 aromatic heteroatom H shift""" 223 | assert enumerate_tautomers_smiles('N=C1C=CC(=Cc2ccc(O)cc2)C=C1') == {'Nc1ccc(C=C2C=CC(=O)C=C2)cc1', 'N=C1C=CC(=CC2C=CC(=O)C=C2)C=C1', 'N=C1C=CC(=Cc2ccc(O)cc2)C=C1', 'N=C1C=CC(C=C2C=CC(=O)C=C2)C=C1'} 224 | 225 | 226 | def test_heterocyclic_enumeration(): 227 | """heterocyclic tautomer""" 228 | assert enumerate_tautomers_smiles('n1ccc2ccc[nH]c12') == {'c1c[nH]c2nccc-2c1', 'c1cnc2[nH]ccc2c1'} 229 | 230 | 231 | def test_heterocyclic_enumeration2(): 232 | """heterocyclic tautomer""" 233 | assert enumerate_tautomers_smiles('c1cc(=O)[nH]c2nccn12') == {'O=c1ccn2cc[nH]c2n1', 'Oc1ccn2ccnc2n1', 'O=c1ccn2ccnc2[nH]1'} 234 | 235 | 236 | def test_heterocyclic_enumeration3(): 237 | """heterocyclic tautomer""" 238 | assert enumerate_tautomers_smiles('c1cnc2c[nH]ccc12') == {'c1cc2cc[nH]c2cn1', 'c1cc2cc[nH]cc-2n1'} 239 | 240 | 241 | def test_heterocyclic_enumeration4(): 242 | """heterocyclic tautomer""" 243 | assert enumerate_tautomers_smiles('n1ccc2c[nH]ccc12') == {'c1cc2[nH]ccc2cn1', 'c1cc2c[nH]ccc-2n1'} 244 | 245 | 246 | def test_heterocyclic_enumeration5(): 247 | """heterocyclic tautomer""" 248 | assert enumerate_tautomers_smiles('c1cnc2ccc[nH]c12') == {'c1c[nH]c2ccnc-2c1', 'c1cnc2cc[nH]c2c1'} 249 | 250 | 251 | def test_furanone_enumeration(): 252 | """furanone tautomer""" 253 | assert enumerate_tautomers_smiles('C1=CC=C(O1)O') == {'Oc1ccco1', 'O=C1CC=CO1'} 254 | 255 | 256 | def test_furanone_enumeration2(): 257 | """furanone tautomer""" 258 | assert enumerate_tautomers_smiles('O=C1CC=CO1') == {'Oc1ccco1', 'O=C1CC=CO1'} 259 | 260 | 261 | def test_keten_ynol_enumeration(): 262 | """keten/ynol tautomer""" 263 | assert enumerate_tautomers_smiles('CC=C=O') == {'CC=C=O', 'CC#CO'} 264 | 265 | 266 | def test_keten_ynol_enumeration2(): 267 | """keten/ynol tautomer""" 268 | assert enumerate_tautomers_smiles('CC#CO') == {'CC=C=O', 'CC#CO'} 269 | 270 | 271 | def test_ionic_nitro_aci_nitro_enumeration(): 272 | """ionic nitro/aci-nitro tautomer""" 273 | assert enumerate_tautomers_smiles('C([N+](=O)[O-])C') == {'CC[N+](=O)[O-]', 'CC=[N+]([O-])O'} 274 | 275 | 276 | def test_ionic_nitro_aci_nitro_enumeration2(): 277 | """ionic nitro/aci-nitro tautomer""" 278 | assert enumerate_tautomers_smiles('C(=[N+](O)[O-])C') == {'CC[N+](=O)[O-]', 'CC=[N+]([O-])O'} 279 | 280 | 281 | def test_oxim_nitroso_enumeration(): 282 | """oxim nitroso tautomer""" 283 | assert enumerate_tautomers_smiles('CC(C)=NO') == {'CC(C)N=O', 'CC(C)=NO', 'C=C(C)NO'} 284 | 285 | 286 | def test_oxim_nitroso_enumeration2(): 287 | """oxim nitroso tautomer""" 288 | assert enumerate_tautomers_smiles('CC(C)N=O') == {'CC(C)N=O', 'CC(C)=NO', 'C=C(C)NO'} 289 | 290 | 291 | def test_oxim_nitroso_enumeration3(): 292 | """oxim/nitroso tautomer via phenol""" 293 | assert enumerate_tautomers_smiles('O=Nc1ccc(O)cc1') == {'O=NC1C=CC(=O)C=C1', 'O=C1C=CC(=NO)C=C1', 'O=Nc1ccc(O)cc1'} 294 | 295 | 296 | def test_oxim_nitroso_enumeration4(): 297 | """oxim/nitroso tautomer via phenol""" 298 | assert enumerate_tautomers_smiles('O=C1C=CC(=NO)C=C1') == {'O=NC1C=CC(=O)C=C1', 'O=C1C=CC(=NO)C=C1', 'O=Nc1ccc(O)cc1'} 299 | 300 | 301 | def test_cyano_iso_cyanic_acid_enumeration(): 302 | """cyano/iso-cyanic acid tautomer""" 303 | assert enumerate_tautomers_smiles('C(#N)O') == {'N#CO', 'N=C=O'} 304 | 305 | 306 | def test_cyano_iso_cyanic_acid_enumeration2(): 307 | """cyano/iso-cyanic acid tautomer""" 308 | assert enumerate_tautomers_smiles('C(=N)=O') == {'N#CO', 'N=C=O'} 309 | 310 | 311 | # TODO: Can't get my head around these... 312 | # def test_formamidinesulfinic_acid_enumeration(): 313 | # """formamidinesulfinic acid tautomer""" 314 | # assert enumerate_tautomers_smiles('NC(N)=S(=O)=O') == {'N=C(N)S(=O)O', 'N[C](N)[S](=O)=O'} 315 | # 316 | # 317 | # def test_formamidinesulfinic_acid_enumeration2(): 318 | # """formamidinesulfinic acid tautomer""" 319 | # assert enumerate_tautomers_smiles('N=C(N)S(=O)O') == {'N=C(N)S(=O)O', 'N[C](N)[S](=O)=O'} 320 | 321 | 322 | def test_isocyanide_enumeration(): 323 | """isocyanide tautomer""" 324 | assert enumerate_tautomers_smiles('C#N') == {'[C-]#[NH+]', 'C#N'} 325 | 326 | 327 | def test_isocyanide_enumeration2(): 328 | """isocyanide tautomer""" 329 | assert enumerate_tautomers_smiles('[C-]#[NH+]') == {'[C-]#[NH+]', 'C#N'} 330 | 331 | 332 | def test_phosphonic_acid_enumeration(): 333 | """phosphonic acid tautomer""" 334 | assert enumerate_tautomers_smiles('[PH](=O)(O)(O)') == {'OP(O)O', 'O=[PH](O)O'} 335 | 336 | 337 | def test_phosphonic_acid_enumeration2(): 338 | """phosphonic acid tautomer""" 339 | assert enumerate_tautomers_smiles('P(O)(O)O') == {'OP(O)O', 'O=[PH](O)O'} 340 | 341 | 342 | def test_mobile_double_stereochemistry_enumeration(): 343 | """Remove stereochemistry from mobile double bonds""" 344 | assert enumerate_tautomers_smiles('c1(ccccc1)/C=C(/O)\\C') == {'C=C(O)Cc1ccccc1', 'CC(O)=Cc1ccccc1', 'CC(=O)Cc1ccccc1'} 345 | 346 | 347 | def test_mobile_double_stereochemistry_enumeration2(): 348 | """Remove stereochemistry from mobile double bonds""" 349 | assert enumerate_tautomers_smiles('C/C=C/C(C)=O') == {'C=C(O)C=CC', 'C=CCC(=C)O', 'CC=CC(C)=O', 'C=CCC(C)=O', 'C=CC=C(C)O'} 350 | 351 | 352 | def test_mobile_double_stereochemistry_enumeration3(): 353 | """Remove stereochemistry from mobile double bonds""" 354 | assert enumerate_tautomers_smiles('C/C=C\\C(C)=O') == {'C=C(O)C=CC', 'C=CCC(=C)O', 'CC=CC(C)=O', 'C=CCC(C)=O', 'C=CC=C(C)O'} 355 | 356 | 357 | def test_gaunine_enumeration(): 358 | """Gaunine tautomers""" 359 | assert enumerate_tautomers_smiles('N1C(N)=NC=2N=CNC2C1=O') == { 360 | 'N=c1[nH]c(=O)c2[nH]cnc2[nH]1', 361 | 'N=c1[nH]c(=O)c2nc[nH]c2[nH]1', 362 | 'N=c1[nH]c2ncnc-2c(O)[nH]1', 363 | 'N=c1nc(O)c2[nH]cnc2[nH]1', 364 | 'N=c1nc(O)c2nc[nH]c2[nH]1', 365 | 'N=c1nc2[nH]cnc2c(O)[nH]1', 366 | 'N=c1nc2nc[nH]c2c(O)[nH]1', 367 | 'Nc1nc(=O)c2[nH]cnc2[nH]1', 368 | 'Nc1nc(=O)c2nc[nH]c2[nH]1', 369 | 'Nc1nc(O)c2[nH]cnc2n1', 370 | 'Nc1nc(O)c2nc[nH]c2n1', 371 | 'Nc1nc(O)c2ncnc-2[nH]1', 372 | 'Nc1nc2[nH]cnc2c(=O)[nH]1', 373 | 'Nc1nc2nc[nH]c2c(=O)[nH]1', 374 | 'Nc1nc2ncnc-2c(O)[nH]1' 375 | } 376 | 377 | 378 | def test_many_enumeration(): 379 | """Test a structure with hundreds of tautomers.""" 380 | assert len(enumerate_tautomers_smiles('[H][C](CO)(NC(=O)C1=C(O)C(O)=CC=C1)C(O)=O')) == 375 381 | 382 | 383 | def test_1_3_keto_enol_canonicalization(): 384 | """1,3 keto/enol tautomer""" 385 | assert canonicalize_tautomer_smiles('C1(=CCCCC1)O') == 'O=C1CCCCC1' 386 | 387 | 388 | def test_1_3_keto_enol_canonicalization2(): 389 | """1,3 keto/enol tautomer""" 390 | assert canonicalize_tautomer_smiles('C1(CCCCC1)=O') == 'O=C1CCCCC1' 391 | 392 | 393 | def test_acetophenone_keto_enol_canonicalization(): 394 | """Acetophenone keto/enol tautomer""" 395 | assert canonicalize_tautomer_smiles('C(=C)(O)C1=CC=CC=C1') == 'CC(=O)c1ccccc1' 396 | 397 | 398 | def test_acetone_keto_enol_canonicalization(): 399 | """Acetone keto/enol tautomer""" 400 | assert canonicalize_tautomer_smiles('CC(C)=O') == 'CC(C)=O' 401 | 402 | 403 | def test_keto_enol_canonicalization(): 404 | """keto/enol tautomer""" 405 | assert canonicalize_tautomer_smiles('OC(C)=C(C)C') == 'CC(=O)C(C)C' 406 | 407 | 408 | def test_phenylpropanone_keto_enol_canonicalization(): 409 | """1-phenyl-2-propanone enol/keto""" 410 | assert canonicalize_tautomer_smiles('c1(ccccc1)CC(=O)C') == 'CC(=O)Cc1ccccc1' 411 | 412 | 413 | def test_1_5_keto_enol_canonicalization(): 414 | """1,5 keto/enol tautomer""" 415 | assert canonicalize_tautomer_smiles('Oc1nccc2cc[nH]c(=N)c12') == 'N=c1[nH]ccc2cc[nH]c(=O)c12' 416 | 417 | 418 | def test_1_5_keto_enol_canonicalization2(): 419 | """1,5 keto/enol tautomer""" 420 | assert canonicalize_tautomer_smiles('C1(C=CCCC1)=O') == 'O=C1C=CCCC1' 421 | 422 | 423 | def test_1_5_keto_enol_canonicalization3(): 424 | """1,5 keto/enol tautomer""" 425 | assert canonicalize_tautomer_smiles('C1(=CC=CCC1)O') == 'O=C1C=CCCC1' 426 | 427 | 428 | def test_aliphatic_imine_canonicalization(): 429 | """aliphatic imine tautomer""" 430 | assert canonicalize_tautomer_smiles('C1(CCCCC1)=N') == 'N=C1CCCCC1' 431 | 432 | 433 | def test_aliphatic_imine_canonicalization2(): 434 | """aliphatic imine tautomer""" 435 | assert canonicalize_tautomer_smiles('C1(=CCCCC1)N') == 'N=C1CCCCC1' 436 | 437 | 438 | def test_special_imine_canonicalization(): 439 | """special imine tautomer""" 440 | assert canonicalize_tautomer_smiles('C1(C=CC=CN1)=CC') == 'CCc1ccccn1' 441 | 442 | 443 | def test_special_imine_canonicalization2(): 444 | """special imine tautomer""" 445 | assert canonicalize_tautomer_smiles('C1(=NC=CC=C1)CC') == 'CCc1ccccn1' 446 | 447 | 448 | def test_1_3_aromatic_heteroatom_canonicalization(): 449 | """1,3 aromatic heteroatom H shift""" 450 | assert canonicalize_tautomer_smiles('O=c1cccc[nH]1') == 'O=c1cccc[nH]1' 451 | 452 | 453 | def test_1_3_aromatic_heteroatom_canonicalization2(): 454 | """1,3 aromatic heteroatom H shift""" 455 | assert canonicalize_tautomer_smiles('Oc1ccccn1') == 'O=c1cccc[nH]1' 456 | 457 | 458 | def test_1_3_aromatic_heteroatom_canonicalization3(): 459 | """1,3 aromatic heteroatom H shift""" 460 | assert canonicalize_tautomer_smiles('Oc1ncc[nH]1') == 'O=c1[nH]cc[nH]1' 461 | 462 | 463 | def test_1_3_heteroatom_canonicalization(): 464 | """1,3 heteroatom H shift""" 465 | assert canonicalize_tautomer_smiles('OC(C)=NC') == 'CNC(C)=O' 466 | 467 | 468 | def test_1_3_heteroatom_canonicalization2(): 469 | """1,3 heteroatom H shift""" 470 | assert canonicalize_tautomer_smiles('CNC(C)=O') == 'CNC(C)=O' 471 | 472 | 473 | def test_1_3_heteroatom_canonicalization3(): 474 | """1,3 heteroatom H shift""" 475 | assert canonicalize_tautomer_smiles('S=C(N)N') == 'NC(N)=S' 476 | 477 | 478 | def test_1_3_heteroatom_canonicalization4(): 479 | """1,3 heteroatom H shift""" 480 | assert canonicalize_tautomer_smiles('SC(N)=N') == 'NC(N)=S' 481 | 482 | 483 | def test_1_3_heteroatom_canonicalization5(): 484 | """1,3 heteroatom H shift""" 485 | assert canonicalize_tautomer_smiles('N=c1[nH]ccn(C)1') == 'Cn1cc[nH]c1=N' 486 | 487 | 488 | def test_1_3_heteroatom_canonicalization6(): 489 | """1,3 heteroatom H shift""" 490 | assert canonicalize_tautomer_smiles('CN=c1[nH]cncc1') == 'CN=c1cc[nH]cn1' 491 | 492 | 493 | def test_1_5_aromatic_heteroatom_canonicalization(): 494 | """1,5 aromatic heteroatom H shift""" 495 | assert canonicalize_tautomer_smiles('Oc1cccc2ccncc12') == 'Oc1cccc2ccncc12' 496 | 497 | 498 | def test_1_5_aromatic_heteroatom_canonicalization2(): 499 | """1,5 aromatic heteroatom H shift""" 500 | assert canonicalize_tautomer_smiles('O=c1cccc2cc[nH]cc1-2') == 'Oc1cccc2ccncc12' 501 | 502 | 503 | def test_1_5_aromatic_heteroatom_canonicalization3(): 504 | """1,5 aromatic heteroatom H shift""" 505 | assert canonicalize_tautomer_smiles('Cc1n[nH]c2ncnn12') == 'Cc1n[nH]c2ncnn12' 506 | 507 | 508 | def test_1_5_aromatic_heteroatom_canonicalization4(): 509 | """1,5 aromatic heteroatom H shift""" 510 | assert canonicalize_tautomer_smiles('Cc1nnc2nc[nH]n12') == 'Cc1n[nH]c2ncnn12' 511 | 512 | 513 | def test_1_5_aromatic_heteroatom_canonicalization5(): 514 | """1,5 aromatic heteroatom H shift""" 515 | assert canonicalize_tautomer_smiles('Oc1ccncc1') == 'O=c1cc[nH]cc1' 516 | 517 | 518 | def test_1_5_aromatic_heteroatom_canonicalization6(): 519 | """1,5 aromatic heteroatom H shift""" 520 | assert canonicalize_tautomer_smiles('Oc1c(cccc3)c3nc2ccncc12') == 'O=c1c2ccccc2[nH]c2ccncc12' 521 | 522 | 523 | def test_1_3_1_5_aromatic_heteroatom_canonicalization(): 524 | """1,3 and 1,5 aromatic heteroatom H shift""" 525 | assert canonicalize_tautomer_smiles('Oc1ncncc1') == 'O=c1cc[nH]cn1' 526 | 527 | 528 | def test_1_5_aromatic_heteroatom_canonicalization7(): 529 | """1,5 aromatic heteroatom H shift""" 530 | assert canonicalize_tautomer_smiles('C2(=C1C(=NC=N1)[NH]C(=N2)N)O') == 'N=c1[nH]c(=O)c2[nH]cnc2[nH]1' 531 | 532 | 533 | def test_1_5_aromatic_heteroatom_canonicalization8(): 534 | """1,5 aromatic heteroatom H shift""" 535 | assert canonicalize_tautomer_smiles('C2(C1=C([NH]C=N1)[NH]C(=N2)N)=O') == 'N=c1[nH]c(=O)c2[nH]cnc2[nH]1' 536 | 537 | 538 | def test_1_5_aromatic_heteroatom_canonicalization9(): 539 | """1,5 aromatic heteroatom H shift""" 540 | assert canonicalize_tautomer_smiles('Oc1n(C)ncc1') == 'Cn1[nH]ccc1=O' 541 | 542 | 543 | def test_1_5_aromatic_heteroatom_canonicalization10(): 544 | """1,5 aromatic heteroatom H shift""" 545 | assert canonicalize_tautomer_smiles('O=c1nc2[nH]ccn2cc1') == 'O=c1ccn2cc[nH]c2n1' 546 | 547 | 548 | def test_1_5_aromatic_heteroatom_canonicalization11(): 549 | """1,5 aromatic heteroatom H shift""" 550 | assert canonicalize_tautomer_smiles('N=c1nc[nH]cc1') == 'N=c1cc[nH]cn1' 551 | 552 | 553 | def test_1_5_aromatic_heteroatom_canonicalization12(): 554 | """1,5 aromatic heteroatom H shift""" 555 | assert canonicalize_tautomer_smiles('N=c(c1)ccn2cc[nH]c12') == 'N=c1ccn2cc[nH]c2c1' 556 | 557 | 558 | def test_1_5_aromatic_heteroatom_canonicalization13(): 559 | """1,5 aromatic heteroatom H shift""" 560 | assert canonicalize_tautomer_smiles('CN=c1nc[nH]cc1') == 'CN=c1cc[nH]cn1' 561 | 562 | 563 | def test_1_7_aromatic_heteroatom_canonicalization(): 564 | """1,7 aromatic heteroatom H shift""" 565 | assert canonicalize_tautomer_smiles('c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1') == 'c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1' 566 | 567 | 568 | def test_1_7_aromatic_heteroatom_canonicalization2(): 569 | """1,7 aromatic heteroatom H shift""" 570 | assert canonicalize_tautomer_smiles('c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2') == 'c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1' 571 | 572 | 573 | def test_1_9_aromatic_heteroatom_canonicalization(): 574 | """1,9 aromatic heteroatom H shift""" 575 | assert canonicalize_tautomer_smiles('CNc1ccnc2ncnn21') == 'CN=c1cc[nH]c2ncnn12' 576 | 577 | 578 | def test_1_9_aromatic_heteroatom_canonicalization2(): 579 | """1,9 aromatic heteroatom H shift""" 580 | assert canonicalize_tautomer_smiles('CN=c1ccnc2nc[nH]n21') == 'CN=c1cc[nH]c2ncnn12' 581 | 582 | 583 | def test_1_11_aromatic_heteroatom_canonicalization(): 584 | """1,11 aromatic heteroatom H shift""" 585 | assert canonicalize_tautomer_smiles('Nc1ccc(C=C2C=CC(=O)C=C2)cc1') == 'Nc1ccc(C=C2C=CC(=O)C=C2)cc1' 586 | 587 | 588 | def test_1_11_aromatic_heteroatom_canonicalization2(): 589 | """1,11 aromatic heteroatom H shift""" 590 | assert canonicalize_tautomer_smiles('N=C1C=CC(=Cc2ccc(O)cc2)C=C1') == 'Nc1ccc(C=C2C=CC(=O)C=C2)cc1' 591 | 592 | 593 | def test_heterocyclic_canonicalization(): 594 | """heterocyclic tautomer""" 595 | assert canonicalize_tautomer_smiles('n1ccc2ccc[nH]c12') == 'c1cnc2[nH]ccc2c1' 596 | 597 | 598 | def test_heterocyclic_canonicalization2(): 599 | """heterocyclic tautomer""" 600 | assert canonicalize_tautomer_smiles('c1cc(=O)[nH]c2nccn12') == 'O=c1ccn2cc[nH]c2n1' 601 | 602 | 603 | def test_heterocyclic_canonicalization3(): 604 | """heterocyclic tautomer""" 605 | assert canonicalize_tautomer_smiles('c1cnc2c[nH]ccc12') == 'c1cc2cc[nH]c2cn1' 606 | 607 | 608 | def test_heterocyclic_canonicalization4(): 609 | """heterocyclic tautomer""" 610 | assert canonicalize_tautomer_smiles('n1ccc2c[nH]ccc12') == 'c1cc2[nH]ccc2cn1' 611 | 612 | 613 | def test_heterocyclic_canonicalization5(): 614 | """heterocyclic tautomer""" 615 | assert canonicalize_tautomer_smiles('c1cnc2ccc[nH]c12') == 'c1cnc2cc[nH]c2c1' 616 | 617 | 618 | def test_furanone_canonicalization(): 619 | """furanone tautomer""" 620 | assert canonicalize_tautomer_smiles('C1=CC=C(O1)O') == 'Oc1ccco1' 621 | 622 | 623 | def test_furanone_canonicalization2(): 624 | """furanone tautomer""" 625 | assert canonicalize_tautomer_smiles('O=C1CC=CO1') == 'Oc1ccco1' 626 | 627 | 628 | def test_keten_ynol_canonicalization(): 629 | """keten/ynol tautomer""" 630 | assert canonicalize_tautomer_smiles('CC=C=O') == 'CC=C=O' 631 | 632 | 633 | def test_keten_ynol_canonicalization2(): 634 | """keten/ynol tautomer""" 635 | assert canonicalize_tautomer_smiles('CC#CO') == 'CC=C=O' 636 | 637 | 638 | def test_ionic_nitro_aci_nitro_canonicalization(): 639 | """ionic nitro/aci-nitro tautomer""" 640 | assert canonicalize_tautomer_smiles('C([N+](=O)[O-])C') == 'CC[N+](=O)[O-]' 641 | 642 | 643 | def test_ionic_nitro_aci_nitro_canonicalization2(): 644 | """ionic nitro/aci-nitro tautomer""" 645 | assert canonicalize_tautomer_smiles('C(=[N+](O)[O-])C') == 'CC[N+](=O)[O-]' 646 | 647 | 648 | def test_oxim_nitroso_canonicalization(): 649 | """oxim nitroso tautomer""" 650 | assert canonicalize_tautomer_smiles('CC(C)=NO') == 'CC(C)=NO' 651 | 652 | 653 | def test_oxim_nitroso_canonicalization2(): 654 | """oxim nitroso tautomer""" 655 | assert canonicalize_tautomer_smiles('CC(C)N=O') == 'CC(C)=NO' 656 | 657 | 658 | def test_oxim_nitroso_phenol_canonicalization(): 659 | """oxim/nitroso tautomer via phenol""" 660 | assert canonicalize_tautomer_smiles('O=Nc1ccc(O)cc1') == 'O=Nc1ccc(O)cc1' 661 | 662 | 663 | def test_oxim_nitroso_phenol_canonicalization2(): 664 | """oxim/nitroso tautomer via phenol""" 665 | assert canonicalize_tautomer_smiles('O=C1C=CC(=NO)C=C1') == 'O=Nc1ccc(O)cc1' 666 | 667 | 668 | def test_cyano_iso_cyanic_acid_canonicalization(): 669 | """cyano/iso-cyanic acid tautomer""" 670 | assert canonicalize_tautomer_smiles('C(#N)O') == 'N=C=O' 671 | 672 | 673 | def test_cyano_iso_cyanic_acid_canonicalization2(): 674 | """cyano/iso-cyanic acid tautomer""" 675 | assert canonicalize_tautomer_smiles('C(=N)=O') == 'N=C=O' 676 | 677 | 678 | # def test_formamidinesulfinic_acid_canonicalization(): 679 | # """formamidinesulfinic acid tautomer""" 680 | # assert canonicalize_tautomer_smiles('N[C](N)[S](=O)=O') == 'N=C(N)S(=O)O' 681 | 682 | 683 | def test_formamidinesulfinic_acid_canonicalization2(): 684 | """formamidinesulfinic acid tautomer""" 685 | assert canonicalize_tautomer_smiles('N=C(N)S(=O)O') == 'N=C(N)S(=O)O' 686 | 687 | 688 | def test_isocyanide_canonicalization(): 689 | """isocyanide tautomer""" 690 | assert canonicalize_tautomer_smiles('C#N') == 'C#N' 691 | 692 | 693 | def test_isocyanide_canonicalization2(): 694 | """isocyanide tautomer""" 695 | assert canonicalize_tautomer_smiles('[C-]#[NH+]') == 'C#N' 696 | 697 | 698 | def test_phosphonic_acid_canonicalization(): 699 | """phosphonic acid tautomer""" 700 | assert canonicalize_tautomer_smiles('[PH](=O)(O)(O)') == 'O=[PH](O)O' 701 | 702 | 703 | def test_phosphonic_acid_canonicalization2(): 704 | """phosphonic acid tautomer""" 705 | assert canonicalize_tautomer_smiles('P(O)(O)O') == 'O=[PH](O)O' 706 | 707 | 708 | # TODO: An example that requires resonance/protonation enumeration as well as tautomer 709 | # If passed to standardizer, the local symmetry means that either N could get +ve charge, regardless of O and S locations 710 | # CN(C)C1=[NH+]C2=CC(=S)C(=O)C=C2N1 711 | # C[N+](C)=C1NC2=CC(=O)C(=S)C=C2N1 712 | -------------------------------------------------------------------------------- /tests/test_validate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Tests for charge.py""" 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | 9 | from molvs.validate import validate_smiles 10 | 11 | 12 | def test_none(): 13 | """IsNoneValidation should log due to SMILES parse error.""" 14 | assert validate_smiles('3478q439g98h') == ['ERROR: [IsNoneValidation] Molecule is None'] 15 | 16 | 17 | def test_no_atoms(): 18 | """An empty SMILES produces a mol with not atoms.""" 19 | assert validate_smiles('') == [u'ERROR: [NoAtomValidation] No atoms are present'] 20 | 21 | 22 | def test_fragment(): 23 | """FragmentValidation should identify 1,2-dichloroethane.""" 24 | assert validate_smiles('ClCCCl.c1ccccc1O') == ['INFO: [FragmentValidation] 1,2-dichloroethane is present'] 25 | 26 | 27 | def test_fragment2(): 28 | """FragmentValidation should identify 1,2-dimethoxyethane.""" 29 | assert validate_smiles('COCCOC.CCCBr') == ['INFO: [FragmentValidation] 1,2-dimethoxyethane is present'] 30 | 31 | 32 | def test_charge(): 33 | """NeutralValidation should identify net overall charge.""" 34 | assert validate_smiles('O=C([O-])c1ccccc1') == ['INFO: [NeutralValidation] Not an overall neutral system (-1)'] 35 | assert validate_smiles('CN=[NH+]CN=N') == ['INFO: [NeutralValidation] Not an overall neutral system (+1)'] 36 | 37 | 38 | def test_isotope(): 39 | """IsotopeValidation should identify atoms with isotope labels.""" 40 | assert validate_smiles('[13CH4]') == ['INFO: [IsotopeValidation] Molecule contains isotope 13C'] 41 | assert validate_smiles('[2H]C(Cl)(Cl)Cl') == ['INFO: [IsotopeValidation] Molecule contains isotope 2H'] 42 | assert validate_smiles('[2H]OC([2H])([2H])[2H]') == ['INFO: [IsotopeValidation] Molecule contains isotope 2H'] 43 | --------------------------------------------------------------------------------